diff options
Diffstat (limited to 'sys/netinet')
91 files changed, 46452 insertions, 0 deletions
diff --git a/sys/netinet/fil.c b/sys/netinet/fil.c new file mode 100644 index 0000000..bb6eb52 --- /dev/null +++ b/sys/netinet/fil.c @@ -0,0 +1,1548 @@ +/* + * Copyright (C) 1993-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)fil.c 1.36 6/5/96 (C) 1993-1996 Darren Reed"; +/*static const char rcsid[] = "@(#)$Id: fil.c,v 2.3.2.7 1999/10/21 14:21:40 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if defined(KERNEL) && defined(__FreeBSD_version) && \ + (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) +# include <sys/systm.h> +#else +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#include <sys/uio.h> +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/byteorder.h> +# if SOLARIS2 < 5 +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +#endif +#ifndef linux +# include <sys/protosw.h> +# include <sys/socket.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef linux +# include <netinet/ip_var.h> +#endif +#if defined(__sgi) && defined(IFF_DRVRLOCK) /* IRIX 6 */ +# include <sys/hashing.h> +# include <netinet/in_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_auth.h" +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +# endif +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#include "netinet/ipl.h" + +#ifndef _KERNEL +# include "ipf.h" +# include "ipt.h" +extern int opts; + +# define FR_IFVERBOSE(ex,second,verb_pr) if (ex) { verbose verb_pr; \ + second; } +# define FR_IFDEBUG(ex,second,verb_pr) if (ex) { debug verb_pr; \ + second; } +# define FR_VERBOSE(verb_pr) verbose verb_pr +# define FR_DEBUG(verb_pr) debug verb_pr +# define SEND_RESET(ip, qif, if, m, fin) send_reset(ip, if) +# define IPLLOG(a, c, d, e) ipllog() +# define FR_NEWAUTH(m, fi, ip, qif) fr_newauth((mb_t *)m, fi, ip) +#else /* #ifndef _KERNEL */ +# define FR_IFVERBOSE(ex,second,verb_pr) ; +# define FR_IFDEBUG(ex,second,verb_pr) ; +# define FR_VERBOSE(verb_pr) +# define FR_DEBUG(verb_pr) +# define IPLLOG(a, c, d, e) ipflog(a, c, d, e) +# if SOLARIS || defined(__sgi) +extern KRWLOCK_T ipf_mutex, ipf_auth, ipf_nat; +extern kmutex_t ipf_rw; +# endif +# if SOLARIS +# define FR_NEWAUTH(m, fi, ip, qif) fr_newauth((mb_t *)m, fi, \ + ip, qif) +# define SEND_RESET(ip, qif, if, fin) send_reset(fin, ip, qif) +# define ICMP_ERROR(b, ip, t, c, if, dst) \ + icmp_error(ip, t, c, if, dst) +# else /* SOLARIS */ +# define FR_NEWAUTH(m, fi, ip, qif) fr_newauth((mb_t *)m, fi, ip) +# ifdef linux +# define SEND_RESET(ip, qif, if, fin) send_reset(ip, ifp) +# define ICMP_ERROR(b, ip, t, c, if, dst) icmp_send(b,t,c,0,if) +# else +# define SEND_RESET(ip, qif, if, fin) send_reset(fin, ip) +# define ICMP_ERROR(b, ip, t, c, if, dst) \ + send_icmp_err(ip, t, c, if, dst) +# endif /* linux */ +# endif /* SOLARIS || __sgi */ +#endif /* _KERNEL */ + + +struct filterstats frstats[2] = {{0,0,0,0,0},{0,0,0,0,0}}; +struct frentry *ipfilter[2][2] = { { NULL, NULL }, { NULL, NULL } }, + *ipacct[2][2] = { { NULL, NULL }, { NULL, NULL } }; +struct frgroup *ipfgroups[3][2]; +int fr_flags = IPF_LOGGING, fr_active = 0; +#if defined(IPFILTER_DEFAULT_BLOCK) +int fr_pass = FR_NOMATCH|FR_BLOCK; +#else +int fr_pass = (IPF_DEFAULT_PASS|FR_NOMATCH); +#endif +char ipfilter_version[] = IPL_VERSION; + +fr_info_t frcache[2]; + +static int fr_tcpudpchk __P((frentry_t *, fr_info_t *)); +static int frflushlist __P((int, minor_t, int *, frentry_t **)); + + +/* + * bit values for identifying presence of individual IP options + */ +struct optlist ipopts[20] = { + { IPOPT_NOP, 0x000001 }, + { IPOPT_RR, 0x000002 }, + { IPOPT_ZSU, 0x000004 }, + { IPOPT_MTUP, 0x000008 }, + { IPOPT_MTUR, 0x000010 }, + { IPOPT_ENCODE, 0x000020 }, + { IPOPT_TS, 0x000040 }, + { IPOPT_TR, 0x000080 }, + { IPOPT_SECURITY, 0x000100 }, + { IPOPT_LSRR, 0x000200 }, + { IPOPT_E_SEC, 0x000400 }, + { IPOPT_CIPSO, 0x000800 }, + { IPOPT_SATID, 0x001000 }, + { IPOPT_SSRR, 0x002000 }, + { IPOPT_ADDEXT, 0x004000 }, + { IPOPT_VISA, 0x008000 }, + { IPOPT_IMITD, 0x010000 }, + { IPOPT_EIP, 0x020000 }, + { IPOPT_FINN, 0x040000 }, + { 0, 0x000000 } +}; + +/* + * bit values for identifying presence of individual IP security options + */ +struct optlist secopt[8] = { + { IPSO_CLASS_RES4, 0x01 }, + { IPSO_CLASS_TOPS, 0x02 }, + { IPSO_CLASS_SECR, 0x04 }, + { IPSO_CLASS_RES3, 0x08 }, + { IPSO_CLASS_CONF, 0x10 }, + { IPSO_CLASS_UNCL, 0x20 }, + { IPSO_CLASS_RES2, 0x40 }, + { IPSO_CLASS_RES1, 0x80 } +}; + + +/* + * compact the IP header into a structure which contains just the info. + * which is useful for comparing IP headers with. + */ +void fr_makefrip(hlen, ip, fin) +int hlen; +ip_t *ip; +fr_info_t *fin; +{ + struct optlist *op; + tcphdr_t *tcp; + fr_ip_t *fi = &fin->fin_fi; + u_short optmsk = 0, secmsk = 0, auth = 0; + int i, mv, ol, off; + u_char *s, opt; + + fin->fin_rev = 0; + fin->fin_fr = NULL; + fin->fin_tcpf = 0; + fin->fin_data[0] = 0; + fin->fin_data[1] = 0; + fin->fin_rule = -1; + fin->fin_group = -1; + fin->fin_id = ip->ip_id; +#ifdef _KERNEL + fin->fin_icode = ipl_unreach; +#endif + fi->fi_v = ip->ip_v; + fi->fi_tos = ip->ip_tos; + fin->fin_hlen = hlen; + fin->fin_dlen = ip->ip_len - hlen; + tcp = (tcphdr_t *)((char *)ip + hlen); + fin->fin_dp = (void *)tcp; + (*(((u_short *)fi) + 1)) = (*(((u_short *)ip) + 4)); + fi->fi_src.s_addr = ip->ip_src.s_addr; + fi->fi_dst.s_addr = ip->ip_dst.s_addr; + + fi->fi_fl = (hlen > sizeof(ip_t)) ? FI_OPTIONS : 0; + off = (ip->ip_off & IP_OFFMASK) << 3; + if (ip->ip_off & 0x3fff) + fi->fi_fl |= FI_FRAG; + switch (ip->ip_p) + { + case IPPROTO_ICMP : + { + int minicmpsz = sizeof(struct icmp); + icmphdr_t *icmp; + + icmp = (icmphdr_t *)tcp; + + if (!off && (icmp->icmp_type == ICMP_ECHOREPLY || + icmp->icmp_type == ICMP_ECHO)) + minicmpsz = ICMP_MINLEN; + if ((!(ip->ip_len >= hlen + minicmpsz) && !off) || + (off && off < sizeof(struct icmp))) + fi->fi_fl |= FI_SHORT; + if (fin->fin_dlen > 1) + fin->fin_data[0] = *(u_short *)tcp; + break; + } + case IPPROTO_TCP : + fi->fi_fl |= FI_TCPUDP; + if ((!IPMINLEN(ip, tcphdr) && !off) || + (off && off < sizeof(struct tcphdr))) + fi->fi_fl |= FI_SHORT; + if (!(fi->fi_fl & FI_SHORT) && !off) + fin->fin_tcpf = tcp->th_flags; + goto getports; + case IPPROTO_UDP : + fi->fi_fl |= FI_TCPUDP; + if ((!IPMINLEN(ip, udphdr) && !off) || + (off && off < sizeof(struct udphdr))) + fi->fi_fl |= FI_SHORT; +getports: + if (!off && (fin->fin_dlen > 3)) { + fin->fin_data[0] = ntohs(tcp->th_sport); + fin->fin_data[1] = ntohs(tcp->th_dport); + } + break; + default : + break; + } + + + for (s = (u_char *)(ip + 1), hlen -= (int)sizeof(*ip); hlen; ) { + opt = *s; + if (opt == '\0') + break; + ol = (opt == IPOPT_NOP) ? 1 : (int)*(s+1); + if (opt > 1 && (ol < 2 || ol > hlen)) + break; + for (i = 9, mv = 4; mv >= 0; ) { + op = ipopts + i; + if (opt == (u_char)op->ol_val) { + optmsk |= op->ol_bit; + if (opt == IPOPT_SECURITY) { + struct optlist *sp; + u_char sec; + int j, m; + + sec = *(s + 2); /* classification */ + for (j = 3, m = 2; m >= 0; ) { + sp = secopt + j; + if (sec == sp->ol_val) { + secmsk |= sp->ol_bit; + auth = *(s + 3); + auth *= 256; + auth += *(s + 4); + break; + } + if (sec < sp->ol_val) + j -= m--; + else + j += m--; + } + } + break; + } + if (opt < op->ol_val) + i -= mv--; + else + i += mv--; + } + hlen -= ol; + s += ol; + } + if (auth && !(auth & 0x0100)) + auth &= 0xff00; + fi->fi_optmsk = optmsk; + fi->fi_secmsk = secmsk; + fi->fi_auth = auth; +} + + +/* + * check an IP packet for TCP/UDP characteristics such as ports and flags. + */ +static int fr_tcpudpchk(fr, fin) +frentry_t *fr; +fr_info_t *fin; +{ + register u_short po, tup; + register char i; + register int err = 1; + + /* + * Both ports should *always* be in the first fragment. + * So far, I cannot find any cases where they can not be. + * + * compare destination ports + */ + if ((i = (int)fr->fr_dcmp)) { + po = fr->fr_dport; + tup = fin->fin_data[1]; + /* + * Do opposite test to that required and + * continue if that succeeds. + */ + if (!--i && tup != po) /* EQUAL */ + err = 0; + else if (!--i && tup == po) /* NOTEQUAL */ + err = 0; + else if (!--i && tup >= po) /* LESSTHAN */ + err = 0; + else if (!--i && tup <= po) /* GREATERTHAN */ + err = 0; + else if (!--i && tup > po) /* LT or EQ */ + err = 0; + else if (!--i && tup < po) /* GT or EQ */ + err = 0; + else if (!--i && /* Out of range */ + (tup >= po && tup <= fr->fr_dtop)) + err = 0; + else if (!--i && /* In range */ + (tup <= po || tup >= fr->fr_dtop)) + err = 0; + } + /* + * compare source ports + */ + if (err && (i = (int)fr->fr_scmp)) { + po = fr->fr_sport; + tup = fin->fin_data[0]; + if (!--i && tup != po) + err = 0; + else if (!--i && tup == po) + err = 0; + else if (!--i && tup >= po) + err = 0; + else if (!--i && tup <= po) + err = 0; + else if (!--i && tup > po) + err = 0; + else if (!--i && tup < po) + err = 0; + else if (!--i && /* Out of range */ + (tup >= po && tup <= fr->fr_stop)) + err = 0; + else if (!--i && /* In range */ + (tup <= po || tup >= fr->fr_stop)) + err = 0; + } + + /* + * If we don't have all the TCP/UDP header, then how can we + * expect to do any sort of match on it ? If we were looking for + * TCP flags, then NO match. If not, then match (which should + * satisfy the "short" class too). + */ + if (err && (fin->fin_fi.fi_p == IPPROTO_TCP)) { + if (fin->fin_fi.fi_fl & FI_SHORT) + return !(fr->fr_tcpf | fr->fr_tcpfm); + /* + * Match the flags ? If not, abort this match. + */ + if (fr->fr_tcpfm && + fr->fr_tcpf != (fin->fin_tcpf & fr->fr_tcpfm)) { + FR_DEBUG(("f. %#x & %#x != %#x\n", fin->fin_tcpf, + fr->fr_tcpfm, fr->fr_tcpf)); + err = 0; + } + } + return err; +} + +/* + * Check the input/output list of rules for a match and result. + * Could be per interface, but this gets real nasty when you don't have + * kernel sauce. + */ +int fr_scanlist(pass, ip, fin, m) +u_32_t pass; +ip_t *ip; +register fr_info_t *fin; +void *m; +{ + register struct frentry *fr; + register fr_ip_t *fi = &fin->fin_fi; + int rulen, portcmp = 0, off, skip = 0, logged = 0; + u_32_t passt; + + fr = fin->fin_fr; + fin->fin_fr = NULL; + fin->fin_rule = 0; + fin->fin_group = 0; + off = ip->ip_off & IP_OFFMASK; + pass |= (fi->fi_fl << 24); + + if ((fi->fi_fl & FI_TCPUDP) && (fin->fin_dlen > 3) && !off) + portcmp = 1; + + for (rulen = 0; fr; fr = fr->fr_next, rulen++) { + if (skip) { + skip--; + continue; + } + /* + * In all checks below, a null (zero) value in the + * filter struture is taken to mean a wildcard. + * + * check that we are working for the right interface + */ +#ifdef _KERNEL +# if BSD >= 199306 + if (fin->fin_out != 0) { + if ((fr->fr_oifa && + fr->fr_oifa != ((mb_t *)m)->m_pkthdr.rcvif) || + (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp)) + continue; + } else +# endif + if (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp) + continue; +#else + if (opts & (OPT_VERBOSE|OPT_DEBUG)) + printf("\n"); + FR_VERBOSE(("%c", (pass & FR_PASS) ? 'p' : + (pass & FR_AUTH) ? 'a' : 'b')); + if (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp) + continue; + FR_VERBOSE((":i")); +#endif + { + register u_32_t *ld, *lm, *lip; + register int i; + + lip = (u_32_t *)fi; + lm = (u_32_t *)&fr->fr_mip; + ld = (u_32_t *)&fr->fr_ip; + i = ((lip[0] & lm[0]) != ld[0]); + FR_IFDEBUG(i,continue,("0. %#08x & %#08x != %#08x\n", + lip[0], lm[0], ld[0])); + i |= ((lip[1] & lm[1]) != ld[1]) << 19; + i ^= (fr->fr_flags & FR_NOTSRCIP); + FR_IFDEBUG(i,continue,("1. %#08x & %#08x != %#08x\n", + lip[1], lm[1], ld[1])); + i |= ((lip[2] & lm[2]) != ld[2]) << 20; + i ^= (fr->fr_flags & FR_NOTDSTIP); + FR_IFDEBUG(i,continue,("2. %#08x & %#08x != %#08x\n", + lip[2], lm[2], ld[2])); + i |= ((lip[3] & lm[3]) != ld[3]); + FR_IFDEBUG(i,continue,("3. %#08x & %#08x != %#08x\n", + lip[3], lm[3], ld[3])); + i |= ((lip[4] & lm[4]) != ld[4]); + FR_IFDEBUG(i,continue,("4. %#08x & %#08x != %#08x\n", + lip[4], lm[4], ld[4])); + if (i) + continue; + } + + /* + * If a fragment, then only the first has what we're looking + * for here... + */ + if (!portcmp && (fr->fr_dcmp || fr->fr_scmp || fr->fr_tcpf || + fr->fr_tcpfm)) + continue; + if (fi->fi_fl & FI_TCPUDP) { + if (!fr_tcpudpchk(fr, fin)) + continue; + } else if (fr->fr_icmpm || fr->fr_icmp) { + if ((fi->fi_p != IPPROTO_ICMP) || off || + (fin->fin_dlen < 2)) + continue; + if ((fin->fin_data[0] & fr->fr_icmpm) != fr->fr_icmp) { + FR_DEBUG(("i. %#x & %#x != %#x\n", + fin->fin_data[0], fr->fr_icmpm, + fr->fr_icmp)); + continue; + } + } + FR_VERBOSE(("*")); + /* + * Just log this packet... + */ + passt = fr->fr_flags; + if ((passt & FR_CALLNOW) && fr->fr_func) + passt = (*fr->fr_func)(passt, ip, fin); + fin->fin_fr = fr; +#ifdef IPFILTER_LOG + if ((passt & FR_LOGMASK) == FR_LOG) { + if (!IPLLOG(passt, ip, fin, m)) { + ATOMIC_INC(frstats[fin->fin_out].fr_skip); + } + ATOMIC_INC(frstats[fin->fin_out].fr_pkl); + logged = 1; + } +#endif /* IPFILTER_LOG */ + if (!(skip = fr->fr_skip) && (passt & FR_LOGMASK) != FR_LOG) + pass = passt; + FR_DEBUG(("pass %#x\n", pass)); + ATOMIC_INC(fr->fr_hits); + if (pass & FR_ACCOUNT) + fr->fr_bytes += (U_QUAD_T)ip->ip_len; + else + fin->fin_icode = fr->fr_icode; + fin->fin_rule = rulen; + fin->fin_group = fr->fr_group; + if (fr->fr_grp) { + fin->fin_fr = fr->fr_grp; + pass = fr_scanlist(pass, ip, fin, m); + if (fin->fin_fr == NULL) { + fin->fin_rule = rulen; + fin->fin_group = fr->fr_group; + fin->fin_fr = fr; + } + if (pass & FR_DONTCACHE) + logged = 1; + } + if (pass & FR_QUICK) + break; + } + if (logged) + pass |= FR_DONTCACHE; + return pass; +} + + +/* + * frcheck - filter check + * check using source and destination addresses/ports in a packet whether + * or not to pass it on or not. + */ +int fr_check(ip, hlen, ifp, out +#if defined(_KERNEL) && SOLARIS +, qif, mp) +qif_t *qif; +#else +, mp) +#endif +mb_t **mp; +ip_t *ip; +int hlen; +void *ifp; +int out; +{ + /* + * The above really sucks, but short of writing a diff + */ + fr_info_t frinfo, *fc; + register fr_info_t *fin = &frinfo; + frentry_t *fr = NULL; + int changed, error = EHOSTUNREACH; + u_32_t pass, apass; +#if !SOLARIS || !defined(_KERNEL) + register mb_t *m = *mp; +#endif + +#ifdef _KERNEL + mb_t *mc = NULL; +# if !defined(__SVR4) && !defined(__svr4__) +# ifdef __sgi + char hbuf[(0xf << 2) + sizeof(struct icmp) + sizeof(ip_t) + 8]; +# endif + int up; + +# ifdef M_CANFASTFWD + /* + * XXX For now, IP Filter and fast-forwarding of cached flows + * XXX are mutually exclusive. Eventually, IP Filter should + * XXX get a "can-fast-forward" filter rule. + */ + m->m_flags &= ~M_CANFASTFWD; +# endif /* M_CANFASTFWD */ + + if ((ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_UDP || + ip->ip_p == IPPROTO_ICMP)) { + int plen = 0; + + if ((ip->ip_off & IP_OFFMASK) == 0) + switch(ip->ip_p) + { + case IPPROTO_TCP: + plen = sizeof(tcphdr_t); + break; + case IPPROTO_UDP: + plen = sizeof(udphdr_t); + break; + /* 96 - enough for complete ICMP error IP header */ + case IPPROTO_ICMP: + plen = ICMPERR_MAXPKTLEN - sizeof(ip_t); + break; + } + up = MIN(hlen + plen, ip->ip_len); + + if (up > m->m_len) { +# ifdef __sgi + /* Under IRIX, avoid m_pullup as it makes ping <hostname> panic */ + if ((up > sizeof(hbuf)) || (m_length(m) < up)) { + ATOMIC_INC(frstats[out].fr_pull[1]); + return -1; + } + m_copydata(m, 0, up, hbuf); + ATOMIC_INC(frstats[out].fr_pull[0]); + ip = (ip_t *)hbuf; +# else /* __ sgi */ +# ifndef linux + if ((*mp = m_pullup(m, up)) == 0) { + ATOMIC_INC(frstats[out].fr_pull[1]); + return -1; + } else { + ATOMIC_INC(frstats[out].fr_pull[0]); + m = *mp; + ip = mtod(m, ip_t *); + } +# endif /* !linux */ +# endif /* __sgi */ + } else + up = 0; + } else + up = 0; +# endif /* !defined(__SVR4) && !defined(__svr4__) */ +# if SOLARIS + mb_t *m = qif->qf_m; + + if ((u_int)ip & 0x3) + return 2; + fin->fin_qfm = m; + fin->fin_qif = qif; +# endif +#endif /* _KERNEL */ + fr_makefrip(hlen, ip, fin); + fin->fin_ifp = ifp; + fin->fin_out = out; + fin->fin_mp = mp; + pass = fr_pass; + + READ_ENTER(&ipf_mutex); + + /* + * Check auth now. This, combined with the check below to see if apass + * is 0 is to ensure that we don't count the packet twice, which can + * otherwise occur when we reprocess it. As it is, we only count it + * after it has no auth. table matchup. This also stops NAT from + * occuring until after the packet has been auth'd. + */ + apass = fr_checkauth(ip, fin); + + if (!out) { + changed = ip_natin(ip, fin); + if (!apass && (fin->fin_fr = ipacct[0][fr_active]) && + (fr_scanlist(FR_NOMATCH, ip, fin, m) & FR_ACCOUNT)) { + ATOMIC_INC(frstats[0].fr_acct); + } + } + + if (apass || (!(fr = ipfr_knownfrag(ip, fin)) && + !(fr = fr_checkstate(ip, fin)))) { + /* + * If a packet is found in the auth table, then skip checking + * the access lists for permission but we do need to consider + * the result as if it were from the ACL's. + */ + if (!apass) { + fc = frcache + out; + if (!bcmp((char *)fin, (char *)fc, FI_CSIZE)) { + /* + * copy cached data so we can unlock the mutex + * earlier. + */ + bcopy((char *)fc, (char *)fin, FI_COPYSIZE); + ATOMIC_INC(frstats[out].fr_chit); + if ((fr = fin->fin_fr)) { + ATOMIC_INC(fr->fr_hits); + pass = fr->fr_flags; + } + } else { + if ((fin->fin_fr = ipfilter[out][fr_active])) + pass = fr_scanlist(fr_pass, ip, fin, m); + if (!(pass & (FR_KEEPSTATE|FR_DONTCACHE))) + bcopy((char *)fin, (char *)fc, + FI_COPYSIZE); + if (pass & FR_NOMATCH) { + ATOMIC_INC(frstats[out].fr_nom); + } + } + fr = fin->fin_fr; + } else + pass = apass; + + /* + * If we fail to add a packet to the authorization queue, + * then we drop the packet later. However, if it was added + * then pretend we've dropped it already. + */ + if ((pass & FR_AUTH)) + if (FR_NEWAUTH(m, fin, ip, qif) != 0) +#ifdef _KERNEL + m = *mp = NULL; +#else + ; +#endif + + if (pass & FR_PREAUTH) { + READ_ENTER(&ipf_auth); + if ((fin->fin_fr = ipauth) && + (pass = fr_scanlist(0, ip, fin, m))) { + ATOMIC_INC(fr_authstats.fas_hits); + } else { + ATOMIC_INC(fr_authstats.fas_miss); + } + RWLOCK_EXIT(&ipf_auth); + } + + fin->fin_fr = fr; + if ((pass & (FR_KEEPFRAG|FR_KEEPSTATE)) == FR_KEEPFRAG) { + if (fin->fin_fi.fi_fl & FI_FRAG) { + if (ipfr_newfrag(ip, fin, pass) == -1) { + ATOMIC_INC(frstats[out].fr_bnfr); + } else { + ATOMIC_INC(frstats[out].fr_nfr); + } + } else { + ATOMIC_INC(frstats[out].fr_cfr); + } + } + if (pass & FR_KEEPSTATE) { + if (fr_addstate(ip, fin, 0) == NULL) { + ATOMIC_INC(frstats[out].fr_bads); + } else { + ATOMIC_INC(frstats[out].fr_ads); + } + } + } else if (fr != NULL) { + pass = fr->fr_flags; + if (pass & FR_LOGFIRST) + pass &= ~(FR_LOGFIRST|FR_LOG); + } + + if (fr && fr->fr_func && !(pass & FR_CALLNOW)) + pass = (*fr->fr_func)(pass, ip, fin); + + /* + * Only count/translate packets which will be passed on, out the + * interface. + */ + if (out && (pass & FR_PASS)) { + if ((fin->fin_fr = ipacct[1][fr_active]) && + (fr_scanlist(FR_NOMATCH, ip, fin, m) & FR_ACCOUNT)) { + ATOMIC_INC(frstats[1].fr_acct); + } + fin->fin_fr = fr; + changed = ip_natout(ip, fin); + } else + fin->fin_fr = fr; + RWLOCK_EXIT(&ipf_mutex); + +#ifdef IPFILTER_LOG + if ((fr_flags & FF_LOGGING) || (pass & FR_LOGMASK)) { + if ((fr_flags & FF_LOGNOMATCH) && (pass & FR_NOMATCH)) { + pass |= FF_LOGNOMATCH; + ATOMIC_INC(frstats[out].fr_npkl); + goto logit; + } else if (((pass & FR_LOGMASK) == FR_LOGP) || + ((pass & FR_PASS) && (fr_flags & FF_LOGPASS))) { + if ((pass & FR_LOGMASK) != FR_LOGP) + pass |= FF_LOGPASS; + ATOMIC_INC(frstats[out].fr_ppkl); + goto logit; + } else if (((pass & FR_LOGMASK) == FR_LOGB) || + ((pass & FR_BLOCK) && (fr_flags & FF_LOGBLOCK))) { + if ((pass & FR_LOGMASK) != FR_LOGB) + pass |= FF_LOGBLOCK; + ATOMIC_INC(frstats[out].fr_bpkl); +logit: + if (!IPLLOG(pass, ip, fin, m)) { + ATOMIC_INC(frstats[out].fr_skip); + if ((pass & (FR_PASS|FR_LOGORBLOCK)) == + (FR_PASS|FR_LOGORBLOCK)) + pass ^= FR_PASS|FR_BLOCK; + } + } + } +#endif /* IPFILTER_LOG */ +#ifdef _KERNEL + /* + * Only allow FR_DUP to work if a rule matched - it makes no sense to + * set FR_DUP as a "default" as there are no instructions about where + * to send the packet. + */ + if (fr && (pass & FR_DUP)) +# if SOLARIS + mc = dupmsg(m); +# else +# ifndef linux + mc = m_copy(m, 0, M_COPYALL); +# else + ; +# endif +# endif +#endif + if (pass & FR_PASS) { + ATOMIC_INC(frstats[out].fr_pass); + } else if (pass & FR_BLOCK) { + ATOMIC_INC(frstats[out].fr_block); + /* + * Should we return an ICMP packet to indicate error + * status passing through the packet filter ? + * WARNING: ICMP error packets AND TCP RST packets should + * ONLY be sent in repsonse to incoming packets. Sending them + * in response to outbound packets can result in a panic on + * some operating systems. + */ + if (!out) { +#ifdef _KERNEL + if (pass & FR_RETICMP) { + struct in_addr dst; + + if ((pass & FR_RETMASK) == FR_FAKEICMP) + dst = ip->ip_dst; + else + dst.s_addr = 0; +# if SOLARIS + ICMP_ERROR(q, ip, ICMP_UNREACH, fin->fin_icode, + qif, dst); +# else + ICMP_ERROR(m, ip, ICMP_UNREACH, fin->fin_icode, + ifp, dst); +# endif + ATOMIC_INC(frstats[0].fr_ret); + } else if (((pass & FR_RETMASK) == FR_RETRST) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + if (SEND_RESET(ip, qif, ifp, fin) == 0) { + ATOMIC_INC(frstats[1].fr_ret); + } + } +#else + if ((pass & FR_RETMASK) == FR_RETICMP) { + verbose("- ICMP unreachable sent\n"); + ATOMIC_INC(frstats[0].fr_ret); + } else if ((pass & FR_RETMASK) == FR_FAKEICMP) { + verbose("- forged ICMP unreachable sent\n"); + ATOMIC_INC(frstats[0].fr_ret); + } else if (((pass & FR_RETMASK) == FR_RETRST) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + verbose("- TCP RST sent\n"); + ATOMIC_INC(frstats[1].fr_ret); + } +#endif + } else { + if (pass & FR_RETRST) + error = ECONNRESET; + } + } + + /* + * If we didn't drop off the bottom of the list of rules (and thus + * the 'current' rule fr is not NULL), then we may have some extra + * instructions about what to do with a packet. + * Once we're finished return to our caller, freeing the packet if + * we are dropping it (* BSD ONLY *). + */ +#if defined(_KERNEL) +# if !SOLARIS +# if !defined(linux) + if (fr) { + frdest_t *fdp = &fr->fr_tif; + + if (((pass & FR_FASTROUTE) && !out) || + (fdp->fd_ifp && fdp->fd_ifp != (struct ifnet *)-1)) { + if (ipfr_fastroute(m, fin, fdp) == 0) + m = *mp = NULL; + } + if (mc) + ipfr_fastroute(mc, fin, &fr->fr_dif); + } + if (!(pass & FR_PASS) && m) + m_freem(m); +# ifdef __sgi + else if (changed && up && m) + m_copyback(m, 0, up, hbuf); +# endif +# endif /* !linux */ +# else /* !SOLARIS */ + if (fr) { + frdest_t *fdp = &fr->fr_tif; + + if (((pass & FR_FASTROUTE) && !out) || + (fdp->fd_ifp && fdp->fd_ifp != (struct ifnet *)-1)) { + if (ipfr_fastroute(qif, ip, m, mp, fin, fdp) == 0) + m = *mp = NULL; + } + if (mc) + ipfr_fastroute(qif, ip, mc, mp, fin, &fr->fr_dif); + } +# endif /* !SOLARIS */ + return (pass & FR_PASS) ? 0 : error; +#else /* _KERNEL */ + if (pass & FR_NOMATCH) + return 1; + if (pass & FR_PASS) + return 0; + if (pass & FR_AUTH) + return -2; + return -1; +#endif /* _KERNEL */ +} + + +/* + * ipf_cksum + * addr should be 16bit aligned and len is in bytes. + * length is in bytes + */ +u_short ipf_cksum(addr, len) +register u_short *addr; +register int len; +{ + register u_32_t sum = 0; + + for (sum = 0; len > 1; len -= 2) + sum += *addr++; + + /* mop up an odd byte, if necessary */ + if (len == 1) + sum += *(u_char *)addr; + + /* + * add back carry outs from top 16 bits to low 16 bits + */ + sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ + sum += (sum >> 16); /* add carry */ + return (u_short)(~sum); +} + + +/* + * NB: This function assumes we've pullup'd enough for all of the IP header + * and the TCP header. We also assume that data blocks aren't allocated in + * odd sizes. + */ +u_short fr_tcpsum(m, ip, tcp) +mb_t *m; +ip_t *ip; +tcphdr_t *tcp; +{ + u_short *sp, slen, ts; + u_int sum, sum2; + int hlen; + + /* + * Add up IP Header portion + */ + hlen = ip->ip_hl << 2; + slen = ip->ip_len - hlen; + sum = htons((u_short)ip->ip_p); + sum += htons(slen); + sp = (u_short *)&ip->ip_src; + sum += *sp++; /* ip_src */ + sum += *sp++; + sum += *sp++; /* ip_dst */ + sum += *sp++; + ts = tcp->th_sum; + tcp->th_sum = 0; +#ifdef KERNEL +# if SOLARIS + sum2 = ip_cksum(m, hlen, sum); /* hlen == offset */ + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + sum2 = ~sum2 & 0xffff; +# else /* SOLARIS */ +# if defined(BSD) || defined(sun) +# if BSD >= 199306 + m->m_data += hlen; +# else + m->m_off += hlen; +# endif + m->m_len -= hlen; + sum2 = in_cksum(m, slen); + m->m_len += hlen; +# if BSD >= 199306 + m->m_data -= hlen; +# else + m->m_off -= hlen; +# endif + /* + * Both sum and sum2 are partial sums, so combine them together. + */ + sum = (sum & 0xffff) + (sum >> 16); + sum = ~sum & 0xffff; + sum2 += sum; + sum2 = (sum2 & 0xffff) + (sum2 >> 16); +# else /* defined(BSD) || defined(sun) */ +{ + union { + u_char c[2]; + u_short s; + } bytes; + u_short len = ip->ip_len; +# if defined(__sgi) + int add; +# endif + + /* + * Add up IP Header portion + */ + sp = (u_short *)&ip->ip_src; + len -= (ip->ip_hl << 2); + sum = ntohs(IPPROTO_TCP); + sum += htons(len); + sum += *sp++; /* ip_src */ + sum += *sp++; + sum += *sp++; /* ip_dst */ + sum += *sp++; + if (sp != (u_short *)tcp) + sp = (u_short *)tcp; + sum += *sp++; /* sport */ + sum += *sp++; /* dport */ + sum += *sp++; /* seq */ + sum += *sp++; + sum += *sp++; /* ack */ + sum += *sp++; + sum += *sp++; /* off */ + sum += *sp++; /* win */ + sum += *sp++; /* Skip over checksum */ + sum += *sp++; /* urp */ + +# ifdef __sgi + /* + * In case we had to copy the IP & TCP header out of mbufs, + * skip over the mbuf bits which are the header + */ + if ((caddr_t)ip != mtod(m, caddr_t)) { + hlen = (caddr_t)sp - (caddr_t)ip; + while (hlen) { + add = MIN(hlen, m->m_len); + sp = (u_short *)(mtod(m, caddr_t) + add); + hlen -= add; + if (add == m->m_len) { + m = m->m_next; + if (!hlen) { + if (!m) + break; + sp = mtod(m, u_short *); + } + PANIC((!m),("fr_tcpsum(1): not enough data")); + } + } + } +# endif + + if (!(len -= sizeof(*tcp))) + goto nodata; + while (len > 1) { + if (((caddr_t)sp - mtod(m, caddr_t)) >= m->m_len) { + m = m->m_next; + PANIC((!m),("fr_tcpsum(2): not enough data")); + sp = mtod(m, u_short *); + } + if (((caddr_t)(sp + 1) - mtod(m, caddr_t)) > m->m_len) { + bytes.c[0] = *(u_char *)sp; + m = m->m_next; + PANIC((!m),("fr_tcpsum(3): not enough data")); + sp = mtod(m, u_short *); + bytes.c[1] = *(u_char *)sp; + sum += bytes.s; + sp = (u_short *)((u_char *)sp + 1); + } + if ((u_long)sp & 1) { + bcopy((char *)sp++, (char *)&bytes.s, sizeof(bytes.s)); + sum += bytes.s; + } else + sum += *sp++; + len -= 2; + } + if (len) + sum += ntohs(*(u_char *)sp << 8); +nodata: + while (sum > 0xffff) + sum = (sum & 0xffff) + (sum >> 16); + sum2 = (u_short)(~sum & 0xffff); +} +# endif /* defined(BSD) || defined(sun) */ +# endif /* SOLARIS */ +#else /* KERNEL */ + sum2 = 0; +#endif /* KERNEL */ + tcp->th_sum = ts; + return sum2; +} + + +#if defined(_KERNEL) && ( ((BSD < 199306) && !SOLARIS) || defined(__sgi) ) +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + * $Id: fil.c,v 2.3.2.7 1999/10/21 14:21:40 darrenr Exp $ + */ +/* + * Copy data from an mbuf chain starting "off" bytes from the beginning, + * continuing for "len" bytes, into the indicated buffer. + */ +void +m_copydata(m, off, len, cp) + register mb_t *m; + register int off; + register int len; + caddr_t cp; +{ + register unsigned count; + + if (off < 0 || len < 0) + panic("m_copydata"); + while (off > 0) { + if (m == 0) + panic("m_copydata"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + if (m == 0) + panic("m_copydata"); + count = MIN(m->m_len - off, len); + bcopy(mtod(m, caddr_t) + off, cp, count); + len -= count; + cp += count; + off = 0; + m = m->m_next; + } +} + + +# ifndef linux +/* + * Copy data from a buffer back into the indicated mbuf chain, + * starting "off" bytes from the beginning, extending the mbuf + * chain if necessary. + */ +void +m_copyback(m0, off, len, cp) + struct mbuf *m0; + register int off; + register int len; + caddr_t cp; +{ + register int mlen; + register struct mbuf *m = m0, *n; + int totlen = 0; + + if (m0 == 0) + return; + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == 0) { + n = m_getclr(M_DONTWAIT, m->m_type); + if (n == 0) + goto out; + n->m_len = min(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + while (len > 0) { + mlen = min (m->m_len - off, len); + bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == 0) { + n = m_get(M_DONTWAIT, m->m_type); + if (n == 0) + break; + n->m_len = min(MLEN, len); + m->m_next = n; + } + m = m->m_next; + } +out: +#if 0 + if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) + m->m_pkthdr.len = totlen; +#endif + return; +} +# endif /* linux */ +#endif /* (_KERNEL) && ( ((BSD < 199306) && !SOLARIS) || __sgi) */ + + +frgroup_t *fr_findgroup(num, flags, which, set, fgpp) +u_int num; +u_32_t flags; +minor_t which; +int set; +frgroup_t ***fgpp; +{ + frgroup_t *fg, **fgp; + + if (which == IPL_LOGAUTH) + fgp = &ipfgroups[2][set]; + else if (flags & FR_ACCOUNT) + fgp = &ipfgroups[1][set]; + else if (flags & (FR_OUTQUE|FR_INQUE)) + fgp = &ipfgroups[0][set]; + else + return NULL; + num &= 0xffff; + + while ((fg = *fgp)) + if (fg->fg_num == num) + break; + else + fgp = &fg->fg_next; + if (fgpp) + *fgpp = fgp; + return fg; +} + + +frgroup_t *fr_addgroup(num, fp, which, set) +u_int num; +frentry_t *fp; +minor_t which; +int set; +{ + frgroup_t *fg, **fgp; + + if ((fg = fr_findgroup(num, fp->fr_flags, which, set, &fgp))) + return fg; + + KMALLOC(fg, frgroup_t *); + if (fg) { + fg->fg_num = num & 0xffff; + fg->fg_next = *fgp; + fg->fg_head = fp; + fg->fg_start = &fp->fr_grp; + *fgp = fg; + } + return fg; +} + + +void fr_delgroup(num, flags, which, set) +u_int num; +u_32_t flags; +minor_t which; +int set; +{ + frgroup_t *fg, **fgp; + + if (!(fg = fr_findgroup(num, flags, which, set, &fgp))) + return; + + *fgp = fg->fg_next; + KFREE(fg); +} + + + +/* + * recursively flush rules from the list, descending groups as they are + * encountered. if a rule is the head of a group and it has lost all its + * group members, then also delete the group reference. + */ +static int frflushlist(set, unit, nfreedp, listp) +int set; +minor_t unit; +int *nfreedp; +frentry_t **listp; +{ + register int freed = 0, i; + register frentry_t *fp; + + while ((fp = *listp)) { + *listp = fp->fr_next; + if (fp->fr_grp) { + i = frflushlist(set, unit, nfreedp, &fp->fr_grp); + MUTEX_ENTER(&ipf_rw); + fp->fr_ref -= i; + MUTEX_EXIT(&ipf_rw); + } + + ATOMIC_DEC(fp->fr_ref); + if (fp->fr_ref == 0) { + if (fp->fr_grhead) + fr_delgroup((u_int)fp->fr_grhead, fp->fr_flags, + unit, set); + KFREE(fp); + } else + fp->fr_next = NULL; + freed++; + } + *nfreedp += freed; + return freed; +} + + +int frflush(unit, flags) +minor_t unit; +int flags; +{ + int flushed = 0, set; + + if (unit != IPL_LOGIPF) + return 0; + WRITE_ENTER(&ipf_mutex); + bzero((char *)frcache, sizeof(frcache[0]) * 2); + + set = fr_active; + if (flags & FR_INACTIVE) + set = 1 - set; + + if (flags & FR_OUTQUE) { + (void) frflushlist(set, unit, &flushed, &ipfilter[1][set]); + (void) frflushlist(set, unit, &flushed, &ipacct[1][set]); + } + if (flags & FR_INQUE) { + (void) frflushlist(set, unit, &flushed, &ipfilter[0][set]); + (void) frflushlist(set, unit, &flushed, &ipacct[0][set]); + } + RWLOCK_EXIT(&ipf_mutex); + return flushed; +} + + +char *memstr(src, dst, slen, dlen) +char *src, *dst; +int slen, dlen; +{ + char *s = NULL; + + while (dlen >= slen) { + if (bcmp(src, dst, slen) == 0) { + s = dst; + break; + } + dst++; + dlen--; + } + return s; +} + + +void fixskip(listp, rp, addremove) +frentry_t **listp, *rp; +int addremove; +{ + frentry_t *fp; + int rules = 0, rn = 0; + + for (fp = *listp; fp && (fp != rp); fp = fp->fr_next, rules++) + ; + + if (!fp) + return; + + for (fp = *listp; fp && (fp != rp); fp = fp->fr_next, rn++) + if (fp->fr_skip && (rn + fp->fr_skip >= rules)) + fp->fr_skip += addremove; +} + + +#ifdef _KERNEL +/* + * count consecutive 1's in bit mask. If the mask generated by counting + * consecutive 1's is different to that passed, return -1, else return # + * of bits. + */ +int countbits(ip) +u_32_t ip; +{ + u_32_t ipn; + int cnt = 0, i, j; + + ip = ipn = ntohl(ip); + for (i = 32; i; i--, ipn *= 2) + if (ipn & 0x80000000) + cnt++; + else + break; + ipn = 0; + for (i = 32, j = cnt; i; i--, j--) { + ipn *= 2; + if (j > 0) + ipn++; + } + if (ipn == ip) + return cnt; + return -1; +} + + +/* + * return the first IP Address associated with an interface + */ +int fr_ifpaddr(ifptr, inp) +void *ifptr; +struct in_addr *inp; +{ +# if SOLARIS + ill_t *ill = ifptr; +# else + struct ifnet *ifp = ifptr; +# endif + struct in_addr in; + +# if SOLARIS + in.s_addr = ill->ill_ipif->ipif_local_addr; +# else /* SOLARIS */ +# if linux + ; +# else /* linux */ + struct ifaddr *ifa; + struct sockaddr_in *sin; + +# if (__FreeBSD_version >= 300000) + ifa = TAILQ_FIRST(&ifp->if_addrhead); +# else +# if defined(__NetBSD__) || defined(__OpenBSD__) + ifa = ifp->if_addrlist.tqh_first; +# else +# if defined(__sgi) && defined(IFF_DRVRLOCK) /* IRIX 6 */ + ifa = &((struct in_ifaddr *)ifp->in_ifaddr)->ia_ifa; +# else + ifa = ifp->if_addrlist; +# endif +# endif /* __NetBSD__ || __OpenBSD__ */ +# endif /* __FreeBSD_version >= 300000 */ +# if (BSD < 199306) && !(/*IRIX6*/defined(__sgi) && defined(IFF_DRVRLOCK)) + sin = (struct sockaddr_in *)&ifa->ifa_addr; +# else + sin = (struct sockaddr_in *)ifa->ifa_addr; + while (sin && ifa && + sin->sin_family != AF_INET) { +# if (__FreeBSD_version >= 300000) + ifa = TAILQ_NEXT(ifa, ifa_link); +# else +# if defined(__NetBSD__) || defined(__OpenBSD__) + ifa = ifa->ifa_list.tqe_next; +# else + ifa = ifa->ifa_next; +# endif +# endif /* __FreeBSD_version >= 300000 */ + if (ifa) + sin = (struct sockaddr_in *)ifa->ifa_addr; + } + if (ifa == NULL) + sin = NULL; + if (sin == NULL) + return -1; +# endif /* (BSD < 199306) && (!__sgi && IFF_DRVLOCK) */ + in = sin->sin_addr; +# endif /* linux */ +# endif /* SOLARIS */ + in.s_addr = ntohl(in.s_addr); + *inp = in; + return 0; +} +#else + + +/* + * return the first IP Address associated with an interface + */ +int fr_ifpaddr(ifptr, inp) +void *ifptr; +struct in_addr *inp; +{ + return 0; +} +#endif diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h new file mode 100644 index 0000000..6d418d6 --- /dev/null +++ b/sys/netinet/icmp6.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_ICMP6_H_ +#define _NETINET_ICMP6_H_ + +#include <netinet6/icmp6.h> + +#endif /* !_NETINET_ICMP6_H_ */ diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h new file mode 100644 index 0000000..02d147c --- /dev/null +++ b/sys/netinet/icmp_var.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_ICMP_VAR_H_ +#define _NETINET_ICMP_VAR_H_ + +#ifdef KERNEL +#include "opt_icmp_bandlim.h" /* for ICMP_BANDLIM */ +#endif + +/* + * Variables related to this implementation + * of the internet control message protocol. + */ +struct icmpstat { +/* statistics related to icmp packets generated */ + u_long icps_error; /* # of calls to icmp_error */ + u_long icps_oldshort; /* no error 'cuz old ip too short */ + u_long icps_oldicmp; /* no error 'cuz old was icmp */ + u_long icps_outhist[ICMP_MAXTYPE + 1]; +/* statistics related to input messages processed */ + u_long icps_badcode; /* icmp_code out of range */ + u_long icps_tooshort; /* packet < ICMP_MINLEN */ + u_long icps_checksum; /* bad checksum */ + u_long icps_badlen; /* calculated bound mismatch */ + u_long icps_reflect; /* number of responses */ + u_long icps_inhist[ICMP_MAXTYPE + 1]; + u_long icps_bmcastecho; /* b/mcast echo requests dropped */ + u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */ +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ +#define ICMPCTL_STATS 2 /* statistics (read-only) */ +#define ICMPCTL_ICMPLIM 3 +#define ICMPCTL_MAXID 4 + +#define ICMPCTL_NAMES { \ + { 0, 0 }, \ + { "maskrepl", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "icmplim", CTLTYPE_INT }, \ +} + +#ifdef KERNEL +SYSCTL_DECL(_net_inet_icmp); +#ifdef ICMP_BANDLIM +extern int badport_bandlim __P((int)); +#endif +#endif + +#endif diff --git a/sys/netinet/if_atm.c b/sys/netinet/if_atm.c new file mode 100644 index 0000000..04b49bf --- /dev/null +++ b/sys/netinet/if_atm.c @@ -0,0 +1,280 @@ +/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */ + +/* + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * IP <=> ATM address resolution. + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_natm.h" + +#if defined(INET) || defined(INET6) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/if_atm.h> + +#include <netinet/in.h> +#include <netinet/if_atm.h> + +#ifdef NATM +#include <netnatm/natm.h> +#endif + + +#define SDL(s) ((struct sockaddr_dl *)s) + +/* + * atm_rtrequest: handle ATM rt request (in support of generic code) + * inputs: "req" = request code + * "rt" = route entry + * "sa" = sockaddr + */ + +void +atm_rtrequest(req, rt, sa) + int req; + register struct rtentry *rt; + struct sockaddr *sa; +{ + register struct sockaddr *gate = rt->rt_gateway; + struct atm_pseudoioctl api; +#ifdef NATM + struct sockaddr_in *sin; + struct natmpcb *npcb = NULL; + struct atm_pseudohdr *aph; +#endif + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */ + return; + + switch (req) { + + case RTM_RESOLVE: /* resolve: only happens when cloning */ + printf("atm_rtrequest: RTM_RESOLVE request detected?\n"); + break; + + case RTM_ADD: + + /* + * route added by a command (e.g. ifconfig, route, arp...). + * + * first check to see if this is not a host route, in which + * case we are being called via "ifconfig" to set the address. + */ + + if ((rt->rt_flags & RTF_HOST) == 0) { + rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + break; + } + + if ((rt->rt_flags & RTF_CLONING) != 0) { + printf("atm_rtrequest: cloning route detected?\n"); + break; + } + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "atm_rtrequest: bad gateway value"); + break; + } + +#ifdef DIAGNOSTIC + if (rt->rt_ifp->if_ioctl == NULL) panic("atm null ioctl"); +#endif + +#ifdef NATM + /* + * let native ATM know we are using this VCI/VPI + * (i.e. reserve it) + */ + sin = (struct sockaddr_in *) rt_key(rt); + if (sin->sin_family != AF_INET) + goto failed; + aph = (struct atm_pseudohdr *) LLADDR(SDL(gate)); + npcb = npcb_add(NULL, rt->rt_ifp, ATM_PH_VCI(aph), + ATM_PH_VPI(aph)); + if (npcb == NULL) + goto failed; + npcb->npcb_flags |= NPCB_IP; + npcb->ipaddr.s_addr = sin->sin_addr.s_addr; + /* XXX: move npcb to llinfo when ATM ARP is ready */ + rt->rt_llinfo = (caddr_t) npcb; + rt->rt_flags |= RTF_LLINFO; +#endif + /* + * let the lower level know this circuit is active + */ + bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); + api.rxhand = NULL; + if (rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMENA, + (caddr_t)&api) != 0) { + printf("atm: couldn't add VC\n"); + goto failed; + } + + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + + break; + +failed: +#ifdef NATM + if (npcb) { + npcb_free(npcb, NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } +#endif + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, + rt_mask(rt), 0, (struct rtentry **) 0); + break; + + case RTM_DELETE: + +#ifdef NATM + /* + * tell native ATM we are done with this VC + */ + + if (rt->rt_flags & RTF_LLINFO) { + npcb_free((struct natmpcb *)rt->rt_llinfo, + NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } +#endif + /* + * tell the lower layer to disable this circuit + */ + + bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); + api.rxhand = NULL; + (void)rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMDIS, + (caddr_t)&api); + + break; + } +} + +/* + * atmresolve: + * inputs: + * [1] "rt" = the link level route to use (or null if need to look one up) + * [2] "m" = mbuf containing the data to be sent + * [3] "dst" = sockaddr_in (IP) address of dest. + * output: + * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info + * return: + * 0 == resolve FAILED; note that "m" gets m_freem'd in this case + * 1 == resolve OK; desten contains result + * + * XXX: will need more work if we wish to support ATMARP in the kernel, + * but this is enough for PVCs entered via the "route" command. + */ + +int +atmresolve(rt, m, dst, desten) + +register struct rtentry *rt; +struct mbuf *m; +register struct sockaddr *dst; +register struct atm_pseudohdr *desten; /* OUT */ + +{ + struct sockaddr_dl *sdl; + + if (m->m_flags & (M_BCAST|M_MCAST)) { + log(LOG_INFO, "atmresolve: BCAST/MCAST packet detected/dumped"); + goto bad; + } + + if (rt == NULL) { + rt = RTALLOC1(dst, 0); + if (rt == NULL) goto bad; /* failed */ + rt->rt_refcnt--; /* don't keep LL references */ + if ((rt->rt_flags & RTF_GATEWAY) != 0 || + (rt->rt_flags & RTF_LLINFO) == 0 || + /* XXX: are we using LLINFO? */ + rt->rt_gateway->sa_family != AF_LINK) { + goto bad; + } + } + + /* + * note that rt_gateway is a sockaddr_dl which contains the + * atm_pseudohdr data structure for this route. we currently + * don't need any rt_llinfo info (but will if we want to support + * ATM ARP [c.f. if_ether.c]). + */ + + sdl = SDL(rt->rt_gateway); + + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + + + if (sdl->sdl_family == AF_LINK && sdl->sdl_alen == sizeof(*desten)) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return(1); /* ok, go for it! */ + } + + /* + * we got an entry, but it doesn't have valid link address + * info in it (it is prob. the interface route, which has + * sdl_alen == 0). dump packet. (fall through to "bad"). + */ + +bad: + m_freem(m); + return(0); +} +#endif /* INET */ diff --git a/sys/netinet/if_atm.h b/sys/netinet/if_atm.h new file mode 100644 index 0000000..e0d26c0 --- /dev/null +++ b/sys/netinet/if_atm.h @@ -0,0 +1,46 @@ +/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */ + +/* + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * if_atm.h + */ + +struct atm_pseudohdr; +struct mbuf; +struct rtentry; +struct sockaddr; + +void atm_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +int atmresolve __P((struct rtentry *, struct mbuf *, struct sockaddr *, + struct atm_pseudohdr *)); diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c new file mode 100644 index 0000000..8c92de3 --- /dev/null +++ b/sys/netinet/if_ether.c @@ -0,0 +1,776 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Ethernet address resolution protocol. + * TODO: + * add "inuse/lock" bit (or ref. count) along with valid bit + */ + +#include "opt_inet.h" +#include "opt_bdg.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/route.h> +#include <net/netisr.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/if_ether.h> + +#include <net/iso88025.h> + +#define SIN(s) ((struct sockaddr_in *)s) +#define SDL(s) ((struct sockaddr_dl *)s) + +SYSCTL_DECL(_net_link_ether); +SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); + +/* timer values */ +static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ +static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +static int arpt_down = 20; /* once declared down, don't send for 20 sec */ + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, + &arpt_prune, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, + &arpt_keep, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, + &arpt_down, 0, ""); + +#define rt_expire rt_rmx.rmx_expire + +struct llinfo_arp { + LIST_ENTRY(llinfo_arp) la_le; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + long la_asked; /* last time we QUERIED for this addr */ +#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ +}; + +static LIST_HEAD(, llinfo_arp) llinfo_arp; + +struct ifqueue arpintrq = {0, 0, 0, 50}; +static int arp_inuse, arp_allocated; + +static int arp_maxtries = 5; +static int useloopback = 1; /* use loopback interface for local traffic */ +static int arp_proxyall = 0; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, + &arp_maxtries, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, + &useloopback, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, + &arp_proxyall, 0, ""); + +static void arp_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +static void arprequest __P((struct arpcom *, + struct in_addr *, struct in_addr *, u_char *)); +static void arpintr __P((void)); +static void arptfree __P((struct llinfo_arp *)); +static void arptimer __P((void *)); +static struct llinfo_arp + *arplookup __P((u_long, int, int)); +#ifdef INET +static void in_arpinput __P((struct mbuf *)); +#endif + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +/* ARGSUSED */ +static void +arptimer(ignored_arg) + void *ignored_arg; +{ + int s = splnet(); + register struct llinfo_arp *la = llinfo_arp.lh_first; + struct llinfo_arp *ola; + + timeout(arptimer, (caddr_t)0, arpt_prune * hz); + while ((ola = la) != 0) { + register struct rtentry *rt = la->la_rt; + la = la->la_le.le_next; + if (rt->rt_expire && rt->rt_expire <= time_second) + arptfree(ola); /* timer has expired, clear */ + } + splx(s); +} + +/* + * Parallel to llc_rtrequest. + */ +static void +arp_rtrequest(req, rt, sa) + int req; + register struct rtentry *rt; + struct sockaddr *sa; +{ + register struct sockaddr *gate = rt->rt_gateway; + register struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + static int arpinit_done; + + if (!arpinit_done) { + arpinit_done = 1; + LIST_INIT(&llinfo_arp); + timeout(arptimer, (caddr_t)0, hz); + } + if (rt->rt_flags & RTF_GATEWAY) + return; + switch (req) { + + case RTM_ADD: + /* + * XXX: If this is a manually added route to interface + * such as older version of routed or gated might provide, + * restore cloning bit. + */ + if ((rt->rt_flags & RTF_HOST) == 0 && + SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) + rt->rt_flags |= RTF_CLONING; + if (rt->rt_flags & RTF_CLONING) { + /* + * Case 1: This route should come from a route to iface. + */ + rt_setgate(rt, rt_key(rt), + (struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + rt->rt_expire = time_second; + break; + } + /* Announce a new entry if requested. */ + if (rt->rt_flags & RTF_ANNOUNCE) + arprequest((struct arpcom *)rt->rt_ifp, + &SIN(rt_key(rt))->sin_addr, + &SIN(rt_key(rt))->sin_addr, + (u_char *)LLADDR(SDL(gate))); + /*FALLTHROUGH*/ + case RTM_RESOLVE: + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n"); + break; + } + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + if (la != 0) + break; /* This happens on a route change */ + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + R_Malloc(la, struct llinfo_arp *, sizeof(*la)); + rt->rt_llinfo = (caddr_t)la; + if (la == 0) { + log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); + break; + } + arp_inuse++, arp_allocated++; + Bzero(la, sizeof(*la)); + la->la_rt = rt; + rt->rt_flags |= RTF_LLINFO; + LIST_INSERT_HEAD(&llinfo_arp, la, la_le); + +#ifdef INET + /* + * This keeps the multicast addresses from showing up + * in `arp -a' listings as unresolved. It's not actually + * functional. Then the same for broadcast. + */ + if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { + ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr, + LLADDR(SDL(gate))); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } + if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { + memcpy(LLADDR(SDL(gate)), etherbroadcastaddr, 6); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } +#endif + + if (SIN(rt_key(rt))->sin_addr.s_addr == + (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced + * through the hardware by configuring the loopback down. + * However, it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and remove + * the route to force traffic out to the hardware. + */ + rt->rt_expire = 0; + Bcopy(((struct arpcom *)rt->rt_ifp)->ac_enaddr, + LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + if (useloopback) + rt->rt_ifp = loif; + + } + break; + + case RTM_DELETE: + if (la == 0) + break; + arp_inuse--; + LIST_REMOVE(la, la_le); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + if (la->la_hold) + m_freem(la->la_hold); + Free((caddr_t)la); + } +} + +/* + * Broadcast an ARP request. Caller specifies: + * - arp header source ip address + * - arp header target ip address + * - arp header source ethernet address + */ +static void +arprequest(ac, sip, tip, enaddr) + register struct arpcom *ac; + register struct in_addr *sip, *tip; + register u_char *enaddr; +{ + register struct mbuf *m; + register struct ether_header *eh; + register struct ether_arp *ea; + struct sockaddr sa; + + if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + return; + m->m_pkthdr.rcvif = (struct ifnet *)0; + switch (ac->ac_if.if_type) { + case IFT_ISO88025: + m->m_len = sizeof(*ea) + 10; + m->m_pkthdr.len = sizeof(*ea) + 10; + MH_ALIGN(m, sizeof(*ea) + 10); + (void)memcpy(mtod(m, caddr_t), + "\x82\x40\xaa\xaa\x03\x00\x00\x00\x08\x06", 10); + (void)memcpy(sa.sa_data, etherbroadcastaddr, 6); + (void)memcpy(sa.sa_data + 6, enaddr, 6); + sa.sa_data[6] |= 0x80; + sa.sa_data[12] = 0x10; + sa.sa_data[13] = 0x40; + ea = (struct ether_arp *)(mtod(m, char *) + 10); + bzero((caddr_t)ea, sizeof (*ea)); + ea->arp_hrd = htons(ARPHRD_IEEE802); + break; + case IFT_FDDI: + case IFT_ETHER: + /* + * This may not be correct for types not explicitly + * listed, but this is our best guess + */ + default: + m->m_len = sizeof(*ea); + m->m_pkthdr.len = sizeof(*ea); + MH_ALIGN(m, sizeof(*ea)); + ea = mtod(m, struct ether_arp *); + eh = (struct ether_header *)sa.sa_data; + bzero((caddr_t)ea, sizeof (*ea)); + /* if_output will not swap */ + eh->ether_type = htons(ETHERTYPE_ARP); + (void)memcpy(eh->ether_dhost, etherbroadcastaddr, + sizeof(eh->ether_dhost)); + ea->arp_hrd = htons(ARPHRD_ETHER); + break; + } + ea->arp_pro = htons(ETHERTYPE_IP); + ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */ + ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */ + ea->arp_op = htons(ARPOP_REQUEST); + (void)memcpy(ea->arp_sha, enaddr, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_spa, sip, sizeof(ea->arp_spa)); + (void)memcpy(ea->arp_tpa, tip, sizeof(ea->arp_tpa)); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); +} + +/* + * Resolve an IP address into an ethernet address. If success, + * desten is filled in. If there is no entry in arptab, + * set one up and broadcast a request for the IP address. + * Hold onto this mbuf and resend it once the address + * is finally resolved. A return value of 1 indicates + * that desten has been filled in and the packet should be sent + * normally; a 0 return indicates that the packet has been + * taken over here, either now or for later transmission. + */ +int +arpresolve(ac, rt, m, dst, desten, rt0) + register struct arpcom *ac; + register struct rtentry *rt; + struct mbuf *m; + register struct sockaddr *dst; + register u_char *desten; + struct rtentry *rt0; +{ + register struct llinfo_arp *la = 0; + struct sockaddr_dl *sdl; + + if (m->m_flags & M_BCAST) { /* broadcast */ + (void)memcpy(desten, etherbroadcastaddr, sizeof(etherbroadcastaddr)); + return (1); + } + if (m->m_flags & M_MCAST) { /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return(1); + } + if (rt) + la = (struct llinfo_arp *)rt->rt_llinfo; + if (la == 0) { + la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0); + if (la) + rt = la->la_rt; + } + if (la == 0 || rt == 0) { + log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s%s%s\n", + inet_ntoa(SIN(dst)->sin_addr), la ? "la" : "", + rt ? "rt" : ""); + m_freem(m); + return (0); + } + sdl = SDL(rt->rt_gateway); + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + if ((rt->rt_expire == 0 || rt->rt_expire > time_second) && + sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return 1; + } + /* + * There is an arptab entry, but no ethernet address + * response yet. Replace the held mbuf with this + * latest one. + */ + if (la->la_hold) + m_freem(la->la_hold); + la->la_hold = m; + if (rt->rt_expire) { + rt->rt_flags &= ~RTF_REJECT; + if (la->la_asked == 0 || rt->rt_expire != time_second) { + rt->rt_expire = time_second; + if (la->la_asked++ < arp_maxtries) + arprequest(ac, + &SIN(rt->rt_ifa->ifa_addr)->sin_addr, + &SIN(dst)->sin_addr, ac->ac_enaddr); + else { + rt->rt_flags |= RTF_REJECT; + rt->rt_expire += arpt_down; + la->la_asked = 0; + } + + } + } + return (0); +} + +/* + * Common length and type checks are done here, + * then the protocol-specific routine is called. + */ +static void +arpintr() +{ + register struct mbuf *m; + register struct arphdr *ar; + int s; + + while (arpintrq.ifq_head) { + s = splimp(); + IF_DEQUEUE(&arpintrq, m); + splx(s); + if (m == 0 || (m->m_flags & M_PKTHDR) == 0) + panic("arpintr"); + if (m->m_len >= sizeof(struct arphdr) && + (ar = mtod(m, struct arphdr *)) && + (ntohs(ar->ar_hrd) == ARPHRD_ETHER || + ntohs(ar->ar_hrd) == ARPHRD_IEEE802) && + m->m_len >= + sizeof(struct arphdr) + 2 * ar->ar_hln + 2 * ar->ar_pln) + + switch (ntohs(ar->ar_pro)) { + +#ifdef INET + case ETHERTYPE_IP: + in_arpinput(m); + continue; +#endif + } + m_freem(m); + } +} + +NETISR_SET(NETISR_ARP, arpintr); + + +#ifdef INET +/* + * ARP for Internet protocols on 10 Mb/s Ethernet. + * Algorithm is that given in RFC 826. + * In addition, a sanity check is performed on the sender + * protocol address, to catch impersonators. + * We no longer handle negotiations for use of trailer protocol: + * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent + * along with IP replies if we wanted trailers sent to us, + * and also sent them in response to IP replies. + * This allowed either end to announce the desire to receive + * trailer packets. + * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, + * but formerly didn't normally send requests. + */ +static void +in_arpinput(m) + struct mbuf *m; +{ + register struct ether_arp *ea; + register struct arpcom *ac = (struct arpcom *)m->m_pkthdr.rcvif; + struct ether_header *eh; + struct iso88025_header *th = (struct iso88025_header *)0; + register struct llinfo_arp *la = 0; + register struct rtentry *rt; + struct in_ifaddr *ia, *maybe_ia = 0; + struct sockaddr_dl *sdl; + struct sockaddr sa; + struct in_addr isaddr, itaddr, myaddr; + int op; + + ea = mtod(m, struct ether_arp *); + op = ntohs(ea->arp_op); + (void)memcpy(&isaddr, ea->arp_spa, sizeof (isaddr)); + (void)memcpy(&itaddr, ea->arp_tpa, sizeof (itaddr)); + for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) +#ifdef BRIDGE + /* + * For a bridge, we want to check the address irrespective + * of the receive interface. (This will change slightly + * when we have clusters of interfaces). + */ + { +#else + if (ia->ia_ifp == &ac->ac_if) { +#endif + maybe_ia = ia; + if ((itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) || + (isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)) + break; + } + if (maybe_ia == 0) { + m_freem(m); + return; + } + myaddr = ia ? ia->ia_addr.sin_addr : maybe_ia->ia_addr.sin_addr; + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)ac->ac_enaddr, + sizeof (ea->arp_sha))) { + m_freem(m); /* it's from me, ignore it. */ + return; + } + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)etherbroadcastaddr, + sizeof (ea->arp_sha))) { + log(LOG_ERR, + "arp: ether address is broadcast for IP address %s!\n", + inet_ntoa(isaddr)); + m_freem(m); + return; + } + if (isaddr.s_addr == myaddr.s_addr) { + log(LOG_ERR, + "arp: %6D is using my IP address %s!\n", + ea->arp_sha, ":", inet_ntoa(isaddr)); + itaddr = myaddr; + goto reply; + } + la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0); + if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) { +#ifndef BRIDGE /* the following is not an error when doing bridging */ + if (rt->rt_ifp != &ac->ac_if) { + log(LOG_ERR, "arp: %s is on %s%d but got reply from %6D on %s%d\n", + inet_ntoa(isaddr), + rt->rt_ifp->if_name, rt->rt_ifp->if_unit, + ea->arp_sha, ":", + ac->ac_if.if_name, ac->ac_if.if_unit); + goto reply; + } +#endif + if (sdl->sdl_alen && + bcmp((caddr_t)ea->arp_sha, LLADDR(sdl), sdl->sdl_alen)) { + if (rt->rt_expire) + log(LOG_INFO, "arp: %s moved from %6D to %6D on %s%d\n", + inet_ntoa(isaddr), (u_char *)LLADDR(sdl), ":", + ea->arp_sha, ":", + ac->ac_if.if_name, ac->ac_if.if_unit); + else { + log(LOG_ERR, + "arp: %6D attempts to modify permanent entry for %s on %s%d\n", + ea->arp_sha, ":", inet_ntoa(isaddr), + ac->ac_if.if_name, ac->ac_if.if_unit); + goto reply; + } + } + (void)memcpy(LLADDR(sdl), ea->arp_sha, sizeof(ea->arp_sha)); + sdl->sdl_alen = sizeof(ea->arp_sha); + sdl->sdl_rcf = NULL; + /* + * If we receive an arp from a token-ring station over + * a token-ring nic then try to save the source + * routing info. + */ + if (ac->ac_if.if_type == IFT_ISO88025) { + th = (struct iso88025_header *)m->m_pkthdr.header; + if ((th->iso88025_shost[0] & 0x80) && + ((th->rcf & 0x001f) > 2)) { + sdl->sdl_rcf = (th->rcf & 0x8000) ? + (th->rcf & 0x7fff) : + (th->rcf | 0x8000); + memcpy(sdl->sdl_route, th->rseg, + (th->rcf & 0x001f) - 2); + sdl->sdl_rcf = sdl->sdl_rcf & 0xff1f; + /* + * Set up source routing information for + * reply packet (XXX) + */ + m->m_data -= (th->rcf & 0x001f); + m->m_len += (th->rcf & 0x001f); + m->m_pkthdr.len += (th->rcf & 0x001f); + } else { + th->iso88025_shost[0] &= 0x7f; + } + m->m_data -= 8; + m->m_len += 8; + m->m_pkthdr.len += 8; + th->rcf = sdl->sdl_rcf; + } else { + sdl->sdl_rcf = NULL; + } + if (rt->rt_expire) + rt->rt_expire = time_second + arpt_keep; + rt->rt_flags &= ~RTF_REJECT; + la->la_asked = 0; + if (la->la_hold) { + (*ac->ac_if.if_output)(&ac->ac_if, la->la_hold, + rt_key(rt), rt); + la->la_hold = 0; + } + } +reply: + if (op != ARPOP_REQUEST) { + m_freem(m); + return; + } + if (itaddr.s_addr == myaddr.s_addr) { + /* I am the target */ + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); + } else { + la = arplookup(itaddr.s_addr, 0, SIN_PROXY); + if (la == NULL) { + struct sockaddr_in sin; + + if (!arp_proxyall) { + m_freem(m); + return; + } + + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = itaddr; + + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) { + m_freem(m); + return; + } + /* + * Don't send proxies for nodes on the same interface + * as this one came out of, or we'll get into a fight + * over who claims what Ether address. + */ + if (rt->rt_ifp == &ac->ac_if) { + rtfree(rt); + m_freem(m); + return; + } + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); + rtfree(rt); +#ifdef DEBUG_PROXY + printf("arp: proxying for %s\n", + inet_ntoa(itaddr)); +#endif + } else { + rt = la->la_rt; + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + sdl = SDL(rt->rt_gateway); + (void)memcpy(ea->arp_sha, LLADDR(sdl), sizeof(ea->arp_sha)); + } + } + + (void)memcpy(ea->arp_tpa, ea->arp_spa, sizeof(ea->arp_spa)); + (void)memcpy(ea->arp_spa, &itaddr, sizeof(ea->arp_spa)); + ea->arp_op = htons(ARPOP_REPLY); + ea->arp_pro = htons(ETHERTYPE_IP); /* let's be sure! */ + switch (ac->ac_if.if_type) { + case IFT_ISO88025: + /* Re-arrange the source/dest address */ + memcpy(th->iso88025_dhost, th->iso88025_shost, + sizeof(th->iso88025_dhost)); + memcpy(th->iso88025_shost, ac->ac_enaddr, + sizeof(th->iso88025_shost)); + /* Set the source routing bit if neccesary */ + if (th->iso88025_dhost[0] & 0x80) { + th->iso88025_dhost[0] &= 0x7f; + if ((th->rcf & 0x001f) - 2) + th->iso88025_shost[0] |= 0x80; + } + /* Copy the addresses, ac and fc into sa_data */ + memcpy(sa.sa_data, th->iso88025_dhost, + sizeof(th->iso88025_dhost) * 2); + sa.sa_data[(sizeof(th->iso88025_dhost) * 2)] = 0x10; + sa.sa_data[(sizeof(th->iso88025_dhost) * 2) + 1] = 0x40; + break; + case IFT_ETHER: + case IFT_FDDI: + /* + * May not be correct for types not explictly + * listed, but it is our best guess. + */ + default: + eh = (struct ether_header *)sa.sa_data; + (void)memcpy(eh->ether_dhost, ea->arp_tha, + sizeof(eh->ether_dhost)); + eh->ether_type = htons(ETHERTYPE_ARP); + break; + } + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); + return; +} +#endif + +/* + * Free an arp entry. + */ +static void +arptfree(la) + register struct llinfo_arp *la; +{ + register struct rtentry *rt = la->la_rt; + register struct sockaddr_dl *sdl; + if (rt == 0) + panic("arptfree"); + if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_alen = 0; + la->la_asked = 0; + rt->rt_flags &= ~RTF_REJECT; + return; + } + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), + 0, (struct rtentry **)0); +} +/* + * Lookup or enter a new address in arptab. + */ +static struct llinfo_arp * +arplookup(addr, create, proxy) + u_long addr; + int create, proxy; +{ + register struct rtentry *rt; + static struct sockaddr_inarp sin = {sizeof(sin), AF_INET }; + const char *why = 0; + + sin.sin_addr.s_addr = addr; + sin.sin_other = proxy ? SIN_PROXY : 0; + rt = rtalloc1((struct sockaddr *)&sin, create, 0UL); + if (rt == 0) + return (0); + rt->rt_refcnt--; + + if (rt->rt_flags & RTF_GATEWAY) + why = "host is not on local network"; + else if ((rt->rt_flags & RTF_LLINFO) == 0) + why = "could not allocate llinfo"; + else if (rt->rt_gateway->sa_family != AF_LINK) + why = "gateway route is not ours"; + + if (why && create) { + log(LOG_DEBUG, "arplookup %s failed: %s\n", + inet_ntoa(sin.sin_addr), why); + return 0; + } else if (why) { + return 0; + } + return ((struct llinfo_arp *)rt->rt_llinfo); +} + +void +arp_ifinit(ac, ifa) + struct arpcom *ac; + struct ifaddr *ifa; +{ + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) + arprequest(ac, &IA_SIN(ifa)->sin_addr, + &IA_SIN(ifa)->sin_addr, ac->ac_enaddr); + ifa->ifa_rtrequest = arp_rtrequest; + ifa->ifa_flags |= RTF_CLONING; +} diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h new file mode 100644 index 0000000..09caa31 --- /dev/null +++ b/sys/netinet/if_ether.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.h 8.3 (Berkeley) 5/2/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IF_ETHER_H_ +#define _NETINET_IF_ETHER_H_ + +#include <net/ethernet.h> +#include <net/if_arp.h> + +/* + * Macro to map an IP multicast address to an Ethernet multicast address. + * The high-order 25 bits of the Ethernet address are statically assigned, + * and the low-order 23 bits are taken from the low end of the IP address. + */ +#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \ + /* struct in_addr *ipaddr; */ \ + /* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x01; \ + (enaddr)[1] = 0x00; \ + (enaddr)[2] = 0x5e; \ + (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((u_char *)ipaddr)[3]; \ +} +/* + * Macro to map an IP6 multicast address to an Ethernet multicast address. + * The high-order 16 bits of the Ethernet address are statically assigned, + * and the low-order 32 bits are taken from the low end of the IP6 address. + */ +#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \ +/* struct in6_addr *ip6addr; */ \ +/* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x33; \ + (enaddr)[1] = 0x33; \ + (enaddr)[2] = ((u_char *)ip6addr)[12]; \ + (enaddr)[3] = ((u_char *)ip6addr)[13]; \ + (enaddr)[4] = ((u_char *)ip6addr)[14]; \ + (enaddr)[5] = ((u_char *)ip6addr)[15]; \ +} + +/* + * Ethernet Address Resolution Protocol. + * + * See RFC 826 for protocol description. Structure below is adapted + * to resolving internet addresses. Field names used correspond to + * RFC 826. + */ +struct ether_arp { + struct arphdr ea_hdr; /* fixed-size header */ + u_char arp_sha[ETHER_ADDR_LEN]; /* sender hardware address */ + u_char arp_spa[4]; /* sender protocol address */ + u_char arp_tha[ETHER_ADDR_LEN]; /* target hardware address */ + u_char arp_tpa[4]; /* target protocol address */ +}; +#define arp_hrd ea_hdr.ar_hrd +#define arp_pro ea_hdr.ar_pro +#define arp_hln ea_hdr.ar_hln +#define arp_pln ea_hdr.ar_pln +#define arp_op ea_hdr.ar_op + +struct sockaddr_inarp { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + struct in_addr sin_srcaddr; + u_short sin_tos; + u_short sin_other; +#define SIN_PROXY 1 +}; +/* + * IP and ethernet specific routing flags + */ +#define RTF_USETRAILERS RTF_PROTO1 /* use trailers */ +#define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */ + +#ifdef KERNEL +extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN]; +extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; +extern struct ifqueue arpintrq; + +int arpresolve __P((struct arpcom *, struct rtentry *, struct mbuf *, + struct sockaddr *, u_char *, struct rtentry *)); +void arp_ifinit __P((struct arpcom *, struct ifaddr *)); +#endif + +#endif diff --git a/sys/netinet/if_fddi.h b/sys/netinet/if_fddi.h new file mode 100644 index 0000000..fc6acc6 --- /dev/null +++ b/sys/netinet/if_fddi.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Matt Thomas (thomas@lkg.dec.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_fddi.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IF_FDDI_H_ +#define _NETINET_IF_FDDI_H_ + +/* + * Structure of an 100Mb/s FDDI header. + */ +struct fddi_header { + u_char fddi_fc; + u_char fddi_dhost[6]; + u_char fddi_shost[6]; +}; + +#define FDDIIPMTU 4352 +#define FDDIMTU 4470 +#define FDDIMIN 3 + +#define FDDIFC_C 0x80 /* 0b10000000 */ +#define FDDIFC_L 0x40 /* 0b01000000 */ +#define FDDIFC_F 0x30 /* 0b00110000 */ +#define FDDIFC_Z 0x0F /* 0b00001111 */ + +#define FDDIFC_LLC_ASYNC 0x50 +#define FDDIFC_LLC_PRIO0 0 +#define FDDIFC_LLC_PRIO1 1 +#define FDDIFC_LLC_PRIO2 2 +#define FDDIFC_LLC_PRIO3 3 +#define FDDIFC_LLC_PRIO4 4 +#define FDDIFC_LLC_PRIO5 5 +#define FDDIFC_LLC_PRIO6 6 +#define FDDIFC_LLC_PRIO7 7 +#define FDDIFC_LLC_SYNC 0xd0 +#define FDDIFC_SMT 0x40 + +#if defined(KERNEL) || defined(_KERNEL) +#define fddibroadcastaddr etherbroadcastaddr +#define fddi_ipmulticast_min ether_ipmulticast_min +#define fddi_ipmulticast_max ether_ipmulticast_max +#define fddi_addmulti ether_addmulti +#define fddi_delmulti ether_delmulti +#define fddi_sprintf ether_sprintf + +void fddi_ifattach __P((struct ifnet *)); +void fddi_input __P((struct ifnet *, struct fddi_header *, struct mbuf *)); +int fddi_output __P((struct ifnet *, + struct mbuf *, struct sockaddr *, struct rtentry *)); + +#endif + +#endif diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c new file mode 100644 index 0000000..83cc650 --- /dev/null +++ b/sys/netinet/igmp.c @@ -0,0 +1,484 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.c 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +/* + * Internet Group Management Protocol (IGMP) routines. + * + * Written by Steve Deering, Stanford, May 1988. + * Modified by Rosen Sharma, Stanford, Aug 1994. + * Modified by Bill Fenner, Xerox PARC, Feb 1995. + * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * + * MULTICAST Revision: 3.5.1.4 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/igmp.h> +#include <netinet/igmp_var.h> + +static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); + +static struct router_info * + find_rti __P((struct ifnet *ifp)); + +static struct igmpstat igmpstat; + +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD, + &igmpstat, igmpstat, ""); + +static int igmp_timers_are_running; +static u_long igmp_all_hosts_group; +static u_long igmp_all_rtrs_group; +static struct mbuf *router_alert; +static struct router_info *Head; + +static void igmp_sendpkt __P((struct in_multi *, int, unsigned long)); + +void +igmp_init() +{ + struct ipoption *ra; + + /* + * To avoid byte-swapping the same value over and over again. + */ + igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); + igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); + + igmp_timers_are_running = 0; + + /* + * Construct a Router Alert option to use in outgoing packets + */ + MGET(router_alert, M_DONTWAIT, MT_DATA); + ra = mtod(router_alert, struct ipoption *); + ra->ipopt_dst.s_addr = 0; + ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + ra->ipopt_list[1] = 0x04; /* 4 bytes long */ + ra->ipopt_list[2] = 0x00; + ra->ipopt_list[3] = 0x00; + router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; + + Head = (struct router_info *) 0; +} + +static struct router_info * +find_rti(ifp) + struct ifnet *ifp; +{ + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> entering \n"); +#endif + while (rti) { + if (rti->rti_ifp == ifp) { +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> found old entry \n"); +#endif + return rti; + } + rti = rti->rti_next; + } + MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT); + rti->rti_ifp = ifp; + rti->rti_type = IGMP_V2_ROUTER; + rti->rti_time = 0; + rti->rti_next = Head; + Head = rti; +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> created an entry \n"); +#endif + return rti; +} + +void +igmp_input(m, iphlen) + register struct mbuf *m; + register int iphlen; +{ + register struct igmp *igmp; + register struct ip *ip; + register int igmplen; + register struct ifnet *ifp = m->m_pkthdr.rcvif; + register int minlen; + register struct in_multi *inm; + register struct in_ifaddr *ia; + struct in_multistep step; + struct router_info *rti; + + int timer; /** timer value in the igmp query header **/ + + ++igmpstat.igps_rcv_total; + + ip = mtod(m, struct ip *); + igmplen = ip->ip_len; + + /* + * Validate lengths + */ + if (igmplen < IGMP_MINLEN) { + ++igmpstat.igps_rcv_tooshort; + m_freem(m); + return; + } + minlen = iphlen + IGMP_MINLEN; + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + ++igmpstat.igps_rcv_tooshort; + return; + } + + /* + * Validate checksum + */ + m->m_data += iphlen; + m->m_len -= iphlen; + igmp = mtod(m, struct igmp *); + if (in_cksum(m, igmplen)) { + ++igmpstat.igps_rcv_badsum; + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + ip = mtod(m, struct ip *); + timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + rti = find_rti(ifp); + + /* + * In the IGMPv2 specification, there are 3 states and a flag. + * + * In Non-Member state, we simply don't have a membership record. + * In Delaying Member state, our timer is running (inm->inm_timer) + * In Idle Member state, our timer is not running (inm->inm_timer==0) + * + * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if + * we have heard a report from another member, or IGMP_IREPORTEDLAST + * if I sent the last report. + */ + switch (igmp->igmp_type) { + + case IGMP_MEMBERSHIP_QUERY: + ++igmpstat.igps_rcv_queries; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (igmp->igmp_code == 0) { + /* + * Old router. Remember that the querier on this + * interface is old, and set the timer to the + * value in RFC 1112. + */ + + rti->rti_type = IGMP_V1_ROUTER; + rti->rti_time = 0; + + timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + + if (ip->ip_dst.s_addr != igmp_all_hosts_group || + igmp->igmp_group.s_addr != 0) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } else { + /* + * New router. Simply do the new validity check. + */ + + if (igmp->igmp_group.s_addr != 0 && + !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to the "all-hosts" group (224.0.0.1). + * - Restart any timer that is already running but has + * a value longer than the requested timeout. + * - Use the value specified in the query message as + * the maximum timeout. + */ + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_ifp == ifp && + inm->inm_addr.s_addr != igmp_all_hosts_group && + (igmp->igmp_group.s_addr == 0 || + igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { + if (inm->inm_timer == 0 || + inm->inm_timer > timer) { + inm->inm_timer = + IGMP_RANDOM_DELAY(timer); + igmp_timers_are_running = 1; + } + } + IN_NEXT_MULTI(step, inm); + } + + break; + + case IGMP_V1_MEMBERSHIP_REPORT: + case IGMP_V2_MEMBERSHIP_REPORT: + /* + * For fast leave to work, we have to know that we are the + * last person to send a report for this group. Reports + * can potentially get looped back if we are a multicast + * router, so discard reports sourced by me. + */ + IFP_TO_IA(ifp, ia); + if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + + ++igmpstat.igps_rcv_reports; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badreports; + m_freem(m); + return; + } + + /* + * KLUDGE: if the IP source address of the report has an + * unspecified (i.e., zero) subnet number, as is allowed for + * a booting host, replace it with the correct subnet number + * so that a process-level multicast routing demon can + * determine which subnet it arrived from. This is necessary + * to compensate for the lack of any way for a process to + * determine the arrival interface of an incoming packet. + */ + if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) + if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); + + /* + * If we belong to the group being reported, stop + * our timer for that group. + */ + IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); + + if (inm != NULL) { + inm->inm_timer = 0; + ++igmpstat.igps_rcv_ourreports; + + inm->inm_state = IGMP_OTHERMEMBER; + } + + break; + } + + /* + * Pass all valid IGMP packets up to any process(es) listening + * on a raw IGMP socket. + */ + rip_input(m, iphlen); +} + +void +igmp_joingroup(inm) + struct in_multi *inm; +{ + int s = splnet(); + + if (inm->inm_addr.s_addr == igmp_all_hosts_group + || inm->inm_ifp->if_flags & IFF_LOOPBACK) { + inm->inm_timer = 0; + inm->inm_state = IGMP_OTHERMEMBER; + } else { + inm->inm_rti = find_rti(inm->inm_ifp); + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); + inm->inm_state = IGMP_IREPORTEDLAST; + igmp_timers_are_running = 1; + } + splx(s); +} + +void +igmp_leavegroup(inm) + struct in_multi *inm; +{ + if (inm->inm_state == IGMP_IREPORTEDLAST && + inm->inm_addr.s_addr != igmp_all_hosts_group && + !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && + inm->inm_rti->rti_type != IGMP_V1_ROUTER) + igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); +} + +void +igmp_fasttimo() +{ + register struct in_multi *inm; + struct in_multistep step; + int s; + + /* + * Quick check to see if any work needs to be done, in order + * to minimize the overhead of fasttimo processing. + */ + + if (!igmp_timers_are_running) + return; + + s = splnet(); + igmp_timers_are_running = 0; + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_timer == 0) { + /* do nothing */ + } else if (--inm->inm_timer == 0) { + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_state = IGMP_IREPORTEDLAST; + } else { + igmp_timers_are_running = 1; + } + IN_NEXT_MULTI(step, inm); + } + splx(s); +} + +void +igmp_slowtimo() +{ + int s = splnet(); + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > entering \n"); +#endif + while (rti) { + if (rti->rti_type == IGMP_V1_ROUTER) { + rti->rti_time++; + if (rti->rti_time >= IGMP_AGE_THRESHOLD) { + rti->rti_type = IGMP_V2_ROUTER; + } + } + rti = rti->rti_next; + } +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > exiting \n"); +#endif + splx(s); +} + +static struct route igmprt; + +static void +igmp_sendpkt(inm, type, addr) + struct in_multi *inm; + int type; + unsigned long addr; +{ + struct mbuf *m; + struct igmp *igmp; + struct ip *ip; + struct ip_moptions imo; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = loif; + m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; + MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); + m->m_data += sizeof(struct ip); + m->m_len = IGMP_MINLEN; + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; + + imo.imo_multicast_ifp = inm->inm_ifp; + imo.imo_multicast_ttl = 1; + imo.imo_multicast_vif = -1; + /* + * Request loopback of the report if we are acting as a multicast + * router, so that the process-level routing demon can hear it. + */ + imo.imo_multicast_loop = (ip_mrouter != NULL); + + /* + * XXX + * Do we have to worry about reentrancy here? Don't think so. + */ + ip_output(m, router_alert, &igmprt, 0, &imo); + + ++igmpstat.igps_snd_reports; +} diff --git a/sys/netinet/igmp.h b/sys/netinet/igmp.h new file mode 100644 index 0000000..7d943d6 --- /dev/null +++ b/sys/netinet/igmp.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_H_ +#define _NETINET_IGMP_H_ + +/* + * Internet Group Management Protocol (IGMP) definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.2 + */ + +/* + * IGMP packet format. + */ +struct igmp { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* subtype for routing msgs */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ +}; /* (zero for queries) */ + +#define IGMP_MINLEN 8 + +/* + * Message types, including version number. + */ +#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */ +#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ +#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ +#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */ + +#define IGMP_DVMRP 0x13 /* DVMRP routing message */ +#define IGMP_PIM 0x14 /* PIM routing message */ + +#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/ +#define IGMP_MTRACE 0x1f /* mcast traceroute messages */ + +#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ + /* query (in seconds) according */ + /* to RFC1112 */ + + +#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */ + /* specifies time in 10th of seconds*/ + +/* + * The following four defininitions are for backwards compatibility. + * They should be removed as soon as all applications are updated to + * use the new constant names. + */ +#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY +#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT +#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP + +#endif /* _NETINET_IGMP_H_ */ diff --git a/sys/netinet/igmp_var.h b/sys/netinet/igmp_var.h new file mode 100644 index 0000000..0dd330fb --- /dev/null +++ b/sys/netinet/igmp_var.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_VAR_H_ +#define _NETINET_IGMP_VAR_H_ + +/* + * Internet Group Management Protocol (IGMP), + * implementation-specific definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.3 + */ + +struct igmpstat { + u_int igps_rcv_total; /* total IGMP messages received */ + u_int igps_rcv_tooshort; /* received with too few bytes */ + u_int igps_rcv_badsum; /* received with bad checksum */ + u_int igps_rcv_queries; /* received membership queries */ + u_int igps_rcv_badqueries; /* received invalid queries */ + u_int igps_rcv_reports; /* received membership reports */ + u_int igps_rcv_badreports; /* received invalid reports */ + u_int igps_rcv_ourreports; /* received reports for our groups */ + u_int igps_snd_reports; /* sent membership reports */ +}; + +#ifdef KERNEL +#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) + +/* + * States for IGMPv2's leave processing + */ +#define IGMP_OTHERMEMBER 0 +#define IGMP_IREPORTEDLAST 1 + +/* + * We must remember what version the subnet's querier is. + * We conveniently use the IGMP message type for the proper + * membership report to keep this state. + */ +#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT + +/* + * Revert to new router if we haven't heard from an old router in + * this amount of time. + */ +#define IGMP_AGE_THRESHOLD 540 + +void igmp_init __P((void)); +void igmp_input __P((struct mbuf *, int)); +void igmp_joingroup __P((struct in_multi *)); +void igmp_leavegroup __P((struct in_multi *)); +void igmp_fasttimo __P((void)); +void igmp_slowtimo __P((void)); + +SYSCTL_DECL(_net_inet_igmp); + +#endif + +/* + * Names for IGMP sysctl objects + */ +#define IGMPCTL_STATS 1 /* statistics (read-only) */ +#define IGMPCTL_MAXID 2 + +#define IGMPCTL_NAMES { \ + { 0, 0 }, \ + { "stats", CTLTYPE_STRUCT }, \ +} +#endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c new file mode 100644 index 0000000..4562672 --- /dev/null +++ b/sys/netinet/in.c @@ -0,0 +1,600 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.c 8.4 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> + +#include <netinet/igmp_var.h> + +static MALLOC_DEFINE(M_IPMADDR, "in_multi", "internet multicast address"); + +static void in_socktrim __P((struct sockaddr_in *)); +static int in_ifinit __P((struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int)); + +static int subnetsarelocal = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, + &subnetsarelocal, 0, ""); + +struct in_multihead in_multihead; /* XXX BSS initialization */ + +/* + * Return 1 if an internet address is for a ``local'' host + * (one to which we have a connection). If subnetsarelocal + * is true, this includes other subnets of the local net. + * Otherwise, it includes only the directly-connected (sub)nets. + */ +int +in_localaddr(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register struct in_ifaddr *ia; + + if (subnetsarelocal) { + for (ia = in_ifaddrhead.tqh_first; ia; + ia = ia->ia_link.tqe_next) + if ((i & ia->ia_netmask) == ia->ia_net) + return (1); + } else { + for (ia = in_ifaddrhead.tqh_first; ia; + ia = ia->ia_link.tqe_next) + if ((i & ia->ia_subnetmask) == ia->ia_subnet) + return (1); + } + return (0); +} + +/* + * Determine whether an IP address is in a reserved set of addresses + * that may not be forwarded, or whether datagrams to that destination + * may be forwarded. + */ +int +in_canforward(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i)) + return (0); + if (IN_CLASSA(i)) { + net = i & IN_CLASSA_NET; + if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) + return (0); + } + return (1); +} + +/* + * Trim a mask in a sockaddr + */ +static void +in_socktrim(ap) +struct sockaddr_in *ap; +{ + register char *cplim = (char *) &ap->sin_addr; + register char *cp = (char *) (&ap->sin_addr + 1); + + ap->sin_len = 0; + while (--cp >= cplim) + if (*cp) { + (ap)->sin_len = cp - (char *) (ap) + 1; + break; + } +} + +static int in_interfaces; /* number of external internet interfaces */ + +/* + * Generic internet control operations (ioctl's). + * Ifp is 0 if not an interface-specific ioctl. + */ +/* ARGSUSED */ +int +in_control(so, cmd, data, ifp, p) + struct socket *so; + u_long cmd; + caddr_t data; + register struct ifnet *ifp; + struct proc *p; +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct in_ifaddr *ia = 0, *iap; + register struct ifaddr *ifa; + struct in_ifaddr *oia; + struct in_aliasreq *ifra = (struct in_aliasreq *)data; + struct sockaddr_in oldaddr; + int error, hostIsNew, maskIsNew, s; + u_long i; + + /* + * Find address for this interface, if it exists. + * + * If an alias address was specified, find that one instead of + * the first one on the interface. + */ + if (ifp) + for (iap = in_ifaddrhead.tqh_first; iap; + iap = iap->ia_link.tqe_next) + if (iap->ia_ifp == ifp) { + if (((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr.s_addr == + iap->ia_addr.sin_addr.s_addr) { + ia = iap; + break; + } else if (ia == NULL) { + ia = iap; + if (ifr->ifr_addr.sa_family != AF_INET) + break; + } + } + + switch (cmd) { + + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifp == 0) + return (EADDRNOTAVAIL); + if (ifra->ifra_addr.sin_family == AF_INET) { + for (oia = ia; ia; ia = ia->ia_link.tqe_next) { + if (ia->ia_ifp == ifp && + ia->ia_addr.sin_addr.s_addr == + ifra->ifra_addr.sin_addr.s_addr) + break; + } + if ((ifp->if_flags & IFF_POINTOPOINT) + && (cmd == SIOCAIFADDR) + && (ifra->ifra_dstaddr.sin_addr.s_addr + == INADDR_ANY)) { + return EDESTADDRREQ; + } + } + if (cmd == SIOCDIFADDR && ia == 0) + return (EADDRNOTAVAIL); + /* FALLTHROUGH */ + case SIOCSIFADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if (p && (error = suser(p)) != 0) + return error; + + if (ifp == 0) + return (EADDRNOTAVAIL); + if (ia == (struct in_ifaddr *)0) { + ia = (struct in_ifaddr *) + malloc(sizeof *ia, M_IFADDR, M_WAITOK); + if (ia == (struct in_ifaddr *)NULL) + return (ENOBUFS); + bzero((caddr_t)ia, sizeof *ia); + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); + ifa = &ia->ia_ifa; + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + + ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; + ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; + ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; + ia->ia_sockmask.sin_len = 8; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_family = AF_INET; + } + ia->ia_ifp = ifp; + if (!(ifp->if_flags & IFF_LOOPBACK)) + in_interfaces++; + splx(s); + } + break; + + case SIOCSIFBRDADDR: + if (p && (error = suser(p)) != 0) + return error; + /* FALLTHROUGH */ + + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + if (ia == (struct in_ifaddr *)0) + return (EADDRNOTAVAIL); + break; + } + switch (cmd) { + + case SIOCGIFADDR: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + break; + + case SIOCGIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + break; + + case SIOCGIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + break; + + case SIOCGIFNETMASK: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + break; + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + oldaddr = ia->ia_dstaddr; + ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + if (ifp->if_ioctl && (error = (*ifp->if_ioctl) + (ifp, SIOCSIFDSTADDR, (caddr_t)ia))) { + ia->ia_dstaddr = oldaddr; + return (error); + } + if (ia->ia_flags & IFA_ROUTE) { + ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } + break; + + case SIOCSIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + break; + + case SIOCSIFADDR: + return (in_ifinit(ifp, ia, + (struct sockaddr_in *) &ifr->ifr_addr, 1)); + + case SIOCSIFNETMASK: + i = ifra->ifra_addr.sin_addr.s_addr; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr = i); + break; + + case SIOCAIFADDR: + maskIsNew = 0; + hostIsNew = 1; + error = 0; + if (ia->ia_addr.sin_family == AF_INET) { + if (ifra->ifra_addr.sin_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; + } + if (ifra->ifra_mask.sin_len) { + in_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + ia->ia_subnetmask = + ntohl(ia->ia_sockmask.sin_addr.s_addr); + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sin_family == AF_INET)) { + in_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.sin_family == AF_INET && + (hostIsNew || maskIsNew)) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if ((ifp->if_flags & IFF_BROADCAST) && + (ifra->ifra_broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = ifra->ifra_broadaddr; + return (error); + + case SIOCDIFADDR: + in_ifscrub(ifp, ia); + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + ifa = &ia->ia_ifa; + TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + oia = ia; + TAILQ_REMOVE(&in_ifaddrhead, oia, ia_link); + IFAFREE(&oia->ia_ifa); + splx(s); + break; + + default: + if (ifp == 0 || ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + return (0); +} + +/* + * Delete any existing route for an interface. + */ +void +in_ifscrub(ifp, ia) + register struct ifnet *ifp; + register struct in_ifaddr *ia; +{ + + if ((ia->ia_flags & IFA_ROUTE) == 0) + return; + if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + ia->ia_flags &= ~IFA_ROUTE; +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +static int +in_ifinit(ifp, ia, sin, scrub) + register struct ifnet *ifp; + register struct in_ifaddr *ia; + struct sockaddr_in *sin; + int scrub; +{ + register u_long i = ntohl(sin->sin_addr.s_addr); + struct sockaddr_in oldaddr; + int s = splimp(), flags = RTF_UP, error; + + oldaddr = ia->ia_addr; + ia->ia_addr = *sin; + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { + splx(s); + ia->ia_addr = oldaddr; + return (error); + } + splx(s); + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + in_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (IN_CLASSA(i)) + ia->ia_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_netmask = IN_CLASSB_NET; + else + ia->ia_netmask = IN_CLASSC_NET; + /* + * The subnet mask usually includes at least the standard network part, + * but may may be smaller in the case of supernetting. + * If it is set, we believe it. + */ + if (ia->ia_subnetmask == 0) { + ia->ia_subnetmask = ia->ia_netmask; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + } else + ia->ia_netmask &= ia->ia_subnetmask; + ia->ia_net = i & ia->ia_netmask; + ia->ia_subnet = i & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); + /* + * Add route for the network. + */ + ia->ia_ifa.ifa_metric = ifp->if_metric; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_netbroadcast.s_addr = + htonl(ia->ia_net | ~ ia->ia_netmask); + } else if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr; + flags |= RTF_HOST; + } else if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_family != AF_INET) + return (0); + flags |= RTF_HOST; + } + if ((error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) + ia->ia_flags |= IFA_ROUTE; + + /* + * If the interface supports multicast, join the "all hosts" + * multicast group on that interface. + */ + if (ifp->if_flags & IFF_MULTICAST) { + struct in_addr addr; + + addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + in_addmulti(&addr, ifp); + } + return (error); +} + + +/* + * Return 1 if the address might be a local broadcast address. + */ +int +in_broadcast(in, ifp) + struct in_addr in; + struct ifnet *ifp; +{ + register struct ifaddr *ifa; + u_long t; + + if (in.s_addr == INADDR_BROADCAST || + in.s_addr == INADDR_ANY) + return 1; + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return 0; + t = ntohl(in.s_addr); + /* + * Look through the list of addresses for a match + * with a broadcast address. + */ +#define ia ((struct in_ifaddr *)ifa) + for (ifa = ifp->if_addrhead.tqh_first; ifa; + ifa = ifa->ifa_link.tqe_next) + if (ifa->ifa_addr->sa_family == AF_INET && + (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + in.s_addr == ia->ia_netbroadcast.s_addr || + /* + * Check for old-style (host 0) broadcast. + */ + t == ia->ia_subnet || t == ia->ia_net) && + /* + * Check for an all one subnetmask. These + * only exist when an interface gets a secondary + * address. + */ + ia->ia_subnetmask != (u_long)0xffffffff) + return 1; + return (0); +#undef ia +} +/* + * Add an address to the list of IP multicast addresses for a given interface. + */ +struct in_multi * +in_addmulti(ap, ifp) + register struct in_addr *ap; + register struct ifnet *ifp; +{ + register struct in_multi *inm; + int error; + struct sockaddr_in sin; + struct ifmultiaddr *ifma; + int s = splnet(); + + /* + * Call generic routine to add membership or increment + * refcount. It wants addresses in the form of a sockaddr, + * so we build one here (being careful to zero the unused bytes). + */ + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = *ap; + error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma); + if (error) { + splx(s); + return 0; + } + + /* + * If ifma->ifma_protospec is null, then if_addmulti() created + * a new record. Otherwise, we are done. + */ + if (ifma->ifma_protospec != 0) + return ifma->ifma_protospec; + + /* XXX - if_addmulti uses M_WAITOK. Can this really be called + at interrupt time? If so, need to fix if_addmulti. XXX */ + inm = (struct in_multi *)malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT); + if (inm == NULL) { + splx(s); + return (NULL); + } + + bzero(inm, sizeof *inm); + inm->inm_addr = *ap; + inm->inm_ifp = ifp; + inm->inm_ifma = ifma; + ifma->ifma_protospec = inm; + LIST_INSERT_HEAD(&in_multihead, inm, inm_link); + + /* + * Let IGMP know that we have joined a new IP multicast group. + */ + igmp_joingroup(inm); + splx(s); + return (inm); +} + +/* + * Delete a multicast address record. + */ +void +in_delmulti(inm) + register struct in_multi *inm; +{ + struct ifmultiaddr *ifma = inm->inm_ifma; + struct in_multi my_inm; + int s = splnet(); + + my_inm.inm_ifp = NULL ; /* don't send the leave msg */ + if (ifma->ifma_refcount == 1) { + /* + * No remaining claims to this record; let IGMP know that + * we are leaving the multicast group. + * But do it after the if_delmulti() which might reset + * the interface and nuke the packet. + */ + my_inm = *inm ; + ifma->ifma_protospec = 0; + LIST_REMOVE(inm, inm_link); + free(inm, M_IPMADDR); + } + /* XXX - should be separate API for when we have an ifma? */ + if_delmulti(ifma->ifma_ifp, ifma->ifma_addr); + if (my_inm.inm_ifp != NULL) + igmp_leavegroup(&my_inm); + splx(s); +} diff --git a/sys/netinet/in.h b/sys/netinet/in.h new file mode 100644 index 0000000..9e8a652 --- /dev/null +++ b/sys/netinet/in.h @@ -0,0 +1,462 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.h 8.3 (Berkeley) 1/3/94 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_H_ +#define _NETINET_IN_H_ + +/* + * Constants and structures defined by the internet system, + * Per RFC 790, September 1981, and numerous additions. + */ + +/* + * Protocols (RFC 1700) + */ +#define IPPROTO_IP 0 /* dummy for IP */ +#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ +#define IPPROTO_ICMP 1 /* control message protocol */ +#define IPPROTO_IGMP 2 /* group mgmt protocol */ +#define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ +#define IPPROTO_IPV4 4 /* IPv4 encapsulation */ +#define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */ +#define IPPROTO_TCP 6 /* tcp */ +#define IPPROTO_ST 7 /* Stream protocol II */ +#define IPPROTO_EGP 8 /* exterior gateway protocol */ +#define IPPROTO_PIGP 9 /* private interior gateway */ +#define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */ +#define IPPROTO_NVPII 11 /* network voice protocol*/ +#define IPPROTO_PUP 12 /* pup */ +#define IPPROTO_ARGUS 13 /* Argus */ +#define IPPROTO_EMCON 14 /* EMCON */ +#define IPPROTO_XNET 15 /* Cross Net Debugger */ +#define IPPROTO_CHAOS 16 /* Chaos*/ +#define IPPROTO_UDP 17 /* user datagram protocol */ +#define IPPROTO_MUX 18 /* Multiplexing */ +#define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */ +#define IPPROTO_HMP 20 /* Host Monitoring */ +#define IPPROTO_PRM 21 /* Packet Radio Measurement */ +#define IPPROTO_IDP 22 /* xns idp */ +#define IPPROTO_TRUNK1 23 /* Trunk-1 */ +#define IPPROTO_TRUNK2 24 /* Trunk-2 */ +#define IPPROTO_LEAF1 25 /* Leaf-1 */ +#define IPPROTO_LEAF2 26 /* Leaf-2 */ +#define IPPROTO_RDP 27 /* Reliable Data */ +#define IPPROTO_IRTP 28 /* Reliable Transaction */ +#define IPPROTO_TP 29 /* tp-4 w/ class negotiation */ +#define IPPROTO_BLT 30 /* Bulk Data Transfer */ +#define IPPROTO_NSP 31 /* Network Services */ +#define IPPROTO_INP 32 /* Merit Internodal */ +#define IPPROTO_SEP 33 /* Sequential Exchange */ +#define IPPROTO_3PC 34 /* Third Party Connect */ +#define IPPROTO_IDPR 35 /* InterDomain Policy Routing */ +#define IPPROTO_XTP 36 /* XTP */ +#define IPPROTO_DDP 37 /* Datagram Delivery */ +#define IPPROTO_CMTP 38 /* Control Message Transport */ +#define IPPROTO_TPXX 39 /* TP++ Transport */ +#define IPPROTO_IL 40 /* IL transport protocol */ +#define IPPROTO_IPV6 41 /* IP6 header */ +#define IPPROTO_SDRP 42 /* Source Demand Routing */ +#define IPPROTO_ROUTING 43 /* IP6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ +#define IPPROTO_IDRP 45 /* InterDomain Routing*/ +#define IPPROTO_RSVP 46 /* resource reservation */ +#define IPPROTO_GRE 47 /* General Routing Encap. */ +#define IPPROTO_MHRP 48 /* Mobile Host Routing */ +#define IPPROTO_BHA 49 /* BHA */ +#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ +#define IPPROTO_AH 51 /* IP6 Auth Header */ +#define IPPROTO_INLSP 52 /* Integ. Net Layer Security */ +#define IPPROTO_SWIPE 53 /* IP with encryption */ +#define IPPROTO_NHRP 54 /* Next Hop Resolution */ +/* 55-57: Unassigned */ +#define IPPROTO_ICMPV6 58 /* ICMP6 */ +#define IPPROTO_NONE 59 /* IP6 no next header */ +#define IPPROTO_DSTOPTS 60 /* IP6 destination option */ +#define IPPROTO_AHIP 61 /* any host internal protocol */ +#define IPPROTO_CFTP 62 /* CFTP */ +#define IPPROTO_HELLO 63 /* "hello" routing protocol */ +#define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */ +#define IPPROTO_KRYPTOLAN 65 /* Kryptolan */ +#define IPPROTO_RVD 66 /* Remote Virtual Disk */ +#define IPPROTO_IPPC 67 /* Pluribus Packet Core */ +#define IPPROTO_ADFS 68 /* Any distributed FS */ +#define IPPROTO_SATMON 69 /* Satnet Monitoring */ +#define IPPROTO_VISA 70 /* VISA Protocol */ +#define IPPROTO_IPCV 71 /* Packet Core Utility */ +#define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */ +#define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */ +#define IPPROTO_WSN 74 /* Wang Span Network */ +#define IPPROTO_PVP 75 /* Packet Video Protocol */ +#define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */ +#define IPPROTO_ND 77 /* Sun net disk proto (temp.) */ +#define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */ +#define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */ +#define IPPROTO_EON 80 /* ISO cnlp */ +#define IPPROTO_VMTP 81 /* VMTP */ +#define IPPROTO_SVMTP 82 /* Secure VMTP */ +#define IPPROTO_VINES 83 /* Banyon VINES */ +#define IPPROTO_TTP 84 /* TTP */ +#define IPPROTO_IGP 85 /* NSFNET-IGP */ +#define IPPROTO_DGP 86 /* dissimilar gateway prot. */ +#define IPPROTO_TCF 87 /* TCF */ +#define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */ +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_SRPC 90 /* Strite RPC protocol */ +#define IPPROTO_LARP 91 /* Locus Address Resoloution */ +#define IPPROTO_MTP 92 /* Multicast Transport */ +#define IPPROTO_AX25 93 /* AX.25 Frames */ +#define IPPROTO_IPEIP 94 /* IP encapsulated in IP */ +#define IPPROTO_MICP 95 /* Mobile Int.ing control */ +#define IPPROTO_SCCSP 96 /* Semaphore Comm. security */ +#define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */ +#define IPPROTO_ENCAP 98 /* encapsulation header */ +#define IPPROTO_APES 99 /* any private encr. scheme */ +#define IPPROTO_GMTP 100 /* GMTP*/ +#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ +/* 101-254: Partly Unassigned */ +#define IPPROTO_PIM 103 /* Protocol Independent Mcast */ +#define IPPROTO_PGM 113 /* PGM */ +/* 255: Reserved */ +/* BSD Private, local use, namespace incursion */ +#define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ +#define IPPROTO_RAW 255 /* raw IP packet */ +#define IPPROTO_MAX 256 + +/* last return value of *_input(), meaning "all job for this pkt is done". */ +#define IPPROTO_DONE 257 + +/* + * Local port number conventions: + * + * When a user does a bind(2) or connect(2) with a port number of zero, + * a non-conflicting local port address is chosen. + * The default range is IPPORT_RESERVED through + * IPPORT_USERRESERVED, although that is settable by sysctl. + * + * A user may set the IPPROTO_IP option IP_PORTRANGE to change this + * default assignment range. + * + * The value IP_PORTRANGE_DEFAULT causes the default behavior. + * + * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers + * into the "high" range. These are reserved for client outbound connections + * which do not want to be filtered by any firewalls. + * + * The value IP_PORTRANGE_LOW changes the range to the "low" are + * that is (by convention) restricted to privileged processes. This + * convention is based on "vouchsafe" principles only. It is only secure + * if you trust the remote host to restrict these ports. + * + * The default range of ports and the high range can be changed by + * sysctl(3). (net.inet.ip.port{hi,low}{first,last}_auto) + * + * Changing those values has bad security implications if you are + * using a a stateless firewall that is allowing packets outside of that + * range in order to allow transparent outgoing connections. + * + * Such a firewall configuration will generally depend on the use of these + * default values. If you change them, you may find your Security + * Administrator looking for you with a heavy object. + * + * For a slightly more orthodox text view on this: + * + * ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers + * + * port numbers are divided into three ranges: + * + * 0 - 1023 Well Known Ports + * 1024 - 49151 Registered Ports + * 49152 - 65535 Dynamic and/or Private Ports + * + */ + +/* + * Ports < IPPORT_RESERVED are reserved for + * privileged processes (e.g. root). (IP_PORTRANGE_LOW) + * Ports > IPPORT_USERRESERVED are reserved + * for servers, not necessarily privileged. (IP_PORTRANGE_DEFAULT) + */ +#define IPPORT_RESERVED 1024 +#define IPPORT_USERRESERVED 5000 + +/* + * Default local port range to use by setting IP_PORTRANGE_HIGH + */ +#define IPPORT_HIFIRSTAUTO 49152 +#define IPPORT_HILASTAUTO 65535 + +/* + * Scanning for a free reserved port return a value below IPPORT_RESERVED, + * but higher than IPPORT_RESERVEDSTART. Traditionally the start value was + * 512, but that conflicts with some well-known-services that firewalls may + * have a fit if we use. + */ +#define IPPORT_RESERVEDSTART 600 + +/* + * Internet address (a structure for historical reasons) + */ +struct in_addr { + u_int32_t s_addr; +}; + +/* + * Definitions of bits in internet address integers. + * On subnets, the decomposition of addresses to host and net parts + * is done according to subnet mask, not the masks here. + */ +#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0) +#define IN_CLASSA_NET 0xff000000 +#define IN_CLASSA_NSHIFT 24 +#define IN_CLASSA_HOST 0x00ffffff +#define IN_CLASSA_MAX 128 + +#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000) +#define IN_CLASSB_NET 0xffff0000 +#define IN_CLASSB_NSHIFT 16 +#define IN_CLASSB_HOST 0x0000ffff +#define IN_CLASSB_MAX 65536 + +#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000) +#define IN_CLASSC_NET 0xffffff00 +#define IN_CLASSC_NSHIFT 8 +#define IN_CLASSC_HOST 0x000000ff + +#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000) +#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ +#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ +#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ +#define IN_MULTICAST(i) IN_CLASSD(i) + +#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) +#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) + +#define INADDR_ANY (u_int32_t)0x00000000 +#define INADDR_LOOPBACK (u_int32_t)0x7f000001 +#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ +#ifndef KERNEL +#define INADDR_NONE 0xffffffff /* -1 return */ +#endif + +#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ +#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ + +#define IN_LOOPBACKNET 127 /* official! */ + +/* + * Socket address, internet style. + */ +struct sockaddr_in { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; + +#define INET_ADDRSTRLEN 16 + +/* + * Structure used to describe IP options. + * Used to store options internally, to pass them to a process, + * or to restore options retrieved earlier. + * The ip_dst is used for the first-hop gateway when using a source route + * (this gets put into the header proper). + */ +struct ip_opts { + struct in_addr ip_dst; /* first hop, 0 w/o src rt */ + char ip_opts[40]; /* actually variable in size */ +}; + +/* + * Options for use with [gs]etsockopt at the IP level. + * First word of comment is data type; bool is stored in int. + */ +#define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */ +#define IP_HDRINCL 2 /* int; header is included with data */ +#define IP_TOS 3 /* int; IP type of service and preced. */ +#define IP_TTL 4 /* int; IP time to live */ +#define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */ +#define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */ +#define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */ +#define IP_RETOPTS 8 /* ip_opts; set/get IP options */ +#define IP_MULTICAST_IF 9 /* u_char; set/get IP multicast i/f */ +#define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */ +#define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */ +#define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */ +#define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */ +#define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */ +#define IP_RSVP_ON 15 /* enable RSVP in kernel */ +#define IP_RSVP_OFF 16 /* disable RSVP in kernel */ +#define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */ +#define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */ +#define IP_PORTRANGE 19 /* int; range to choose for unspec port */ +#define IP_RECVIF 20 /* bool; receive reception if w/dgram */ +/* for IPSEC */ +#define IP_IPSEC_POLICY 21 /* int; set/get security policy */ +#define IP_FAITH 22 /* bool; accept FAITH'ed connections */ + +#define IP_FW_ADD 50 /* add a firewall rule to chain */ +#define IP_FW_DEL 51 /* delete a firewall rule from chain */ +#define IP_FW_FLUSH 52 /* flush firewall rule chain */ +#define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */ +#define IP_FW_GET 54 /* get entire firewall rule chain */ +#define IP_FW_RESETLOG 55 /* reset logging counters */ + +#define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */ +#define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */ +#define IP_DUMMYNET_FLUSH 62 /* flush dummynet */ +#define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ + +/* + * Defaults and limits for options + */ +#define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ +#define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ +#define IP_MAX_MEMBERSHIPS 20 /* per socket */ + +/* + * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. + */ +struct ip_mreq { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +/* + * Argument for IP_PORTRANGE: + * - which range to search when port is unspecified at bind() or connect() + */ +#define IP_PORTRANGE_DEFAULT 0 /* default range */ +#define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ +#define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ + +/* + * Definitions for inet sysctl operations. + * + * Third level is protocol number. + * Fourth level is desired variable within that protocol. + */ +#define IPPROTO_MAXID (IPPROTO_ESP + 1) /* don't list to IPPROTO_MAX */ + +#define CTL_IPPROTO_NAMES { \ + { "ip", CTLTYPE_NODE }, \ + { "icmp", CTLTYPE_NODE }, \ + { "igmp", CTLTYPE_NODE }, \ + { "ggp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "tcp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { "egp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "pup", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "udp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "idp", CTLTYPE_NODE }, \ +} + +/* + * Names for IP sysctl objects + */ +#define IPCTL_FORWARDING 1 /* act as router */ +#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ +#define IPCTL_DEFTTL 3 /* default TTL */ +#ifdef notyet +#define IPCTL_DEFMTU 4 /* default MTU */ +#endif +#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */ +#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */ +#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */ +#define IPCTL_SOURCEROUTE 8 /* may perform source routes */ +#define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */ +#define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */ +#define IPCTL_INTRQDROPS 11 /* number of netisr q drops */ +#define IPCTL_STATS 12 /* ipstat structure */ +#define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ +#define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ +#define IPCTL_KEEPFAITH 15 +#define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ +#define IPCTL_MAXID 17 + +#define IPCTL_NAMES { \ + { 0, 0 }, \ + { "forwarding", CTLTYPE_INT }, \ + { "redirect", CTLTYPE_INT }, \ + { "ttl", CTLTYPE_INT }, \ + { "mtu", CTLTYPE_INT }, \ + { "rtexpire", CTLTYPE_INT }, \ + { "rtminexpire", CTLTYPE_INT }, \ + { "rtmaxcache", CTLTYPE_INT }, \ + { "sourceroute", CTLTYPE_INT }, \ + { "directed-broadcast", CTLTYPE_INT }, \ + { "intr-queue-maxlen", CTLTYPE_INT }, \ + { "intr-queue-drops", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "accept_sourceroute", CTLTYPE_INT }, \ + { "fastforwarding", CTLTYPE_INT }, \ +} + +/* INET6 stuff */ +#include <netinet6/in6.h> + +#ifdef KERNEL +struct ifnet; struct mbuf; /* forward declarations for Standard C */ +struct proc; + +int in_broadcast __P((struct in_addr, struct ifnet *)); +int in_canforward __P((struct in_addr)); +int in_cksum __P((struct mbuf *, int)); +int in_localaddr __P((struct in_addr)); +char *inet_ntoa __P((struct in_addr)); /* in libkern */ + +int prison_ip __P((struct proc *p, int flag, u_int32_t *ip)); +void prison_remote_ip __P((struct proc *p, int flag, u_int32_t *ip)); + +#endif /* KERNEL */ + +#endif diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c new file mode 100644 index 0000000..eaf1493 --- /dev/null +++ b/sys/netinet/in_cksum.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/mbuf.h> + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(m, len) + register struct mbuf *m; + register int len; +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (int) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c new file mode 100644 index 0000000..6aa3673 --- /dev/null +++ b/sys/netinet/in_gif.c @@ -0,0 +1,311 @@ +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * in_gif.c + */ + +#include "opt_mrouting.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/protosw.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/in_gif.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#ifdef MROUTING +#include <netinet/ip_mroute.h> +#endif /* MROUTING */ + +#include <net/if_gif.h> + +#include "gif.h" + +#include <machine/stdarg.h> + +#include <net/net_osdep.h> + +#if NGIF > 0 +int ip_gif_ttl = GIF_TTL; +#else +int ip_gif_ttl = 0; +#endif + +SYSCTL_DECL(_net_inet_ip); +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, + &ip_gif_ttl, 0, ""); + +int +in_gif_output(ifp, family, m, rt) + struct ifnet *ifp; + int family; + struct mbuf *m; + struct rtentry *rt; +{ + register struct gif_softc *sc = (struct gif_softc*)ifp; + struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; + struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; + struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; + struct ip iphdr; /* capsule IP header, host byte ordered */ + int proto, error; + u_int8_t tos; + + if (sin_src == NULL || sin_dst == NULL || + sin_src->sin_family != AF_INET || + sin_dst->sin_family != AF_INET) { + m_freem(m); + return EAFNOSUPPORT; + } + + switch (family) { + case AF_INET: + { + struct ip *ip; + + proto = IPPROTO_IPV4; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return ENOBUFS; + } + ip = mtod(m, struct ip *); + tos = ip->ip_tos; + break; + } +#ifdef INET6 + case AF_INET6: + { + struct ip6_hdr *ip6; + proto = IPPROTO_IPV6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return ENOBUFS; + } + ip6 = mtod(m, struct ip6_hdr *); + tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif /*INET6*/ + default: +#ifdef DIAGNOSTIC + printf("in_gif_output: warning: unknown family %d passed\n", + family); +#endif + m_freem(m); + return EAFNOSUPPORT; + } + + bzero(&iphdr, sizeof(iphdr)); + iphdr.ip_src = sin_src->sin_addr; + if (ifp->if_flags & IFF_LINK0) { + /* multi-destination mode */ + if (sin_dst->sin_addr.s_addr != INADDR_ANY) + iphdr.ip_dst = sin_dst->sin_addr; + else if (rt) { + iphdr.ip_dst = ((struct sockaddr_in *) + (rt->rt_gateway))->sin_addr; + } else { + m_freem(m); + return ENETUNREACH; + } + } else { + /* bidirectional configured tunnel mode */ + if (sin_dst->sin_addr.s_addr != INADDR_ANY) + iphdr.ip_dst = sin_dst->sin_addr; + else { + m_freem(m); + return ENETUNREACH; + } + } + iphdr.ip_p = proto; + /* version will be set in ip_output() */ + iphdr.ip_ttl = ip_gif_ttl; + iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); + + /* prepend new IP header */ + M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + if (m && m->m_len < sizeof(struct ip)) + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) { + printf("ENOBUFS in in_gif_output %d\n", __LINE__); + return ENOBUFS; + } + + *(mtod(m, struct ip *)) = iphdr; + + if (dst->sin_family != sin_dst->sin_family || + dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { + /* cache route doesn't match */ + dst->sin_family = sin_dst->sin_family; + dst->sin_len = sizeof(struct sockaddr_in); + dst->sin_addr = sin_dst->sin_addr; + if (sc->gif_ro.ro_rt) { + RTFREE(sc->gif_ro.ro_rt); + sc->gif_ro.ro_rt = NULL; + } + } + + if (sc->gif_ro.ro_rt == NULL) { + rtalloc(&sc->gif_ro); + if (sc->gif_ro.ro_rt == NULL) { + m_freem(m); + return ENETUNREACH; + } + } + +#ifdef IPSEC + m->m_pkthdr.rcvif = NULL; +#endif /*IPSEC*/ + error = ip_output(m, 0, &sc->gif_ro, 0, 0); + return(error); +} + +void +#if __STDC__ +in_gif_input(struct mbuf *m, ...) +#else +in_gif_input(m, va_alist) + struct mbuf *m; + va_dcl +#endif +{ + int off, proto; + struct gif_softc *sc; + struct ifnet *gifp = NULL; + struct ip *ip; + int i, af; + va_list ap; + + va_start(ap, m); + off = va_arg(ap, int); + proto = va_arg(ap, int); + va_end(ap); + + ip = mtod(m, struct ip *); + + /* this code will be soon improved. */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) + for (i = 0, sc = gif; i < ngif; i++, sc++) { + if (sc->gif_psrc == NULL + || sc->gif_pdst == NULL + || sc->gif_psrc->sa_family != AF_INET + || sc->gif_pdst->sa_family != AF_INET) { + continue; + } + + if ((sc->gif_if.if_flags & IFF_UP) == 0) + continue; + + if ((sc->gif_if.if_flags & IFF_LINK0) + && satosin(sc->gif_psrc)->sin_addr.s_addr == ip->ip_dst.s_addr + && satosin(sc->gif_pdst)->sin_addr.s_addr == INADDR_ANY) { + gifp = &sc->gif_if; + continue; + } + + if (satosin(sc->gif_psrc)->sin_addr.s_addr == ip->ip_dst.s_addr + && satosin(sc->gif_pdst)->sin_addr.s_addr == ip->ip_src.s_addr) + { + gifp = &sc->gif_if; + break; + } + } + + if (gifp == NULL) { +#ifdef MROUTING + /* for backward compatibility */ + if (proto == IPPROTO_IPV4) { + ipip_input(m, off); + return; + } +#endif /*MROUTING*/ + m_freem(m); + ipstat.ips_nogif++; + return; + } + + m_adj(m, off); + + switch (proto) { + case IPPROTO_IPV4: + { + struct ip *ip; + af = AF_INET; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return; + } + ip = mtod(m, struct ip *); + break; + } +#ifdef INET6 + case IPPROTO_IPV6: + { + struct ip6_hdr *ip6; + af = AF_INET6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return; + } + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_flow &= ~htonl(0xff << 20); + break; + } +#endif /* INET6 */ + default: + ipstat.ips_nogif++; + m_freem(m); + return; + } + gif_input(m, af, gifp); + return; +} diff --git a/sys/netinet/in_gif.h b/sys/netinet/in_gif.h new file mode 100644 index 0000000..b874524 --- /dev/null +++ b/sys/netinet/in_gif.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_GIF_H_ +#define _NETINET_IN_GIF_H_ + +#define GIF_TTL 30 + +extern int ip_gif_ttl; + +void in_gif_input __P((struct mbuf *, ...)); +int in_gif_output __P((struct ifnet *, int, struct mbuf *, struct rtentry *)); + +#endif /*_NETINET_IN_GIF_H_*/ diff --git a/sys/netinet/in_hostcache.c b/sys/netinet/in_hostcache.c new file mode 100644 index 0000000..33e0187 --- /dev/null +++ b/sys/netinet/in_hostcache.c @@ -0,0 +1,157 @@ +/* + * Copyright 1997 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/hostcache.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_hostcache.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> + +/* + * Manage the IP per-host cache (really a thin veneer over the generic + * per-host cache code). + */ + +/* Look up an entry -- can be called from interrupt context. */ +struct in_hcentry * +inhc_lookup(struct sockaddr_in *sin) +{ + struct hcentry *hc; + + hc = hc_get((struct sockaddr *)sin); + return ((struct in_hcentry *)hc); +} + +/* Look up and possibly create an entry -- must be called from user mode. */ +struct in_hcentry * +inhc_alloc(struct sockaddr_in *sin) +{ + struct in_hcentry *inhc; + struct rtentry *rt; + int error; + /* xxx mutual exclusion for smp */ + + inhc = inhc_lookup(sin); + if (inhc != 0) + return inhc; + + rt = rtalloc1(inhc->inhc_hc.hc_host, 1, 0); + if (rt == 0) + return 0; + + MALLOC(inhc, struct in_hcentry *, sizeof *inhc, M_HOSTCACHE, M_WAITOK); + bzero(inhc, sizeof *inhc); + inhc->inhc_hc.hc_host = dup_sockaddr((struct sockaddr *)sin, 1); + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) + inhc->inhc_flags |= INHC_BROADCAST; + else if (((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->sin_addr.s_addr + == sin->sin_addr.s_addr) + inhc->inhc_flags |= INHC_LOCAL; + else if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + inhc->inhc_flags |= INHC_MULTICAST; + inhc->inhc_pmtu = rt->rt_rmx.rmx_mtu; + inhc->inhc_recvpipe = rt->rt_rmx.rmx_recvpipe; + inhc->inhc_sendpipe = rt->rt_rmx.rmx_sendpipe; + inhc->inhc_ssthresh = rt->rt_rmx.rmx_ssthresh; + if (rt->rt_rmx.rmx_locks & RTV_RTT) + inhc->inhc_rttmin = rt->rt_rmx.rmx_rtt + / (RTM_RTTUNIT / TCP_RTT_SCALE); + inhc->inhc_hc.hc_rt = rt; + error = hc_insert(&inhc->inhc_hc); + if (error != 0) { + RTFREE(rt); + FREE(inhc, M_HOSTCACHE); + return 0; + } + /* + * We don't return the structure directly because hc_get() needs + * to be allowed to do its own processing. + */ + return (inhc_lookup(sin)); +} + +/* + * This is Van Jacobson's hash function for IPv4 addresses. + * It is designed to work with a power-of-two-sized hash table. + */ +static u_long +inhc_hash(struct sockaddr *sa, u_long nbuckets) +{ + u_long ip; + + ip = ((struct sockaddr_in *)sa)->sin_addr.s_addr; + return ((ip ^ (ip >> 23) ^ (ip >> 17)) & ~(nbuckets - 1)); +} + +/* + * We don't need to do any special work... if there are no references, + * as the caller has already ensured, then it's OK to kill. + */ +static int +inhc_delete(struct hcentry *hc) +{ + return 0; +} + +/* + * Return the next increment for the number of buckets in the hash table. + * Zero means ``do not bump''. + */ +static u_long +inhc_bump(u_long oldsize) +{ + if (oldsize < 512) + return (oldsize << 1); + return 0; +} + +static struct hccallback inhc_cb = { + inhc_hash, inhc_delete, inhc_bump +}; + +int +inhc_init(void) +{ + + return (hc_init(AF_INET, &inhc_cb, 128, 0)); +} + diff --git a/sys/netinet/in_hostcache.h b/sys/netinet/in_hostcache.h new file mode 100644 index 0000000..98e8fd4 --- /dev/null +++ b/sys/netinet/in_hostcache.h @@ -0,0 +1,83 @@ +/* + * Copyright 1997 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_HOSTCACHE_H +#define _NETINET_IN_HOSTCACHE_H 1 + +/* + * This file defines the particular structures contained in the host cache + * for the use of IP. + */ + +/* + * An IP host cache entry. Note that we include the srtt/var here, + * with the expectation that it might be used to keep a persistent, + * cross-connection view of this statistic. + */ +struct in_hcentry { + struct hcentry inhc_hc; + u_long inhc_pmtu; + u_long inhc_recvpipe; + u_long inhc_sendpipe; + u_long inhc_pksent; + u_long inhc_flags; + u_long inhc_ssthresh; + int inhc_srtt; /* VJ RTT estimator */ + int inhc_srttvar; /* VJ */ + u_int inhc_rttmin; /* VJ */ + int inhc_rxt; /* TCP retransmit timeout */ + u_long inhc_cc; /* deliberate type pun with tcp_cc */ + u_long inhc_ccsent; /* as above */ + u_short inhc_mssopt; +}; + +#define inhc_addr(inhc) ((struct sockaddr_in *)(inhc)->inhc_hc.hc_host) + +/* Flags for inhc_flags... */ +#define INHC_LOCAL 0x0001 /* this address is local */ +#define INHC_BROADCAST 0x0002 /* this address is broadcast */ +#define INHC_MULTICAST 0x0004 /* this address is multicast */ +#define INHC_REDUCEDMTU 0x0008 /* we reduced the mtu via PMTU discovery */ + +#ifdef KERNEL +/* + * inhc_alloc can block while adding a new entry to the cache; + * inhc_lookup will does not add new entries and so can be called + * in non-process context. + */ +struct in_hcentry *inhc_alloc(struct sockaddr_in *sin); +int inhc_init(void); +struct in_hcentry *inhc_lookup(struct sockaddr_in *sin); +#define inhc_ref(inhc) (hc_ref(&(inhc)->inhc_hc)) +#define inhc_rele(inhc) (hc_rele(&(inhc)->inhc_hc)) +#endif /* KERNEL */ + +#endif /* _NETINET_IN_HOSTCACHE_H */ diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c new file mode 100644 index 0000000..71d091b --- /dev/null +++ b/sys/netinet/in_pcb.c @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/proc.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <machine/limits.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif /* INET6 */ + +#include "faith.h" + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#include <netkey/key_debug.h> +#endif /* IPSEC */ + +struct in_addr zeroin_addr; + +static void in_rtchange __P((struct inpcb *, int)); + +/* + * These configure the range of local port addresses assigned to + * "unspecified" outgoing connections/packets/whatever. + */ +int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ +int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ +int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ +int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ +int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ +int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + +#define RANGECHK(var, min, max) \ + if ((var) < (min)) { (var) = (min); } \ + else if ((var) > (max)) { (var) = (max); } + +static int +sysctl_net_ipport_check SYSCTL_HANDLER_ARGS +{ + int error = sysctl_handle_int(oidp, + oidp->oid_arg1, oidp->oid_arg2, req); + if (!error) { + RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); + } + return error; +} + +#undef RANGECHK + +SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); + +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, + &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); + +/* + * in_pcb.c: manage the Protocol Control Blocks. + * + * NOTE: It is assumed that most of these functions will be called at + * splnet(). XXX - There are, unfortunately, a few exceptions to this + * rule that should be fixed. + */ + +/* + * Allocate a PCB and associate it with the socket. + */ +int +in_pcballoc(so, pcbinfo, p) + struct socket *so; + struct inpcbinfo *pcbinfo; + struct proc *p; +{ + register struct inpcb *inp; + + inp = zalloci(pcbinfo->ipi_zone); + if (inp == NULL) + return (ENOBUFS); + bzero((caddr_t)inp, sizeof(*inp)); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + inp->inp_pcbinfo = pcbinfo; + inp->inp_socket = so; + LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); + pcbinfo->ipi_count++; + so->so_pcb = (caddr_t)inp; + return (0); +} + +int +in_pcbbind(inp, nam, p) + register struct inpcb *inp; + struct sockaddr *nam; + struct proc *p; +{ + register struct socket *so = inp->inp_socket; + unsigned short *lastport; + struct sockaddr_in *sin; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + u_short lport = 0; + int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error, prison = 0; + + if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ + return (EADDRNOTAVAIL); + if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY) + return (EINVAL); + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + wild = 1; + if (nam) { + sin = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); +#ifdef notdef + /* + * We should check the family, but old programs + * incorrectly fail to initialize it. + */ + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); +#endif + if (prison_ip(p, 0, &sin->sin_addr.s_addr)) + return(EINVAL); + lport = sin->sin_port; + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if (so->so_options & SO_REUSEADDR) + reuseport = SO_REUSEADDR|SO_REUSEPORT; + } else if (sin->sin_addr.s_addr != INADDR_ANY) { + sin->sin_port = 0; /* yech... */ + if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) + return (EADDRNOTAVAIL); + } + if (lport) { + struct inpcb *t; + + /* GROSS */ + if (ntohs(lport) < IPPORT_RESERVED && p && + suser_xxx(0, p, PRISON_ROOT)) + return (EACCES); + if (p && p->p_prison) + prison = 1; + if (so->so_cred->cr_uid != 0 && + !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + t = in_pcblookup_local(inp->inp_pcbinfo, + sin->sin_addr, lport, + prison ? 0 : INPLOOKUP_WILDCARD); + if (t && + (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != INADDR_ANY || + (t->inp_socket->so_options & + SO_REUSEPORT) == 0) && + (so->so_cred->cr_uid != + t->inp_socket->so_cred->cr_uid)) { +#if defined(INET6) + if (ip6_mapped_addr_on == 0 || + ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif /* defined(INET6) */ + return (EADDRINUSE); + } + } + t = in_pcblookup_local(pcbinfo, sin->sin_addr, + lport, prison ? 0 : wild); + if (t && + (reuseport & t->inp_socket->so_options) == 0) { +#if defined(INET6) + if (ip6_mapped_addr_on == 0 || + ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif /* defined(INET6) */ + return (EADDRINUSE); + } + } + inp->inp_laddr = sin->sin_addr; + } + if (lport == 0) { + ushort first, last; + int count; + + if (prison_ip(p, 0, &inp->inp_laddr.s_addr )) + return (EINVAL); + inp->inp_flags |= INP_ANONPORT; + + if (inp->inp_flags & INP_HIGHPORT) { + first = ipport_hifirstauto; /* sysctl */ + last = ipport_hilastauto; + lastport = &pcbinfo->lasthi; + } else if (inp->inp_flags & INP_LOWPORT) { + if (p && (error = suser_xxx(0, p, PRISON_ROOT))) + return error; + first = ipport_lowfirstauto; /* 1023 */ + last = ipport_lowlastauto; /* 600 */ + lastport = &pcbinfo->lastlow; + } else { + first = ipport_firstauto; /* sysctl */ + last = ipport_lastauto; + lastport = &pcbinfo->lastport; + } + /* + * Simple check to ensure all ports are not used up causing + * a deadlock here. + * + * We split the two cases (up and down) so that the direction + * is not being tested on each round of the loop. + */ + if (first > last) { + /* + * counting down + */ + count = first - last; + + do { + if (count-- < 0) { /* completely used? */ + /* + * Undo any address bind that may have + * occurred above. + */ + inp->inp_laddr.s_addr = INADDR_ANY; + return (EAGAIN); + } + --*lastport; + if (*lastport > first || *lastport < last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, + inp->inp_laddr, lport, wild)); + } else { + /* + * counting up + */ + count = last - first; + + do { + if (count-- < 0) { /* completely used? */ + /* + * Undo any address bind that may have + * occurred above. + */ + inp->inp_laddr.s_addr = INADDR_ANY; + return (EAGAIN); + } + ++*lastport; + if (*lastport < first || *lastport > last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, + inp->inp_laddr, lport, wild)); + } + } + inp->inp_lport = lport; + if (in_pcbinshash(inp) != 0) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return (EAGAIN); + } + return (0); +} + +/* + * Transform old in_pcbconnect() into an inner subroutine for new + * in_pcbconnect(): Do some validity-checking on the remote + * address (in mbuf 'nam') and then determine local host address + * (i.e., which interface) to use to access that remote host. + * + * This preserves definition of in_pcbconnect(), while supporting a + * slightly different version for T/TCP. (This is more than + * a bit of a kludge, but cleaning up the internal interfaces would + * have forced minor changes in every protocol). + */ + +int +in_pcbladdr(inp, nam, plocal_sin) + register struct inpcb *inp; + struct sockaddr *nam; + struct sockaddr_in **plocal_sin; +{ + struct in_ifaddr *ia; + register struct sockaddr_in *sin = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); + if (!TAILQ_EMPTY(&in_ifaddrhead)) { + /* + * If the destination address is INADDR_ANY, + * use the primary local address. + * If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, + * choose the broadcast address for that interface. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr = IA_SIN(in_ifaddrhead.tqh_first)->sin_addr; + else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && + (in_ifaddrhead.tqh_first->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&in_ifaddrhead.tqh_first->ia_broadaddr)->sin_addr; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + register struct route *ro; + + ia = (struct in_ifaddr *)0; + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + */ + ro = &inp->inp_route; + if (ro->ro_rt && + (satosin(&ro->ro_dst)->sin_addr.s_addr != + sin->sin_addr.s_addr || + inp->inp_socket->so_options & SO_DONTROUTE)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ + (ro->ro_rt == (struct rtentry *)0 || + ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + /* No route yet, so try to acquire one */ + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + sin->sin_addr; + rtalloc(ro); + } + /* + * If we found a route, use the address + * corresponding to the outgoing interface + * unless it is the loopback (in case a route + * to our address on another net goes to loopback). + */ + if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia == 0) { + u_short fport = sin->sin_port; + + sin->sin_port = 0; + ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); + if (ia == 0) + ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + sin->sin_port = fport; + if (ia == 0) + ia = in_ifaddrhead.tqh_first; + if (ia == 0) + return (EADDRNOTAVAIL); + } + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, use the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && + inp->inp_moptions != NULL) { + struct ip_moptions *imo; + struct ifnet *ifp; + + imo = inp->inp_moptions; + if (imo->imo_multicast_ifp != NULL) { + ifp = imo->imo_multicast_ifp; + for (ia = in_ifaddrhead.tqh_first; ia; + ia = ia->ia_link.tqe_next) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) + return (EADDRNOTAVAIL); + } + } + /* + * Don't do pcblookup call here; return interface in plocal_sin + * and exit to caller, that will do the lookup. + */ + *plocal_sin = &ia->ia_addr; + + } + return(0); +} + +/* + * Outer subroutine: + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sin. + * If don't have a local address for this socket yet, + * then pick one. + */ +int +in_pcbconnect(inp, nam, p) + register struct inpcb *inp; + struct sockaddr *nam; + struct proc *p; +{ + struct sockaddr_in *ifaddr; + register struct sockaddr_in *sin = (struct sockaddr_in *)nam; + int error; + + /* + * Call inner routine, to assign local interface address. + */ + if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0) + return(error); + + if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_lport, 0, NULL) != NULL) { + return (EADDRINUSE); + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, p); + if (error) + return (error); + } + inp->inp_laddr = ifaddr->sin_addr; + } + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + return (0); +} + +void +in_pcbdisconnect(inp) + struct inpcb *inp; +{ + + inp->inp_faddr.s_addr = INADDR_ANY; + inp->inp_fport = 0; + in_pcbrehash(inp); + if (inp->inp_socket->so_state & SS_NOFDREF) + in_pcbdetach(inp); +} + +void +in_pcbdetach(inp) + struct inpcb *inp; +{ + struct socket *so = inp->inp_socket; + struct inpcbinfo *ipi = inp->inp_pcbinfo; + +#ifdef IPSEC + ipsec4_delete_pcbpolicy(inp); +#endif /*IPSEC*/ + inp->inp_gencnt = ++ipi->ipi_gencnt; + in_pcbremlists(inp); + so->so_pcb = 0; + sofree(so); + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_route.ro_rt) + rtfree(inp->inp_route.ro_rt); + ip_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; + zfreei(ipi->ipi_zone, inp); +} + +/* + * The calling convention of in_setsockaddr() and in_setpeeraddr() was + * modified to match the pru_sockaddr() and pru_peeraddr() entry points + * in struct pr_usrreqs, so that protocols can just reference then directly + * without the need for a wrapper function. The socket must have a valid + * (i.e., non-nil) PCB, but it should be impossible to get an invalid one + * except through a kernel programming error, so it is acceptable to panic + * (or in this case trap) if the PCB is invalid. (Actually, we don't trap + * because there actually /is/ a programming error somewhere... XXX) + */ +int +in_setsockaddr(so, nam) + struct socket *so; + struct sockaddr **nam; +{ + int s; + register struct inpcb *inp; + register struct sockaddr_in *sin; + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK); + bzero(sin, sizeof *sin); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + inp = sotoinpcb(so); + if (!inp) { + splx(s); + free(sin, M_SONAME); + return EINVAL; + } + sin->sin_port = inp->inp_lport; + sin->sin_addr = inp->inp_laddr; + splx(s); + + *nam = (struct sockaddr *)sin; + return 0; +} + +int +in_setpeeraddr(so, nam) + struct socket *so; + struct sockaddr **nam; +{ + int s; + struct inpcb *inp; + register struct sockaddr_in *sin; + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK); + bzero((caddr_t)sin, sizeof (*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + inp = sotoinpcb(so); + if (!inp) { + splx(s); + free(sin, M_SONAME); + return EINVAL; + } + sin->sin_port = inp->inp_fport; + sin->sin_addr = inp->inp_faddr; + splx(s); + + *nam = (struct sockaddr *)sin; + return 0; +} + +/* + * Pass some notification to all connections of a protocol + * associated with address dst. The local address and/or port numbers + * may be specified to limit the search. The "usual action" will be + * taken, depending on the ctlinput cmd. The caller must filter any + * cmds that are uninteresting (e.g., no error in the map). + * Call the protocol specific routine (if any) to report + * any errors for each matching socket. + */ +void +in_pcbnotify(head, dst, fport_arg, laddr, lport_arg, cmd, notify) + struct inpcbhead *head; + struct sockaddr *dst; + u_int fport_arg, lport_arg; + struct in_addr laddr; + int cmd; + void (*notify) __P((struct inpcb *, int)); +{ + register struct inpcb *inp, *oinp; + struct in_addr faddr; + u_short fport = fport_arg, lport = lport_arg; + int errno, s; + + if ((unsigned)cmd > PRC_NCMDS || dst->sa_family != AF_INET) + return; + faddr = ((struct sockaddr_in *)dst)->sin_addr; + if (faddr.s_addr == INADDR_ANY) + return; + + /* + * Redirects go to all references to the destination, + * and use in_rtchange to invalidate the route cache. + * Dead host indications: notify all references to the destination. + * Otherwise, if we have knowledge of the local port and address, + * deliver only to that socket. + */ + if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { + fport = 0; + lport = 0; + laddr.s_addr = 0; + if (cmd != PRC_HOSTDEAD) + notify = in_rtchange; + } + errno = inetctlerrmap[cmd]; + s = splnet(); + for (inp = head->lh_first; inp != NULL;) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == NULL) { + inp = LIST_NEXT(inp, inp_list); + continue; + } +#endif + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_socket == 0 || + (lport && inp->inp_lport != lport) || + (laddr.s_addr && inp->inp_laddr.s_addr != laddr.s_addr) || + (fport && inp->inp_fport != fport)) { + inp = inp->inp_list.le_next; + continue; + } + oinp = inp; + inp = inp->inp_list.le_next; + if (notify) + (*notify)(oinp, errno); + } + splx(s); +} + +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +void +in_losing(inp) + struct inpcb *inp; +{ + register struct rtentry *rt; + struct rt_addrinfo info; + + if ((rt = inp->inp_route.ro_rt)) { + inp->inp_route.ro_rt = 0; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = + (struct sockaddr *)&inp->inp_route.ro_dst; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); + if (rt->rt_flags & RTF_DYNAMIC) + (void) rtrequest(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, + (struct rtentry **)0); + else + /* + * A new route can be allocated + * the next time output is attempted. + */ + rtfree(rt); + } +} + +/* + * After a routing change, flush old routing + * and allocate a (hopefully) better one. + */ +static void +in_rtchange(inp, errno) + register struct inpcb *inp; + int errno; +{ + if (inp->inp_route.ro_rt) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = 0; + /* + * A new route can be allocated the next time + * output is attempted. + */ + } +} + +/* + * Lookup a PCB based on the local address and port. + */ +struct inpcb * +in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) + struct inpcbinfo *pcbinfo; + struct in_addr laddr; + u_int lport_arg; + int wild_okay; +{ + register struct inpcb *inp; + int matchwild = 3, wildcard; + u_short lport = lport_arg; + + if (!wild_okay) { + struct inpcbhead *head; + /* + * Look for an unconnected (wildcard foreign addr) PCB that + * matches the local address and port we're looking for. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + for (inp = head->lh_first; inp != NULL; inp = inp->inp_hash.le_next) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == NULL) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_lport == lport) { + /* + * Found. + */ + return (inp); + } + } + /* + * Not found. + */ + return (NULL); + } else { + struct inpcbporthead *porthash; + struct inpcbport *phd; + struct inpcb *match = NULL; + /* + * Best fit PCB lookup. + * + * First see if this local port is in use by looking on the + * port hash list. + */ + porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport, + pcbinfo->porthashmask)]; + for (phd = porthash->lh_first; phd != NULL; phd = phd->phd_hash.le_next) { + if (phd->phd_port == lport) + break; + } + if (phd != NULL) { + /* + * Port is in use by one or more PCBs. Look for best + * fit. + */ + for (inp = phd->phd_pcblist.lh_first; inp != NULL; + inp = inp->inp_portlist.le_next) { + wildcard = 0; +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == NULL) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY) + wildcard++; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) + wildcard++; + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) { + break; + } + } + } + } + return (match); + } +} + +/* + * Lookup PCB in hash list. + */ +struct inpcb * +in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, + ifp) + struct inpcbinfo *pcbinfo; + struct in_addr faddr, laddr; + u_int fport_arg, lport_arg; + int wildcard; + struct ifnet *ifp; +{ + struct inpcbhead *head; + register struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)]; + for (inp = head->lh_first; inp != NULL; inp = inp->inp_hash.le_next) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == NULL) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * Found. + */ + return (inp); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; +#if defined(INET6) + struct inpcb *local_wild_mapped = NULL; +#endif /* defined(INET6) */ + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + for (inp = head->lh_first; inp != NULL; inp = inp->inp_hash.le_next) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == NULL) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_lport == lport) { +#if defined(NFAITH) && NFAITH > 0 + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; +#endif + if (inp->inp_laddr.s_addr == laddr.s_addr) + return (inp); + else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#if defined(INET6) + if (INP_CHECK_SOCKAF(inp->inp_socket, + AF_INET6)) + local_wild_mapped = inp; + else +#endif /* defined(INET6) */ + local_wild = inp; + } + } + } +#if defined(INET6) + if (local_wild == NULL) + return (local_wild_mapped); +#endif /* defined(INET6) */ + return (local_wild); + } + + /* + * Not found. + */ + return (NULL); +} + +/* + * Insert PCB onto various hash lists. + */ +int +in_pcbinshash(inp) + struct inpcb *inp; +{ + struct inpcbhead *pcbhash; + struct inpcbporthead *pcbporthash; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbport *phd; + u_int32_t hashkey_faddr; + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)]; + + pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport, + pcbinfo->porthashmask)]; + + /* + * Go through port list and look for a head for this lport. + */ + for (phd = pcbporthash->lh_first; phd != NULL; phd = phd->phd_hash.le_next) { + if (phd->phd_port == inp->inp_lport) + break; + } + /* + * If none exists, malloc one and tack it on. + */ + if (phd == NULL) { + MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT); + if (phd == NULL) { + return (ENOBUFS); /* XXX */ + } + phd->phd_port = inp->inp_lport; + LIST_INIT(&phd->phd_pcblist); + LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); + } + inp->inp_phd = phd; + LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); + LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + return (0); +} + +/* + * Move PCB to the proper hash bucket when { faddr, fport } have been + * changed. NOTE: This does not handle the case of the lport changing (the + * hashed port list would have to be updated as well), so the lport must + * not change after in_pcbinshash() has been called. + */ +void +in_pcbrehash(inp) + struct inpcb *inp; +{ + struct inpcbhead *head; + u_int32_t hashkey_faddr; + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)]; + + LIST_REMOVE(inp, inp_hash); + LIST_INSERT_HEAD(head, inp, inp_hash); +} + +/* + * Remove PCB from various lists. + */ +void +in_pcbremlists(inp) + struct inpcb *inp; +{ + inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt; + if (inp->inp_lport) { + struct inpcbport *phd = inp->inp_phd; + + LIST_REMOVE(inp, inp_hash); + LIST_REMOVE(inp, inp_portlist); + if (phd->phd_pcblist.lh_first == NULL) { + LIST_REMOVE(phd, phd_hash); + free(phd, M_PCB); + } + } + LIST_REMOVE(inp, inp_list); + inp->inp_pcbinfo->ipi_count--; +} + +int +prison_xinpcb(struct proc *p, struct inpcb *inp) +{ + if (!p->p_prison) + return (0); + if (ntohl(inp->inp_laddr.s_addr) == p->p_prison->pr_ip) + return (0); + return (1); +} diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h new file mode 100644 index 0000000..5aeb579 --- /dev/null +++ b/sys/netinet/in_pcb.h @@ -0,0 +1,301 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_PCB_H_ +#define _NETINET_IN_PCB_H_ + +#include <sys/queue.h> + + +#include <netinet6/ipsec.h> /* for IPSEC */ + +#define in6pcb inpcb /* for KAME src sync over BSD*'s */ +#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ + +/* + * Common structure pcb for internet protocol implementation. + * Here are stored pointers to local and foreign host table + * entries, local and foreign socket numbers, and pointers + * up (to a socket structure) and down (to a protocol-specific) + * control block. + */ +LIST_HEAD(inpcbhead, inpcb); +LIST_HEAD(inpcbporthead, inpcbport); +typedef u_quad_t inp_gen_t; + +/* + * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. + * So, AF_INET6 null laddr is also used as AF_INET null laddr, + * by utilize following structure. (At last, same as INRIA) + */ +struct in_addr_4in6 { + u_int32_t ia46_pad32[3]; + struct in_addr ia46_addr4; +}; + +/* + * NB: the zone allocator is type-stable EXCEPT FOR THE FIRST TWO LONGS + * of the structure. Therefore, it is important that the members in + * that position not contain any information which is required to be + * stable. + */ +struct icmp6_filter; + +struct inpcb { + LIST_ENTRY(inpcb) inp_hash; /* hash list */ + u_short inp_fport; /* foreign port */ + u_short inp_lport; /* local port */ + LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ + u_int32_t inp_flow; + + /* protocol dependent part, local and foreign addr */ + union { + /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { + /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + + caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + struct inpcbinfo *inp_pcbinfo; /* PCB list info */ + struct socket *inp_socket; /* back pointer to socket */ + /* list for this PCB's local port */ + int inp_flags; /* generic IP/datagram flags */ + + /* protocol dependent part; cached route */ + union { + /* placeholder for routing entry */ + struct route inp4_route; + struct route_in6 inp6_route; + } inp_dependroute; + + struct inpcbpolicy *inp_sp; /* for IPSEC */ + u_char inp_vflag; +#define INP_IPV4 0x1 +#define INP_IPV6 0x2 + u_char inp_ip_ttl; /* time to live proto */ + u_char inp_ip_p; /* protocol proto */ + + /* protocol dependent part; options */ + struct { + u_char inp4_ip_tos; /* type of service proto */ + struct mbuf *inp4_options; /* IP options */ + struct ip_moptions *inp4_moptions; /* IP multicast options */ + } inp_depend4; +#define inp_faddr inp_dependfaddr.inp46_foreign.ia46_addr4 +#define inp_laddr inp_dependladdr.inp46_local.ia46_addr4 +#define inp_route inp_dependroute.inp4_route +#define inp_ip_tos inp_depend4.inp4_ip_tos +#define inp_options inp_depend4.inp4_options +#define inp_moptions inp_depend4.inp4_moptions + struct { + /* IP options */ + struct mbuf *inp6_options; + /* IP6 options for outgoing packets */ + struct ip6_pktopts *inp6_outputopts; + /* IP multicast options */ + struct ip6_moptions *inp6_moptions; + /* ICMPv6 code type filter */ + struct icmp6_filter *inp6_icmp6filt; + /* IPV6_CHECKSUM setsockopt */ + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + u_int8_t inp6_hlim; + } inp_depend6; + LIST_ENTRY(inpcb) inp_portlist; + struct inpcbport *inp_phd; /* head of this list */ + inp_gen_t inp_gencnt; /* generation count of this instance */ +#define in6p_faddr inp_dependfaddr.inp6_foreign +#define in6p_laddr inp_dependladdr.inp6_local +#define in6p_route inp_dependroute.inp6_route +#define in6p_ip6_hlim inp_depend6.inp6_hlim +#define in6p_hops inp_depend6.inp6_hops /* default hop limit */ +#define in6p_ip6_nxt inp_ip_p +#define in6p_flowinfo inp_flow +#define in6p_vflag inp_vflag +#define in6p_options inp_depend6.inp6_options +#define in6p_outputopts inp_depend6.inp6_outputopts +#define in6p_moptions inp_depend6.inp6_moptions +#define in6p_icmp6filt inp_depend6.inp6_icmp6filt +#define in6p_cksum inp_depend6.inp6_cksum +#define inp6_ifindex inp_depend6.inp6_ifindex +#define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ +#define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ +#define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ +#define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ +#define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ +}; +/* + * The range of the generation count, as used in this implementation, + * is 9e19. We would have to create 300 billion connections per + * second for this number to roll over in a year. This seems sufficiently + * unlikely that we simply don't concern ourselves with that possibility. + */ + +/* + * Interface exported to userland by various protocols which use + * inpcbs. Hack alert -- only define if struct xsocket is in scope. + */ +#ifdef _SYS_SOCKETVAR_H_ +struct xinpcb { + size_t xi_len; /* length of this structure */ + struct inpcb xi_inp; + struct xsocket xi_socket; + u_quad_t xi_alignment_hack; +}; + +struct xinpgen { + size_t xig_len; /* length of this structure */ + u_int xig_count; /* number of PCBs at this time */ + inp_gen_t xig_gen; /* generation count at this time */ + so_gen_t xig_sogen; /* socket generation count at this time */ +}; +#endif /* _SYS_SOCKETVAR_H_ */ + +struct inpcbport { + LIST_ENTRY(inpcbport) phd_hash; + struct inpcbhead phd_pcblist; + u_short phd_port; +}; + +struct inpcbinfo { /* XXX documentation, prefixes */ + struct inpcbhead *hashbase; + u_long hashmask; + struct inpcbporthead *porthashbase; + u_long porthashmask; + struct inpcbhead *listhead; + u_short lastport; + u_short lastlow; + u_short lasthi; + struct vm_zone *ipi_zone; /* zone to allocate pcbs from */ + u_int ipi_count; /* number of pcbs in this list */ + u_quad_t ipi_gencnt; /* current generation count */ +}; + +#define INP_PCBHASH(faddr, lport, fport, mask) \ + (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) +#define INP_PCBPORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) + +/* flags in inp_flags: */ +#define INP_RECVOPTS 0x01 /* receive incoming IP options */ +#define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ +#define INP_RECVDSTADDR 0x04 /* receive IP dst address */ +#define INP_HDRINCL 0x08 /* user supplies entire IP header */ +#define INP_HIGHPORT 0x10 /* user wants "high" port binding */ +#define INP_LOWPORT 0x20 /* user wants "low" port binding */ +#define INP_ANONPORT 0x40 /* port chosen for user */ +#define INP_RECVIF 0x80 /* receive incoming interface */ +#define INP_MTUDISC 0x100 /* user can do MTU discovery */ +#define INP_FAITH 0x200 /* accept FAITH'ed connections */ +#define IN6P_PKTINFO 0x010000 +#define IN6P_HOPLIMIT 0x020000 +#define IN6P_NEXTHOP 0x040000 +#define IN6P_HOPOPTS 0x080000 +#define IN6P_DSTOPTS 0x100000 +#define IN6P_RTHDR 0x200000 +#define IN6P_BINDV6ONLY 0x400000 +#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ + INP_RECVIF|\ + IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_NEXTHOP|\ + IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR) + +#define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR) + + /* for KAME src sync over BSD*'s */ +#define IN6P_RECVOPTS INP_RECVOPTS +#define IN6P_RECVRETOPTS INP_RECVRETOPTS +#define IN6P_RECVDSTADDR INP_RECVDSTADDR +#define IN6P_HDRINCL INP_HDRINCL +#define IN6P_HIGHPORT INP_HIGHPORT +#define IN6P_LOWPORT INP_LOWPORT +#define IN6P_ANONPORT INP_ANONPORT +#define IN6P_RECVIF INP_RECVIF +#define IN6P_MTUDISC INP_MTUDISC +#define IN6P_FAITH INP_FAITH +#define IN6P_CONTROLOPTS INP_CONTROLOPTS + /* + * socket AF version is {newer than,or include} + * actual datagram AF version + */ + +#define INPLOOKUP_WILDCARD 1 +#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) +#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ + +#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family + +#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) + +#ifdef KERNEL +extern int ipport_lowfirstauto; +extern int ipport_lowlastauto; +extern int ipport_firstauto; +extern int ipport_lastauto; +extern int ipport_hifirstauto; +extern int ipport_hilastauto; + +void in_losing __P((struct inpcb *)); +int in_pcballoc __P((struct socket *, struct inpcbinfo *, struct proc *)); +int in_pcbbind __P((struct inpcb *, struct sockaddr *, struct proc *)); +int in_pcbconnect __P((struct inpcb *, struct sockaddr *, struct proc *)); +void in_pcbdetach __P((struct inpcb *)); +void in_pcbdisconnect __P((struct inpcb *)); +int in_pcbinshash __P((struct inpcb *)); +int in_pcbladdr __P((struct inpcb *, struct sockaddr *, + struct sockaddr_in **)); +struct inpcb * + in_pcblookup_local __P((struct inpcbinfo *, + struct in_addr, u_int, int)); +struct inpcb * + in_pcblookup_hash __P((struct inpcbinfo *, + struct in_addr, u_int, struct in_addr, u_int, + int, struct ifnet *)); +void in_pcbnotify __P((struct inpcbhead *, struct sockaddr *, + u_int, struct in_addr, u_int, int, void (*)(struct inpcb *, int))); +void in_pcbrehash __P((struct inpcb *)); +int in_setpeeraddr __P((struct socket *so, struct sockaddr **nam)); +int in_setsockaddr __P((struct socket *so, struct sockaddr **nam)); +void in_pcbremlists __P((struct inpcb *inp)); +int prison_xinpcb __P((struct proc *p, struct inpcb *inp)); +#endif /* KERNEL */ + +#endif /* !_NETINET_IN_PCB_H_ */ diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c new file mode 100644 index 0000000..b9a9e4a --- /dev/null +++ b/sys/netinet/in_proto.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 + * $FreeBSD$ + */ + +#include "opt_ipdivert.h" +#include "opt_ipx.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/queue.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/igmp_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> +/* + * TCP/IP protocol family: IP, ICMP, UDP, TCP. + */ + +#include "gif.h" +#if NGIF > 0 +#include <netinet/in_gif.h> +#endif + +#ifdef IPXIP +#include <netipx/ipx_ip.h> +#endif + +#ifdef NSIP +#include <netns/ns.h> +#include <netns/ns_if.h> +#endif + +extern struct domain inetdomain; +static struct pr_usrreqs nousrreqs; + +struct protosw inetsw[] = { +{ 0, &inetdomain, 0, 0, + 0, 0, 0, 0, + 0, + ip_init, 0, ip_slowtimo, ip_drain, + &nousrreqs +}, +{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, + udp_input, 0, udp_ctlinput, ip_ctloutput, + 0, + udp_init, 0, 0, 0, + &udp_usrreqs +}, +{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, + PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + tcp_input, 0, tcp_ctlinput, tcp_ctloutput, + 0, + tcp_init, 0, tcp_slowtimo, tcp_drain, + &tcp_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, + rip_input, 0, rip_ctlinput, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR, + icmp_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR, + igmp_input, 0, 0, rip_ctloutput, + 0, + igmp_init, igmp_fasttimo, igmp_slowtimo, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RSVP, PR_ATOMIC|PR_ADDR, + rsvp_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#if NGIF > 0 +{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, + in_gif_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +# ifdef INET6 +{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, + in_gif_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#endif +#else /*NGIF*/ +{ SOCK_RAW, &inetdomain, IPPROTO_IPIP, PR_ATOMIC|PR_ADDR, + ipip_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif /*NGIF*/ +#ifdef IPDIVERT +{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR, + div_input, 0, 0, ip_ctloutput, + 0, + div_init, 0, 0, 0, + &div_usrreqs, +}, +#endif +#ifdef TPIP +{ SOCK_SEQPACKET,&inetdomain, IPPROTO_TP, PR_CONNREQUIRED|PR_WANTRCVD, + tpip_input, 0, tpip_ctlinput, tp_ctloutput, + tp_usrreq, + tp_init, 0, tp_slowtimo, tp_drain, +}, +#endif +/* EON (ISO CLNL over IP) */ +#ifdef EON +{ SOCK_RAW, &inetdomain, IPPROTO_EON, 0, + eoninput, 0, eonctlinput, 0, + 0, + eonprotoinit, 0, 0, 0, +}, +#endif +#ifdef IPXIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + ipxip_input, 0, ipxip_ctlinput, 0, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif +#ifdef NSIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + idpip_input, 0, nsip_ctlinput, 0, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif + /* raw wildcard */ +{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR, + rip_input, 0, 0, rip_ctloutput, + 0, + rip_init, 0, 0, 0, + &rip_usrreqs +}, +}; + +extern int in_inithead __P((void **, int)); + +struct domain inetdomain = + { AF_INET, "internet", 0, 0, 0, + inetsw, &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0, + in_inithead, 32, sizeof(struct sockaddr_in) + }; + +DOMAIN_SET(inet); + +SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, + "Internet Family"); + +SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP"); +SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP"); +SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); +SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); +SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); +SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); +#ifdef IPDIVERT +SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, div, CTLFLAG_RW, 0, "DIVERT"); +#endif + diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c new file mode 100644 index 0000000..52d062d --- /dev/null +++ b/sys/netinet/in_rmx.c @@ -0,0 +1,414 @@ +/* + * Copyright 1994, 1995 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This code does two things necessary for the enhanced TCP metrics to + * function in a useful manner: + * 1) It marks all non-host routes as `cloning', thus ensuring that + * every actual reference to such a route actually gets turned + * into a reference to a host route to the specific destination + * requested. + * 2) When such routes lose all their references, it arranges for them + * to be deleted in some random collection of circumstances, so that + * a large quantity of stale routing data is not kept in kernel memory + * indefinitely. See in_rtqtimo() below for the exact mechanism. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/mbuf.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_var.h> + +extern int in_inithead __P((void **head, int off)); + +#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ + +/* + * Do what we need to do when inserting a route. + */ +static struct radix_node * +in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, + struct radix_node *treenodes) +{ + struct rtentry *rt = (struct rtentry *)treenodes; + struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + struct radix_node *ret; + + /* + * For IP, all unicast non-host routes are automatically cloning. + */ + if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + rt->rt_flags |= RTF_MULTICAST; + + if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) { + rt->rt_flags |= RTF_PRCLONING; + } + + /* + * A little bit of help for both IP output and input: + * For host routes, we make sure that RTF_BROADCAST + * is set for anything that looks like a broadcast address. + * This way, we can avoid an expensive call to in_broadcast() + * in ip_output() most of the time (because the route passed + * to ip_output() is almost always a host route). + * + * We also do the same for local addresses, with the thought + * that this might one day be used to speed up ip_input(). + * + * We also mark routes to multicast addresses as such, because + * it's easy to do and might be useful (but this is much more + * dubious since it's so easy to inspect the address). (This + * is done above.) + */ + if (rt->rt_flags & RTF_HOST) { + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { + rt->rt_flags |= RTF_BROADCAST; + } else { +#define satosin(sa) ((struct sockaddr_in *)sa) + if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr + == sin->sin_addr.s_addr) + rt->rt_flags |= RTF_LOCAL; +#undef satosin + } + } + + if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) + && rt->rt_ifp) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + + ret = rn_addroute(v_arg, n_arg, head, treenodes); + if (ret == NULL && rt->rt_flags & RTF_HOST) { + struct rtentry *rt2; + /* + * We are trying to add a host route, but can't. + * Find out if it is because of an + * ARP entry and delete it if so. + */ + rt2 = rtalloc1((struct sockaddr *)sin, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt2) { + if (rt2->rt_flags & RTF_LLINFO && + rt2->rt_flags & RTF_HOST && + rt2->rt_gateway && + rt2->rt_gateway->sa_family == AF_LINK) { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt2), + rt2->rt_gateway, + rt_mask(rt2), rt2->rt_flags, 0); + ret = rn_addroute(v_arg, n_arg, head, + treenodes); + } + RTFREE(rt2); + } + } + return ret; +} + +/* + * This code is the inverse of in_clsroute: on first reference, if we + * were managing the route, stop doing so and set the expiration timer + * back off again. + */ +static struct radix_node * +in_matroute(void *v_arg, struct radix_node_head *head) +{ + struct radix_node *rn = rn_match(v_arg, head); + struct rtentry *rt = (struct rtentry *)rn; + + if(rt && rt->rt_refcnt == 0) { /* this is first reference */ + if(rt->rt_flags & RTPRF_OURS) { + rt->rt_flags &= ~RTPRF_OURS; + rt->rt_rmx.rmx_expire = 0; + } + } + return rn; +} + +static int rtq_reallyold = 60*60; + /* one hour is ``really old'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, + &rtq_reallyold , 0, + "Default expiration time on dynamically learned routes"); + +static int rtq_minreallyold = 10; + /* never automatically crank down to less */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, + &rtq_minreallyold , 0, + "Minimum time to attempt to hold onto dynamically learned routes"); + +static int rtq_toomany = 128; + /* 128 cached routes is ``too many'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, + &rtq_toomany , 0, "Upper limit on dynamically learned routes"); + +/* + * On last reference drop, mark the route as belong to us so that it can be + * timed out. + */ +static void +in_clsroute(struct radix_node *rn, struct radix_node_head *head) +{ + struct rtentry *rt = (struct rtentry *)rn; + + if(!(rt->rt_flags & RTF_UP)) + return; /* prophylactic measures */ + + if((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) + return; + + if((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) + != RTF_WASCLONED) + return; + + /* + * As requested by David Greenman: + * If rtq_reallyold is 0, just delete the route without + * waiting for a timeout cycle to kill it. + */ + if(rtq_reallyold != 0) { + rt->rt_flags |= RTPRF_OURS; + rt->rt_rmx.rmx_expire = time_second + rtq_reallyold; + } else { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + } +} + +struct rtqk_arg { + struct radix_node_head *rnh; + int draining; + int killed; + int found; + int updating; + time_t nextstop; +}; + +/* + * Get rid of old routes. When draining, this deletes everything, even when + * the timeout is not expired yet. When updating, this makes sure that + * nothing has a timeout longer than the current value of rtq_reallyold. + */ +static int +in_rtqkill(struct radix_node *rn, void *rock) +{ + struct rtqk_arg *ap = rock; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if(rt->rt_flags & RTPRF_OURS) { + ap->found++; + + if(ap->draining || rt->rt_rmx.rmx_expire <= time_second) { + if(rt->rt_refcnt > 0) + panic("rtqkill route really not free"); + + err = rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + if(err) { + log(LOG_WARNING, "in_rtqkill: error %d\n", err); + } else { + ap->killed++; + } + } else { + if(ap->updating + && (rt->rt_rmx.rmx_expire - time_second + > rtq_reallyold)) { + rt->rt_rmx.rmx_expire = time_second + + rtq_reallyold; + } + ap->nextstop = lmin(ap->nextstop, + rt->rt_rmx.rmx_expire); + } + } + + return 0; +} + +#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ +static int rtq_timeout = RTQ_TIMEOUT; + +static void +in_rtqtimo(void *rock) +{ + struct radix_node_head *rnh = rock; + struct rtqk_arg arg; + struct timeval atv; + static time_t last_adjusted_timeout = 0; + int s; + + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = time_second + rtq_timeout; + arg.draining = arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + + /* + * Attempt to be somewhat dynamic about this: + * If there are ``too many'' routes sitting around taking up space, + * then crank down the timeout, and see if we can't make some more + * go away. However, we make sure that we will never adjust more + * than once in rtq_timeout seconds, to keep from cranking down too + * hard. + */ + if((arg.found - arg.killed > rtq_toomany) + && (time_second - last_adjusted_timeout >= rtq_timeout) + && rtq_reallyold > rtq_minreallyold) { + rtq_reallyold = 2*rtq_reallyold / 3; + if(rtq_reallyold < rtq_minreallyold) { + rtq_reallyold = rtq_minreallyold; + } + + last_adjusted_timeout = time_second; +#ifdef DIAGNOSTIC + log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", + rtq_reallyold); +#endif + arg.found = arg.killed = 0; + arg.updating = 1; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + } + + atv.tv_usec = 0; + atv.tv_sec = arg.nextstop - time_second; + timeout(in_rtqtimo, rock, tvtohz(&atv)); +} + +void +in_rtqdrain(void) +{ + struct radix_node_head *rnh = rt_tables[AF_INET]; + struct rtqk_arg arg; + int s; + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = 0; + arg.draining = 1; + arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); +} + +/* + * Initialize our routing tree. + */ +int +in_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + if(!rn_inithead(head, off)) + return 0; + + if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ + return 1; /* only do this for the real routing table */ + + rnh = *head; + rnh->rnh_addaddr = in_addroute; + rnh->rnh_matchaddr = in_matroute; + rnh->rnh_close = in_clsroute; + in_rtqtimo(rnh); /* kick off timeout first time */ + return 1; +} + + +/* + * This zaps old routes when the interface goes down. + * Currently it doesn't delete static routes; there are + * arguments one could make for both behaviors. For the moment, + * we will adopt the Principle of Least Surprise and leave them + * alone (with the knowledge that this will not be enough for some + * people). The ones we really want to get rid of are things like ARP + * entries, since the user might down the interface, walk over to a completely + * different network, and plug back in. + */ +struct in_ifadown_arg { + struct radix_node_head *rnh; + struct ifaddr *ifa; +}; + +static int +in_ifadownkill(struct radix_node *rn, void *xap) +{ + struct in_ifadown_arg *ap = xap; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if (rt->rt_ifa == ap->ifa && !(rt->rt_flags & RTF_STATIC)) { + /* + * We need to disable the automatic prune that happens + * in this case in rtrequest() because it will blow + * away the pointers that rn_walktree() needs in order + * continue our descent. We will end up deleting all + * the routes that rtrequest() would have in any case, + * so that behavior is not needed there. + */ + rt->rt_flags &= ~RTF_PRCLONING; + err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); + if (err) { + log(LOG_WARNING, "in_ifadownkill: error %d\n", err); + } + } + return 0; +} + +int +in_ifadown(struct ifaddr *ifa) +{ + struct in_ifadown_arg arg; + struct radix_node_head *rnh; + + if (ifa->ifa_addr->sa_family != AF_INET) + return 1; + + arg.rnh = rnh = rt_tables[AF_INET]; + arg.ifa = ifa; + rnh->rnh_walktree(rnh, in_ifadownkill, &arg); + ifa->ifa_flags &= ~IFA_ROUTE; + return 0; +} diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h new file mode 100644 index 0000000..5d18c56 --- /dev/null +++ b/sys/netinet/in_systm.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_systm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_SYSTM_H_ +#define _NETINET_IN_SYSTM_H_ + +/* + * Miscellaneous internetwork + * definitions for kernel. + */ + +/* + * Network types. + * + * Internally the system keeps counters in the headers with the bytes + * swapped so that VAX instructions will work on them. It reverses + * the bytes before transmission at each protocol level. The n_ types + * represent the types with the bytes in ``high-ender'' order. + */ +typedef u_int16_t n_short; /* short as received from the net */ +typedef u_int32_t n_long; /* long as received from the net */ + +typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */ + +#ifdef KERNEL +n_time iptime __P((void)); +#endif + +#endif diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h new file mode 100644 index 0000000..c04c778 --- /dev/null +++ b/sys/netinet/in_var.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_VAR_H_ +#define _NETINET_IN_VAR_H_ + +#include <sys/queue.h> + +/* + * Interface address, Internet version. One of these structures + * is allocated for each Internet address on an interface. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ +struct in_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + /* ia_{,sub}net{,mask} in host order */ + u_long ia_net; /* network number of interface */ + u_long ia_netmask; /* mask of net part */ + u_long ia_subnet; /* subnet number, including net */ + u_long ia_subnetmask; /* mask of subnet part */ + struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + TAILQ_ENTRY(in_ifaddr) ia_link; /* tailq macro glue */ + struct sockaddr_in ia_addr; /* reserve space for interface name */ + struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ +}; + +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; +}; +/* + * Given a pointer to an in_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_in. + */ +#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) +#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) + +#define IN_LNAOF(in, ifa) \ + ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) + + +#ifdef KERNEL +extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; +extern struct ifqueue ipintrq; /* ip packet input queue */ +extern struct in_addr zeroin_addr; +extern u_char inetctlerrmap[]; + +/* + * Macro for finding the interface (ifnet structure) corresponding to one + * of our IP addresses. + */ +#define INADDR_TO_IFP(addr, ifp) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ +{ \ + register struct in_ifaddr *ia; \ +\ + for (ia = in_ifaddrhead.tqh_first; \ + ia != NULL && ((ia->ia_ifp->if_flags & IFF_POINTOPOINT)? \ + IA_DSTSIN(ia):IA_SIN(ia))->sin_addr.s_addr != (addr).s_addr; \ + ia = ia->ia_link.tqe_next) \ + continue; \ + if (ia == NULL) \ + for (ia = in_ifaddrhead.tqh_first; \ + ia != NULL; \ + ia = ia->ia_link.tqe_next) \ + if (ia->ia_ifp->if_flags & IFF_POINTOPOINT && \ + IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + break; \ + (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ +} + +/* + * Macro for finding the internet address structure (in_ifaddr) corresponding + * to a given interface (ifnet structure). + */ +#define IFP_TO_IA(ifp, ia) \ + /* struct ifnet *ifp; */ \ + /* struct in_ifaddr *ia; */ \ +{ \ + for ((ia) = in_ifaddrhead.tqh_first; \ + (ia) != NULL && (ia)->ia_ifp != (ifp); \ + (ia) = (ia)->ia_link.tqe_next) \ + continue; \ +} +#endif + +/* + * This information should be part of the ifnet structure but we don't wish + * to change that - as it might break a number of things + */ + +struct router_info { + struct ifnet *rti_ifp; + int rti_type; /* type of router which is querier on this interface */ + int rti_time; /* # of slow timeouts since last old query */ + struct router_info *rti_next; +}; + +/* + * Internet multicast address structure. There is one of these for each IP + * multicast group to which this host belongs on a given network interface. + * For every entry on the interface's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. They are also + * kept on a system-wide list to make it easier to keep our legacy IGMP code + * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). + */ +struct in_multi { + LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ + struct in_addr inm_addr; /* IP multicast address, convenience */ + struct ifnet *inm_ifp; /* back pointer to ifnet */ + struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ + u_int inm_timer; /* IGMP membership report timer */ + u_int inm_state; /* state of the membership */ + struct router_info *inm_rti; /* router info*/ +}; + +#ifdef KERNEL + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_raw); +#endif + +extern LIST_HEAD(in_multihead, in_multi) in_multihead; + +/* + * Structure used by macros below to remember position when stepping through + * all of the in_multi records. + */ +struct in_multistep { + struct in_multi *i_inm; +}; + +/* + * Macro for looking up the in_multi record for a given IP multicast address + * on a given interface. If no matching record is found, "inm" is set null. + */ +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +do { \ + register struct ifmultiaddr *ifma; \ +\ + for (ifma = (ifp)->if_multiaddrs.lh_first; ifma; \ + ifma = ifma->ifma_link.le_next) { \ + if (ifma->ifma_addr->sa_family == AF_INET \ + && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ + (addr).s_addr) \ + break; \ + } \ + (inm) = ifma ? ifma->ifma_protospec : 0; \ +} while(0) + +/* + * Macro to step through all of the in_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "inm" when there + * are no remaining records. + */ +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = (step).i_inm->inm_link.le_next; \ +} while(0) + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + (step).i_inm = in_multihead.lh_first; \ + IN_NEXT_MULTI((step), (inm)); \ +} while(0) + +struct route; +struct in_multi *in_addmulti __P((struct in_addr *, struct ifnet *)); +void in_delmulti __P((struct in_multi *)); +int in_control __P((struct socket *, u_long, caddr_t, struct ifnet *, + struct proc *)); +void in_rtqdrain __P((void)); +void ip_input __P((struct mbuf *)); +int in_ifadown __P((struct ifaddr *ifa)); +void in_ifscrub __P((struct ifnet *, struct in_ifaddr *)); +int ipflow_fastforward __P((struct mbuf *)); +void ipflow_create __P((const struct route *, struct mbuf *)); +void ipflow_slowtimo __P((void)); + +#endif /* KERNEL */ + +/* INET6 stuff */ +#include <netinet6/in6_var.h> + +#endif /* _NETINET_IN_VAR_H_ */ diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h new file mode 100644 index 0000000..0b6f1b6 --- /dev/null +++ b/sys/netinet/ip.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.2 (Berkeley) 6/1/94 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_H_ +#define _NETINET_IP_H_ + +/* + * Definitions for internet protocol version 4. + * Per RFC 791, September 1981. + */ +#define IPVERSION 4 + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#ifdef _IP_VHL + u_char ip_vhl; /* version << 4 | header length >> 2 */ +#else +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif +#endif /* not _IP_VHL */ + u_char ip_tos; /* type of service */ + u_short ip_len; /* total length */ + u_short ip_id; /* identification */ + u_short ip_off; /* fragment offset field */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +}; + +#ifdef _IP_VHL +#define IP_MAKE_VHL(v, hl) ((v) << 4 | (hl)) +#define IP_VHL_HL(vhl) ((vhl) & 0x0f) +#define IP_VHL_V(vhl) ((vhl) >> 4) +#define IP_VHL_BORING 0x45 +#endif + +#define IP_MAXPACKET 65535 /* maximum packet size */ + +/* + * Definitions for IP type of service (ip_tos) + */ +#define IPTOS_LOWDELAY 0x10 +#define IPTOS_THROUGHPUT 0x08 +#define IPTOS_RELIABILITY 0x04 +#define IPTOS_MINCOST 0x02 + +/* + * Definitions for IP precedence (also in ip_tos) (hopefully unused) + */ +#define IPTOS_PREC_NETCONTROL 0xe0 +#define IPTOS_PREC_INTERNETCONTROL 0xc0 +#define IPTOS_PREC_CRITIC_ECP 0xa0 +#define IPTOS_PREC_FLASHOVERRIDE 0x80 +#define IPTOS_PREC_FLASH 0x60 +#define IPTOS_PREC_IMMEDIATE 0x40 +#define IPTOS_PREC_PRIORITY 0x20 +#define IPTOS_PREC_ROUTINE 0x00 + +/* + * Definitions for options. + */ +#define IPOPT_COPIED(o) ((o)&0x80) +#define IPOPT_CLASS(o) ((o)&0x60) +#define IPOPT_NUMBER(o) ((o)&0x1f) + +#define IPOPT_CONTROL 0x00 +#define IPOPT_RESERVED1 0x20 +#define IPOPT_DEBMEAS 0x40 +#define IPOPT_RESERVED2 0x60 + +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ + +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ +#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_SATID 136 /* satnet id */ +#define IPOPT_SSRR 137 /* strict source route */ +#define IPOPT_RA 148 /* router alert */ + +/* + * Offsets to fields in options other than EOL and NOP. + */ +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_OFFSET 2 /* offset within option */ +#define IPOPT_MINOFF 4 /* min value of above */ + +/* + * Time stamp option structure. + */ +struct ip_timestamp { + u_char ipt_code; /* IPOPT_TS */ + u_char ipt_len; /* size of structure (variable) */ + u_char ipt_ptr; /* index of current entry */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ipt_flg:4, /* flags, see below */ + ipt_oflw:4; /* overflow counter */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ipt_oflw:4, /* overflow counter */ + ipt_flg:4; /* flags, see below */ +#endif + union ipt_timestamp { + n_long ipt_time[1]; + struct ipt_ta { + struct in_addr ipt_addr; + n_long ipt_time; + } ipt_ta[1]; + } ipt_timestamp; +}; + +/* flag bits for ipt_flg */ +#define IPOPT_TS_TSONLY 0 /* timestamps only */ +#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ +#define IPOPT_TS_PRESPEC 3 /* specified modules only */ + +/* bits for security (not byte swapped) */ +#define IPOPT_SECUR_UNCLASS 0x0000 +#define IPOPT_SECUR_CONFID 0xf135 +#define IPOPT_SECUR_EFTO 0x789a +#define IPOPT_SECUR_MMMM 0xbc4d +#define IPOPT_SECUR_RESTR 0xaf13 +#define IPOPT_SECUR_SECRET 0xd788 +#define IPOPT_SECUR_TOPSECRET 0x6bc5 + +/* + * Internet implementation parameters. + */ +#define MAXTTL 255 /* maximum time to live (seconds) */ +#define IPDEFTTL 64 /* default ttl, from RFC 1340 */ +#define IPFRAGTTL 60 /* time to live for frags, slowhz */ +#define IPTTLDEC 1 /* subtracted when forwarding */ + +#define IP_MSS 576 /* default maximum segment size */ + +#endif diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h new file mode 100644 index 0000000..a0fe4c1 --- /dev/null +++ b/sys/netinet/ip6.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 1995, 1996, 1997, 1998, and 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <netinet6/ip6.h> diff --git a/sys/netinet/ip_auth.c b/sys/netinet/ip_auth.c new file mode 100644 index 0000000..f0fd500 --- /dev/null +++ b/sys/netinet/ip_auth.c @@ -0,0 +1,538 @@ +/* + * Copyright (C) 1998 by Darren Reed & Guido van Rooij. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +/*static const char rcsid[] = "@(#)$Id: ip_auth.c,v 2.1.2.1 1999/09/28 11:44:04 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +#endif +#if defined(KERNEL) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/filio.h> +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#if _BSDI_VERSION >= 199802 +# include <sys/queue.h> +#endif +#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(bsdi) +# include <machine/cpu.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef KERNEL +# define KERNEL +# define NOT_KERNEL +#endif +#ifndef linux +# include <netinet/ip_var.h> +#endif +#ifdef NOT_KERNEL +# undef KERNEL +#endif +#ifdef __sgi +# ifdef IFF_DRVRLOCK /* IRIX6 */ +# include <sys/hashing.h> +# endif +#endif +#include <netinet/tcp.h> +#if defined(__sgi) && !defined(IFF_DRVRLOCK) /* IRIX < 6 */ +extern struct ifqueue ipintrq; /* ip packet input queue */ +#else +# ifndef linux +# if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# endif +# include <netinet/in_var.h> +# include <netinet/tcp_fsm.h> +# endif +#endif +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_auth.h" +#if !SOLARIS && !defined(linux) +# include <net/netisr.h> +# ifdef __FreeBSD__ +# include <machine/cpufunc.h> +# endif +#endif +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) +# include <sys/libkern.h> +# include <sys/systm.h> +# endif +#endif + + + +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern KRWLOCK_T ipf_auth; +extern kmutex_t ipf_authmx; +# if SOLARIS +extern kcondvar_t ipfauthwait; +# endif +#endif +#ifdef linux +static struct wait_queue *ipfauthwait = NULL; +#endif + +int fr_authsize = FR_NUMAUTH; +int fr_authused = 0; +int fr_defaultauthage = 600; +fr_authstat_t fr_authstats; +static frauth_t fr_auth[FR_NUMAUTH]; +mb_t *fr_authpkts[FR_NUMAUTH]; +static int fr_authstart = 0, fr_authend = 0, fr_authnext = 0; +static frauthent_t *fae_list = NULL; +frentry_t *ipauth = NULL; + + +/* + * Check if a packet has authorization. If the packet is found to match an + * authorization result and that would result in a feedback loop (i.e. it + * will end up returning FR_AUTH) then return FR_BLOCK instead. + */ +u_32_t fr_checkauth(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + u_short id = ip->ip_id; + u_32_t pass; + int i; + + READ_ENTER(&ipf_auth); + for (i = fr_authstart; i != fr_authend; ) { + /* + * index becomes -2 only after an SIOCAUTHW. Check this in + * case the same packet gets sent again and it hasn't yet been + * auth'd. + */ + if ((fr_auth[i].fra_index == -2) && + (id == fr_auth[i].fra_info.fin_id) && + !bcmp((char *)fin,(char *)&fr_auth[i].fra_info,FI_CSIZE)) { + /* + * Avoid feedback loop. + */ + if (!(pass = fr_auth[i].fra_pass) || (pass & FR_AUTH)) + pass = FR_BLOCK; + RWLOCK_EXIT(&ipf_auth); + WRITE_ENTER(&ipf_auth); + fr_authstats.fas_hits++; + fr_auth[i].fra_index = -1; + fr_authused--; + if (i == fr_authstart) { + while (fr_auth[i].fra_index == -1) { + i++; + if (i == FR_NUMAUTH) + i = 0; + fr_authstart = i; + if (i == fr_authend) + break; + } + if (fr_authstart == fr_authend) { + fr_authnext = 0; + fr_authstart = fr_authend = 0; + } + } + RWLOCK_EXIT(&ipf_auth); + return pass; + } + i++; + if (i == FR_NUMAUTH) + i = 0; + } + fr_authstats.fas_miss++; + RWLOCK_EXIT(&ipf_auth); + return 0; +} + + +/* + * Check if we have room in the auth array to hold details for another packet. + * If we do, store it and wake up any user programs which are waiting to + * hear about these events. + */ +int fr_newauth(m, fin, ip +#if defined(_KERNEL) && SOLARIS +, qif) +qif_t *qif; +#else +) +#endif +mb_t *m; +fr_info_t *fin; +ip_t *ip; +{ + int i; + + WRITE_ENTER(&ipf_auth); + if (fr_authstart > fr_authend) { + fr_authstats.fas_nospace++; + RWLOCK_EXIT(&ipf_auth); + return 0; + } else { + if ((fr_authstart == 0) && (fr_authend == FR_NUMAUTH - 1)) { + fr_authstats.fas_nospace++; + RWLOCK_EXIT(&ipf_auth); + return 0; + } + } + + fr_authstats.fas_added++; + fr_authused++; + i = fr_authend++; + if (fr_authend == FR_NUMAUTH) + fr_authend = 0; + RWLOCK_EXIT(&ipf_auth); + fr_auth[i].fra_index = i; + fr_auth[i].fra_pass = 0; + fr_auth[i].fra_age = fr_defaultauthage; + bcopy((char *)fin, (char *)&fr_auth[i].fra_info, sizeof(*fin)); +#if !defined(sparc) && !defined(m68k) + /* + * No need to copyback here as we want to undo the changes, not keep + * them. + */ +# if SOLARIS && defined(_KERNEL) + if (ip == (ip_t *)m->b_rptr) +# endif + { + register u_short bo; + + bo = ip->ip_len; + ip->ip_len = htons(bo); +# if !SOLARIS /* 4.4BSD converts this ip_input.c, but I don't in solaris.c */ + bo = ip->ip_id; + ip->ip_id = htons(bo); +# endif + bo = ip->ip_off; + ip->ip_off = htons(bo); + } +#endif +#if SOLARIS && defined(_KERNEL) + m->b_rptr -= qif->qf_off; + fr_authpkts[i] = *(mblk_t **)fin->fin_mp; + fr_auth[i].fra_q = qif->qf_q; + cv_signal(&ipfauthwait); +#else + fr_authpkts[i] = m; +# if defined(linux) && defined(_KERNEL) + wake_up_interruptible(&ipfauthwait); +# else + WAKEUP(&fr_authnext); +# endif +#endif + return 1; +} + + +int fr_auth_ioctl(data, cmd, fr, frptr) +caddr_t data; +#if defined(__NetBSD__) || defined(__OpenBSD__) || (FreeBSD_version >= 300003) +u_long cmd; +#else +int cmd; +#endif +frentry_t *fr, **frptr; +{ + mb_t *m; +#if defined(_KERNEL) +# if !SOLARIS + struct ifqueue *ifq; + int s; +# endif +#endif + frauth_t auth, *au = &auth; + frauthent_t *fae, **faep; + int i, error = 0; + + switch (cmd) + { + case SIOCINIFR : + case SIOCRMIFR : + case SIOCADIFR : + error = EINVAL; + break; + case SIOCINAFR : + case SIOCRMAFR : + case SIOCADAFR : + for (faep = &fae_list; (fae = *faep); ) + if (&fae->fae_fr == fr) + break; + else + faep = &fae->fae_next; + if (cmd == SIOCRMAFR) { + if (!fae) + error = ESRCH; + else { + WRITE_ENTER(&ipf_auth); + *faep = fae->fae_next; + *frptr = fr->fr_next; + RWLOCK_EXIT(&ipf_auth); + KFREE(fae); + } + } else { + KMALLOC(fae, frauthent_t *); + if (fae != NULL) { + IRCOPY((char *)data, (char *)&fae->fae_fr, + sizeof(fae->fae_fr)); + WRITE_ENTER(&ipf_auth); + fae->fae_age = fr_defaultauthage; + fae->fae_fr.fr_hits = 0; + fae->fae_fr.fr_next = *frptr; + *frptr = &fae->fae_fr; + fae->fae_next = *faep; + *faep = fae; + ipauth = &fae_list->fae_fr; + RWLOCK_EXIT(&ipf_auth); + } else + error = ENOMEM; + } + break; + case SIOCATHST: + READ_ENTER(&ipf_auth); + fr_authstats.fas_faelist = fae_list; + RWLOCK_EXIT(&ipf_auth); + IWCOPY((char *)&fr_authstats, data, sizeof(fr_authstats)); + break; + case SIOCAUTHW: +fr_authioctlloop: + READ_ENTER(&ipf_auth); + if ((fr_authnext != fr_authend) && fr_authpkts[fr_authnext]) { + IWCOPY((char *)&fr_auth[fr_authnext], data, + sizeof(fr_info_t)); + RWLOCK_EXIT(&ipf_auth); + WRITE_ENTER(&ipf_auth); + fr_authnext++; + if (fr_authnext == FR_NUMAUTH) + fr_authnext = 0; + RWLOCK_EXIT(&ipf_auth); + return 0; + } +#ifdef _KERNEL +# if SOLARIS + mutex_enter(&ipf_authmx); + if (!cv_wait_sig(&ipfauthwait, &ipf_authmx)) { + mutex_exit(&ipf_authmx); + return EINTR; + } + mutex_exit(&ipf_authmx); +# else +# ifdef linux + interruptible_sleep_on(&ipfauthwait); + if (current->signal & ~current->blocked) + error = -EINTR; +# else + error = SLEEP(&fr_authnext, "fr_authnext"); +# endif +# endif +#endif + RWLOCK_EXIT(&ipf_auth); + if (!error) + goto fr_authioctlloop; + break; + case SIOCAUTHR: + IRCOPY(data, (caddr_t)&auth, sizeof(auth)); + WRITE_ENTER(&ipf_auth); + i = au->fra_index; + if ((i < 0) || (i > FR_NUMAUTH) || + (fr_auth[i].fra_info.fin_id != au->fra_info.fin_id)) { + RWLOCK_EXIT(&ipf_auth); + return EINVAL; + } + m = fr_authpkts[i]; + fr_auth[i].fra_index = -2; + fr_auth[i].fra_pass = au->fra_pass; + fr_authpkts[i] = NULL; +#ifdef _KERNEL + RWLOCK_EXIT(&ipf_auth); + SPL_NET(s); +# ifndef linux + if (m && au->fra_info.fin_out) { +# if SOLARIS + error = fr_qout(fr_auth[i].fra_q, m); +# else /* SOLARIS */ +# if _BSDI_VERSION >= 199802 + error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, + NULL); +# else + error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL); +# endif +# endif /* SOLARIS */ + if (error) + fr_authstats.fas_sendfail++; + else + fr_authstats.fas_sendok++; + } else if (m) { +# if SOLARIS + error = fr_qin(fr_auth[i].fra_q, m); +# else /* SOLARIS */ + ifq = &ipintrq; + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + error = ENOBUFS; + } else { + IF_ENQUEUE(ifq, m); + schednetisr(NETISR_IP); + } +# endif /* SOLARIS */ + if (error) + fr_authstats.fas_quefail++; + else + fr_authstats.fas_queok++; + } else + error = EINVAL; +# endif +# if SOLARIS + if (error) + error = EINVAL; +# else + /* + * If we experience an error which will result in the packet + * not being processed, make sure we advance to the next one. + */ + if (error == ENOBUFS) { + fr_authused--; + fr_auth[i].fra_index = -1; + fr_auth[i].fra_pass = 0; + if (i == fr_authstart) { + while (fr_auth[i].fra_index == -1) { + i++; + if (i == FR_NUMAUTH) + i = 0; + fr_authstart = i; + if (i == fr_authend) + break; + } + if (fr_authstart == fr_authend) { + fr_authnext = 0; + fr_authstart = fr_authend = 0; + } + } + } +# endif + SPL_X(s); +#endif /* _KERNEL */ + break; + default : + error = EINVAL; + break; + } + return error; +} + + +#ifdef _KERNEL +/* + * Free all network buffer memory used to keep saved packets. + */ +void fr_authunload() +{ + register int i; + register frauthent_t *fae, **faep; + mb_t *m; + + WRITE_ENTER(&ipf_auth); + for (i = 0; i < FR_NUMAUTH; i++) { + if ((m = fr_authpkts[i])) { + FREE_MB_T(m); + fr_authpkts[i] = NULL; + fr_auth[i].fra_index = -1; + } + } + + + for (faep = &fae_list; (fae = *faep); ) { + *faep = fae->fae_next; + KFREE(fae); + } + ipauth = NULL; + RWLOCK_EXIT(&ipf_auth); +} + + +/* + * Slowly expire held auth records. Timeouts are set + * in expectation of this being called twice per second. + */ +void fr_authexpire() +{ + register int i; + register frauth_t *fra; + register frauthent_t *fae, **faep; + mb_t *m; +#if !SOLARIS + int s; +#endif + + SPL_NET(s); + WRITE_ENTER(&ipf_auth); + for (i = 0, fra = fr_auth; i < FR_NUMAUTH; i++, fra++) { + if ((!--fra->fra_age) && (m = fr_authpkts[i])) { + FREE_MB_T(m); + fr_authpkts[i] = NULL; + fr_auth[i].fra_index = -1; + fr_authstats.fas_expire++; + fr_authused--; + } + } + + for (faep = &fae_list; (fae = *faep); ) { + if (!--fae->fae_age) { + *faep = fae->fae_next; + KFREE(fae); + fr_authstats.fas_expire++; + } else + faep = &fae->fae_next; + } + ipauth = &fae_list->fae_fr; + RWLOCK_EXIT(&ipf_auth); + SPL_X(s); +} +#endif diff --git a/sys/netinet/ip_auth.h b/sys/netinet/ip_auth.h new file mode 100644 index 0000000..a7c640b --- /dev/null +++ b/sys/netinet/ip_auth.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 1997-1998 by Darren Reed & Guido Van Rooij. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * $Id: ip_auth.h,v 2.1 1999/08/04 17:29:54 darrenr Exp $ + * + * $FreeBSD$ + */ +#ifndef __IP_AUTH_H__ +#define __IP_AUTH_H__ + +#define FR_NUMAUTH 32 + +typedef struct frauth { + int fra_age; + int fra_index; + u_32_t fra_pass; + fr_info_t fra_info; +#if SOLARIS + queue_t *fra_q; +#endif +} frauth_t; + +typedef struct frauthent { + struct frentry fae_fr; + struct frauthent *fae_next; + u_long fae_age; +} frauthent_t; + +typedef struct fr_authstat { + U_QUAD_T fas_hits; + U_QUAD_T fas_miss; + u_long fas_nospace; + u_long fas_added; + u_long fas_sendfail; + u_long fas_sendok; + u_long fas_queok; + u_long fas_quefail; + u_long fas_expire; + frauthent_t *fas_faelist; +} fr_authstat_t; + + +extern frentry_t *ipauth; +extern struct fr_authstat fr_authstats; +extern int fr_defaultauthage; +extern int fr_authstart; +extern int fr_authend; +extern int fr_authsize; +extern int fr_authused; +extern u_32_t fr_checkauth __P((ip_t *, fr_info_t *)); +extern void fr_authexpire __P((void)); +extern void fr_authunload __P((void)); +extern mb_t *fr_authpkts[]; +#if defined(_KERNEL) && SOLARIS +extern int fr_newauth __P((mb_t *, fr_info_t *, ip_t *, qif_t *)); +#else +extern int fr_newauth __P((mb_t *, fr_info_t *, ip_t *)); +#endif +#if defined(__NetBSD__) || defined(__OpenBSD__) +extern int fr_auth_ioctl __P((caddr_t, u_long, frentry_t *, frentry_t **)); +#else +extern int fr_auth_ioctl __P((caddr_t, int, frentry_t *, frentry_t **)); +#endif +#endif /* __IP_AUTH_H__ */ diff --git a/sys/netinet/ip_compat.h b/sys/netinet/ip_compat.h new file mode 100644 index 0000000..7982a5d --- /dev/null +++ b/sys/netinet/ip_compat.h @@ -0,0 +1,835 @@ +/* + * Copyright (C) 1993-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_compat.h 1.8 1/14/96 + * $Id: ip_compat.h,v 2.1.2.1 1999/09/18 15:03:51 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_COMPAT_H__ +#define __IP_COMPAT_H__ + +#ifndef __P +# ifdef __STDC__ +# define __P(x) x +# else +# define __P(x) () +# endif +#endif +#ifndef __STDC__ +# undef const +# define const +#endif + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(_KERNEL) || defined(KERNEL) || defined(__KERNEL__) +# undef KERNEL +# undef _KERNEL +# undef __KERNEL__ +# define KERNEL +# define _KERNEL +# define __KERNEL__ +#endif + +#if defined(__SVR4) || defined(__svr4__) || defined(__sgi) +#define index strchr +# if !defined(KERNEL) +# define bzero(a,b) memset(a,0,b) +# define bcmp memcmp +# define bcopy(a,b,c) memmove(b,a,c) +# endif +#endif + +#ifndef offsetof +#define offsetof(t,m) (int)((&((t *)0L)->m)) +#endif + +#if defined(__sgi) || defined(bsdi) +struct ether_addr { + u_char ether_addr_octet[6]; +}; +#endif + +#if defined(__sgi) && !defined(IPFILTER_LKM) +# ifdef __STDC__ +# define IPL_EXTERN(ep) ipfilter##ep +# else +# define IPL_EXTERN(ep) ipfilter/**/ep +# endif +#else +# ifdef __STDC__ +# define IPL_EXTERN(ep) ipl##ep +# else +# define IPL_EXTERN(ep) ipl/**/ep +# endif +#endif + +#ifdef linux +# include <sys/sysmacros.h> +#endif +#if SOLARIS +# define MTYPE(m) ((m)->b_datap->db_type) +# include <sys/isa_defs.h> +# include <sys/ioccom.h> +# include <sys/sysmacros.h> +# include <sys/kmem.h> +/* + * because Solaris 2 defines these in two places :-/ + */ +# undef IPOPT_EOL +# undef IPOPT_NOP +# undef IPOPT_LSRR +# undef IPOPT_RR +# undef IPOPT_SSRR +# ifndef KERNEL +# define _KERNEL +# undef RES_INIT +# include <inet/common.h> +# include <inet/ip.h> +# include <inet/ip_ire.h> +# undef _KERNEL +# else /* _KERNEL */ +# include <inet/common.h> +# include <inet/ip.h> +# include <inet/ip_ire.h> +# endif /* _KERNEL */ +#else +# if !defined(__sgi) +typedef int minor_t; +#endif +#endif /* SOLARIS */ +#define IPMINLEN(i, h) ((i)->ip_len >= ((i)->ip_hl * 4 + sizeof(struct h))) + +#ifndef IP_OFFMASK +#define IP_OFFMASK 0x1fff +#endif + +#if BSD > 199306 +# define USE_QUAD_T +# define U_QUAD_T u_quad_t +# define QUAD_T quad_t +#else /* BSD > 199306 */ +# define U_QUAD_T u_long +# define QUAD_T long +#endif /* BSD > 199306 */ + +/* + * These operating systems already take care of the problem for us. + */ +#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD__) || \ + defined(__sgi) +typedef u_int32_t u_32_t; +#else +/* + * Really, any arch where sizeof(long) != sizeof(int). + */ +# if defined(__alpha__) || defined(__alpha) || defined(_LP64) +typedef unsigned int u_32_t; +# else +typedef unsigned long u_32_t; +# endif +#endif /* __NetBSD__ || __OpenBSD__ || __FreeBSD__ || __sgi */ + +#ifndef MAX +#define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +/* + * Security Options for Intenet Protocol (IPSO) as defined in RFC 1108. + * + * Basic Option + * + * 00000001 - (Reserved 4) + * 00111101 - Top Secret + * 01011010 - Secret + * 10010110 - Confidential + * 01100110 - (Reserved 3) + * 11001100 - (Reserved 2) + * 10101011 - Unclassified + * 11110001 - (Reserved 1) + */ +#define IPSO_CLASS_RES4 0x01 +#define IPSO_CLASS_TOPS 0x3d +#define IPSO_CLASS_SECR 0x5a +#define IPSO_CLASS_CONF 0x96 +#define IPSO_CLASS_RES3 0x66 +#define IPSO_CLASS_RES2 0xcc +#define IPSO_CLASS_UNCL 0xab +#define IPSO_CLASS_RES1 0xf1 + +#define IPSO_AUTH_GENSER 0x80 +#define IPSO_AUTH_ESI 0x40 +#define IPSO_AUTH_SCI 0x20 +#define IPSO_AUTH_NSA 0x10 +#define IPSO_AUTH_DOE 0x08 +#define IPSO_AUTH_UN 0x06 +#define IPSO_AUTH_FTE 0x01 + +/* + * IP option #defines + */ +/*#define IPOPT_RR 7 */ +#define IPOPT_ZSU 10 /* ZSU */ +#define IPOPT_MTUP 11 /* MTUP */ +#define IPOPT_MTUR 12 /* MTUR */ +#define IPOPT_ENCODE 15 /* ENCODE */ +/*#define IPOPT_TS 68 */ +#define IPOPT_TR 82 /* TR */ +/*#define IPOPT_SECURITY 130 */ +/*#define IPOPT_LSRR 131 */ +#define IPOPT_E_SEC 133 /* E-SEC */ +#define IPOPT_CIPSO 134 /* CIPSO */ +/*#define IPOPT_SATID 136 */ +#ifndef IPOPT_SID +# define IPOPT_SID IPOPT_SATID +#endif +/*#define IPOPT_SSRR 137 */ +#define IPOPT_ADDEXT 147 /* ADDEXT */ +#define IPOPT_VISA 142 /* VISA */ +#define IPOPT_IMITD 144 /* IMITD */ +#define IPOPT_EIP 145 /* EIP */ +#define IPOPT_FINN 205 /* FINN */ + + +#if defined(__FreeBSD__) && defined(KERNEL) +# if __FreeBSD__ < 3 +# include <machine/spl.h> +# else +# if __FreeBSD__ == 3 +# if defined(IPFILTER_LKM) && !defined(ACTUALLY_LKM_NOT_KERNEL) +# define ACTUALLY_LKM_NOT_KERNEL +# endif +# endif +# endif +#endif /* __FreeBSD__ && KERNEL */ + +/* + * Build some macros and #defines to enable the same code to compile anywhere + * Well, that's the idea, anyway :-) + */ +#ifdef KERNEL +# if SOLARIS +# define ATOMIC_INC(x) { mutex_enter(&ipf_rw); (x)++; \ + mutex_exit(&ipf_rw); } +# define ATOMIC_DEC(x) { mutex_enter(&ipf_rw); (x)--; \ + mutex_exit(&ipf_rw); } +# define MUTEX_ENTER(x) mutex_enter(x) +# if 1 +# define KRWLOCK_T krwlock_t +# define READ_ENTER(x) rw_enter(x, RW_READER) +# define WRITE_ENTER(x) rw_enter(x, RW_WRITER) +# define RW_UPGRADE(x) { if (rw_tryupgrade(x) == 0) { \ + rw_exit(x); \ + rw_enter(x, RW_WRITER); } \ + } +# define MUTEX_DOWNGRADE(x) rw_downgrade(x) +# define RWLOCK_INIT(x, y, z) rw_init((x), (y), RW_DRIVER, (z)) +# define RWLOCK_EXIT(x) rw_exit(x) +# define RW_DESTROY(x) rw_destroy(x) +# else +# define KRWLOCK_T kmutex_t +# define READ_ENTER(x) mutex_enter(x) +# define WRITE_ENTER(x) mutex_enter(x) +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_INIT(x, y, z) mutex_init((x), (y), MUTEX_DRIVER, (z)) +# define RWLOCK_EXIT(x) mutex_exit(x) +# define RW_DESTROY(x) mutex_destroy(x) +# endif +# define MUTEX_EXIT(x) mutex_exit(x) +# define MTOD(m,t) (t)((m)->b_rptr) +# define IRCOPY(a,b,c) copyin((a), (b), (c)) +# define IWCOPY(a,b,c) copyout((a), (b), (c)) +# define FREE_MB_T(m) freemsg(m) +# define SPL_NET(x) ; +# define SPL_IMP(x) ; +# undef SPL_X +# define SPL_X(x) ; +# ifdef sparc +# define ntohs(x) (x) +# define ntohl(x) (x) +# define htons(x) (x) +# define htonl(x) (x) +# endif /* sparc */ +# define KMALLOC(a,b) (a) = (b)kmem_alloc(sizeof(*(a)), KM_NOSLEEP) +# define KMALLOCS(a,b,c) (a) = (b)kmem_alloc((c), KM_NOSLEEP) +# define GET_MINOR(x) getminor(x) +typedef struct qif { + struct qif *qf_next; + ill_t *qf_ill; + kmutex_t qf_lock; + void *qf_iptr; + void *qf_optr; + queue_t *qf_in; + queue_t *qf_out; + struct qinit *qf_wqinfo; + struct qinit *qf_rqinfo; + struct qinit qf_wqinit; + struct qinit qf_rqinit; + mblk_t *qf_m; /* These three fields are for passing data up from */ + queue_t *qf_q; /* fr_qin and fr_qout to the packet processing. */ + size_t qf_off; + size_t qf_len; /* this field is used for in ipfr_fastroute */ + char qf_name[8]; + /* + * in case the ILL has disappeared... + */ + size_t qf_hl; /* header length */ +} qif_t; +extern ill_t *get_unit __P((char *)); +# define GETUNIT(n) get_unit((n)) +# else /* SOLARIS */ +# if defined(__sgi) +# define hz HZ +# include <sys/ksynch.h> +# define IPF_LOCK_PL plhi +# include <sys/sema.h> +#undef kmutex_t +typedef struct { + lock_t *l; + int pl; +} kmutex_t; +# define ATOMIC_INC(x) { MUTEX_ENTER(&ipf_rw); \ + (x)++; MUTEX_EXIT(&ipf_rw); } +# define ATOMIC_DEC(x) { MUTEX_ENTER(&ipf_rw); \ + (x)--; MUTEX_EXIT(&ipf_rw); } +# define MUTEX_ENTER(x) (x)->pl = LOCK((x)->l, IPF_LOCK_PL); +# define KRWLOCK_T kmutex_t +# define READ_ENTER(x) MUTEX_ENTER(x) +# define WRITE_ENTER(x) MUTEX_ENTER(x) +# define RW_UPGRADE(x) ; +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_EXIT(x) MUTEX_EXIT(x) +# define MUTEX_EXIT(x) UNLOCK((x)->l, (x)->pl); +# else /* __sgi */ +# define ATOMIC_INC(x) (x)++ +# define ATOMIC_DEC(x) (x)-- +# define MUTEX_ENTER(x) ; +# define READ_ENTER(x) ; +# define WRITE_ENTER(x) ; +# define RW_UPGRADE(x) ; +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_EXIT(x) ; +# define MUTEX_EXIT(x) ; +# endif /* __sgi */ +# ifndef linux +# define FREE_MB_T(m) m_freem(m) +# define MTOD(m,t) mtod(m,t) +# define IRCOPY(a,b,c) bcopy((a), (b), (c)) +# define IWCOPY(a,b,c) bcopy((a), (b), (c)) +# endif /* !linux */ +# endif /* SOLARIS */ + +# ifdef sun +# if !SOLARIS +# include <sys/kmem_alloc.h> +# define GETUNIT(n) ifunit((n), IFNAMSIZ) +# endif +# else +# ifndef linux +# define GETUNIT(n) ifunit((n)) +# endif +# endif /* sun */ + +# if defined(sun) && !defined(linux) || defined(__sgi) +# define UIOMOVE(a,b,c,d) uiomove((caddr_t)a,b,c,d) +# define SLEEP(id, n) sleep((id), PZERO+1) +# define WAKEUP(id) wakeup(id) +# define KFREE(x) kmem_free((char *)(x), sizeof(*(x))) +# define KFREES(x,s) kmem_free((char *)(x), (s)) +# if !SOLARIS +extern void m_copydata __P((struct mbuf *, int, int, caddr_t)); +extern void m_copyback __P((struct mbuf *, int, int, caddr_t)); +# endif +# ifdef __sgi +# include <sys/kmem.h> +# include <sys/ddi.h> +# define KMALLOC(a,b) (a) = (b)kmem_alloc(sizeof(*(a)), KM_NOSLEEP) +# define KMALLOCS(a,b,c) (a) = (b)kmem_alloc((c), KM_NOSLEEP) +# define GET_MINOR(x) getminor(x) +# else +# if !SOLARIS +# define KMALLOC(a,b) (a) = (b)new_kmem_alloc(sizeof(*(a)), \ + KMEM_NOSLEEP) +# define KMALLOCS(a,b,c) (a) = (b)new_kmem_alloc((c), KMEM_NOSLEEP) +# endif /* SOLARIS */ +# endif /* __sgi */ +# endif /* sun && !linux */ +# ifndef GET_MINOR +# define GET_MINOR(x) minor(x) +# endif +# if (BSD >= 199306) || defined(__FreeBSD__) +# include <vm/vm.h> +# if !defined(__FreeBSD__) || (defined (__FreeBSD__) && __FreeBSD__>=3) +# include <vm/vm_extern.h> +# include <sys/proc.h> +extern vm_map_t kmem_map; +# else /* !__FreeBSD__ || (__FreeBSD__ && __FreeBSD__>=3) */ +# include <vm/vm_kern.h> +# endif /* !__FreeBSD__ || (__FreeBSD__ && __FreeBSD__>=3) */ +# ifdef M_PFIL +# define KMALLOC(a, b) MALLOC((a), b, sizeof(*(a)), M_PFIL, M_NOWAIT) +# define KMALLOCS(a, b, c) MALLOC((a), b, (c), M_PFIL, M_NOWAIT) +# define KFREE(x) FREE((x), M_PFIL) +# define KFREES(x,s) FREE((x), M_PFIL) +# else +# define KMALLOC(a, b) MALLOC((a), b, sizeof(*(a)), M_TEMP, M_NOWAIT) +# define KMALLOCS(a, b, c) MALLOC((a), b, (c), M_TEMP, M_NOWAIT) +# define KFREE(x) FREE((x), M_TEMP) +# define KFREES(x,s) FREE((x), M_TEMP) +# endif /* M_PFIL */ +# define UIOMOVE(a,b,c,d) uiomove(a,b,d) +# define SLEEP(id, n) tsleep((id), PPAUSE|PCATCH, n, 0) +# define WAKEUP(id) wakeup(id) +# endif /* BSD */ +# if defined(NetBSD) && NetBSD <= 1991011 && NetBSD >= 199407 +# define SPL_NET(x) x = splsoftnet() +# define SPL_X(x) (void) splx(x) +# else +# if !SOLARIS && !defined(linux) +# define SPL_IMP(x) x = splimp() +# define SPL_NET(x) x = splnet() +# define SPL_X(x) (void) splx(x) +# endif +# endif /* NetBSD && NetBSD <= 1991011 && NetBSD >= 199407 */ +# define PANIC(x,y) if (x) panic y +#else /* KERNEL */ +# define SLEEP(x,y) ; +# define WAKEUP(x) ; +# define PANIC(x,y) ; +# define ATOMIC_INC(x) (x)++ +# define ATOMIC_DEC(x) (x)-- +# define MUTEX_ENTER(x) ; +# define READ_ENTER(x) ; +# define WRITE_ENTER(x) ; +# define RW_UPGRADE(x) ; +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_EXIT(x) ; +# define MUTEX_EXIT(x) ; +# define SPL_NET(x) ; +# define SPL_IMP(x) ; +# undef SPL_X +# define SPL_X(x) ; +# define KMALLOC(a,b) (a) = (b)malloc(sizeof(*a)) +# define KMALLOCS(a,b,c) (a) = (b)malloc(c) +# define KFREE(x) free(x) +# define KFREES(x,s) free(x) +# define GETUNIT(x) get_unit(x) +# define IRCOPY(a,b,c) bcopy((a), (b), (c)) +# define IWCOPY(a,b,c) bcopy((a), (b), (c)) +#endif /* KERNEL */ + +#if SOLARIS +typedef mblk_t mb_t; +# if SOLARIS2 >= 7 +# ifdef lint +# define ALIGN32(ptr) (ptr ? 0L : 0L) +# define ALIGN16(ptr) (ptr ? 0L : 0L) +# else +# define ALIGN32(ptr) (ptr) +# define ALIGN16(ptr) (ptr) +# endif +# endif +#else +# ifdef linux +# ifndef kernel +typedef struct mb { + struct mb *next; + u_int len; + u_char *data; +} mb_t; +# else +typedef struct sk_buff mb_t; +# endif +# else +typedef struct mbuf mb_t; +# endif +#endif /* SOLARIS */ + +#if defined(linux) || defined(__sgi) +/* + * These #ifdef's are here mainly for linux, but who knows, they may + * not be in other places or maybe one day linux will grow up and some + * of these will turn up there too. + */ +#ifndef ICMP_MINLEN +# define ICMP_MINLEN 8 +#endif +#ifndef ICMP_UNREACH +# define ICMP_UNREACH ICMP_DEST_UNREACH +#endif +#ifndef ICMP_SOURCEQUENCH +# define ICMP_SOURCEQUENCH ICMP_SOURCE_QUENCH +#endif +#ifndef ICMP_TIMXCEED +# define ICMP_TIMXCEED ICMP_TIME_EXCEEDED +#endif +#ifndef ICMP_PARAMPROB +# define ICMP_PARAMPROB ICMP_PARAMETERPROB +#endif +#ifndef ICMP_TSTAMP +# define ICMP_TSTAMP ICMP_TIMESTAMP +#endif +#ifndef ICMP_TSTAMPREPLY +# define ICMP_TSTAMPREPLY ICMP_TIMESTAMPREPLY +#endif +#ifndef ICMP_IREQ +# define ICMP_IREQ ICMP_INFO_REQUEST +#endif +#ifndef ICMP_IREQREPLY +# define ICMP_IREQREPLY ICMP_INFO_REPLY +#endif +#ifndef ICMP_MASKREQ +# define ICMP_MASKREQ ICMP_ADDRESS +#endif +#ifndef ICMP_MASKREPLY +# define ICMP_MASKREPLY ICMP_ADDRESSREPLY +#endif +#ifndef IPVERSION +# define IPVERSION 4 +#endif +#ifndef IPOPT_MINOFF +# define IPOPT_MINOFF 4 +#endif +#ifndef IPOPT_COPIED +# define IPOPT_COPIED(x) ((x)&0x80) +#endif +#ifndef IPOPT_EOL +# define IPOPT_EOL 0 +#endif +#ifndef IPOPT_NOP +# define IPOPT_NOP 1 +#endif +#ifndef IP_MF +# define IP_MF ((u_short)0x2000) +#endif +#ifndef ETHERTYPE_IP +# define ETHERTYPE_IP ((u_short)0x0800) +#endif +#ifndef TH_FIN +# define TH_FIN 0x01 +#endif +#ifndef TH_SYN +# define TH_SYN 0x02 +#endif +#ifndef TH_RST +# define TH_RST 0x04 +#endif +#ifndef TH_PUSH +# define TH_PUSH 0x08 +#endif +#ifndef TH_ACK +# define TH_ACK 0x10 +#endif +#ifndef TH_URG +# define TH_URG 0x20 +#endif +#ifndef IPOPT_EOL +# define IPOPT_EOL 0 +#endif +#ifndef IPOPT_NOP +# define IPOPT_NOP 1 +#endif +#ifndef IPOPT_RR +# define IPOPT_RR 7 +#endif +#ifndef IPOPT_TS +# define IPOPT_TS 68 +#endif +#ifndef IPOPT_SECURITY +# define IPOPT_SECURITY 130 +#endif +#ifndef IPOPT_LSRR +# define IPOPT_LSRR 131 +#endif +#ifndef IPOPT_SATID +# define IPOPT_SATID 136 +#endif +#ifndef IPOPT_SSRR +# define IPOPT_SSRR 137 +#endif +#ifndef IPOPT_SECUR_UNCLASS +# define IPOPT_SECUR_UNCLASS ((u_short)0x0000) +#endif +#ifndef IPOPT_SECUR_CONFID +# define IPOPT_SECUR_CONFID ((u_short)0xf135) +#endif +#ifndef IPOPT_SECUR_EFTO +# define IPOPT_SECUR_EFTO ((u_short)0x789a) +#endif +#ifndef IPOPT_SECUR_MMMM +# define IPOPT_SECUR_MMMM ((u_short)0xbc4d) +#endif +#ifndef IPOPT_SECUR_RESTR +# define IPOPT_SECUR_RESTR ((u_short)0xaf13) +#endif +#ifndef IPOPT_SECUR_SECRET +# define IPOPT_SECUR_SECRET ((u_short)0xd788) +#endif +#ifndef IPOPT_SECUR_TOPSECRET +# define IPOPT_SECUR_TOPSECRET ((u_short)0x6bc5) +#endif +#ifndef IPOPT_OLEN +# define IPOPT_OLEN 1 +#endif +#endif /* linux || __sgi */ + +#ifdef linux +#include <linux/in_systm.h> +/* + * TCP States + */ +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +/* + * file flags. + */ +#ifdef WRITE +#define FWRITE WRITE +#define FREAD READ +#else +#define FWRITE _IOC_WRITE +#define FREAD _IOC_READ +#endif +/* + * mbuf related problems. + */ +#define mtod(m,t) (t)((m)->data) +#define m_len len +#define m_next next + +#ifdef IP_DF +#undef IP_DF +#endif +#define IP_DF 0x4000 + +typedef struct { + __u16 th_sport; + __u16 th_dport; + __u32 th_seq; + __u32 th_ack; +# if defined(__i386__) || defined(__MIPSEL__) || defined(__alpha__) ||\ + defined(vax) + __u8 th_res:4; + __u8 th_off:4; +#else + __u8 th_off:4; + __u8 th_res:4; +#endif + __u8 th_flags; + __u16 th_win; + __u16 th_sum; + __u16 th_urp; +} tcphdr_t; + +typedef struct { + __u16 uh_sport; + __u16 uh_dport; + __u16 uh_ulen; + __u16 uh_sum; +} udphdr_t; + +typedef struct { +# if defined(__i386__) || defined(__MIPSEL__) || defined(__alpha__) ||\ + defined(vax) + __u8 ip_hl:4; + __u8 ip_v:4; +# else + __u8 ip_hl:4; + __u8 ip_v:4; +# endif + __u8 ip_tos; + __u16 ip_len; + __u16 ip_id; + __u16 ip_off; + __u8 ip_ttl; + __u8 ip_p; + __u16 ip_sum; + struct in_addr ip_src; + struct in_addr ip_dst; +} ip_t; + +/* + * Structure of an icmp header. + */ +typedef struct icmp { + __u8 icmp_type; /* type of message, see below */ + __u8 icmp_code; /* type sub code */ + __u16 icmp_cksum; /* ones complement cksum of struct */ + union { + __u8 ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + __u16 icd_id; + __u16 icd_seq; + } ih_idseq; + int ih_void; + } icmp_hun; +# define icmp_pptr icmp_hun.ih_pptr +# define icmp_gwaddr icmp_hun.ih_gwaddr +# define icmp_id icmp_hun.ih_idseq.icd_id +# define icmp_seq icmp_hun.ih_idseq.icd_seq +# define icmp_void icmp_hun.ih_void + union { + struct id_ts { + n_time its_otime; + n_time its_rtime; + n_time its_ttime; + } id_ts; + struct id_ip { + ip_t idi_ip; + /* options and then 64 bits of data */ + } id_ip; + u_long id_mask; + char id_data[1]; + } icmp_dun; +# define icmp_otime icmp_dun.id_ts.its_otime +# define icmp_rtime icmp_dun.id_ts.its_rtime +# define icmp_ttime icmp_dun.id_ts.its_ttime +# define icmp_ip icmp_dun.id_ip.idi_ip +# define icmp_mask icmp_dun.id_mask +# define icmp_data icmp_dun.id_data +} icmphdr_t; + +# ifndef LINUX_IPOVLY +# define LINUX_IPOVLY +struct ipovly { + caddr_t ih_next, ih_prev; /* for protocol sequence q's */ + u_char ih_x1; /* (unused) */ + u_char ih_pr; /* protocol */ + short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; +# endif + +typedef struct { + __u8 ether_dhost[6]; + __u8 ether_shost[6]; + __u16 ether_type; +} ether_header_t; + +typedef struct uio { + int uio_resid; + int uio_rw; + caddr_t uio_buf; +} uio_t; + +# define UIO_READ 0 +# define UIO_WRITE 1 +# define UIOMOVE(a, b, c, d) uiomove(a,b,c,d) + +/* + * For masking struct ifnet onto struct device + */ +# define if_name name + +# ifdef KERNEL +# define GETUNIT(x) dev_get(x) +# define FREE_MB_T(m) kfree_skb(m, FREE_WRITE) +# define uniqtime do_gettimeofday +# undef INT_MAX +# undef UINT_MAX +# undef LONG_MAX +# undef ULONG_MAX +# include <linux/netdevice.h> +# define SPL_X(x) +# define SPL_NET(x) +# define SPL_IMP(x) + +# define bcmp(a,b,c) memcmp(a,b,c) +# define bcopy(a,b,c) memcpy(b,a,c) +# define bzero(a,c) memset(a,0,c) + +# define UNITNAME(n) dev_get((n)) + +# define KMALLOC(a,b) (a) = (b)kmalloc(sizeof(*(a)), GFP_ATOMIC) +# define KMALLOCS(a,b,c) (a) = (b)kmalloc((c), GFP_ATOMIC) +# define KFREE(x) kfree_s((x), sizeof(*(x))) +# define KFREES(x,s) kfree_s((x), (s)) +# define IRCOPY(a,b,c) { \ + error = verify_area(VERIFY_READ, (a) ,(c)); \ + if (!error) \ + memcpy_fromfs((b), (a), (c)); \ + } +# define IWCOPY(a,b,c) { \ + error = verify_area(VERIFY_WRITE, (b), (c)); \ + if (!error) \ + memcpy_tofs((b), (a), (c)); \ + } +# else +# define __KERNEL__ +# undef INT_MAX +# undef UINT_MAX +# undef LONG_MAX +# undef ULONG_MAX +# define s8 __s8 +# define u8 __u8 +# define s16 __s16 +# define u16 __u16 +# define s32 __s32 +# define u32 __u32 +# include <linux/netdevice.h> +# undef __KERNEL__ +# endif +# define ifnet device +#else +typedef struct tcphdr tcphdr_t; +typedef struct udphdr udphdr_t; +typedef struct icmp icmphdr_t; +typedef struct ip ip_t; +typedef struct ether_header ether_header_t; +#endif /* linux */ +typedef struct tcpiphdr tcpiphdr_t; + +#if defined(hpux) || defined(linux) +struct ether_addr { + char ether_addr_octet[6]; +}; +#endif + +/* + * XXX - This is one of those *awful* hacks which nobody likes + */ +#ifdef ultrix +#define A_A +#else +#define A_A & +#endif + +#ifndef ICMP_ROUTERADVERT +# define ICMP_ROUTERADVERT 9 +#endif +#ifndef ICMP_ROUTERSOLICIT +# define ICMP_ROUTERSOLICIT 10 +#endif +/* + * ICMP error replies have an IP header (20 bytes), 8 bytes of ICMP data, + * another IP header and then 64 bits of data, totalling 56. Of course, + * the last 64 bits is dependant on that being available. + */ +#define ICMPERR_ICMPHLEN 8 +#define ICMPERR_IPICMPHLEN (20 + 8) +#define ICMPERR_MINPKTLEN (20 + 8 + 20) +#define ICMPERR_MAXPKTLEN (20 + 8 + 20 + 8) + +#endif /* __IP_COMPAT_H__ */ diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c new file mode 100644 index 0000000..a2b76b3 --- /dev/null +++ b/sys/netinet/ip_divert.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_ipfw.h" +#include "opt_ipdivert.h" + +#ifndef INET +#error "IPDIVERT requires INET." +#endif + +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/socketvar.h> +#include <sys/systm.h> +#include <sys/proc.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +/* + * Divert sockets + */ + +/* + * Allocate enough space to hold a full IP packet + */ +#define DIVSNDQ (65536 + 100) +#define DIVRCVQ (65536 + 100) + +/* + * A 16 bit cookie is passed to and from the user process. + * The user process can send it back to help the caller know + * something about where the packet originally came from. + * + * In the case of ipfw, then the cookie is the rule that sent + * us here. On reinjection is is the rule after which processing + * should continue. Leaving it the same will make processing start + * at the rule number after that which sent it here. Setting it to + * 0 will restart processing at the beginning. + * + * For divert_packet(), ip_divert_cookie is an input value only. + * For div_output(), ip_divert_cookie is an output value only. + */ +u_int16_t ip_divert_cookie; + +/* Internal variables */ +static struct inpcbhead divcb; +static struct inpcbinfo divcbinfo; + +static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ +static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ + +/* Optimization: have this preinitialized */ +static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET }; + +/* Internal functions */ +static int div_output(struct socket *so, + struct mbuf *m, struct sockaddr *addr, struct mbuf *control); + +/* + * Initialize divert connection block queue. + */ +void +div_init(void) +{ + LIST_INIT(&divcb); + divcbinfo.listhead = &divcb; + /* + * XXX We don't use the hash list for divert IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); + divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); + divcbinfo.ipi_zone = zinit("divcb", sizeof(struct inpcb), + maxsockets, ZONE_INTERRUPT, 0); +} + +/* + * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets + * with that protocol number to enter the system from the outside. + */ +void +div_input(struct mbuf *m, int hlen) +{ + ipstat.ips_noproto++; + m_freem(m); +} + +/* + * Divert a packet by passing it up to the divert socket at port 'port'. + * + * Setup generic address and protocol structures for div_input routine, + * then pass them along with mbuf chain. + */ +void +divert_packet(struct mbuf *m, int incoming, int port) +{ + struct ip *ip; + struct inpcb *inp; + struct socket *sa; + u_int16_t nport; + + /* Sanity check */ + KASSERT(port != 0, ("%s: port=0", __FUNCTION__)); + + /* Record and reset divert cookie */ + divsrc.sin_port = ip_divert_cookie; + ip_divert_cookie = 0; + + /* Assure header */ + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == 0) { + return; + } + ip = mtod(m, struct ip *); + + /* + * Record receive interface address, if any. + * But only for incoming packets. + */ + divsrc.sin_addr.s_addr = 0; + if (incoming) { + struct ifaddr *ifa; + + /* Sanity check */ + KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __FUNCTION__)); + + /* Find IP address for receive interface */ + for (ifa = m->m_pkthdr.rcvif->if_addrhead.tqh_first; + ifa != NULL; ifa = ifa->ifa_link.tqe_next) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + divsrc.sin_addr = + ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + break; + } + } + /* + * Record the incoming interface name whenever we have one. + */ + bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero)); + if (m->m_pkthdr.rcvif) { + /* + * Hide the actual interface name in there in the + * sin_zero array. XXX This needs to be moved to a + * different sockaddr type for divert, e.g. + * sockaddr_div with multiple fields like + * sockaddr_dl. Presently we have only 7 bytes + * but that will do for now as most interfaces + * are 4 or less + 2 or less bytes for unit. + * There is probably a faster way of doing this, + * possibly taking it from the sockaddr_dl on the iface. + * This solves the problem of a P2P link and a LAN interface + * having the same address, which can result in the wrong + * interface being assigned to the packet when fed back + * into the divert socket. Theoretically if the daemon saves + * and re-uses the sockaddr_in as suggested in the man pages, + * this iface name will come along for the ride. + * (see div_output for the other half of this.) + */ + snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero), + "%s%d", m->m_pkthdr.rcvif->if_name, + m->m_pkthdr.rcvif->if_unit); + } + + /* Put packet on socket queue, if any */ + sa = NULL; + nport = htons((u_int16_t)port); + for (inp = divcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if (inp->inp_lport == nport) + sa = inp->inp_socket; + } + if (sa) { + if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, + m, (struct mbuf *)0) == 0) + m_freem(m); + else + sorwakeup(sa); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Deliver packet back into the IP processing machinery. + * + * If no address specified, or address is 0.0.0.0, send to ip_output(); + * otherwise, send to ip_input() and mark as having been received on + * the interface with that address. + */ +static int +div_output(so, m, addr, control) + struct socket *so; + register struct mbuf *m; + struct sockaddr *addr; + struct mbuf *control; +{ + register struct inpcb *const inp = sotoinpcb(so); + register struct ip *const ip = mtod(m, struct ip *); + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + int error = 0; + + if (control) + m_freem(control); /* XXX */ + + /* Loopback avoidance and state recovery */ + if (sin) { + int len = 0; + char *c = sin->sin_zero; + + ip_divert_cookie = sin->sin_port; + + /* + * Find receive interface with the given name or IP address. + * The name is user supplied data so don't trust it's size or + * that it is zero terminated. The name has priority. + * We are presently assuming that the sockaddr_in + * has not been replaced by a sockaddr_div, so we limit it + * to 16 bytes in total. the name is stuffed (if it exists) + * in the sin_zero[] field. + */ + while (*c++ && (len++ < sizeof(sin->sin_zero))); + if ((len > 0) && (len < sizeof(sin->sin_zero))) + m->m_pkthdr.rcvif = ifunit(sin->sin_zero); + } else { + ip_divert_cookie = 0; + } + + /* Reinject packet into the system as incoming or outgoing */ + if (!sin || sin->sin_addr.s_addr == 0) { + /* + * Don't allow both user specified and setsockopt options, + * and don't allow packet length sizes that will crash + */ + if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || + ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { + error = EINVAL; + goto cantsend; + } + + /* Convert fields to host order for ip_output() */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + + /* Send packet to output processing */ + ipstat.ips_rawout++; /* XXX */ + error = ip_output(m, inp->inp_options, &inp->inp_route, + (so->so_options & SO_DONTROUTE) | + IP_ALLOWBROADCAST | IP_RAWOUTPUT, inp->inp_moptions); + } else { + struct ifaddr *ifa; + + /* If no luck with the name above. check by IP address. */ + if (m->m_pkthdr.rcvif == NULL) { + /* + * Make sure there are no distractions + * for ifa_ifwithaddr. Clear the port and the ifname. + * Maybe zap all 8 bytes at once using a 64bit write? + */ + bzero(sin->sin_zero, sizeof(sin->sin_zero)); + /* *((u_int64_t *)sin->sin_zero) = 0; */ /* XXX ?? */ + sin->sin_port = 0; + if (!(ifa = ifa_ifwithaddr((struct sockaddr *) sin))) { + error = EADDRNOTAVAIL; + goto cantsend; + } + m->m_pkthdr.rcvif = ifa->ifa_ifp; + } + + /* Send packet to input processing */ + ip_input(m); + } + + /* paranoid: Reset for next time (and other packets) */ + /* almost definitly already done in the ipfw filter but.. */ + ip_divert_cookie = 0; + return error; + +cantsend: + m_freem(m); + ip_divert_cookie = 0; + return error; +} + +static int +div_attach(struct socket *so, int proto, struct proc *p) +{ + struct inpcb *inp; + int error, s; + + inp = sotoinpcb(so); + if (inp) + panic("div_attach"); + if (p && (error = suser(p)) != 0) + return error; + + s = splnet(); + error = in_pcballoc(so, &divcbinfo, p); + splx(s); + if (error) + return error; + error = soreserve(so, div_sendspace, div_recvspace); + if (error) + return error; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip_p = proto; + inp->inp_flags |= INP_HDRINCL; + /* The socket is always "connected" because + we always know "where" to send the packet */ + so->so_state |= SS_ISCONNECTED; + return 0; +} + +static int +div_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + panic("div_detach"); + in_pcbdetach(inp); + return 0; +} + +static int +div_abort(struct socket *so) +{ + soisdisconnected(so); + return div_detach(so); +} + +static int +div_disconnect(struct socket *so) +{ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + return div_abort(so); +} + +static int +div_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp; + int s; + int error; + + s = splnet(); + inp = sotoinpcb(so); + error = in_pcbbind(inp, nam, p); + splx(s); + return 0; +} + +static int +div_shutdown(struct socket *so) +{ + socantsendmore(so); + return 0; +} + +static int +div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) +{ + /* Packet must have a header (but that's about it) */ + if (m->m_len < sizeof (struct ip) || + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + m_freem(m); + return EINVAL; + } + + /* Send packet */ + return div_output(so, m, nam, control); +} + +struct pr_usrreqs div_usrreqs = { + div_abort, pru_accept_notsupp, div_attach, div_bind, + pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach, + div_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c new file mode 100644 index 0000000..401e24d --- /dev/null +++ b/sys/netinet/ip_dummynet.c @@ -0,0 +1,644 @@ +/* + * Copyright (c) 1998 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +/* + * This module implements IP dummynet, a bandwidth limiter/delay emulator + * used in conjunction with the ipfw package. + * + * Changes: + * + * 980821: changed conventions in the queueing logic + * packets passed from dummynet to ip_in/out are prepended with + * a vestigial mbuf type MT_DUMMYNET which contains a pointer + * to the matching rule. + * ip_input/output will extract the parameters, free the vestigial mbuf, + * and do the processing. + * + * 980519: fixed behaviour when deleting rules. + * 980518: added splimp()/splx() to protect against races + * 980513: initial release + */ + +/* include files marked with XXX are probably not needed */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/queue.h> /* XXX */ +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/time.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/ip_var.h> + +#include "opt_bdg.h" +#ifdef BRIDGE +#include <netinet/if_ether.h> /* for struct arpcom */ +#include <net/bridge.h> +#endif + +static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ + +static int dn_debug = 0 ; /* verbose */ +static int dn_calls = 0 ; /* number of calls */ +static int dn_idle = 1; +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dn_debug, 0, ""); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, calls, CTLFLAG_RD, &dn_calls, 0, ""); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, idle, CTLFLAG_RD, &dn_idle, 0, ""); +#endif + +static int ip_dn_ctl(struct sockopt *sopt); + +static void rt_unref(struct rtentry *); +static void dummynet(void *); +static void dn_restart(void); +static void dn_move(struct dn_pipe *pipe, int immediate); +static void dummynet_flush(void); + +/* + * the following is needed when deleting a pipe, because rules can + * hold references to the pipe. + */ +extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain; + +/* + * invoked to reschedule the periodic task if necessary. + * Should only be called when dn_idle = 1 ; + */ +static void +dn_restart() +{ + struct dn_pipe *pipe; + + if (!dn_idle) + return; + + for (pipe = all_pipes ; pipe ; pipe = pipe->next ) { + /* if there any pipe that needs work, restart */ + if (pipe->r.head || pipe->p.head || pipe->numbytes < 0 ) { + dn_idle = 0; + timeout(dummynet, NULL, 1); + return ; + } + } +} + +static void +rt_unref(struct rtentry *rt) +{ + if (rt == NULL) + return ; + if (rt->rt_refcnt <= 0) + printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt); + RTFREE(rt); +} + +/* + * move packets from R-queue to P-queue + */ +static void +dn_move(struct dn_pipe *pipe, int immediate) +{ + struct dn_pkt *pkt; + + /* + * consistency check, should catch new pipes which are + * not initialized properly. + */ + if ( pipe->p.head == NULL && + pipe->ticks_from_last_insert != pipe->delay) { + printf("Warning, empty pipe and delay %d (should be %d)\n", + pipe->ticks_from_last_insert, pipe->delay); + pipe->ticks_from_last_insert = pipe->delay; + } + /* this ought to go in dn_dequeue() */ + if (!immediate && pipe->ticks_from_last_insert < pipe->delay) + pipe->ticks_from_last_insert++; + if ((pkt = pipe->r.head) != NULL) { + /* + * Move at most numbytes bytes from src and move to dst. + * delay is set to ticks_from_last_insert, which + * is reset after the first insertion; + */ + while ( pkt ) { + int len = pkt->dn_m->m_pkthdr.len ; + + /* + * queue limitation: pass packets down if the len is + * such that the pkt would go out before the next tick. + */ + if (pipe->bandwidth) { + int len_scaled = len*8*hz ; + /* numbytes is in bit/sec, scaled 8*hz ... */ + if (pipe->numbytes < len_scaled) + break; + pipe->numbytes -= len_scaled; + } + pipe->r_len--; /* elements in queue */ + pipe->r_len_bytes -= len ; + + /* + * to add delay jitter, must act here. A lower value + * (bounded to 0) means lower delay. + */ + pkt->delay = pipe->ticks_from_last_insert; + pipe->ticks_from_last_insert = 0; + /* compensate the decrement done next in dn_dequeue */ + if (!immediate && pkt->delay >0 && pipe->p.head==NULL) + pkt->delay++; + if (pipe->p.head == NULL) + pipe->p.head = pkt; + else + (struct dn_pkt *)pipe->p.tail->dn_next = pkt; + pipe->p.tail = pkt; + pkt = (struct dn_pkt *)pkt->dn_next; + pipe->p.tail->dn_next = NULL; + } + pipe->r.head = pkt; + + /*** XXX just a sanity check */ + if ( ( pkt == NULL && pipe->r_len != 0) || + ( pkt != NULL && pipe->r_len == 0) ) + printf("-- Warning, pipe head %p len %d\n", + (void *)pkt, pipe->r_len); + } + + /* + * deliver packets downstream after the delay in the P-queue. + */ + + if (pipe->p.head == NULL) + return; + if (!immediate) + pipe->p.head->delay--; + while ( (pkt = pipe->p.head) && pkt->delay < 1) { + /* + * first unlink, then call procedures since ip_input() + * can result in a call to ip_output cnd viceversa, + * thus causing nested calls + */ + pipe->p.head = (struct dn_pkt *) pkt->dn_next ; + + /* + * the trick to avoid flow-id settings here is to prepend a + * vestigial mbuf to the packet, with the following values: + * m_type = MT_DUMMYNET + * m_next = the actual mbuf to be processed by ip_input/output + * m_data = the matching rule + * The vestigial element is the same memory area used by + * the dn_pkt, and IS FREED HERE because it can contain + * parameters passed to the called routine. The buffer IS NOT + * A REAL MBUF, just a block of memory acquired with malloc(). + */ + switch (pkt->dn_dir) { + case DN_TO_IP_OUT: { + struct route *ro = &(pkt->ro) ; + + (void)ip_output((struct mbuf *)pkt, (struct mbuf *)pkt->ifp, + ro, pkt->dn_dst, NULL); + rt_unref (ro->ro_rt) ; + } + break ; + case DN_TO_IP_IN : + ip_input((struct mbuf *)pkt) ; + break ; +#ifdef BRIDGE + case DN_TO_BDG_FWD : { + struct mbuf *m = (struct mbuf *)pkt ; + + bdg_forward(&m, pkt->ifp); + if (m) + m_freem(m); + } + break ; +#endif + default: + printf("dummynet: bad switch %d!\n", pkt->dn_dir); + m_freem(pkt->dn_m); + break ; + } + FREE(pkt, M_IPFW); + } +} +/* + * this is the periodic task that moves packets between the R- + * and the P- queue + */ +/*ARGSUSED*/ +void +dummynet(void * __unused unused) +{ + struct dn_pipe *p ; + int s ; + + dn_calls++ ; + for (p = all_pipes ; p ; p = p->next ) { + /* + * Increment the amount of data that can be sent. However, + * don't do that if the channel is idle + * (r.head == NULL && numbytes >= bandwidth). + * This bug fix is from tim shepard (shep@bbn.com) + */ + s = splimp(); + if (p->r.head != NULL || p->numbytes < p->bandwidth ) + p->numbytes += p->bandwidth ; + dn_move(p, 0); /* is it really 0 (also below) ? */ + splx(s); + } + + /* + * finally, if some queue has data, restart the timer. + */ + s = splimp(); + dn_idle = 1; + dn_restart(); + splx(s); +} + +/* + * dummynet hook for packets. + * input and output use the same code, so i use bit 16 in the pipe + * number to chose the direction: 1 for output packets, 0 for input. + * for input, only m is significant. For output, also the others. + */ +int +dummynet_io(int pipe_nr, int dir, + struct mbuf *m, struct ifnet *ifp, struct route *ro, + struct sockaddr_in *dst, + struct ip_fw_chain *rule) +{ + struct dn_pkt *pkt; + struct dn_pipe *pipe; + int len = m->m_pkthdr.len ; + + int s=splimp(); + + pipe_nr &= 0xffff ; + /* + * locate pipe. First time is expensive, next have direct access. + */ + + if ( (pipe = rule->rule->pipe_ptr) == NULL ) { + for (pipe=all_pipes; pipe && pipe->pipe_nr !=pipe_nr; pipe=pipe->next) + ; + if (pipe == NULL) { + splx(s); + if (dn_debug) + printf("warning, pkt for no pipe %d\n", pipe_nr); + m_freem(m); + return 0 ; + } else + rule->rule->pipe_ptr = pipe ; + } + + /* + * should i drop ? + * This section implements random packet drop. + */ + if ( (pipe->plr && random() < pipe->plr) || + (pipe->queue_size && pipe->r_len >= pipe->queue_size) || + (pipe->queue_size_bytes && + len + pipe->r_len_bytes > pipe->queue_size_bytes) || + (pkt = (struct dn_pkt *)malloc(sizeof (*pkt), + M_IPFW, M_NOWAIT) ) == NULL ) { + splx(s); + if (dn_debug) + printf("-- dummynet: drop from pipe %d, have %d pks, %d bytes\n", + pipe_nr, pipe->r_len, pipe->r_len_bytes); + pipe->r_drops++ ; + m_freem(m); + return 0 ; /* XXX error */ + } + bzero(pkt, sizeof(*pkt) ); + /* build and enqueue packet */ + pkt->hdr.mh_type = MT_DUMMYNET ; + (struct ip_fw_chain *)pkt->hdr.mh_data = rule ; + pkt->dn_next = NULL; + pkt->dn_m = m; + pkt->dn_dir = dir ; + pkt->delay = 0; + + pkt->ifp = ifp; + if (dir == DN_TO_IP_OUT) { + /* + * we need to copy *ro because for icmp pkts (and maybe others) + * the caller passed a pointer into the stack. + */ + pkt->ro = *ro; + if (ro->ro_rt) + ro->ro_rt->rt_refcnt++ ; /* XXX */ + /* + * and again, dst might be a pointer into *ro... + */ + if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */ + dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; + + pkt->dn_dst = dst; /* XXX this can't be right! */ + } + if (pipe->r.head == NULL) + pipe->r.head = pkt; + else + (struct dn_pkt *)pipe->r.tail->dn_next = pkt; + pipe->r.tail = pkt; + pipe->r_len++; + pipe->r_len_bytes += len ; + + /* + * here we could implement RED if we like to + */ + + if (pipe->r.head == pkt) { /* process immediately */ + dn_move(pipe, 1); + } + if (dn_idle) + dn_restart(); + splx(s); + return 0; +} + +/* + * dispose all packets queued on a pipe + */ +static void +purge_pipe(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt, *n ; + struct rtentry *tmp_rt ; + + for (pkt = pipe->r.head ; pkt ; ) { + rt_unref (tmp_rt = pkt->ro.ro_rt ) ; + m_freem(pkt->dn_m); + n = pkt ; + pkt = (struct dn_pkt *)pkt->dn_next ; + free(n, M_IPFW) ; + } + for (pkt = pipe->p.head ; pkt ; ) { + rt_unref (tmp_rt = pkt->ro.ro_rt ) ; + m_freem(pkt->dn_m); + n = pkt ; + pkt = (struct dn_pkt *)pkt->dn_next ; + free(n, M_IPFW) ; + } +} + +/* + * delete all pipes returning memory + */ +static void +dummynet_flush() +{ + struct dn_pipe *q, *p = all_pipes ; + int s = splnet() ; + + all_pipes = NULL ; + splx(s) ; + /* + * purge all queued pkts and delete all pipes + */ + for ( ; p ; ) { + purge_pipe(p); + q = p ; + p = p->next ; + free(q, M_IPFW); + } +} + +extern struct ip_fw_chain *ip_fw_default_rule ; +/* + * when a firewall rule is deleted, scan all pipes and remove the flow-id + * from packets matching this rule. + */ +void +dn_rule_delete(void *r) +{ + struct dn_pipe *p ; + int matches = 0 ; + + for ( p = all_pipes ; p ; p = p->next ) { + struct dn_pkt *x ; + for (x = p->r.head ; x ; x = (struct dn_pkt *)x->dn_next ) + if (x->hdr.mh_data == r) { + matches++ ; + x->hdr.mh_data = (void *)ip_fw_default_rule ; + } + for (x = p->p.head ; x ; x = (struct dn_pkt *)x->dn_next ) + if (x->hdr.mh_data == r) { + matches++ ; + x->hdr.mh_data = (void *)ip_fw_default_rule ; + } + } + printf("dn_rule_delete, r %p, default %p%s, %d matches\n", + (void *)r, (void *)ip_fw_default_rule, + r == ip_fw_default_rule ? " AARGH!":"", matches); +} + +/* + * handler for the various dummynet socket options + * (get, flush, config, del) + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + int error = 0 ; + size_t size ; + char *buf, *bp ; + struct dn_pipe *p, tmp_pipe ; + + struct dn_pipe *x, *a, *b ; + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) + return (EPERM); + + switch (sopt->sopt_name) { + default : + panic("ip_dn_ctl -- unknown option"); + + case IP_DUMMYNET_GET : + for (p = all_pipes, size = 0 ; p ; p = p->next ) + size += sizeof( *p ) ; + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == 0) { + error = ENOBUFS ; + break ; + } + for (p = all_pipes, bp = buf ; p ; p = p->next ) { + struct dn_pipe *q = (struct dn_pipe *)bp ; + + bcopy(p, bp, sizeof( *p ) ); + /* + * return bw and delay in bits/s and ms, respectively + */ + q->delay = (q->delay * 1000) / hz ; + bp += sizeof( *p ) ; + } + error = sooptcopyout(sopt, buf, size); + FREE(buf, M_TEMP); + break ; + case IP_DUMMYNET_FLUSH : + dummynet_flush() ; + break ; + case IP_DUMMYNET_CONFIGURE : + p = &tmp_pipe ; + error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); + if (error) + break ; + /* + * The config program passes parameters as follows: + * bandwidth = bits/second (0 = no limits); + * delay = ms + * must be translated in ticks. + * queue_size = slots (0 = no limit) + * queue_size_bytes = bytes (0 = no limit) + * only one can be set, must be bound-checked + */ + p->delay = ( p->delay * hz ) / 1000 ; + if (p->queue_size == 0 && p->queue_size_bytes == 0) + p->queue_size = 50 ; + if (p->queue_size != 0 ) /* buffers are prevailing */ + p->queue_size_bytes = 0 ; + if (p->queue_size > 100) + p->queue_size = 50 ; + if (p->queue_size_bytes > 1024*1024) + p->queue_size_bytes = 1024*1024 ; +#if 0 + printf("ip_dn: config pipe %d %d bit/s %d ms %d bufs\n", + p->pipe_nr, + p->bandwidth * 8 * hz , + p->delay * 1000 / hz , p->queue_size); +#endif + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + if (b && b->pipe_nr == p->pipe_nr) { + /* XXX should spl and flush old pipe... */ + b->bandwidth = p->bandwidth ; + b->delay = p->delay ; + b->ticks_from_last_insert = p->delay ; + b->queue_size = p->queue_size ; + b->queue_size_bytes = p->queue_size_bytes ; + b->plr = p->plr ; + } else { + int s ; + x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; + if (x == NULL) { + printf("ip_dummynet.c: sorry no memory\n"); + error = ENOSPC ; + break ; + } + bzero(x, sizeof(*x) ); + x->bandwidth = p->bandwidth ; + x->delay = p->delay ; + x->ticks_from_last_insert = p->delay ; + x->pipe_nr = p->pipe_nr ; + x->queue_size = p->queue_size ; + x->queue_size_bytes = p->queue_size_bytes ; + x->plr = p->plr ; + + s = splnet() ; + x->next = b ; + if (a == NULL) + all_pipes = x ; + else + a->next = x ; + splx(s); + } + break ; + + case IP_DUMMYNET_DEL : + p = &tmp_pipe ; + error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); + if (error) + break ; + + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + if (b && b->pipe_nr == p->pipe_nr) { /* found pipe */ + int s = splnet() ; + struct ip_fw_chain *chain = ip_fw_chain.lh_first; + + if (a == NULL) + all_pipes = b->next ; + else + a->next = b->next ; + /* + * remove references to this pipe from the ip_fw rules. + */ + for (; chain; chain = chain->chain.le_next) { + register struct ip_fw *const f = chain->rule; + if (f->pipe_ptr == b) + f->pipe_ptr = NULL ; + } + splx(s); + purge_pipe(b); /* remove pkts from here */ + free(b, M_IPFW); + } + break ; + } + return error ; +} + +static void +ip_dn_init(void) +{ + printf("DUMMYNET initialized (990811)\n"); + all_pipes = NULL ; + ip_dn_ctl_ptr = ip_dn_ctl; +} + +static ip_dn_ctl_t *old_dn_ctl_ptr; + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + int s; + + switch (type) { + case MOD_LOAD: + s = splnet(); + old_dn_ctl_ptr = ip_dn_ctl_ptr; + ip_dn_init(); + splx(s); + break; + case MOD_UNLOAD: + s = splnet(); + ip_dn_ctl_ptr = old_dn_ctl_ptr; + splx(s); + dummynet_flush(); + break; + default: + break; + } + return 0; +} + +static moduledata_t dummynet_mod = { + "dummynet", + dummynet_modevent, + NULL +}; +DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h new file mode 100644 index 0000000..55d9614 --- /dev/null +++ b/sys/netinet/ip_dummynet.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 1998 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of dummynet data structures. + * Dummynet handles a list of pipes, each one identified by a unique + * number (hopefully the list is short so we use a linked list). + * + * Each list contains a set of parameters identifying the pipe, and + * a set of packets queued on the pipe itself. + * + * I could have used queue macros, but the management i have + * is pretty simple and this makes the code more portable. + */ + +/* + * struct dn_pkt identifies a packet in the dummynet queue. The + * first part is really an m_hdr for implementation purposes, and some + * fields are saved there. When passing the packet back to the ip_input/ + * ip_output(), the struct is prepended to the mbuf chain with type + * MT_DUMMYNET, and contains the pointer to the matching rule. + */ +struct dn_pkt { + struct m_hdr hdr ; +#define dn_next hdr.mh_nextpkt /* next element in queue */ +#define dn_m hdr.mh_next /* packet to be forwarded */ +#define dn_dst hdr.mh_len /* dst, for ip_output */ +#define dn_dir hdr.mh_flags /* IP_FW_F_IN or IP_FW_F_OUT */ + int delay; /* stays queued until delay=0 */ + struct ifnet *ifp; /* interface, for ip_output */ + struct route ro; /* route, for ip_output. MUST COPY */ + +#ifdef DUMMYNET_DEBUG + struct timeval beg, mid; /* testing only */ + int act_delay; /* testing only */ + int in_delay; /* testing only */ +#endif +}; + +struct dn_queue { + struct dn_pkt *head, *tail; +} ; + +/* + * descriptor of a pipe. The flags field will be used to speed up the + * forwarding code paths, in case some of the parameters are not + * used. + */ +struct dn_pipe { /* a pipe */ + struct dn_pipe *next ; + + u_short pipe_nr ; /* number */ + u_short flags ; /* to speed up things */ +#define DN_HAVE_BW 1 +#define DN_HAVE_QUEUE 2 +#define DN_HAVE_DELAY 4 + int bandwidth; /* really, bytes/tick. */ + int queue_size ; + int queue_size_bytes ; + int delay ; /* really, ticks */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct dn_queue r; + int r_len; /* elements in r_queue */ + int r_len_bytes; /* bytes in r_queue */ + int r_drops; /* drops from r_queue */ + struct dn_queue p ; + int ticks_from_last_insert; + long numbytes; /* which can send or receive */ +}; + +/* + * The following is used to define a new mbuf type that is + * prepended to the packet when it comes out of a pipe. The definition + * ought to go in /sys/sys/mbuf.h but here it is less intrusive. + */ + +#define MT_DUMMYNET MT_CONTROL +/* + * what to do of a packet when it comes out of a pipe + */ +#define DN_TO_IP_OUT 1 +#define DN_TO_IP_IN 2 +#define DN_TO_BDG_FWD 3 + +#ifdef KERNEL + +MALLOC_DECLARE(M_IPFW); + +typedef int ip_dn_ctl_t __P((struct sockopt *)) ; +extern ip_dn_ctl_t *ip_dn_ctl_ptr; + +void dn_rule_delete(void *r); /* used in ip_fw.c */ +int dummynet_io(int pipe, int dir, + struct mbuf *m, struct ifnet *ifp, struct route *ro, + struct sockaddr_in * dst, + struct ip_fw_chain *rule); +#endif /* KERNEL */ + +#endif /* _IP_DUMMYNET_H */ diff --git a/sys/netinet/ip_fil.c b/sys/netinet/ip_fil.c new file mode 100644 index 0000000..9d95eb3 --- /dev/null +++ b/sys/netinet/ip_fil.c @@ -0,0 +1,1471 @@ +/* + * Copyright (C) 1993-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_fil.c 2.41 6/5/96 (C) 1993-1995 Darren Reed"; +/*static const char rcsid[] = "@(#)$Id: ip_fil.c,v 2.4.2.7 1999/10/15 13:49:43 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif +#include <sys/param.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if defined(__FreeBSD__) && !defined(__FreeBSD_version) +# if !defined(_KERNEL) || defined(IPFILTER_LKM) +# include <osreldate.h> +# endif +#endif +#ifndef _KERNEL +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +# include <ctype.h> +# include <fcntl.h> +#endif +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/file.h> +#if __FreeBSD_version >= 220000 && defined(_KERNEL) +# include <sys/fcntl.h> +# include <sys/filio.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/time.h> +#ifdef _KERNEL +# include <sys/systm.h> +#endif +#include <sys/uio.h> +#if !SOLARIS +# if (NetBSD > 199609) || (OpenBSD > 199603) || (__FreeBSD_version >= 300000) +# include <sys/dirent.h> +# else +# include <sys/dir.h> +# endif +# include <sys/mbuf.h> +#else +# include <sys/filio.h> +#endif +#include <sys/protosw.h> +#include <sys/socket.h> + +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +#endif +#ifdef __sgi +#include <sys/debug.h> +# ifdef IFF_DRVRLOCK /* IRIX6 */ +#include <sys/hashing.h> +# endif +#endif +#include <net/route.h> +#include <netinet/in.h> +#if !(defined(__sgi) && !defined(IFF_DRVRLOCK)) /* IRIX < 6 */ +# include <netinet/in_var.h> +#endif +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/tcpip.h> +#include <netinet/ip_icmp.h> +#ifndef _KERNEL +# include <unistd.h> +# include <syslog.h> +#endif +#include "netinet/ip_compat.h" +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_auth.h" +#if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +#endif +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#if !SOLARIS && defined(_KERNEL) && !defined(__sgi) +# include <sys/kernel.h> +extern int ip_optcopy __P((struct ip *, struct ip *)); +#endif + + +extern struct protosw inetsw[]; + +#ifndef _KERNEL +# include "ipt.h" +static struct ifnet **ifneta = NULL; +static int nifs = 0; +#else +# if (BSD < 199306) || defined(__sgi) +extern int tcp_ttl; +# endif +#endif + +int ipl_inited = 0; +int ipl_unreach = ICMP_UNREACH_FILTER; +u_long ipl_frouteok[2] = {0, 0}; + +static void frzerostats __P((caddr_t)); +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +static int frrequest __P((int, u_long, caddr_t, int)); +#else +static int frrequest __P((int, int, caddr_t, int)); +#endif +#ifdef _KERNEL +static int (*fr_savep) __P((ip_t *, int, void *, int, struct mbuf **)); +static int send_ip __P((struct mbuf *, ip_t *)); +# ifdef __sgi +extern kmutex_t ipf_rw; +# endif +#else +int ipllog __P((void)); +void init_ifp __P((void)); +# ifdef __sgi +static int no_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *)); +static int write_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *)); +# else +static int no_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *)); +static int write_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *)); +# endif +#endif +#if defined(IPFILTER_LKM) +int fr_running = 1; +#else +int fr_running = 0; +#endif + +#if (__FreeBSD_version >= 300000) && defined(_KERNEL) +struct callout_handle ipfr_slowtimer_ch; +#endif + +#if (_BSDI_VERSION >= 199510) && defined(_KERNEL) +# include <sys/device.h> +# include <sys/conf.h> + +struct cfdriver iplcd = { + NULL, "ipl", NULL, NULL, DV_DULL, 0 +}; + +struct devsw iplsw = { + &iplcd, + iplopen, iplclose, iplread, nowrite, iplioctl, noselect, nommap, + nostrat, nodump, nopsize, 0, + nostop +}; +#endif /* _BSDI_VERSION >= 199510 && _KERNEL */ + +#if defined(__NetBSD__) || defined(__OpenBSD__) || (_BSDI_VERSION >= 199701) +# include <sys/conf.h> +# if defined(NETBSD_PF) +# include <net/pfil.h> +/* + * We provide the fr_checkp name just to minimize changes later. + */ +int (*fr_checkp) __P((ip_t *ip, int hlen, void *ifp, int out, mb_t **mp)); +# endif /* NETBSD_PF */ +#endif /* __NetBSD__ */ + +#ifdef _KERNEL +# if defined(IPFILTER_LKM) && !defined(__sgi) +int iplidentify(s) +char *s; +{ + if (strcmp(s, "ipl") == 0) + return 1; + return 0; +} +# endif /* IPFILTER_LKM */ + + +/* + * Try to detect the case when compiling for NetBSD with pseudo-device + */ +# if defined(__NetBSD__) && defined(PFIL_HOOKS) +void +ipfilterattach(count) +int count; +{ + if (iplattach() != 0) + printf("IP Filter failed to attach\n"); +} +# endif + + +int iplattach() +{ + char *defpass; + int s; +# ifdef __sgi + int error; +# endif + + SPL_NET(s); + if (ipl_inited || (fr_checkp == fr_check)) { + printf("IP Filter: already initialized\n"); + SPL_X(s); + return EBUSY; + } + +# ifdef IPFILTER_LOG + ipflog_init(); +# endif + if (nat_init() == -1) + return -1; + if (fr_stateinit() == -1) + return -1; + if (appr_init() == -1) + return -1; + +# ifdef NETBSD_PF + pfil_add_hook((void *)fr_check, PFIL_IN|PFIL_OUT); +# endif + +# ifdef __sgi + error = ipfilter_sgi_attach(); + if (error) { + SPL_X(s); + return error; + } +# endif + + ipl_inited = 1; + bzero((char *)frcache, sizeof(frcache)); + fr_savep = fr_checkp; + fr_checkp = fr_check; + + SPL_X(s); + if (fr_pass & FR_PASS) + defpass = "pass"; + else if (fr_pass & FR_BLOCK) + defpass = "block"; + else + defpass = "no-match -> block"; + + printf("IP Filter: initialized. Default = %s all, Logging = %s\n", + defpass, +# ifdef IPFILTER_LOG + "enabled"); +# else + "disabled"); +# endif + printf("%s\n", ipfilter_version); +#ifdef _KERNEL +# if (__FreeBSD_version >= 300000) && defined(_KERNEL) + ipfr_slowtimer_ch = timeout(ipfr_slowtimer, NULL, hz/2); +# else + timeout(ipfr_slowtimer, NULL, hz/2); +# endif +#endif + return 0; +} + + +/* + * Disable the filter by removing the hooks from the IP input/output + * stream. + */ +int ipldetach() +{ + int s, i = FR_INQUE|FR_OUTQUE; + +#ifdef _KERNEL +# if (__FreeBSD_version >= 300000) + untimeout(ipfr_slowtimer, NULL, ipfr_slowtimer_ch); +# else +# ifdef __sgi + untimeout(ipfr_slowtimer); +# else + untimeout(ipfr_slowtimer, NULL); +# endif +# endif +#endif + SPL_NET(s); + if (!ipl_inited) + { + printf("IP Filter: not initialized\n"); + SPL_X(s); + return 0; + } + + printf("IP Filter: unloaded\n"); + + fr_checkp = fr_savep; + i = frflush(IPL_LOGIPF, i); + ipl_inited = 0; + +# ifdef NETBSD_PF + pfil_remove_hook((void *)fr_check, PFIL_IN|PFIL_OUT); +# endif + +# ifdef __sgi + ipfilter_sgi_detach(); +# endif + + ipfr_unload(); + ip_natunload(); + fr_stateunload(); + fr_authunload(); + + SPL_X(s); + return 0; +} +#endif /* _KERNEL */ + + +static void frzerostats(data) +caddr_t data; +{ + friostat_t fio; + + bcopy((char *)frstats, (char *)fio.f_st, + sizeof(struct filterstats) * 2); + fio.f_fin[0] = ipfilter[0][0]; + fio.f_fin[1] = ipfilter[0][1]; + fio.f_fout[0] = ipfilter[1][0]; + fio.f_fout[1] = ipfilter[1][1]; + fio.f_acctin[0] = ipacct[0][0]; + fio.f_acctin[1] = ipacct[0][1]; + fio.f_acctout[0] = ipacct[1][0]; + fio.f_acctout[1] = ipacct[1][1]; + fio.f_active = fr_active; + fio.f_froute[0] = ipl_frouteok[0]; + fio.f_froute[1] = ipl_frouteok[1]; + IWCOPY((caddr_t)&fio, data, sizeof(fio)); + bzero((char *)frstats, sizeof(*frstats) * 2); +} + + +/* + * Filter ioctl interface. + */ +#ifdef __sgi +int IPL_EXTERN(ioctl)(dev_t dev, int cmd, caddr_t data, int mode +# ifdef _KERNEL + , cred_t *cp, int *rp +# endif +) +#else +int IPL_EXTERN(ioctl)(dev, cmd, data, mode +#if ((_BSDI_VERSION >= 199510) || (BSD >= 199506) || (NetBSD >= 199511) || \ + (__FreeBSD_version >= 220000) || defined(__OpenBSD__)) && defined(_KERNEL) +, p) +struct proc *p; +#else +) +#endif +dev_t dev; +#if defined(__NetBSD__) || defined(__OpenBSD__) || \ + (_BSDI_VERSION >= 199701) || (__FreeBSD_version >= 300000) +u_long cmd; +#else +int cmd; +#endif +caddr_t data; +int mode; +#endif /* __sgi */ +{ +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + int error = 0, unit = 0, tmp; + +#if (BSD >= 199306) && defined(_KERNEL) + if ((securelevel >= 2) && (mode & FWRITE)) + return EPERM; +#endif +#ifdef _KERNEL + unit = GET_MINOR(dev); + if ((IPL_LOGMAX < unit) || (unit < 0)) + return ENXIO; +#else + unit = dev; +#endif + + SPL_NET(s); + + if (unit == IPL_LOGNAT) { + error = nat_ioctl(data, cmd, mode); + SPL_X(s); + return error; + } + if (unit == IPL_LOGSTATE) { + error = fr_state_ioctl(data, cmd, mode); + SPL_X(s); + return error; + } + switch (cmd) { + case FIONREAD : +#ifdef IPFILTER_LOG + IWCOPY((caddr_t)&iplused[IPL_LOGIPF], (caddr_t)data, + sizeof(iplused[IPL_LOGIPF])); +#endif + break; +#if !defined(IPFILTER_LKM) && defined(_KERNEL) + case SIOCFRENB : + { + u_int enable; + + if (!(mode & FWRITE)) + error = EPERM; + else { + IRCOPY(data, (caddr_t)&enable, sizeof(enable)); + if (enable) { + error = iplattach(); + if (error == 0) + fr_running = 1; + } else { + error = ipldetach(); + if (error == 0) + fr_running = 0; + } + } + break; + } +#endif + case SIOCSETFF : + if (!(mode & FWRITE)) + error = EPERM; + else + IRCOPY(data, (caddr_t)&fr_flags, sizeof(fr_flags)); + break; + case SIOCGETFF : + IWCOPY((caddr_t)&fr_flags, data, sizeof(fr_flags)); + break; + case SIOCINAFR : + case SIOCRMAFR : + case SIOCADAFR : + case SIOCZRLST : + if (!(mode & FWRITE)) + error = EPERM; + else + error = frrequest(unit, cmd, data, fr_active); + break; + case SIOCINIFR : + case SIOCRMIFR : + case SIOCADIFR : + if (!(mode & FWRITE)) + error = EPERM; + else + error = frrequest(unit, cmd, data, 1 - fr_active); + break; + case SIOCSWAPA : + if (!(mode & FWRITE)) + error = EPERM; + else { + bzero((char *)frcache, sizeof(frcache[0]) * 2); + *(u_int *)data = fr_active; + fr_active = 1 - fr_active; + } + break; + case SIOCGETFS : + { + struct friostat fio; + + bcopy((char *)frstats, (char *)fio.f_st, + sizeof(struct filterstats) * 2); + fio.f_fin[0] = ipfilter[0][0]; + fio.f_fin[1] = ipfilter[0][1]; + fio.f_fout[0] = ipfilter[1][0]; + fio.f_fout[1] = ipfilter[1][1]; + fio.f_acctin[0] = ipacct[0][0]; + fio.f_acctin[1] = ipacct[0][1]; + fio.f_acctout[0] = ipacct[1][0]; + fio.f_acctout[1] = ipacct[1][1]; + fio.f_auth = ipauth; + fio.f_active = fr_active; + fio.f_froute[0] = ipl_frouteok[0]; + fio.f_froute[1] = ipl_frouteok[1]; + fio.f_running = fr_running; + fio.f_groups[0][0] = ipfgroups[0][0]; + fio.f_groups[0][1] = ipfgroups[0][1]; + fio.f_groups[1][0] = ipfgroups[1][0]; + fio.f_groups[1][1] = ipfgroups[1][1]; + fio.f_groups[2][0] = ipfgroups[2][0]; + fio.f_groups[2][1] = ipfgroups[2][1]; +#ifdef IPFILTER_LOG + fio.f_logging = 1; +#else + fio.f_logging = 0; +#endif + fio.f_defpass = fr_pass; + strncpy(fio.f_version, ipfilter_version, + sizeof(fio.f_version)); + IWCOPY((caddr_t)&fio, data, sizeof(fio)); + break; + } + case SIOCFRZST : + if (!(mode & FWRITE)) + error = EPERM; + else + frzerostats(data); + break; + case SIOCIPFFL : + if (!(mode & FWRITE)) + error = EPERM; + else { + IRCOPY(data, (caddr_t)&tmp, sizeof(tmp)); + tmp = frflush(unit, tmp); + IWCOPY((caddr_t)&tmp, data, sizeof(tmp)); + } + break; +#ifdef IPFILTER_LOG + case SIOCIPFFB : + if (!(mode & FWRITE)) + error = EPERM; + else + *(int *)data = ipflog_clear(unit); + break; +#endif /* IPFILTER_LOG */ + case SIOCGFRST : + IWCOPY((caddr_t)ipfr_fragstats(), data, sizeof(ipfrstat_t)); + break; + case SIOCAUTHW : + case SIOCAUTHR : + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + case SIOCATHST : + error = fr_auth_ioctl(data, cmd, NULL, NULL); + break; + case SIOCFRSYN : + if (!(mode & FWRITE)) + error = EPERM; + else { +#if defined(_KERNEL) && defined(__sgi) + ipfsync(); +#endif + frsync(); + } + break; + default : + error = EINVAL; + break; + } + SPL_X(s); + return error; +} + + +void frsync() +{ +#ifdef _KERNEL + register frentry_t *f; + register struct ifnet *ifp; + +# if defined(__OpenBSD__) || ((NetBSD >= 199511) && (NetBSD < 1991011)) || \ + (defined(__FreeBSD_version) && (__FreeBSD_version >= 300000)) +# if (NetBSD >= 199905) || defined(__OpenBSD__) + for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) +# else + for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) +# endif +# else + for (ifp = ifnet; ifp; ifp = ifp->if_next) +# endif + ip_natsync(ifp); + + WRITE_ENTER(&ipf_mutex); + for (f = ipacct[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == (void *)-1) + f->fr_ifa = GETUNIT(f->fr_ifname); + for (f = ipacct[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == (void *)-1) + f->fr_ifa = GETUNIT(f->fr_ifname); + for (f = ipfilter[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == (void *)-1) + f->fr_ifa = GETUNIT(f->fr_ifname); + for (f = ipfilter[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == (void *)-1) + f->fr_ifa = GETUNIT(f->fr_ifname); + RWLOCK_EXIT(&ipf_mutex); +#endif +} + + +void fr_forgetifp(ifp) +void *ifp; +{ + register frentry_t *f; + + WRITE_ENTER(&ipf_mutex); + for (f = ipacct[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipacct[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipfilter[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipfilter[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + RWLOCK_EXIT(&ipf_mutex); + ip_natsync(ifp); +} + + +static int frrequest(unit, req, data, set) +int unit; +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +u_long req; +#else +int req; +#endif +int set; +caddr_t data; +{ + register frentry_t *fp, *f, **fprev; + register frentry_t **ftail; + frentry_t frd; + frdest_t *fdp; + frgroup_t *fg = NULL; + int error = 0, in; + u_int group; + + fp = &frd; + IRCOPY(data, (caddr_t)fp, sizeof(*fp)); + fp->fr_ref = 0; + + /* + * Check that the group number does exist and that if a head group + * has been specified, doesn't exist. + */ + if ((req != SIOCZRLST) && fp->fr_grhead && + fr_findgroup((u_int)fp->fr_grhead, fp->fr_flags, unit, set, NULL)) + return EEXIST; + if ((req != SIOCZRLST) && fp->fr_group && + !fr_findgroup((u_int)fp->fr_group, fp->fr_flags, unit, set, NULL)) + return ESRCH; + + in = (fp->fr_flags & FR_INQUE) ? 0 : 1; + + if (unit == IPL_LOGAUTH) + ftail = fprev = &ipauth; + else if (fp->fr_flags & FR_ACCOUNT) + ftail = fprev = &ipacct[in][set]; + else if (fp->fr_flags & (FR_OUTQUE|FR_INQUE)) + ftail = fprev = &ipfilter[in][set]; + else + return ESRCH; + + if ((group = fp->fr_group)) { + if (!(fg = fr_findgroup(group, fp->fr_flags, unit, set, NULL))) + return ESRCH; + ftail = fprev = fg->fg_start; + } + + bzero((char *)frcache, sizeof(frcache[0]) * 2); + + if (*fp->fr_ifname) { + fp->fr_ifa = GETUNIT(fp->fr_ifname); + if (!fp->fr_ifa) + fp->fr_ifa = (void *)-1; + } +#if BSD >= 199306 + if (*fp->fr_oifname) { + fp->fr_oifa = GETUNIT(fp->fr_oifname); + if (!fp->fr_oifa) + fp->fr_oifa = (void *)-1; + } +#endif + + fdp = &fp->fr_dif; + fp->fr_flags &= ~FR_DUP; + if (*fdp->fd_ifname) { + fdp->fd_ifp = GETUNIT(fdp->fd_ifname); + if (!fdp->fd_ifp) + fdp->fd_ifp = (struct ifnet *)-1; + else + fp->fr_flags |= FR_DUP; + } + + fdp = &fp->fr_tif; + if (*fdp->fd_ifname) { + fdp->fd_ifp = GETUNIT(fdp->fd_ifname); + if (!fdp->fd_ifp) + fdp->fd_ifp = (struct ifnet *)-1; + } + + /* + * Look for a matching filter rule, but don't include the next or + * interface pointer in the comparison (fr_next, fr_ifa). + */ + for (; (f = *ftail); ftail = &f->fr_next) + if (bcmp((char *)&f->fr_ip, (char *)&fp->fr_ip, + FR_CMPSIZ) == 0) + break; + + /* + * If zero'ing statistics, copy current to caller and zero. + */ + if (req == SIOCZRLST) { + if (!f) + return ESRCH; + IWCOPY((caddr_t)f, data, sizeof(*f)); + f->fr_hits = 0; + f->fr_bytes = 0; + return 0; + } + + if (!f) { + ftail = fprev; + if (req != SIOCINAFR && req != SIOCINIFR) + while ((f = *ftail)) + ftail = &f->fr_next; + else if (fp->fr_hits) + while (--fp->fr_hits && (f = *ftail)) + ftail = &f->fr_next; + f = NULL; + } + + if (req == SIOCDELFR || req == SIOCRMIFR) { + if (!f) + error = ESRCH; + else { + if (f->fr_ref > 1) + return EBUSY; + if (fg && fg->fg_head) + fg->fg_head->fr_ref--; + if (unit == IPL_LOGAUTH) + return fr_auth_ioctl(data, req, f, ftail); + if (f->fr_grhead) + fr_delgroup((u_int)f->fr_grhead, fp->fr_flags, + unit, set); + fixskip(fprev, f, -1); + *ftail = f->fr_next; + KFREE(f); + } + } else { + if (f) + error = EEXIST; + else { + if (unit == IPL_LOGAUTH) + return fr_auth_ioctl(data, req, f, ftail); + KMALLOC(f, frentry_t *); + if (f != NULL) { + if (fg && fg->fg_head) + fg->fg_head->fr_ref++; + bcopy((char *)fp, (char *)f, sizeof(*f)); + f->fr_ref = 1; + f->fr_hits = 0; + f->fr_next = *ftail; + *ftail = f; + if (req == SIOCINIFR || req == SIOCINAFR) + fixskip(fprev, f, 1); + f->fr_grp = NULL; + if ((group = f->fr_grhead)) + fg = fr_addgroup(group, f, unit, set); + } else + error = ENOMEM; + } + } + return (error); +} + + +#ifdef _KERNEL +/* + * routines below for saving IP headers to buffer + */ +# ifdef __sgi +# ifdef _KERNEL +int IPL_EXTERN(open)(dev_t *pdev, int flags, int devtype, cred_t *cp) +# else +int IPL_EXTERN(open)(dev_t dev, int flags) +# endif +# else +int IPL_EXTERN(open)(dev, flags +# if ((_BSDI_VERSION >= 199510) || (BSD >= 199506) || (NetBSD >= 199511) || \ + (__FreeBSD_version >= 220000) || defined(__OpenBSD__)) && defined(_KERNEL) +, devtype, p) +int devtype; +struct proc *p; +# else +) +# endif +dev_t dev; +int flags; +# endif /* __sgi */ +{ +# if defined(__sgi) && defined(_KERNEL) + u_int min = geteminor(*pdev); +# else + u_int min = GET_MINOR(dev); +# endif + + if (IPL_LOGMAX < min) + min = ENXIO; + else + min = 0; + return min; +} + + +# ifdef __sgi +int IPL_EXTERN(close)(dev_t dev, int flags, int devtype, cred_t *cp) +#else +int IPL_EXTERN(close)(dev, flags +# if ((_BSDI_VERSION >= 199510) || (BSD >= 199506) || (NetBSD >= 199511) || \ + (__FreeBSD_version >= 220000) || defined(__OpenBSD__)) && defined(_KERNEL) +, devtype, p) +int devtype; +struct proc *p; +# else +) +# endif +dev_t dev; +int flags; +# endif /* __sgi */ +{ + u_int min = GET_MINOR(dev); + + if (IPL_LOGMAX < min) + min = ENXIO; + else + min = 0; + return min; +} + +/* + * iplread/ipllog + * both of these must operate with at least splnet() lest they be + * called during packet processing and cause an inconsistancy to appear in + * the filter lists. + */ +# ifdef __sgi +int IPL_EXTERN(read)(dev_t dev, uio_t *uio, cred_t *crp) +# else +# if BSD >= 199306 +int IPL_EXTERN(read)(dev, uio, ioflag) +int ioflag; +# else +int IPL_EXTERN(read)(dev, uio) +# endif +dev_t dev; +register struct uio *uio; +# endif /* __sgi */ +{ +# ifdef IPFILTER_LOG + return ipflog_read(GET_MINOR(dev), uio); +# else + return ENXIO; +# endif +} + + +/* + * send_reset - this could conceivably be a call to tcp_respond(), but that + * requires a large amount of setting up and isn't any more efficient. + */ +int send_reset(fin, oip) +fr_info_t *fin; +struct ip *oip; +{ + struct tcphdr *tcp, *tcp2; + struct tcpiphdr *tp; + struct mbuf *m; + int tlen = 0; + ip_t *ip; + + tcp = (struct tcphdr *)fin->fin_dp; + if (tcp->th_flags & TH_RST) + return -1; /* feedback loop */ +# if (BSD < 199306) || defined(__sgi) + m = m_get(M_DONTWAIT, MT_HEADER); +# else + m = m_gethdr(M_DONTWAIT, MT_HEADER); +# endif + if (m == NULL) + return ENOBUFS; + if (m == NULL) + return -1; + + if (tcp->th_flags & TH_SYN) + tlen = 1; + m->m_len = sizeof(*tcp2) + sizeof(*ip); +# if BSD >= 199306 + m->m_data += max_linkhdr; + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = (struct ifnet *)0; +# endif + bzero(mtod(m, char *), sizeof(struct tcpiphdr)); + ip = mtod(m, struct ip *); + tp = mtod(m, struct tcpiphdr *); + tcp2 = (struct tcphdr *)((char *)ip + sizeof(*ip)); + + ip->ip_src.s_addr = oip->ip_dst.s_addr; + ip->ip_dst.s_addr = oip->ip_src.s_addr; + tcp2->th_dport = tcp->th_sport; + tcp2->th_sport = tcp->th_dport; + tcp2->th_ack = ntohl(tcp->th_seq); + tcp2->th_ack += tlen; + tcp2->th_ack = htonl(tcp2->th_ack); + tcp2->th_off = sizeof(*tcp2) >> 2; + tcp2->th_flags = TH_RST|TH_ACK; + tp->ti_pr = oip->ip_p; + tp->ti_len = htons(sizeof(struct tcphdr)); + tcp2->th_sum = in_cksum(m, sizeof(*ip) + sizeof(*tcp2)); + + ip->ip_tos = oip->ip_tos; + ip->ip_p = oip->ip_p; + ip->ip_len = sizeof(*ip) + sizeof(*tcp2); + + return send_ip(m, ip); +} + + +static int send_ip(m, ip) +struct mbuf *m; +ip_t *ip; +{ +# if (defined(__FreeBSD_version) && (__FreeBSD_version >= 220000)) || \ + (defined(_BSDI_VERSION) && (_BSDI_VERSION >= 199802)) + struct route ro; +# endif + +# if (BSD < 199306) || defined(__sgi) + ip->ip_ttl = tcp_ttl; +# else + ip->ip_ttl = ip_defttl; +# endif + +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 220000) + { + int err; + + bzero((char *)&ro, sizeof(ro)); + err = ip_output(m, (struct mbuf *)0, &ro, 0, 0); + if (ro.ro_rt) + RTFREE(ro.ro_rt); + return err; + } +# else + /* + * extra 0 in case of multicast + */ +# if _BSDI_VERSION >= 199802 + return ip_output(m, (struct mbuf *)0, &ro, 0, 0, NULL); +# else + return ip_output(m, (struct mbuf *)0, 0, 0, 0); +# endif +# endif +} + + +int send_icmp_err(oip, type, code, ifp, dst) +ip_t *oip; +int type, code; +void *ifp; +struct in_addr dst; +{ + struct icmp *icmp; + struct mbuf *m; + ip_t *nip; + +# if (BSD < 199306) || defined(__sgi) + m = m_get(M_DONTWAIT, MT_HEADER); +# else + m = m_gethdr(M_DONTWAIT, MT_HEADER); +# endif + if (m == NULL) + return ENOBUFS; + m->m_len = sizeof(*nip) + sizeof(*icmp) + 8; +# if BSD >= 199306 + m->m_data += max_linkhdr; + m->m_pkthdr.len = sizeof(*nip) + sizeof(*icmp) + 8; + m->m_pkthdr.rcvif = (struct ifnet *)0; +# endif + + bzero(mtod(m, char *), (size_t)sizeof(*nip) + sizeof(*icmp) + 8); + nip = mtod(m, ip_t *); + icmp = (struct icmp *)(nip + 1); + + nip->ip_v = IPVERSION; + nip->ip_hl = (sizeof(*nip) >> 2); + nip->ip_p = IPPROTO_ICMP; + nip->ip_id = oip->ip_id; + nip->ip_sum = 0; + nip->ip_ttl = 60; + nip->ip_tos = oip->ip_tos; + nip->ip_len = sizeof(*nip) + sizeof(*icmp) + 8; + if (dst.s_addr == 0) { + if (fr_ifpaddr(ifp, &dst) == -1) + return -1; + dst.s_addr = htonl(dst.s_addr); + } + nip->ip_src = dst; + nip->ip_dst = oip->ip_src; + + icmp->icmp_type = type; + icmp->icmp_code = code; + icmp->icmp_cksum = 0; + bcopy((char *)oip, (char *)&icmp->icmp_ip, sizeof(*oip)); + bcopy((char *)oip + (oip->ip_hl << 2), + (char *)&icmp->icmp_ip + sizeof(*oip), 8); /* 64 bits */ +# ifndef sparc + { + register u_short __iplen, __ipoff; + ip_t *ip = &icmp->icmp_ip; + + __iplen = ip->ip_len; + __ipoff = ip->ip_off; + ip->ip_len = htons(__iplen); + ip->ip_off = htons(__ipoff); + } +# endif + icmp->icmp_cksum = ipf_cksum((u_short *)icmp, sizeof(*icmp) + 8); + return send_ip(m, nip); +} + + +# if !defined(IPFILTER_LKM) && (__FreeBSD_version < 300000) && !defined(__sgi) +# if (BSD < 199306) +int iplinit __P((void)); + +int +# else +void iplinit __P((void)); + +void +# endif +iplinit() +{ + if (iplattach() != 0) + printf("IP Filter failed to attach\n"); + ip_init(); +} +# endif /* ! __NetBSD__ */ + + +size_t mbufchainlen(m0) +register struct mbuf *m0; +{ + register size_t len = 0; + + for (; m0; m0 = m0->m_next) + len += m0->m_len; + return len; +} + + +int ipfr_fastroute(m0, fin, fdp) +struct mbuf *m0; +fr_info_t *fin; +frdest_t *fdp; +{ + register struct ip *ip, *mhip; + register struct mbuf *m = m0; + register struct route *ro; + int len, off, error = 0, hlen; + struct sockaddr_in *dst; + struct route iproute; + struct ifnet *ifp; + frentry_t *fr; + + hlen = fin->fin_hlen; + ip = mtod(m0, struct ip *); + /* + * Route packet. + */ + ro = &iproute; + bzero((caddr_t)ro, sizeof (*ro)); + dst = (struct sockaddr_in *)&ro->ro_dst; + dst->sin_family = AF_INET; + + fr = fin->fin_fr; + ifp = fdp->fd_ifp; + /* + * In case we're here due to "to <if>" being used with "keep state", + * check that we're going in the correct direction. + */ + if ((fr != NULL) && (ifp != NULL) && (fin->fin_rev != 0) && + (fdp == &fr->fr_tif)) + return -1; +# ifdef __bsdi__ + dst->sin_len = sizeof(*dst); +# endif +# if (BSD >= 199306) && !defined(__NetBSD__) && !defined(__bsdi__) && \ + !defined(__OpenBSD__) +# ifdef RTF_CLONING + rtalloc_ign(ro, RTF_CLONING); +# else + rtalloc_ign(ro, RTF_PRCLONING); +# endif +# else + rtalloc(ro); +# endif + if (!ifp) { + if (!(fin->fin_fr->fr_flags & FR_FASTROUTE)) { + error = -2; + goto bad; + } + if (ro->ro_rt == 0 || (ifp = ro->ro_rt->rt_ifp) == 0) { + if (in_localaddr(ip->ip_dst)) + error = EHOSTUNREACH; + else + error = ENETUNREACH; + goto bad; + } + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)&ro->ro_rt->rt_gateway; + } + if (ro->ro_rt) + ro->ro_rt->rt_use++; + + /* + * For input packets which are being "fastrouted", they won't + * go back through output filtering and miss their chance to get + * NAT'd and counted. + */ + fin->fin_ifp = ifp; + if (fin->fin_out == 0) { + fin->fin_out = 1; + if ((fin->fin_fr = ipacct[1][fr_active]) && + (fr_scanlist(FR_NOMATCH, ip, fin, m) & FR_ACCOUNT)) { + ATOMIC_INC(frstats[1].fr_acct); + } + fin->fin_fr = NULL; + (void) fr_checkstate(ip, fin); + (void) ip_natout(ip, fin); + } else + ip->ip_sum = 0; + /* + * If small enough for interface, can just send directly. + */ + if (ip->ip_len <= ifp->if_mtu) { +# ifndef sparc + ip->ip_id = htons(ip->ip_id); + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); +# endif + if (!ip->ip_sum) + ip->ip_sum = in_cksum(m, hlen); +# if BSD >= 199306 + error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, + ro->ro_rt); +# else + error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst); +# endif + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + { + int mhlen, firstlen = len; + struct mbuf **mnext = &m->m_act; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (off = hlen + len; off < ip->ip_len; off += len) { +# ifdef MGETHDR + MGETHDR(m, M_DONTWAIT, MT_HEADER); +# else + MGET(m, M_DONTWAIT, MT_HEADER); +# endif + if (m == 0) { + error = ENOBUFS; + goto bad; + } +# if BSD >= 199306 + m->m_data += max_linkhdr; +# else + m->m_off = MMAXOFF - hlen; +# endif + mhip = mtod(m, struct ip *); + bcopy((char *)ip, (char *)mhip, sizeof(*ip)); + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_hl = mhlen >> 2; + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); + if (ip->ip_off & IP_MF) + mhip->ip_off |= IP_MF; + if (off + len >= ip->ip_len) + len = ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + error = ENOBUFS; /* ??? */ + goto sendorfree; + } +# ifndef sparc + mhip->ip_off = htons((u_short)mhip->ip_off); +# endif + mhip->ip_sum = 0; + mhip->ip_sum = in_cksum(m, mhlen); + *mnext = m; + mnext = &m->m_act; + } + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m_adj(m0, hlen + firstlen - ip->ip_len); + ip->ip_len = htons((u_short)(hlen + firstlen)); + ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m0, hlen); +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_act; + m->m_act = 0; + if (error == 0) +# if BSD >= 199306 + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); +# else + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst); +# endif + else + m_freem(m); + } + } +done: + if (!error) + ipl_frouteok[0]++; + else + ipl_frouteok[1]++; + + if (ro->ro_rt) + RTFREE(ro->ro_rt); + return 0; +bad: + m_freem(m); + goto done; +} +#else /* #ifdef _KERNEL */ + + +# ifdef __sgi +static int no_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s)) +# else +static int no_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s, struct rtentry *rt)) +# endif +{ + return 0; +} + + +# ifdef __STDC__ +# ifdef __sgi +static int write_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s)) +# else +static int write_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s, struct rtentry *rt)) +# endif +{ + ip_t *ip = (ip_t *)m; +# else +static int write_output(ifp, ip) +struct ifnet *ifp; +ip_t *ip; +{ +# endif + char fname[32]; + int fd; + +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + snprintf(fname, sizeof(fname), "/tmp/%s", ifp->if_xname); +# else + snprintf(fname, sizeof(fname), "/tmp/%s%d", ifp->if_name, ifp->if_unit); +# endif + fd = open(fname, O_WRONLY|O_APPEND); + if (fd == -1) { + perror("open"); + return -1; + } + write(fd, (char *)ip, ntohs(ip->ip_len)); + close(fd); + return 0; +} + + +struct ifnet *get_unit(name) +char *name; +{ + struct ifnet *ifp, **ifa; +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + if (!strcmp(name, ifp->if_xname)) + return ifp; + } +# else + char ifname[32], *s; + + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + (void) snprintf(ifname, sizeof(ifname), "%s%d", + ifp->if_name, ifp->if_unit); + if (!strcmp(name, ifname)) + return ifp; + } +# endif + + if (!ifneta) { + ifneta = (struct ifnet **)malloc(sizeof(ifp) * 2); + ifneta[1] = NULL; + ifneta[0] = (struct ifnet *)calloc(1, sizeof(*ifp)); + nifs = 1; + } else { + nifs++; + ifneta = (struct ifnet **)realloc(ifneta, + (nifs + 1) * sizeof(*ifa)); + ifneta[nifs] = NULL; + ifneta[nifs - 1] = (struct ifnet *)malloc(sizeof(*ifp)); + } + ifp = ifneta[nifs - 1]; + +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + strncpy(ifp->if_xname, name, sizeof(ifp->if_xname)); +# else + for (s = name; *s && !isdigit(*s); s++) + ; + if (*s && isdigit(*s)) { + ifp->if_unit = atoi(s); + ifp->if_name = (char *)malloc(s - name + 1); + strncpy(ifp->if_name, name, s - name); + ifp->if_name[s - name] = '\0'; + } else { + ifp->if_name = strdup(name); + ifp->if_unit = -1; + } +# endif + ifp->if_output = no_output; + return ifp; +} + + + +void init_ifp() +{ + struct ifnet *ifp, **ifa; + char fname[32]; + int fd; + +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + ifp->if_output = write_output; + snprintf(fname, sizeof(fname), "/tmp/%s", ifp->if_xname); + fd = open(fname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC, 0600); + if (fd == -1) + perror("open"); + else + close(fd); + } +# else + + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + ifp->if_output = write_output; + snprintf(fname, sizeof(fname), "/tmp/%s%d", ifp->if_name, ifp->if_unit); + fd = open(fname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC, 0600); + if (fd == -1) + perror("open"); + else + close(fd); + } +# endif +} + + +int ipfr_fastroute(ip, fin, fdp) +ip_t *ip; +fr_info_t *fin; +frdest_t *fdp; +{ + struct ifnet *ifp = fdp->fd_ifp; + + if (!ifp) + return 0; /* no routing table out here */ + + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_sum = 0; +#ifdef __sgi + (*ifp->if_output)(ifp, (void *)ip, NULL); +#else + (*ifp->if_output)(ifp, (void *)ip, NULL, 0); +#endif + return 0; +} + + +int ipllog __P((void)) +{ + verbose("l"); + return 0; +} + + +int send_reset(ip, ifp) +ip_t *ip; +struct ifnet *ifp; +{ + verbose("- TCP RST sent\n"); + return 0; +} + + +int icmp_error(ip, ifp) +ip_t *ip; +struct ifnet *ifp; +{ + verbose("- TCP RST sent\n"); + return 0; +} +#endif /* _KERNEL */ diff --git a/sys/netinet/ip_fil.h b/sys/netinet/ip_fil.h new file mode 100644 index 0000000..aef96ca --- /dev/null +++ b/sys/netinet/ip_fil.h @@ -0,0 +1,566 @@ +/* + * Copyright (C) 1993-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_fil.h 1.35 6/5/96 + * $Id: ip_fil.h,v 2.3.2.4 1999/10/15 13:42:37 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_FIL_H__ +#define __IP_FIL_H__ + +/* + * Pathnames for various IP Filter control devices. Used by LKM + * and userland, so defined here. + */ +#define IPNAT_NAME "/dev/ipnat" +#define IPSTATE_NAME "/dev/ipstate" +#define IPAUTH_NAME "/dev/ipauth" + +#ifndef SOLARIS +# define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif + +#ifndef __P +# ifdef __STDC__ +# define __P(x) x +# else +# define __P(x) () +# endif +#endif + +#if defined(__STDC__) || defined(__GNUC__) +# define SIOCADAFR _IOW('r', 60, struct frentry) +# define SIOCRMAFR _IOW('r', 61, struct frentry) +# define SIOCSETFF _IOW('r', 62, u_int) +# define SIOCGETFF _IOR('r', 63, u_int) +# define SIOCGETFS _IOR('r', 64, struct friostat) +# define SIOCIPFFL _IOWR('r', 65, int) +# define SIOCIPFFB _IOR('r', 66, int) +# define SIOCADIFR _IOW('r', 67, struct frentry) +# define SIOCRMIFR _IOW('r', 68, struct frentry) +# define SIOCSWAPA _IOR('r', 69, u_int) +# define SIOCINAFR _IOW('r', 70, struct frentry) +# define SIOCINIFR _IOW('r', 71, struct frentry) +# define SIOCFRENB _IOW('r', 72, u_int) +# define SIOCFRSYN _IOW('r', 73, u_int) +# define SIOCFRZST _IOWR('r', 74, struct friostat) +# define SIOCZRLST _IOWR('r', 75, struct frentry) +# define SIOCAUTHW _IOWR('r', 76, struct fr_info) +# define SIOCAUTHR _IOWR('r', 77, struct fr_info) +# define SIOCATHST _IOWR('r', 78, struct fr_authstat) +#else +# define SIOCADAFR _IOW(r, 60, struct frentry) +# define SIOCRMAFR _IOW(r, 61, struct frentry) +# define SIOCSETFF _IOW(r, 62, u_int) +# define SIOCGETFF _IOR(r, 63, u_int) +# define SIOCGETFS _IOR(r, 64, struct friostat) +# define SIOCIPFFL _IOWR(r, 65, int) +# define SIOCIPFFB _IOR(r, 66, int) +# define SIOCADIFR _IOW(r, 67, struct frentry) +# define SIOCRMIFR _IOW(r, 68, struct frentry) +# define SIOCSWAPA _IOR(r, 69, u_int) +# define SIOCINAFR _IOW(r, 70, struct frentry) +# define SIOCINIFR _IOW(r, 71, struct frentry) +# define SIOCFRENB _IOW(r, 72, u_int) +# define SIOCFRSYN _IOW(r, 73, u_int) +# define SIOCFRZST _IOWR(r, 74, struct friostat) +# define SIOCZRLST _IOWR(r, 75, struct frentry) +# define SIOCAUTHW _IOWR(r, 76, struct fr_info) +# define SIOCAUTHR _IOWR(r, 77, struct fr_info) +# define SIOCATHST _IOWR(r, 78, struct fr_authstat) +#endif +#define SIOCADDFR SIOCADAFR +#define SIOCDELFR SIOCRMAFR +#define SIOCINSFR SIOCINAFR + +typedef struct fr_ip { + u_char fi_v:4; /* IP version */ + u_char fi_fl:4; /* packet flags */ + u_char fi_tos; /* IP packet TOS */ + u_char fi_ttl; /* IP packet TTL */ + u_char fi_p; /* IP packet protocol */ + struct in_addr fi_src; /* source address from packet */ + struct in_addr fi_dst; /* destination address from packet */ + u_32_t fi_optmsk; /* bitmask composed from IP options */ + u_short fi_secmsk; /* bitmask composed from IP security options */ + u_short fi_auth; /* authentication code from IP sec. options */ +} fr_ip_t; + +#define FI_OPTIONS (FF_OPTIONS >> 24) +#define FI_TCPUDP (FF_TCPUDP >> 24) /* TCP/UCP implied comparison*/ +#define FI_FRAG (FF_FRAG >> 24) +#define FI_SHORT (FF_SHORT >> 24) +#define FI_CMP (FI_OPTIONS|FI_TCPUDP|FI_SHORT) + +/* + * These are both used by the state and NAT code to indicate that one port or + * the other should be treated as a wildcard. + */ +#define FI_W_SPORT 0x00000100 +#define FI_W_DPORT 0x00000200 + +typedef struct fr_info { + void *fin_ifp; /* interface packet is `on' */ + struct fr_ip fin_fi; /* IP Packet summary */ + u_short fin_data[2]; /* TCP/UDP ports, ICMP code/type */ + u_char fin_out; /* in or out ? 1 == out, 0 == in */ + u_char fin_rev; /* state only: 1 = reverse */ + u_short fin_hlen; /* length of IP header in bytes */ + u_char fin_tcpf; /* TCP header flags (SYN, ACK, etc) */ + /* From here on is packet specific */ + u_char fin_icode; /* ICMP error to return */ + u_short fin_rule; /* rule # last matched */ + u_short fin_group; /* group number, -1 for none */ + struct frentry *fin_fr; /* last matching rule */ + char *fin_dp; /* start of data past IP header */ + u_short fin_dlen; /* length of data portion of packet */ + u_short fin_id; /* IP packet id field */ + void *fin_mp; /* pointer to pointer to mbuf */ +#if SOLARIS && defined(_KERNEL) + void *fin_qfm; /* pointer to mblk where pkt starts */ + void *fin_qif; +#endif +} fr_info_t; + +/* + * Size for compares on fr_info structures + */ +#define FI_CSIZE offsetof(fr_info_t, fin_icode) + +/* + * Size for copying cache fr_info structure + */ +#define FI_COPYSIZE offsetof(fr_info_t, fin_dp) + +typedef struct frdest { + void *fd_ifp; + struct in_addr fd_ip; + char fd_ifname[IFNAMSIZ]; +} frdest_t; + +typedef struct frentry { + struct frentry *fr_next; + u_short fr_group; /* group to which this rule belongs */ + u_short fr_grhead; /* group # which this rule starts */ + struct frentry *fr_grp; + int fr_ref; /* reference count - for grouping */ + void *fr_ifa; +#if BSD >= 199306 + void *fr_oifa; +#endif + /* + * These are only incremented when a packet matches this rule and + * it is the last match + */ + U_QUAD_T fr_hits; + U_QUAD_T fr_bytes; + /* + * Fields after this may not change whilst in the kernel. + */ + struct fr_ip fr_ip; + struct fr_ip fr_mip; /* mask structure */ + + u_char fr_tcpfm; /* tcp flags mask */ + u_char fr_tcpf; /* tcp flags */ + + u_short fr_icmpm; /* data for ICMP packets (mask) */ + u_short fr_icmp; + + u_char fr_scmp; /* data for port comparisons */ + u_char fr_dcmp; + u_short fr_dport; + u_short fr_sport; + u_short fr_stop; /* top port for <> and >< */ + u_short fr_dtop; /* top port for <> and >< */ + u_32_t fr_flags; /* per-rule flags && options (see below) */ + u_short fr_skip; /* # of rules to skip */ + u_short fr_loglevel; /* syslog log facility + priority */ + int (*fr_func) __P((int, ip_t *, fr_info_t *)); /* call this function */ + char fr_icode; /* return ICMP code */ + char fr_ifname[IFNAMSIZ]; +#if BSD >= 199306 + char fr_oifname[IFNAMSIZ]; +#endif + struct frdest fr_tif; /* "to" interface */ + struct frdest fr_dif; /* duplicate packet interfaces */ +} frentry_t; + +#define fr_proto fr_ip.fi_p +#define fr_ttl fr_ip.fi_ttl +#define fr_tos fr_ip.fi_tos +#define fr_dst fr_ip.fi_dst +#define fr_src fr_ip.fi_src +#define fr_dmsk fr_mip.fi_dst +#define fr_smsk fr_mip.fi_src + +#ifndef offsetof +#define offsetof(t,m) (int)((&((t *)0L)->m)) +#endif +#define FR_CMPSIZ (sizeof(struct frentry) - offsetof(frentry_t, fr_ip)) + +/* + * fr_flags + */ +#define FR_BLOCK 0x00001 /* do not allow packet to pass */ +#define FR_PASS 0x00002 /* allow packet to pass */ +#define FR_OUTQUE 0x00004 /* outgoing packets */ +#define FR_INQUE 0x00008 /* ingoing packets */ +#define FR_LOG 0x00010 /* Log */ +#define FR_LOGB 0x00011 /* Log-fail */ +#define FR_LOGP 0x00012 /* Log-pass */ +#define FR_LOGBODY 0x00020 /* Log the body */ +#define FR_LOGFIRST 0x00040 /* Log the first byte if state held */ +#define FR_RETRST 0x00080 /* Return TCP RST packet - reset connection */ +#define FR_RETICMP 0x00100 /* Return ICMP unreachable packet */ +#define FR_FAKEICMP 0x00180 /* Return ICMP unreachable with fake source */ +#define FR_NOMATCH 0x00200 /* no match occured */ +#define FR_ACCOUNT 0x00400 /* count packet bytes */ +#define FR_KEEPFRAG 0x00800 /* keep fragment information */ +#define FR_KEEPSTATE 0x01000 /* keep `connection' state information */ +#define FR_INACTIVE 0x02000 +#define FR_QUICK 0x04000 /* match & stop processing list */ +#define FR_FASTROUTE 0x08000 /* bypass normal routing */ +#define FR_CALLNOW 0x10000 /* call another function (fr_func) if matches */ +#define FR_DUP 0x20000 /* duplicate packet */ +#define FR_LOGORBLOCK 0x40000 /* block the packet if it can't be logged */ +#define FR_NOTSRCIP 0x80000 /* not the src IP# */ +#define FR_NOTDSTIP 0x100000 /* not the dst IP# */ +#define FR_AUTH 0x200000 /* use authentication */ +#define FR_PREAUTH 0x400000 /* require preauthentication */ +#define FR_DONTCACHE 0x800000 /* don't cache the result */ + +#define FR_LOGMASK (FR_LOG|FR_LOGP|FR_LOGB) +#define FR_RETMASK (FR_RETICMP|FR_RETRST|FR_FAKEICMP) + +/* + * These correspond to #define's for FI_* and are stored in fr_flags + */ +#define FF_OPTIONS 0x01000000 +#define FF_TCPUDP 0x02000000 +#define FF_FRAG 0x04000000 +#define FF_SHORT 0x08000000 +/* + * recognized flags for SIOCGETFF and SIOCSETFF, and get put in fr_flags + */ +#define FF_LOGPASS 0x10000000 +#define FF_LOGBLOCK 0x20000000 +#define FF_LOGNOMATCH 0x40000000 +#define FF_LOGGING (FF_LOGPASS|FF_LOGBLOCK|FF_LOGNOMATCH) +#define FF_BLOCKNONIP 0x80000000 /* Solaris2 Only */ + +#define FR_NONE 0 +#define FR_EQUAL 1 +#define FR_NEQUAL 2 +#define FR_LESST 3 +#define FR_GREATERT 4 +#define FR_LESSTE 5 +#define FR_GREATERTE 6 +#define FR_OUTRANGE 7 +#define FR_INRANGE 8 + +typedef struct filterstats { + u_long fr_pass; /* packets allowed */ + u_long fr_block; /* packets denied */ + u_long fr_nom; /* packets which don't match any rule */ + u_long fr_ppkl; /* packets allowed and logged */ + u_long fr_bpkl; /* packets denied and logged */ + u_long fr_npkl; /* packets unmatched and logged */ + u_long fr_pkl; /* packets logged */ + u_long fr_skip; /* packets to be logged but buffer full */ + u_long fr_ret; /* packets for which a return is sent */ + u_long fr_acct; /* packets for which counting was performed */ + u_long fr_bnfr; /* bad attempts to allocate fragment state */ + u_long fr_nfr; /* new fragment state kept */ + u_long fr_cfr; /* add new fragment state but complete pkt */ + u_long fr_bads; /* bad attempts to allocate packet state */ + u_long fr_ads; /* new packet state kept */ + u_long fr_chit; /* cached hit */ + u_long fr_tcpbad; /* TCP checksum check failures */ + u_long fr_pull[2]; /* good and bad pullup attempts */ +#if SOLARIS + u_long fr_notdata; /* PROTO/PCPROTO that have no data */ + u_long fr_nodata; /* mblks that have no data */ + u_long fr_bad; /* bad IP packets to the filter */ + u_long fr_notip; /* packets passed through no on ip queue */ + u_long fr_drop; /* packets dropped - no info for them! */ +#endif +} filterstats_t; + +/* + * For SIOCGETFS + */ +typedef struct friostat { + struct filterstats f_st[2]; + struct frentry *f_fin[2]; + struct frentry *f_fout[2]; + struct frentry *f_acctin[2]; + struct frentry *f_acctout[2]; + struct frentry *f_auth; + struct frgroup *f_groups[3][2]; + u_long f_froute[2]; + int f_active; /* 1 or 0 - active rule set */ + int f_defpass; /* default pass - from fr_pass */ + int f_running; /* 1 if running, else 0 */ + int f_logging; /* 1 if enabled, else 0 */ + char f_version[32]; /* version string */ +} friostat_t; + +typedef struct optlist { + u_short ol_val; + int ol_bit; +} optlist_t; + + +/* + * Group list structure. + */ +typedef struct frgroup { + u_short fg_num; + struct frgroup *fg_next; + struct frentry *fg_head; + struct frentry **fg_start; +} frgroup_t; + + +/* + * Log structure. Each packet header logged is prepended by one of these. + * Following this in the log records read from the device will be an ipflog + * structure which is then followed by any packet data. + */ +typedef struct iplog { + u_32_t ipl_magic; + u_int ipl_count; + u_long ipl_sec; + u_long ipl_usec; + size_t ipl_dsize; + struct iplog *ipl_next; +} iplog_t; + +#define IPL_MAGIC 0x49504c4d /* 'IPLM' */ + +typedef struct ipflog { +#if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199603)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + u_char fl_ifname[IFNAMSIZ]; +#else + u_int fl_unit; + u_char fl_ifname[4]; +#endif + u_char fl_plen; /* extra data after hlen */ + u_char fl_hlen; /* length of IP headers saved */ + u_short fl_rule; /* assume never more than 64k rules, total */ + u_short fl_group; + u_short fl_loglevel; /* syslog log level */ + u_32_t fl_flags; + u_32_t fl_lflags; +} ipflog_t; + + +#ifndef ICMP_UNREACH_FILTER +# define ICMP_UNREACH_FILTER 13 +#endif + +#ifndef IPF_LOGGING +# define IPF_LOGGING 0 +#endif +#ifndef IPF_DEFAULT_PASS +# define IPF_DEFAULT_PASS FR_PASS +#endif + +#define IPMINLEN(i, h) ((i)->ip_len >= ((i)->ip_hl * 4 + sizeof(struct h))) +#define IPLLOGSIZE 8192 + +/* + * Device filenames for reading log information. Use ipf on Solaris2 because + * ipl is already a name used by something else. + */ +#ifndef IPL_NAME +# if SOLARIS +# define IPL_NAME "/dev/ipf" +# else +# define IPL_NAME "/dev/ipl" +# endif +#endif +#define IPL_NAT IPNAT_NAME +#define IPL_STATE IPSTATE_NAME +#define IPL_AUTH IPAUTH_NAME + +#define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */ +#define IPL_LOGNAT 1 +#define IPL_LOGSTATE 2 +#define IPL_LOGAUTH 3 +#define IPL_LOGMAX 3 + +#if !defined(CDEV_MAJOR) && defined (__FreeBSD_version) && \ + (__FreeBSD_version >= 220000) +# define CDEV_MAJOR 79 +#endif + +/* + * Post NetBSD 1.2 has the PFIL interface for packet filters. This turns + * on those hooks. We don't need any special mods in non-IP Filter code + * with this! + */ +#if (defined(NetBSD) && (NetBSD > 199609) && (NetBSD <= 1991011)) || \ + (defined(NetBSD1_2) && NetBSD1_2 > 1) +# if (NetBSD >= 199905) +# define PFIL_HOOKS +# endif +# ifdef PFIL_HOOKS +# define NETBSD_PF +# endif +#endif + + +#ifndef _KERNEL +extern int fr_check __P((ip_t *, int, void *, int, mb_t **)); +extern int (*fr_checkp) __P((ip_t *, int, void *, int, mb_t **)); +extern int send_reset __P((ip_t *, struct ifnet *)); +extern int icmp_error __P((ip_t *, struct ifnet *)); +extern int ipf_log __P((void)); +extern int ipfr_fastroute __P((ip_t *, fr_info_t *, frdest_t *)); +extern struct ifnet *get_unit __P((char *)); +# if defined(__NetBSD__) || defined(__OpenBSD__) || \ + (_BSDI_VERSION >= 199701) || (__FreeBSD_version >= 300000) +extern int iplioctl __P((dev_t, u_long, caddr_t, int)); +# else +extern int iplioctl __P((dev_t, int, caddr_t, int)); +# endif +extern int iplopen __P((dev_t, int)); +extern int iplclose __P((dev_t, int)); +#else /* #ifndef _KERNEL */ +# if defined(__NetBSD__) && defined(PFIL_HOOKS) +extern void ipfilterattach __P((int)); +# endif +extern int iplattach __P((void)); +extern int ipl_enable __P((void)); +extern int ipl_disable __P((void)); +extern void ipflog_init __P((void)); +extern int ipflog_clear __P((minor_t)); +extern int ipflog_read __P((minor_t, struct uio *)); +extern int ipflog __P((u_int, ip_t *, fr_info_t *, mb_t *)); +extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int)); +# if SOLARIS +extern int fr_check __P((ip_t *, int, void *, int, qif_t *, mb_t **)); +extern int (*fr_checkp) __P((ip_t *, int, void *, + int, qif_t *, mb_t **)); +extern int icmp_error __P((ip_t *, int, int, qif_t *, struct in_addr)); +# if SOLARIS2 >= 7 +extern int iplioctl __P((dev_t, int, intptr_t, int, cred_t *, int *)); +# else +extern int iplioctl __P((dev_t, int, int *, int, cred_t *, int *)); +# endif +extern int iplopen __P((dev_t *, int, int, cred_t *)); +extern int iplclose __P((dev_t, int, int, cred_t *)); +extern int ipfsync __P((void)); +extern int send_reset __P((fr_info_t *, ip_t *, qif_t *)); +extern int ipfr_fastroute __P((qif_t *, ip_t *, mblk_t *, mblk_t **, + fr_info_t *, frdest_t *)); +extern void copyin_mblk __P((mblk_t *, size_t, size_t, char *)); +extern void copyout_mblk __P((mblk_t *, size_t, size_t, char *)); +extern int fr_qin __P((queue_t *, mblk_t *)); +extern int fr_qout __P((queue_t *, mblk_t *)); +# ifdef IPFILTER_LOG +extern int iplread __P((dev_t, struct uio *, cred_t *)); +# endif +# else /* SOLARIS */ +extern int fr_check __P((ip_t *, int, void *, int, mb_t **)); +extern int (*fr_checkp) __P((ip_t *, int, void *, int, mb_t **)); +# ifdef linux +extern int send_reset __P((tcpiphdr_t *, struct ifnet *)); +# else +extern int send_reset __P((fr_info_t *, struct ip *)); +extern int send_icmp_err __P((ip_t *, int, int, void *, struct in_addr)); +# endif +extern int ipfr_fastroute __P((mb_t *, fr_info_t *, frdest_t *)); +extern size_t mbufchainlen __P((mb_t *)); +# ifdef __sgi +# include <sys/cred.h> +extern int iplioctl __P((dev_t, int, caddr_t, int, cred_t *, int *)); +extern int iplopen __P((dev_t *, int, int, cred_t *)); +extern int iplclose __P((dev_t, int, int, cred_t *)); +extern int iplread __P((dev_t, struct uio *, cred_t *)); +extern int ipfsync __P((void)); +extern int ipfilter_sgi_attach __P((void)); +extern void ipfilter_sgi_detach __P((void)); +extern void ipfilter_sgi_intfsync __P((void)); +# else +# ifdef IPFILTER_LKM +extern int iplidentify __P((char *)); +# endif +# if (_BSDI_VERSION >= 199510) || (__FreeBSD_version >= 220000) || \ + (NetBSD >= 199511) || defined(__OpenBSD__) +# if defined(__NetBSD__) || (_BSDI_VERSION >= 199701) || \ + defined(__OpenBSD__) || (__FreeBSD_version >= 300000) +extern int iplioctl __P((dev_t, u_long, caddr_t, int, struct proc *)); +# else +extern int iplioctl __P((dev_t, int, caddr_t, int, struct proc *)); +# endif +extern int iplopen __P((dev_t, int, int, struct proc *)); +extern int iplclose __P((dev_t, int, int, struct proc *)); +# else +# ifndef linux +extern int iplopen __P((dev_t, int)); +extern int iplclose __P((dev_t, int)); +extern int iplioctl __P((dev_t, int, caddr_t, int)); +# else +extern int iplioctl(struct inode *, struct file *, u_int, u_long); +extern int iplopen __P((struct inode *, struct file *)); +extern void iplclose __P((struct inode *, struct file *)); +# endif /* !linux */ +# endif /* (_BSDI_VERSION >= 199510) */ +# if BSD >= 199306 +extern int iplread __P((dev_t, struct uio *, int)); +# else +# ifndef linux +extern int iplread __P((dev_t, struct uio *)); +# else +extern int iplread(struct inode *, struct file *, char *, int); +# endif /* !linux */ +# endif /* BSD >= 199306 */ +# endif /* __ sgi */ +# endif /* SOLARIS */ +#endif /* #ifndef _KERNEL */ + +extern void fixskip __P((frentry_t **, frentry_t *, int)); +extern int countbits __P((u_32_t)); +extern int ipldetach __P((void)); +extern u_short fr_tcpsum __P((mb_t *, ip_t *, tcphdr_t *)); +extern int fr_scanlist __P((u_32_t, ip_t *, fr_info_t *, void *)); +extern u_short ipf_cksum __P((u_short *, int)); +extern int fr_copytolog __P((int, char *, int)); +extern void fr_forgetifp __P((void *)); +extern int frflush __P((minor_t, int)); +extern void frsync __P((void)); +extern frgroup_t *fr_addgroup __P((u_int, frentry_t *, minor_t, int)); +extern frgroup_t *fr_findgroup __P((u_int, u_32_t, minor_t, int, frgroup_t ***)); +extern void fr_delgroup __P((u_int, u_32_t, minor_t, int)); +extern void fr_makefrip __P((int, ip_t *, fr_info_t *)); +extern int fr_ifpaddr __P((void *, struct in_addr *)); +extern char *memstr __P((char *, char *, int, int)); +extern int ipl_unreach; +extern int ipl_inited; +extern u_long ipl_frouteok[2]; +extern int fr_pass; +extern int fr_flags; +extern int fr_active; +extern fr_info_t frcache[2]; +extern char ipfilter_version[]; +#ifdef IPFILTER_LOG +extern iplog_t **iplh[IPL_LOGMAX+1], *iplt[IPL_LOGMAX+1]; +extern size_t iplused[IPL_LOGMAX + 1]; +#endif +extern struct frentry *ipfilter[2][2], *ipacct[2][2]; +extern struct frgroup *ipfgroups[3][2]; +extern struct filterstats frstats[]; + +#endif /* __IP_FIL_H__ */ diff --git a/sys/netinet/ip_flow.c b/sys/netinet/ip_flow.c new file mode 100644 index 0000000..6c9119b --- /dev/null +++ b/sys/netinet/ip_flow.c @@ -0,0 +1,327 @@ +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/kernel.h> + +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/ip_flow.h> + +#define IPFLOW_TIMER (5 * PR_SLOWHZ) +#define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ +#define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) +static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE]; +static int ipflow_inuse; +#define IPFLOW_MAX 256 + +static int ipflow_active = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, + &ipflow_active, 0, "Enable flow-based IP forwarding"); + +static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); + +static unsigned +ipflow_hash( + struct in_addr dst, + struct in_addr src, + unsigned tos) +{ + unsigned hash = tos; + int idx; + for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) + hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); + return hash & (IPFLOW_HASHSIZE-1); +} + +static struct ipflow * +ipflow_lookup( + const struct ip *ip) +{ + unsigned hash; + struct ipflow *ipf; + + hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + + ipf = LIST_FIRST(&ipflows[hash]); + while (ipf != NULL) { + if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr + && ip->ip_src.s_addr == ipf->ipf_src.s_addr + && ip->ip_tos == ipf->ipf_tos) + break; + ipf = LIST_NEXT(ipf, ipf_next); + } + return ipf; +} + +int +ipflow_fastforward( + struct mbuf *m) +{ + struct ip *ip; + struct ipflow *ipf; + struct rtentry *rt; + int error; + + /* + * Are we forwarding packets? Big enough for an IP packet? + */ + if (!ipforwarding || !ipflow_active || m->m_len < sizeof(struct ip)) + return 0; + /* + * IP header with no option and valid version and length + */ + ip = mtod(m, struct ip *); + if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) + || ntohs(ip->ip_len) > m->m_pkthdr.len) + return 0; + /* + * Find a flow. + */ + if ((ipf = ipflow_lookup(ip)) == NULL) + return 0; + + /* + * Route and interface still up? + */ + rt = ipf->ipf_ro.ro_rt; + if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) + return 0; + + /* + * Packet size OK? TTL? + */ + if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) + return 0; + + /* + * Everything checks out and so we can forward this packet. + * Modify the TTL and incrementally change the checksum. + */ + ip->ip_ttl -= IPTTLDEC; + if (ip->ip_sum >= htons(0xffff - (IPTTLDEC << 8))) { + ip->ip_sum += htons(IPTTLDEC << 8) + 1; + } else { + ip->ip_sum += htons(IPTTLDEC << 8); + } + + /* + * Send the packet on its way. All we can get back is ENOBUFS + */ + ipf->ipf_uses++; + ipf->ipf_timer = IPFLOW_TIMER; + if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, &ipf->ipf_ro.ro_dst, rt)) != 0) { + if (error == ENOBUFS) + ipf->ipf_dropped++; + else + ipf->ipf_errors++; + } + return 1; +} + +static void +ipflow_addstats( + struct ipflow *ipf) +{ + ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; + ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; + ipstat.ips_forward += ipf->ipf_uses; + ipstat.ips_fastforward += ipf->ipf_uses; +} + +static void +ipflow_free( + struct ipflow *ipf) +{ + int s; + /* + * Remove the flow from the hash table (at elevated IPL). + * Once it's off the list, we can deal with it at normal + * network IPL. + */ + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipflow_inuse--; + FREE(ipf, M_IPFLOW); +} + +static struct ipflow * +ipflow_reap( + void) +{ + struct ipflow *ipf, *maybe_ipf = NULL; + int idx; + int s; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + ipf = LIST_FIRST(&ipflows[idx]); + while (ipf != NULL) { + /* + * If this no longer points to a valid route + * reclaim it. + */ + if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) + goto done; + /* + * choose the one that's been least recently used + * or has had the least uses in the last 1.5 + * intervals. + */ + if (maybe_ipf == NULL + || ipf->ipf_timer < maybe_ipf->ipf_timer + || (ipf->ipf_timer == maybe_ipf->ipf_timer + && ipf->ipf_last_uses + ipf->ipf_uses < + maybe_ipf->ipf_last_uses + + maybe_ipf->ipf_uses)) + maybe_ipf = ipf; + ipf = LIST_NEXT(ipf, ipf_next); + } + } + ipf = maybe_ipf; + done: + /* + * Remove the entry from the flow table. + */ + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + return ipf; +} + +void +ipflow_slowtimo( + void) +{ + struct ipflow *ipf; + int idx; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + ipf = LIST_FIRST(&ipflows[idx]); + while (ipf != NULL) { + struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next); + if (--ipf->ipf_timer == 0) { + ipflow_free(ipf); + } else { + ipf->ipf_last_uses = ipf->ipf_uses; + ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; + ipstat.ips_forward += ipf->ipf_uses; + ipstat.ips_fastforward += ipf->ipf_uses; + ipf->ipf_uses = 0; + } + ipf = next_ipf; + } + } +} + +void +ipflow_create( + const struct route *ro, + struct mbuf *m) +{ + const struct ip *const ip = mtod(m, struct ip *); + struct ipflow *ipf; + unsigned hash; + int s; + + /* + * Don't create cache entries for ICMP messages. + */ + if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) + return; + /* + * See if an existing flow struct exists. If so remove it from it's + * list and free the old route. If not, try to malloc a new one + * (if we aren't at our limit). + */ + ipf = ipflow_lookup(ip); + if (ipf == NULL) { + if (ipflow_inuse == IPFLOW_MAX) { + ipf = ipflow_reap(); + } else { + ipf = (struct ipflow *) malloc(sizeof(*ipf), M_IPFLOW, + M_NOWAIT); + if (ipf == NULL) + return; + ipflow_inuse++; + } + bzero((caddr_t) ipf, sizeof(*ipf)); + } else { + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipf->ipf_uses = ipf->ipf_last_uses = 0; + ipf->ipf_errors = ipf->ipf_dropped = 0; + } + + /* + * Fill in the updated information. + */ + ipf->ipf_ro = *ro; + ro->ro_rt->rt_refcnt++; + ipf->ipf_dst = ip->ip_dst; + ipf->ipf_src = ip->ip_src; + ipf->ipf_tos = ip->ip_tos; + ipf->ipf_timer = IPFLOW_TIMER; + /* + * Insert into the approriate bucket of the flow table. + */ + hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + s = splimp(); + LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next); + splx(s); +} diff --git a/sys/netinet/ip_flow.h b/sys/netinet/ip_flow.h new file mode 100644 index 0000000..4675996 --- /dev/null +++ b/sys/netinet/ip_flow.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_FLOW_H +#define _NETINET_IP_FLOW_H + +struct ipflow { + LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */ + struct in_addr ipf_dst; /* destination address */ + struct in_addr ipf_src; /* source address */ + + u_int8_t ipf_tos; /* type-of-service */ + struct route ipf_ro; /* associated route entry */ + u_long ipf_uses; /* number of uses in this period */ + + int ipf_timer; /* remaining lifetime of this entry */ + u_long ipf_dropped; /* ENOBUFS returned by if_output */ + u_long ipf_errors; /* other errors returned by if_output */ + u_long ipf_last_uses; /* number of uses in last period */ +}; + +#endif diff --git a/sys/netinet/ip_frag.c b/sys/netinet/ip_frag.c new file mode 100644 index 0000000..af07c3d --- /dev/null +++ b/sys/netinet/ip_frag.c @@ -0,0 +1,525 @@ +/* + * Copyright (C) 1993-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_frag.c 1.11 3/24/96 (C) 1993-1995 Darren Reed"; +/*static const char rcsid[] = "@(#)$Id: ip_frag.c,v 2.4.2.3 1999/09/18 15:03:54 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#if defined(KERNEL) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if defined(_KERNEL) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# if defined(_KERNEL) && !defined(__sgi) +# include <sys/kernel.h> +# endif +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef linux +# include <netinet/ip_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_auth.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if (defined(KERNEL) || defined(_KERNEL)) +# ifndef IPFILTER_LKM +# include <sys/libkern.h> +# include <sys/systm.h> +# endif +extern struct callout_handle ipfr_slowtimer_ch; +# endif +#endif + + +static ipfr_t *ipfr_heads[IPFT_SIZE]; +static ipfr_t *ipfr_nattab[IPFT_SIZE]; +static ipfrstat_t ipfr_stats; +static int ipfr_inuse = 0; + +int fr_ipfrttl = 120; /* 60 seconds */ + +#ifdef _KERNEL +# if SOLARIS2 >= 7 +extern timeout_id_t ipfr_timer_id; +# else +extern int ipfr_timer_id; +# endif +#endif +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern KRWLOCK_T ipf_frag, ipf_natfrag, ipf_nat, ipf_mutex; +# if SOLARIS +extern KRWLOCK_T ipf_solaris; +# else +KRWLOCK_T ipf_solaris; +# endif +extern kmutex_t ipf_rw; +#endif + + +static ipfr_t *ipfr_new __P((ip_t *, fr_info_t *, u_int, ipfr_t **)); +static ipfr_t *ipfr_lookup __P((ip_t *, fr_info_t *, ipfr_t **)); +static void ipfr_delete __P((ipfr_t *)); + + +ipfrstat_t *ipfr_fragstats() +{ + ipfr_stats.ifs_table = ipfr_heads; + ipfr_stats.ifs_nattab = ipfr_nattab; + ipfr_stats.ifs_inuse = ipfr_inuse; + return &ipfr_stats; +} + + +/* + * add a new entry to the fragment cache, registering it as having come + * through this box, with the result of the filter operation. + */ +static ipfr_t *ipfr_new(ip, fin, pass, table) +ip_t *ip; +fr_info_t *fin; +u_int pass; +ipfr_t *table[]; +{ + ipfr_t **fp, *fra, frag; + u_int idx; + + frag.ipfr_p = ip->ip_p; + idx = ip->ip_p; + frag.ipfr_id = ip->ip_id; + idx += ip->ip_id; + frag.ipfr_tos = ip->ip_tos; + frag.ipfr_src.s_addr = ip->ip_src.s_addr; + idx += ip->ip_src.s_addr; + frag.ipfr_dst.s_addr = ip->ip_dst.s_addr; + idx += ip->ip_dst.s_addr; + idx *= 127; + idx %= IPFT_SIZE; + + /* + * first, make sure it isn't already there... + */ + for (fp = &table[idx]; (fra = *fp); fp = &fra->ipfr_next) + if (!bcmp((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, + IPFR_CMPSZ)) { + ATOMIC_INC(ipfr_stats.ifs_exists); + return NULL; + } + + /* + * allocate some memory, if possible, if not, just record that we + * failed to do so. + */ + KMALLOC(fra, ipfr_t *); + if (fra == NULL) { + ATOMIC_INC(ipfr_stats.ifs_nomem); + return NULL; + } + + if ((fra->ipfr_rule = fin->fin_fr) != NULL) { + ATOMIC_INC(fin->fin_fr->fr_ref); + } + + + /* + * Instert the fragment into the fragment table, copy the struct used + * in the search using bcopy rather than reassign each field. + * Set the ttl to the default and mask out logging from "pass" + */ + if ((fra->ipfr_next = table[idx])) + table[idx]->ipfr_prev = fra; + fra->ipfr_prev = NULL; + fra->ipfr_data = NULL; + table[idx] = fra; + bcopy((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, IPFR_CMPSZ); + fra->ipfr_ttl = fr_ipfrttl; + /* + * Compute the offset of the expected start of the next packet. + */ + fra->ipfr_off = (ip->ip_off & IP_OFFMASK) + (fin->fin_dlen >> 3); + ATOMIC_INC(ipfr_stats.ifs_new); + ATOMIC_INC(ipfr_inuse); + return fra; +} + + +int ipfr_newfrag(ip, fin, pass) +ip_t *ip; +fr_info_t *fin; +u_int pass; +{ + ipfr_t *ipf; + + WRITE_ENTER(&ipf_frag); + ipf = ipfr_new(ip, fin, pass, ipfr_heads); + RWLOCK_EXIT(&ipf_frag); + return ipf ? 0 : -1; +} + + +int ipfr_nat_newfrag(ip, fin, pass, nat) +ip_t *ip; +fr_info_t *fin; +u_int pass; +nat_t *nat; +{ + ipfr_t *ipf; + + WRITE_ENTER(&ipf_natfrag); + ipf = ipfr_new(ip, fin, pass, ipfr_nattab); + if (ipf != NULL) { + ipf->ipfr_data = nat; + nat->nat_data = ipf; + } + RWLOCK_EXIT(&ipf_natfrag); + return ipf ? 0 : -1; +} + + +/* + * check the fragment cache to see if there is already a record of this packet + * with its filter result known. + */ +static ipfr_t *ipfr_lookup(ip, fin, table) +ip_t *ip; +fr_info_t *fin; +ipfr_t *table[]; +{ + ipfr_t *f, frag; + u_int idx; + + /* + * For fragments, we record protocol, packet id, TOS and both IP#'s + * (these should all be the same for all fragments of a packet). + * + * build up a hash value to index the table with. + */ + frag.ipfr_p = ip->ip_p; + idx = ip->ip_p; + frag.ipfr_id = ip->ip_id; + idx += ip->ip_id; + frag.ipfr_tos = ip->ip_tos; + frag.ipfr_src.s_addr = ip->ip_src.s_addr; + idx += ip->ip_src.s_addr; + frag.ipfr_dst.s_addr = ip->ip_dst.s_addr; + idx += ip->ip_dst.s_addr; + idx *= 127; + idx %= IPFT_SIZE; + + /* + * check the table, careful to only compare the right amount of data + */ + for (f = table[idx]; f; f = f->ipfr_next) + if (!bcmp((char *)&frag.ipfr_src, (char *)&f->ipfr_src, + IPFR_CMPSZ)) { + u_short atoff, off; + + if (f != table[idx]) { + /* + * move fragment info. to the top of the list + * to speed up searches. + */ + if ((f->ipfr_prev->ipfr_next = f->ipfr_next)) + f->ipfr_next->ipfr_prev = f->ipfr_prev; + f->ipfr_next = table[idx]; + table[idx]->ipfr_prev = f; + f->ipfr_prev = NULL; + table[idx] = f; + } + off = ip->ip_off; + atoff = off + (fin->fin_dlen >> 3); + /* + * If we've follwed the fragments, and this is the + * last (in order), shrink expiration time. + */ + if ((off & IP_OFFMASK) == f->ipfr_off) { + if (!(off & IP_MF)) + f->ipfr_ttl = 1; + else + f->ipfr_off = atoff; + } + ATOMIC_INC(ipfr_stats.ifs_hits); + return f; + } + return NULL; +} + + +/* + * functional interface for NAT lookups of the NAT fragment cache + */ +nat_t *ipfr_nat_knownfrag(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + nat_t *nat; + ipfr_t *ipf; + + READ_ENTER(&ipf_natfrag); + ipf = ipfr_lookup(ip, fin, ipfr_nattab); + if (ipf != NULL) { + nat = ipf->ipfr_data; + /* + * This is the last fragment for this packet. + */ + if ((ipf->ipfr_ttl == 1) && (nat != NULL)) { + nat->nat_data = NULL; + ipf->ipfr_data = NULL; + } + } else + nat = NULL; + RWLOCK_EXIT(&ipf_natfrag); + return nat; +} + + +/* + * functional interface for normal lookups of the fragment cache + */ +frentry_t *ipfr_knownfrag(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + frentry_t *fr = NULL; + ipfr_t *fra; + + READ_ENTER(&ipf_frag); + fra = ipfr_lookup(ip, fin, ipfr_heads); + if (fra != NULL) + fr = fra->ipfr_rule; + RWLOCK_EXIT(&ipf_frag); + return fr; +} + + +/* + * forget any references to this external object. + */ +void ipfr_forget(nat) +void *nat; +{ + ipfr_t *fr; + int idx; + + WRITE_ENTER(&ipf_natfrag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fr = ipfr_heads[idx]; fr; fr = fr->ipfr_next) + if (fr->ipfr_data == nat) + fr->ipfr_data = NULL; + + RWLOCK_EXIT(&ipf_natfrag); +} + + +static void ipfr_delete(fra) +ipfr_t *fra; +{ + frentry_t *fr; + + fr = fra->ipfr_rule; + if (fr != NULL) { + ATOMIC_DEC(fr->fr_ref); + if (fr->fr_ref == 0) + KFREE(fr); + } + if (fra->ipfr_prev) + fra->ipfr_prev->ipfr_next = fra->ipfr_next; + if (fra->ipfr_next) + fra->ipfr_next->ipfr_prev = fra->ipfr_prev; + KFREE(fra); +} + + +/* + * Free memory in use by fragment state info. kept. + */ +void ipfr_unload() +{ + ipfr_t **fp, *fra; + nat_t *nat; + int idx; + + WRITE_ENTER(&ipf_frag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_heads[idx]; (fra = *fp); ) { + *fp = fra->ipfr_next; + ipfr_delete(fra); + } + RWLOCK_EXIT(&ipf_frag); + + WRITE_ENTER(&ipf_nat); + WRITE_ENTER(&ipf_natfrag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_nattab[idx]; (fra = *fp); ) { + *fp = fra->ipfr_next; + nat = fra->ipfr_data; + if (nat != NULL) { + if (nat->nat_data == fra) + nat->nat_data = NULL; + } + ipfr_delete(fra); + } + RWLOCK_EXIT(&ipf_natfrag); + RWLOCK_EXIT(&ipf_nat); +} + + +#ifdef _KERNEL +/* + * Slowly expire held state for fragments. Timeouts are set * in expectation + * of this being called twice per second. + */ +# if (BSD >= 199306) || SOLARIS || defined(__sgi) +# if defined(SOLARIS2) && (SOLARIS2 < 7) +void ipfr_slowtimer() +# else +void ipfr_slowtimer __P((void *ptr)) +# endif +# else +int ipfr_slowtimer() +# endif +{ + ipfr_t **fp, *fra; + nat_t *nat; + int idx; +#if defined(_KERNEL) +# if !SOLARIS + int s; +# else + extern int fr_running; + + if (fr_running <= 0) + return; +# endif +#endif + + READ_ENTER(&ipf_solaris); +#ifdef __sgi + ipfilter_sgi_intfsync(); +#endif + + SPL_NET(s); + WRITE_ENTER(&ipf_frag); + + /* + * Go through the entire table, looking for entries to expire, + * decreasing the ttl by one for each entry. If it reaches 0, + * remove it from the chain and free it. + */ + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_heads[idx]; (fra = *fp); ) { + --fra->ipfr_ttl; + if (fra->ipfr_ttl == 0) { + *fp = fra->ipfr_next; + ipfr_delete(fra); + ATOMIC_INC(ipfr_stats.ifs_expire); + ATOMIC_DEC(ipfr_inuse); + } else + fp = &fra->ipfr_next; + } + RWLOCK_EXIT(&ipf_frag); + + /* + * Same again for the NAT table, except that if the structure also + * still points to a NAT structure, and the NAT structure points back + * at the one to be free'd, NULL the reference from the NAT struct. + * NOTE: We need to grab both mutex's early, and in this order so as + * to prevent a deadlock if both try to expire at the same time. + */ + WRITE_ENTER(&ipf_nat); + WRITE_ENTER(&ipf_natfrag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_nattab[idx]; (fra = *fp); ) { + --fra->ipfr_ttl; + if (fra->ipfr_ttl == 0) { + ATOMIC_INC(ipfr_stats.ifs_expire); + ATOMIC_DEC(ipfr_inuse); + nat = fra->ipfr_data; + if (nat != NULL) { + if (nat->nat_data == fra) + nat->nat_data = NULL; + } + *fp = fra->ipfr_next; + ipfr_delete(fra); + } else + fp = &fra->ipfr_next; + } + RWLOCK_EXIT(&ipf_natfrag); + RWLOCK_EXIT(&ipf_nat); + SPL_X(s); + fr_timeoutstate(); + ip_natexpire(); + fr_authexpire(); +# if SOLARIS + ipfr_timer_id = timeout(ipfr_slowtimer, NULL, drv_usectohz(500000)); +# else +# ifndef linux +# if (__FreeBSD_version >= 300000) + ipfr_slowtimer_ch = timeout(ipfr_slowtimer, NULL, hz/2); +# else + timeout(ipfr_slowtimer, NULL, hz/2); +# endif +# endif +# if (BSD < 199306) && !defined(__sgi) + return 0; +# endif +# endif + RWLOCK_EXIT(&ipf_solaris); +} +#endif /* defined(_KERNEL) */ diff --git a/sys/netinet/ip_frag.h b/sys/netinet/ip_frag.h new file mode 100644 index 0000000..0494e9c --- /dev/null +++ b/sys/netinet/ip_frag.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 1993-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_frag.h 1.5 3/24/96 + * $Id: ip_frag.h,v 2.2 1999/08/06 06:26:38 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_FRAG_H__ +#define __IP_FRAG_H__ + +#define IPFT_SIZE 257 + +typedef struct ipfr { + struct ipfr *ipfr_next, *ipfr_prev; + void *ipfr_data; + struct in_addr ipfr_src; + struct in_addr ipfr_dst; + u_short ipfr_id; + u_char ipfr_p; + u_char ipfr_tos; + u_short ipfr_off; + u_short ipfr_ttl; + frentry_t *ipfr_rule; +} ipfr_t; + + +typedef struct ipfrstat { + u_long ifs_exists; /* add & already exists */ + u_long ifs_nomem; + u_long ifs_new; + u_long ifs_hits; + u_long ifs_expire; + u_long ifs_inuse; + struct ipfr **ifs_table; + struct ipfr **ifs_nattab; +} ipfrstat_t; + +#define IPFR_CMPSZ (4 + 4 + 2 + 1 + 1) + +extern int fr_ipfrttl; +extern ipfrstat_t *ipfr_fragstats __P((void)); +extern int ipfr_newfrag __P((ip_t *, fr_info_t *, u_int)); +extern int ipfr_nat_newfrag __P((ip_t *, fr_info_t *, u_int, struct nat *)); +extern nat_t *ipfr_nat_knownfrag __P((ip_t *, fr_info_t *)); +extern frentry_t *ipfr_knownfrag __P((ip_t *, fr_info_t *)); +extern void ipfr_forget __P((void *)); +extern void ipfr_unload __P((void)); + +#if (BSD >= 199306) || SOLARIS || defined(__sgi) +# if defined(SOLARIS2) && (SOLARIS2 < 7) +extern void ipfr_slowtimer __P((void)); +# else +extern void ipfr_slowtimer __P((void *)); +# endif +#else +extern int ipfr_slowtimer __P((void)); +#endif + +#endif /* __IP_FIL_H__ */ diff --git a/sys/netinet/ip_ftp_pxy.c b/sys/netinet/ip_ftp_pxy.c new file mode 100644 index 0000000..0600617 --- /dev/null +++ b/sys/netinet/ip_ftp_pxy.c @@ -0,0 +1,460 @@ +/* + * Simple FTP transparent proxy for in-kernel use. For use with the NAT + * code. + * + * $Id$ + * $FreeBSD$ + */ +#if SOLARIS && defined(_KERNEL) +extern kmutex_t ipf_rw; +#endif + +#define isdigit(x) ((x) >= '0' && (x) <= '9') + +#define IPF_FTP_PROXY + +#define IPF_MINPORTLEN 18 +#define IPF_MAXPORTLEN 30 +#define IPF_MIN227LEN 39 +#define IPF_MAX227LEN 51 + + +int ippr_ftp_init __P((void)); +int ippr_ftp_out __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_ftp_in __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_ftp_portmsg __P((fr_info_t *, ip_t *, nat_t *)); +int ippr_ftp_pasvmsg __P((fr_info_t *, ip_t *, nat_t *)); + +u_short ipf_ftp_atoi __P((char **)); + +static frentry_t natfr; + + +/* + * Initialize local structures. + */ +int ippr_ftp_init() +{ + bzero((char *)&natfr, sizeof(natfr)); + natfr.fr_ref = 1; + natfr.fr_flags = FR_INQUE|FR_PASS|FR_QUICK|FR_KEEPSTATE; + return 0; +} + + +/* + * ipf_ftp_atoi - implement a version of atoi which processes numbers in + * pairs separated by commas (which are expected to be in the range 0 - 255), + * returning a 16 bit number combining either side of the , as the MSB and + * LSB. + */ +u_short ipf_ftp_atoi(ptr) +char **ptr; +{ + register char *s = *ptr, c; + register u_char i = 0, j = 0; + + while ((c = *s++) && isdigit(c)) { + i *= 10; + i += c - '0'; + } + if (c != ',') { + *ptr = NULL; + return 0; + } + while ((c = *s++) && isdigit(c)) { + j *= 10; + j += c - '0'; + } + *ptr = s; + return (i << 8) | j; +} + + +int ippr_ftp_portmsg(fin, ip, nat) +fr_info_t *fin; +ip_t *ip; +nat_t *nat; +{ + char portbuf[IPF_MAXPORTLEN + 1], newbuf[IPF_MAXPORTLEN + 1], *s; + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + size_t nlen = 0, dlen, olen; + u_short a5, a6, sp, dp; + u_int a1, a2, a3, a4; + struct in_addr swip; + int off, inc = 0; + fr_info_t fi; + nat_t *ipn; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + tcp = (tcphdr_t *)fin->fin_dp; + bzero(portbuf, sizeof(portbuf)); + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + if (dlen > 0) + copyout_mblk(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#else + m = *(mb_t **)fin->fin_mp; + + dlen = mbufchainlen(m) - off; + if (dlen > 0) + m_copydata(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#endif + if (dlen == 0) + return 0; + portbuf[sizeof(portbuf) - 1] = '\0'; + *newbuf = '\0'; + if (!strncmp(portbuf, "PORT ", 5)) { + if (dlen < IPF_MINPORTLEN) + return 0; + } else + return 0; + + /* + * Skip the PORT command + space + */ + s = portbuf + 5; + /* + * Pick out the address components, two at a time. + */ + a1 = ipf_ftp_atoi(&s); + if (!s) + return 0; + a2 = ipf_ftp_atoi(&s); + if (!s) + return 0; + + /* + * check that IP address in the PORT/PASV reply is the same as the + * sender of the command - prevents using PORT for port scanning. + */ + a1 <<= 16; + a1 |= a2; + if (a1 != ntohl(nat->nat_inip.s_addr)) + return 0; + + a5 = ipf_ftp_atoi(&s); + if (!s) + return 0; + if (*s == ')') + s++; + + /* + * check for CR-LF at the end. + */ + if (*s == '\n') + s--; + if ((*s == '\r') && (*(s + 1) == '\n')) { + s += 2; + a6 = a5 & 0xff; + } else + return 0; + a5 >>= 8; + /* + * Calculate new address parts for PORT command + */ + a1 = ntohl(ip->ip_src.s_addr); + a2 = (a1 >> 16) & 0xff; + a3 = (a1 >> 8) & 0xff; + a4 = a1 & 0xff; + a1 >>= 24; + olen = s - portbuf; + (void) snprintf(newbuf, sizeof(newbuf), "%s %u,%u,%u,%u,%u,%u\r\n", + "PORT", a1, a2, a3, a4, a5, a6); + + nlen = strlen(newbuf); + inc = nlen - olen; +#if SOLARIS + for (m1 = m; m1->b_cont; m1 = m1->b_cont) + ; + if ((inc > 0) && (m1->b_datap->db_lim - m1->b_wptr < inc)) { + mblk_t *nm; + + /* alloc enough to keep same trailer space for lower driver */ + nm = allocb(nlen, BPRI_MED); + PANIC((!nm),("ippr_ftp_out: allocb failed")); + + nm->b_band = m1->b_band; + nm->b_wptr += nlen; + + m1->b_wptr -= olen; + PANIC((m1->b_wptr < m1->b_rptr), + ("ippr_ftp_out: cannot handle fragmented data block")); + + linkb(m1, nm); + } else { + if (m1->b_datap->db_struiolim == m1->b_wptr) + m1->b_datap->db_struiolim += inc; + m1->b_datap->db_struioflag &= ~STRUIO_IP; + m1->b_wptr += inc; + } + copyin_mblk(m, off, nlen, newbuf); +#else + if (inc < 0) + m_adj(m, inc); + /* the mbuf chain will be extended if necessary by m_copyback() */ + m_copyback(m, off, nlen, newbuf); +#endif + if (inc != 0) { +#if SOLARIS || defined(__sgi) + register u_32_t sum1, sum2; + + sum1 = ip->ip_len; + sum2 = ip->ip_len + inc; + + /* Because ~1 == -2, We really need ~1 == -1 */ + if (sum1 > sum2) + sum2--; + sum2 -= sum1; + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + + fix_outcksum(&ip->ip_sum, sum2); +#endif + ip->ip_len += inc; + } + + /* + * Add skeleton NAT entry for connection which will come back the + * other way. + */ + sp = htons(a5 << 8 | a6); + /* + * The server may not make the connection back from port 20, but + * it is the most likely so use it here to check for a conflicting + * mapping. + */ + dp = htons(fin->fin_data[1] - 1); + ipn = nat_outlookup(fin->fin_ifp, IPN_TCP, nat->nat_p, nat->nat_inip, + ip->ip_dst, (dp << 16) | sp); + if (ipn == NULL) { + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_win = htons(8192); + tcp2->th_sport = sp; + tcp2->th_dport = 0; /* XXX - don't specify remote port */ + fi.fin_data[0] = ntohs(sp); + fi.fin_data[1] = 0; + fi.fin_dp = (char *)tcp2; + swip = ip->ip_src; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_DPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, FI_W_DPORT); + } + ip->ip_src = swip; + } + return inc; +} + + +int ippr_ftp_out(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + return ippr_ftp_portmsg(fin, ip, nat); +} + + +int ippr_ftp_pasvmsg(fin, ip, nat) +fr_info_t *fin; +ip_t *ip; +nat_t *nat; +{ + char portbuf[IPF_MAX227LEN + 1], newbuf[IPF_MAX227LEN + 1], *s; + int off, olen, dlen, nlen = 0, inc = 0; + tcphdr_t tcph, *tcp2 = &tcph; + struct in_addr swip, swip2; + u_short a5, a6, dp, sp; + u_int a1, a2, a3, a4; + tcphdr_t *tcp; + fr_info_t fi; + nat_t *ipn; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + tcp = (tcphdr_t *)fin->fin_dp; + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + m = *(mb_t **)fin->fin_mp; + bzero(portbuf, sizeof(portbuf)); + +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + if (dlen > 0) + copyout_mblk(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#else + dlen = mbufchainlen(m) - off; + if (dlen > 0) + m_copydata(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#endif + if (dlen == 0) + return 0; + portbuf[sizeof(portbuf) - 1] = '\0'; + *newbuf = '\0'; + + if (!strncmp(portbuf, "227 ", 4)) { + if (dlen < IPF_MIN227LEN) + return 0; + else if (strncmp(portbuf, "227 Entering Passive Mode", 25)) + return 0; + } else + return 0; + /* + * Skip the PORT command + space + */ + s = portbuf + 25; + while (*s && !isdigit(*s)) + s++; + /* + * Pick out the address components, two at a time. + */ + a1 = ipf_ftp_atoi(&s); + if (!s) + return 0; + a2 = ipf_ftp_atoi(&s); + if (!s) + return 0; + + /* + * check that IP address in the PORT/PASV reply is the same as the + * sender of the command - prevents using PORT for port scanning. + */ + a1 <<= 16; + a1 |= a2; + if (a1 != ntohl(nat->nat_oip.s_addr)) + return 0; + + a5 = ipf_ftp_atoi(&s); + if (!s) + return 0; + + if (*s == ')') + s++; + if (*s == '\n') + s--; + /* + * check for CR-LF at the end. + */ + if ((*s == '\r') && (*(s + 1) == '\n')) { + s += 2; + a6 = a5 & 0xff; + } else + return 0; + a5 >>= 8; + /* + * Calculate new address parts for 227 reply + */ + a1 = ntohl(ip->ip_src.s_addr); + a2 = (a1 >> 16) & 0xff; + a3 = (a1 >> 8) & 0xff; + a4 = a1 & 0xff; + a1 >>= 24; + olen = s - portbuf; + (void) sprintf(newbuf, "%s %u,%u,%u,%u,%u,%u\r\n", + "227 Entering Passive Mode", a1, a2, a3, a4, a5, a6); + + nlen = strlen(newbuf); + inc = nlen - olen; +#if SOLARIS + for (m1 = m; m1->b_cont; m1 = m1->b_cont) + ; + if ((inc > 0) && (m1->b_datap->db_lim - m1->b_wptr < inc)) { + mblk_t *nm; + + /* alloc enough to keep same trailer space for lower driver */ + nm = allocb(nlen, BPRI_MED); + PANIC((!nm),("ippr_ftp_out: allocb failed")); + + nm->b_band = m1->b_band; + nm->b_wptr += nlen; + + m1->b_wptr -= olen; + PANIC((m1->b_wptr < m1->b_rptr), + ("ippr_ftp_out: cannot handle fragmented data block")); + + linkb(m1, nm); + } else { + m1->b_wptr += inc; + } + copyin_mblk(m, off, nlen, newbuf); +#else + if (inc < 0) + m_adj(m, inc); + /* the mbuf chain will be extended if necessary by m_copyback() */ + m_copyback(m, off, nlen, newbuf); +#endif + if (inc != 0) { +#if SOLARIS || defined(__sgi) + register u_32_t sum1, sum2; + + sum1 = ip->ip_len; + sum2 = ip->ip_len + inc; + + /* Because ~1 == -2, We really need ~1 == -1 */ + if (sum1 > sum2) + sum2--; + sum2 -= sum1; + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + + fix_outcksum(&ip->ip_sum, sum2); +#endif + ip->ip_len += inc; + } + + /* + * Add skeleton NAT entry for connection which will come back the + * other way. + */ + sp = 0; + dp = htons(fin->fin_data[1] - 1); + ipn = nat_outlookup(fin->fin_ifp, IPN_TCP, nat->nat_p, nat->nat_inip, + ip->ip_dst, (dp << 16) | sp); + if (ipn == NULL) { + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_win = htons(8192); + tcp2->th_sport = 0; /* XXX - fake it for nat_new */ + fi.fin_data[0] = a5 << 8 | a6; + tcp2->th_dport = htons(fi.fin_data[0]); + fi.fin_data[1] = 0; + fi.fin_dp = (char *)tcp2; + swip = ip->ip_src; + swip2 = ip->ip_dst; + ip->ip_dst = ip->ip_src; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_SPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, FI_W_SPORT); + } + ip->ip_src = swip; + ip->ip_dst = swip2; + } + return inc; +} + + +int ippr_ftp_in(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + + return ippr_ftp_pasvmsg(fin, ip, nat); +} diff --git a/sys/netinet/ip_fw.c b/sys/netinet/ip_fw.c new file mode 100644 index 0000000..b897421 --- /dev/null +++ b/sys/netinet/ip_fw.c @@ -0,0 +1,1541 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * Copyright (c) 1996 Alex Nash + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +/* + * Implement IP packet firewall + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/ucred.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#include <netinet/if_ether.h> /* XXX ethertype_ip */ + +static int fw_debug = 1; +#ifdef IPFIREWALL_VERBOSE +static int fw_verbose = 1; +#else +static int fw_verbose = 0; +#endif +static int fw_one_pass = 1 ; +#ifdef IPFIREWALL_VERBOSE_LIMIT +static int fw_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#else +static int fw_verbose_limit = 0; +#endif + +static u_int64_t counter; /* counter for ipfw_report(NULL...) */ + +#define IPFW_DEFAULT_RULE ((u_int)(u_short)~0) + +LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain; + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +#ifdef SYSCTL_NODE +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, + &fw_debug, 0, "Enable printing of debug ip_fw statements"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW, + &fw_one_pass, 0, + "Only do a single pass through ipfw rules when using divert(4)"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW, + &fw_verbose, 0, "Log matches to ipfw rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, + &fw_verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); +#endif + +#define dprintf(a) do { \ + if (fw_debug) \ + printf a; \ + } while (0) +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 + +static int add_entry __P((struct ip_fw_head *chainptr, struct ip_fw *frwl)); +static int del_entry __P((struct ip_fw_head *chainptr, u_short number)); +static int zero_entry __P((struct ip_fw *)); +static int resetlog_entry __P((struct ip_fw *)); +static int check_ipfw_struct __P((struct ip_fw *m)); +static __inline int + iface_match __P((struct ifnet *ifp, union ip_fw_if *ifu, + int byname)); +static int ipopts_match __P((struct ip *ip, struct ip_fw *f)); +static __inline int + port_match __P((u_short *portptr, int nports, u_short port, + int range_flag)); +static int tcpflg_match __P((struct tcphdr *tcp, struct ip_fw *f)); +static int icmptype_match __P((struct icmp * icmp, struct ip_fw * f)); +static void ipfw_report __P((struct ip_fw *f, struct ip *ip, + struct ifnet *rif, struct ifnet *oif)); + +static void flush_rule_ptrs(void); + +static int ip_fw_chk __P((struct ip **pip, int hlen, + struct ifnet *oif, u_int16_t *cookie, struct mbuf **m, + struct ip_fw_chain **flow_id, + struct sockaddr_in **next_hop)); +static int ip_fw_ctl __P((struct sockopt *sopt)); + +static char err_prefix[] = "ip_fw_ctl:"; + +/* + * Returns 1 if the port is matched by the vector, 0 otherwise + */ +static __inline int +port_match(u_short *portptr, int nports, u_short port, int range_flag) +{ + if (!nports) + return 1; + if (range_flag) { + if (portptr[0] <= port && port <= portptr[1]) { + return 1; + } + nports -= 2; + portptr += 2; + } + while (nports-- > 0) { + if (*portptr++ == port) { + return 1; + } + } + return 0; +} + +static int +tcpflg_match(struct tcphdr *tcp, struct ip_fw *f) +{ + u_char flg_set, flg_clr; + + if ((f->fw_tcpf & IP_FW_TCPF_ESTAB) && + (tcp->th_flags & (IP_FW_TCPF_RST | IP_FW_TCPF_ACK))) + return 1; + + flg_set = tcp->th_flags & f->fw_tcpf; + flg_clr = tcp->th_flags & f->fw_tcpnf; + + if (flg_set != f->fw_tcpf) + return 0; + if (flg_clr) + return 0; + + return 1; +} + +static int +icmptype_match(struct icmp *icmp, struct ip_fw *f) +{ + int type; + + if (!(f->fw_flg & IP_FW_F_ICMPBIT)) + return(1); + + type = icmp->icmp_type; + + /* check for matching type in the bitmap */ + if (type < IP_FW_ICMPTYPES_MAX && + (f->fw_uar.fw_icmptypes[type / (sizeof(unsigned) * NBBY)] & + (1U << (type % (sizeof(unsigned) * NBBY))))) + return(1); + + return(0); /* no match */ +} + +static int +is_icmp_query(struct ip *ip) +{ + const struct icmp *icmp; + int icmp_type; + + icmp = (struct icmp *)((u_int32_t *)ip + ip->ip_hl); + icmp_type = icmp->icmp_type; + + if (icmp_type == ICMP_ECHO || icmp_type == ICMP_ROUTERSOLICIT || + icmp_type == ICMP_TSTAMP || icmp_type == ICMP_IREQ || + icmp_type == ICMP_MASKREQ) + return(1); + + return(0); +} + +static int +ipopts_match(struct ip *ip, struct ip_fw *f) +{ + register u_char *cp; + int opt, optlen, cnt; + u_char opts, nopts, nopts_sve; + + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + opts = f->fw_ipopt; + nopts = nopts_sve = f->fw_ipnopt; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + return 0; /*XXX*/ + } + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + opts &= ~IP_FW_IPOPT_LSRR; + nopts &= ~IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + opts &= ~IP_FW_IPOPT_SSRR; + nopts &= ~IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + opts &= ~IP_FW_IPOPT_RR; + nopts &= ~IP_FW_IPOPT_RR; + break; + case IPOPT_TS: + opts &= ~IP_FW_IPOPT_TS; + nopts &= ~IP_FW_IPOPT_TS; + break; + } + if (opts == nopts) + break; + } + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static __inline int +iface_match(struct ifnet *ifp, union ip_fw_if *ifu, int byname) +{ + /* Check by name or by IP address */ + if (byname) { + /* Check unit number (-1 is wildcard) */ + if (ifu->fu_via_if.unit != -1 + && ifp->if_unit != ifu->fu_via_if.unit) + return(0); + /* Check name */ + if (strncmp(ifp->if_name, ifu->fu_via_if.name, FW_IFNLEN)) + return(0); + return(1); + } else if (ifu->fu_via_ip.s_addr != 0) { /* Zero == wildcard */ + struct ifaddr *ia; + + for (ia = ifp->if_addrhead.tqh_first; + ia != NULL; ia = ia->ifa_link.tqe_next) { + if (ia->ifa_addr == NULL) + continue; + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (ifu->fu_via_ip.s_addr != ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) + continue; + return(1); + } + return(0); + } + return(1); +} + +static void +ipfw_report(struct ip_fw *f, struct ip *ip, + struct ifnet *rif, struct ifnet *oif) +{ + if (ip) { + struct tcphdr *const tcp = (struct tcphdr *) ((u_int32_t *) ip+ ip->ip_hl); + struct udphdr *const udp = (struct udphdr *) ((u_int32_t *) ip+ ip->ip_hl); + struct icmp *const icmp = (struct icmp *) ((u_int32_t *) ip + ip->ip_hl); + u_int64_t count; + char *action; + char action2[32], proto[47], name[18], fragment[17]; + int len; + + count = f ? f->fw_pcnt : ++counter; + if ((f == NULL && fw_verbose_limit != 0 && count > fw_verbose_limit) || + (f && f->fw_logamount != 0 && count > f->fw_loghighest)) + return; + + /* Print command name */ + snprintf(SNPARGS(name, 0), "ipfw: %d", f ? f->fw_number : -1); + + action = action2; + if (!f) + action = "Refuse"; + else { + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_DENY: + action = "Deny"; + break; + case IP_FW_F_REJECT: + if (f->fw_reject_code == IP_FW_REJECT_RST) + action = "Reset"; + else + action = "Unreach"; + break; + case IP_FW_F_ACCEPT: + action = "Accept"; + break; + case IP_FW_F_COUNT: + action = "Count"; + break; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + f->fw_divert_port); + break; + case IP_FW_F_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + f->fw_divert_port); + break; +#endif + case IP_FW_F_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + f->fw_skipto_rule); + break; +#ifdef DUMMYNET + case IP_FW_F_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + f->fw_skipto_rule); + break; +#endif +#ifdef IPFIREWALL_FORWARD + case IP_FW_F_FWD: + if (f->fw_fwd_ip.sin_port) + snprintf(SNPARGS(action2, 0), + "Forward to %s:%d", + inet_ntoa(f->fw_fwd_ip.sin_addr), + f->fw_fwd_ip.sin_port); + else + snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(f->fw_fwd_ip.sin_addr)); + break; +#endif + default: + action = "UNKNOWN"; + break; + } + } + + len = 0; + switch (ip->ip_p) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", + inet_ntoa(ip->ip_src)); + if ((ip->ip_off & IP_OFFMASK) == 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(tcp->th_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_dst)); + if ((ip->ip_off & IP_OFFMASK) == 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(tcp->th_dport)); + break; + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", + inet_ntoa(ip->ip_src)); + if ((ip->ip_off & IP_OFFMASK) == 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(udp->uh_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_dst)); + if ((ip->ip_off & IP_OFFMASK) == 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(udp->uh_dport)); + break; + case IPPROTO_ICMP: + if ((ip->ip_off & IP_OFFMASK) == 0) + len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + snprintf(SNPARGS(proto, len), "%s %s", inet_ntoa(ip->ip_src), + inet_ntoa(ip->ip_dst)); + break; + default: + snprintf(SNPARGS(proto, 0), "P:%d %s %s", ip->ip_p, + inet_ntoa(ip->ip_src), inet_ntoa(ip->ip_dst)); + break; + } + + if ((ip->ip_off & IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), " Fragment = %d", + ip->ip_off & IP_OFFMASK); + else + fragment[0] = '\0'; + if (oif) + log(LOG_SECURITY | LOG_INFO, "%s %s %s out via %s%d%s\n", + name, action, proto, oif->if_name, oif->if_unit, fragment); + else if (rif) + log(LOG_SECURITY | LOG_INFO, "%s %s %s in via %s%d%s\n", name, + action, proto, rif->if_name, rif->if_unit, fragment); + else + log(LOG_SECURITY | LOG_INFO, "%s %s %s%s\n", name, action, + proto, fragment); + if ((f ? f->fw_logamount != 0 : 1) && + count == (f ? f->fw_loghighest : fw_verbose_limit)) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + f ? f->fw_logamount : fw_verbose_limit, + f ? f->fw_number : -1); + } +} + +/* + * given an ip_fw_chain *, lookup_next_rule will return a pointer + * of the same type to the next one. This can be either the jump + * target (for skipto instructions) or the next one in the chain (in + * all other cases including a missing jump target). + * Backward jumps are not allowed, so start looking from the next + * rule... + */ +static struct ip_fw_chain * lookup_next_rule(struct ip_fw_chain *me); + +static struct ip_fw_chain * +lookup_next_rule(struct ip_fw_chain *me) +{ + struct ip_fw_chain *chain ; + int rule = me->rule->fw_skipto_rule ; /* guess... */ + + if ( (me->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_SKIPTO ) + for (chain = me->chain.le_next; chain ; chain = chain->chain.le_next ) + if (chain->rule->fw_number >= rule) + return chain ; + return me->chain.le_next ; /* failure or not a skipto */ +} + +/* + * Parameters: + * + * pip Pointer to packet header (struct ip **) + * bridge_ipfw extension: pip = NULL means a complete ethernet packet + * including ethernet header in the mbuf. Other fields + * are ignored/invalid. + * + * hlen Packet header length + * oif Outgoing interface, or NULL if packet is incoming + * *cookie Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee + * *m The packet; we set to NULL when/if we nuke it. + * *flow_id pointer to the last matching rule (in/out) + * *next_hop socket we are forwarding to (in/out). + * + * Return value: + * + * 0 The packet is to be accepted and routed normally OR + * the packet was denied/rejected and has been dropped; + * in the latter case, *m is equal to NULL upon return. + * port Divert the packet to port, with these caveats: + * + * - If IP_FW_PORT_TEE_FLAG is set, tee the packet instead + * of diverting it (ie, 'ipfw tee'). + * + * - If IP_FW_PORT_DYNT_FLAG is set, interpret the lower + * 16 bits as a dummynet pipe number instead of diverting + */ + +static int +ip_fw_chk(struct ip **pip, int hlen, + struct ifnet *oif, u_int16_t *cookie, struct mbuf **m, + struct ip_fw_chain **flow_id, + struct sockaddr_in **next_hop) +{ + struct ip_fw_chain *chain; + struct ip_fw *rule = NULL; + struct ip *ip = NULL ; + struct ifnet *const rif = (*m)->m_pkthdr.rcvif; + u_short offset = 0 ; + u_short src_port, dst_port; + u_int16_t skipto; + + /* Grab and reset cookie */ + skipto = *cookie; + *cookie = 0; + + if (pip) { /* normal ip packet */ + ip = *pip; + offset = (ip->ip_off & IP_OFFMASK); + } else { /* bridged or non-ip packet */ + struct ether_header *eh = mtod(*m, struct ether_header *); + switch (ntohs(eh->ether_type)) { + case ETHERTYPE_IP : + if ((*m)->m_len<sizeof(struct ether_header) + sizeof(struct ip)) + goto non_ip ; + ip = (struct ip *)(eh + 1 ); + if (ip->ip_v != IPVERSION) + goto non_ip ; + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) /* minimum header length */ + goto non_ip ; + if ((*m)->m_len < 14 + hlen + 14) { + printf("-- m_len %d, need more...\n", (*m)->m_len); + goto non_ip ; + } + offset = (ip->ip_off & IP_OFFMASK); + break ; + default : +non_ip: ip = NULL ; + break ; + } + } + + if (*flow_id) { + + /* Accept if passed first test */ + if (fw_one_pass) + return 0; + + /* + * Packet has already been tagged. Look for the next rule + * to restart processing. + */ + chain = LIST_NEXT(*flow_id, chain); + + if ((chain = (*flow_id)->rule->next_rule_ptr) == NULL) + chain = (*flow_id)->rule->next_rule_ptr = + lookup_next_rule(*flow_id); + if (chain == NULL) + goto dropit; + } else { + /* + * Go down the chain, looking for enlightment. + * If we've been asked to start at a given rule, do so + */ + chain = LIST_FIRST(&ip_fw_chain); + if (skipto != 0) { + if (skipto >= IPFW_DEFAULT_RULE) + goto dropit; + while (chain && chain->rule->fw_number <= skipto) + chain = LIST_NEXT(chain, chain); + if (chain == NULL) + goto dropit; + } + } + for (; chain; chain = LIST_NEXT(chain, chain)) { + register struct ip_fw * f ; +again: + f = chain->rule; + + if (oif) { + /* Check direction outbound */ + if (!(f->fw_flg & IP_FW_F_OUT)) + continue; + } else { + /* Check direction inbound */ + if (!(f->fw_flg & IP_FW_F_IN)) + continue; + } + if (ip == NULL ) { + /* + * do relevant checks for non-ip packets: + * after this, only goto got_match or continue + */ + struct ether_header *eh = mtod(*m, struct ether_header *); + + /* + * make default rule always match or we have a panic + */ + if (f->fw_number == IPFW_DEFAULT_RULE) + goto got_match ; + /* + * temporary hack: + * udp from 0.0.0.0 means this rule applies. + * 1 src port is match ether type + * 2 src ports (interval) is match ether type + * 3 src ports is match ether address + */ + if ( f->fw_src.s_addr != 0 || f->fw_prot != IPPROTO_UDP + || f->fw_smsk.s_addr != 0xffffffff ) + continue; + switch (IP_FW_GETNSRCP(f)) { + case 1: /* match one type */ + if ( /* ( (f->fw_flg & IP_FW_F_INVSRC) != 0) ^ */ + ( f->fw_uar.fw_pts[0] == ntohs(eh->ether_type) ) ) { + goto got_match ; + } + break ; + default: + break ; + } + continue ; + } + + /* Fragments */ + if ((f->fw_flg & IP_FW_F_FRAG) && offset == 0 ) + continue; + + /* If src-addr doesn't match, not this rule. */ + if (((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ ((ip->ip_src.s_addr + & f->fw_smsk.s_addr) != f->fw_src.s_addr)) + continue; + + /* If dest-addr doesn't match, not this rule. */ + if (((f->fw_flg & IP_FW_F_INVDST) != 0) ^ ((ip->ip_dst.s_addr + & f->fw_dmsk.s_addr) != f->fw_dst.s_addr)) + continue; + + /* Interface check */ + if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + struct ifnet *const iface = oif ? oif : rif; + + /* Backwards compatibility hack for "via" */ + if (!iface || !iface_match(iface, + &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME)) + continue; + } else { + /* Check receive interface */ + if ((f->fw_flg & IP_FW_F_IIFACE) + && (!rif || !iface_match(rif, + &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME))) + continue; + /* Check outgoing interface */ + if ((f->fw_flg & IP_FW_F_OIFACE) + && (!oif || !iface_match(oif, + &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME))) + continue; + } + + /* Check IP options */ + if (f->fw_ipopt != f->fw_ipnopt && !ipopts_match(ip, f)) + continue; + + /* Check protocol; if wildcard, and no [ug]id, match */ + if (f->fw_prot == IPPROTO_IP) { + if (!(f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID))) + goto rnd_then_got_match; + } else + /* If different, don't match */ + if (ip->ip_p != f->fw_prot) + continue; + +/* + * here, pip==NULL for bridged pkts -- they include the ethernet + * header so i have to adjust lengths accordingly + */ +#define PULLUP_TO(l) do { \ + int len = (pip ? (l) : (l) + 14 ); \ + if ((*m)->m_len < (len) ) { \ + if ( (*m = m_pullup(*m, (len))) == 0) \ + goto bogusfrag; \ + ip = mtod(*m, struct ip *); \ + if (pip) \ + *pip = ip ; \ + else \ + ip = (struct ip *)((char *)ip + 14);\ + offset = (ip->ip_off & IP_OFFMASK); \ + } \ + } while (0) + + /* Protocol specific checks for uid only */ + if (f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)) { + switch (ip->ip_p) { + case IPPROTO_TCP: + { + struct tcphdr *tcp; + struct inpcb *P; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) + continue; + + PULLUP_TO(hlen + 14); + tcp =(struct tcphdr *)((u_int32_t *)ip + ip->ip_hl); + + if (oif) + P = in_pcblookup_hash(&tcbinfo, ip->ip_dst, + tcp->th_dport, ip->ip_src, tcp->th_sport, 0, + oif); + else + P = in_pcblookup_hash(&tcbinfo, ip->ip_src, + tcp->th_sport, ip->ip_dst, tcp->th_dport, 0, + NULL); + + if (P && P->inp_socket) { + if (f->fw_flg & IP_FW_F_UID) { + if (P->inp_socket->so_cred->cr_uid != + f->fw_uid) + continue; + } else if (!groupmember(f->fw_gid, + P->inp_socket->so_cred)) + continue; + } else + continue; + break; + } + + case IPPROTO_UDP: + { + struct udphdr *udp; + struct inpcb *P; + + if (offset != 0) + continue; + + PULLUP_TO(hlen + 4); + udp =(struct udphdr *)((u_int32_t *)ip + ip->ip_hl); + + if (oif) + P = in_pcblookup_hash(&udbinfo, ip->ip_dst, + udp->uh_dport, ip->ip_src, udp->uh_sport, 1, + oif); + else + P = in_pcblookup_hash(&udbinfo, ip->ip_src, + udp->uh_sport, ip->ip_dst, udp->uh_dport, 1, + NULL); + + if (P && P->inp_socket) { + if (f->fw_flg & IP_FW_F_UID) { + if (P->inp_socket->so_cred->cr_uid != + f->fw_uid) + continue; + } else if (!groupmember(f->fw_gid, + P->inp_socket->so_cred)) + continue; + } else + continue; + break; + } + + default: + continue; + } + } + + /* Protocol specific checks */ + switch (ip->ip_p) { + case IPPROTO_TCP: + { + struct tcphdr *tcp; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) { + /* + * TCP flags and ports aren't available in this + * packet -- if this rule specified either one, + * we consider the rule a non-match. + */ + if (f->fw_nports != 0 || + f->fw_tcpf != f->fw_tcpnf) + continue; + + break; + } + PULLUP_TO(hlen + 14); + tcp = (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl); + if (f->fw_tcpf != f->fw_tcpnf && !tcpflg_match(tcp, f)) + continue; + src_port = ntohs(tcp->th_sport); + dst_port = ntohs(tcp->th_dport); + goto check_ports; + } + + case IPPROTO_UDP: + { + struct udphdr *udp; + + if (offset != 0) { + /* + * Port specification is unavailable -- if this + * rule specifies a port, we consider the rule + * a non-match. + */ + if (f->fw_nports != 0) + continue; + + break; + } + PULLUP_TO(hlen + 4); + udp = (struct udphdr *) ((u_int32_t *)ip + ip->ip_hl); + src_port = ntohs(udp->uh_sport); + dst_port = ntohs(udp->uh_dport); +check_ports: + if (!port_match(&f->fw_uar.fw_pts[0], + IP_FW_GETNSRCP(f), src_port, + f->fw_flg & IP_FW_F_SRNG)) + continue; + if (!port_match(&f->fw_uar.fw_pts[IP_FW_GETNSRCP(f)], + IP_FW_GETNDSTP(f), dst_port, + f->fw_flg & IP_FW_F_DRNG)) + continue; + break; + } + + case IPPROTO_ICMP: + { + struct icmp *icmp; + + if (offset != 0) /* Type isn't valid */ + break; + PULLUP_TO(hlen + 2); + icmp = (struct icmp *) ((u_int32_t *)ip + ip->ip_hl); + if (!icmptype_match(icmp, f)) + continue; + break; + } +#undef PULLUP_TO + + default: + break; + +bogusfrag: + if (fw_verbose) + ipfw_report(NULL, ip, rif, oif); + goto dropit; + + } + +rnd_then_got_match: + if ( ((struct ip_fw_ext *)f)->dont_match_prob && + random() < ((struct ip_fw_ext *)f)->dont_match_prob ) + continue ; +got_match: + *flow_id = chain ; /* XXX set flow id */ + /* Update statistics */ + f->fw_pcnt += 1; + if (ip) { + f->fw_bcnt += ip->ip_len; + } + f->timestamp = time_second; + + /* Log to console if desired */ + if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose) + ipfw_report(f, ip, rif, oif); + + /* Take appropriate action */ + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_ACCEPT: + return(0); + case IP_FW_F_COUNT: + continue; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: + *cookie = f->fw_number; + return(f->fw_divert_port); + case IP_FW_F_TEE: + *cookie = f->fw_number; + return(f->fw_divert_port | IP_FW_PORT_TEE_FLAG); +#endif + case IP_FW_F_SKIPTO: /* XXX check */ + if ( f->next_rule_ptr ) + chain = f->next_rule_ptr ; + else + chain = lookup_next_rule(chain) ; + if (! chain) goto dropit; + goto again ; +#ifdef DUMMYNET + case IP_FW_F_PIPE: + return(f->fw_pipe_nr | IP_FW_PORT_DYNT_FLAG); +#endif +#ifdef IPFIREWALL_FORWARD + case IP_FW_F_FWD: + /* Change the next-hop address for this packet. + * Initially we'll only worry about directly + * reachable next-hop's, but ultimately + * we will work out for next-hops that aren't + * direct the route we would take for it. We + * [cs]ould leave this latter problem to + * ip_output.c. We hope to high [name the abode of + * your favourite deity] that ip_output doesn't modify + * the new value of next_hop (which is dst there) + */ + if (next_hop != NULL) /* Make sure, first... */ + *next_hop = &(f->fw_fwd_ip); + return(0); /* Allow the packet */ +#endif + } + + /* Deny/reject this packet using this rule */ + rule = f; + break; + + } + +#ifdef DIAGNOSTIC + /* Rule IPFW_DEFAULT_RULE should always be there and should always match */ + if (!chain) + panic("ip_fw: chain"); +#endif + + /* + * At this point, we're going to drop the packet. + * Send a reject notice if all of the following are true: + * + * - The packet matched a reject rule + * - The packet is not an ICMP packet, or is an ICMP query packet + * - The packet is not a multicast or broadcast packet + */ + if ((rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT + && ip + && (ip->ip_p != IPPROTO_ICMP || is_icmp_query(ip)) + && !((*m)->m_flags & (M_BCAST|M_MCAST)) + && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + switch (rule->fw_reject_code) { + case IP_FW_REJECT_RST: + { + struct tcphdr *const tcp = + (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl); + struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip; + + if (offset != 0 || (tcp->th_flags & TH_RST)) + break; + ti.ti_i = *((struct ipovly *) ip); + ti.ti_t = *tcp; + bcopy(&ti, ip, sizeof(ti)); + NTOHL(tip->ti_seq); + NTOHL(tip->ti_ack); + tip->ti_len = ip->ip_len - hlen - (tip->ti_off << 2); + if (tcp->th_flags & TH_ACK) { + tcp_respond(NULL, tip, *m, + (tcp_seq)0, ntohl(tcp->th_ack), TH_RST); + } else { + if (tcp->th_flags & TH_SYN) + tip->ti_len++; + tcp_respond(NULL, tip, *m, tip->ti_seq + + tip->ti_len, (tcp_seq)0, TH_RST|TH_ACK); + } + *m = NULL; + break; + } + default: /* Send an ICMP unreachable using code */ + icmp_error(*m, ICMP_UNREACH, + rule->fw_reject_code, 0L, 0); + *m = NULL; + break; + } + } + +dropit: + /* + * Finally, drop the packet. + */ + if (*m) { + m_freem(*m); + *m = NULL; + } + return(0); +} + +/* + * when a rule is added/deleted, zero the direct pointers within + * all firewall rules. These will be reconstructed on the fly + * as packets are matched. + * Must be called at splnet(). + */ +static void +flush_rule_ptrs() +{ + struct ip_fw_chain *fcp ; + + for (fcp = ip_fw_chain.lh_first; fcp; fcp = fcp->chain.le_next) { + fcp->rule->next_rule_ptr = NULL ; + } +} + +static int +add_entry(struct ip_fw_head *chainptr, struct ip_fw *frwl) +{ + struct ip_fw *ftmp = 0; + struct ip_fw_ext *ftmp_ext = 0 ; + struct ip_fw_chain *fwc = 0, *fcp, *fcpl = 0; + u_short nbr = 0; + int s; + + fwc = malloc(sizeof *fwc, M_IPFW, M_DONTWAIT); + ftmp_ext = malloc(sizeof *ftmp_ext, M_IPFW, M_DONTWAIT); + ftmp = &ftmp_ext->rule ; + if (!fwc || !ftmp) { + dprintf(("%s malloc said no\n", err_prefix)); + if (fwc) free(fwc, M_IPFW); + if (ftmp) free(ftmp, M_IPFW); + return (ENOSPC); + } + + bzero(ftmp_ext, sizeof(*ftmp_ext)); /* play safe! */ + bcopy(frwl, ftmp, sizeof(*ftmp)); + if (ftmp->fw_flg & IP_FW_F_RND_MATCH) + ftmp_ext->dont_match_prob = (intptr_t)ftmp->pipe_ptr; + + ftmp->fw_in_if.fu_via_if.name[FW_IFNLEN - 1] = '\0'; + ftmp->fw_pcnt = 0L; + ftmp->fw_bcnt = 0L; + ftmp->next_rule_ptr = NULL ; + ftmp->pipe_ptr = NULL ; + fwc->rule = ftmp; + + s = splnet(); + + if (chainptr->lh_first == 0) { + LIST_INSERT_HEAD(chainptr, fwc, chain); + splx(s); + return(0); + } + + /* If entry number is 0, find highest numbered rule and add 100 */ + if (ftmp->fw_number == 0) { + for (fcp = LIST_FIRST(chainptr); fcp; fcp = LIST_NEXT(fcp, chain)) { + if (fcp->rule->fw_number != (u_short)-1) + nbr = fcp->rule->fw_number; + else + break; + } + if (nbr < IPFW_DEFAULT_RULE - 100) + nbr += 100; + ftmp->fw_number = nbr; + } + + /* Got a valid number; now insert it, keeping the list ordered */ + for (fcp = LIST_FIRST(chainptr); fcp; fcp = LIST_NEXT(fcp, chain)) { + if (fcp->rule->fw_number > ftmp->fw_number) { + if (fcpl) { + LIST_INSERT_AFTER(fcpl, fwc, chain); + } else { + LIST_INSERT_HEAD(chainptr, fwc, chain); + } + break; + } else { + fcpl = fcp; + } + } + flush_rule_ptrs(); + + splx(s); + return (0); +} + +static int +del_entry(struct ip_fw_head *chainptr, u_short number) +{ + struct ip_fw_chain *fcp; + + fcp = LIST_FIRST(chainptr); + if (number != (u_short)-1) { + for (; fcp; fcp = LIST_NEXT(fcp, chain)) { + if (fcp->rule->fw_number == number) { + int s; + + /* prevent access to rules while removing them */ + s = splnet(); + while (fcp && fcp->rule->fw_number == number) { + struct ip_fw_chain *next; + + next = LIST_NEXT(fcp, chain); + LIST_REMOVE(fcp, chain); +#ifdef DUMMYNET + dn_rule_delete(fcp) ; +#endif + flush_rule_ptrs(); + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + fcp = next; + } + splx(s); + return 0; + } + } + } + + return (EINVAL); +} + +static int +zero_entry(struct ip_fw *frwl) +{ + struct ip_fw_chain *fcp; + int s, cleared; + + if (frwl == 0) { + s = splnet(); + for (fcp = LIST_FIRST(&ip_fw_chain); fcp; fcp = LIST_NEXT(fcp, chain)) { + fcp->rule->fw_bcnt = fcp->rule->fw_pcnt = 0; + fcp->rule->fw_loghighest = fcp->rule->fw_logamount; + fcp->rule->timestamp = 0; + } + splx(s); + } + else { + cleared = 0; + + /* + * It's possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + for (fcp = LIST_FIRST(&ip_fw_chain); fcp; fcp = LIST_NEXT(fcp, chain)) + if (frwl->fw_number == fcp->rule->fw_number) { + s = splnet(); + while (fcp && frwl->fw_number == fcp->rule->fw_number) { + fcp->rule->fw_bcnt = fcp->rule->fw_pcnt = 0; + fcp->rule->fw_loghighest = + fcp->rule->fw_logamount; + fcp->rule->timestamp = 0; + fcp = LIST_NEXT(fcp, chain); + } + splx(s); + cleared = 1; + break; + } + if (!cleared) /* we didn't find any matching rules */ + return (EINVAL); + } + + if (fw_verbose) { + if (frwl) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: Entry %d cleared.\n", frwl->fw_number); + else + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: Accounting cleared.\n"); + } + + return (0); +} + +static int +resetlog_entry(struct ip_fw *frwl) +{ + struct ip_fw_chain *fcp; + int s, cleared; + + if (frwl == 0) { + s = splnet(); + counter = 0; + for (fcp = LIST_FIRST(&ip_fw_chain); fcp; fcp = LIST_NEXT(fcp, chain)) + fcp->rule->fw_loghighest = fcp->rule->fw_pcnt + + fcp->rule->fw_logamount; + splx(s); + } + else { + cleared = 0; + + /* + * It's possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + for (fcp = LIST_FIRST(&ip_fw_chain); fcp; fcp = LIST_NEXT(fcp, chain)) + if (frwl->fw_number == fcp->rule->fw_number) { + s = splnet(); + while (fcp && frwl->fw_number == fcp->rule->fw_number) { + fcp->rule->fw_loghighest = + fcp->rule->fw_pcnt + + fcp->rule->fw_logamount; + fcp = LIST_NEXT(fcp, chain); + } + splx(s); + cleared = 1; + break; + } + if (!cleared) /* we didn't find any matching rules */ + return (EINVAL); + } + + if (fw_verbose) { + if (frwl) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: Entry %d logging count reset.\n", + frwl->fw_number); + else + log(LOG_SECURITY | LOG_NOTICE, " + ipfw: All logging counts cleared.\n"); + } + + return (0); +} + +static int +check_ipfw_struct(struct ip_fw *frwl) +{ + /* Check for invalid flag bits */ + if ((frwl->fw_flg & ~IP_FW_F_MASK) != 0) { + dprintf(("%s undefined flag bits set (flags=%x)\n", + err_prefix, frwl->fw_flg)); + return (EINVAL); + } + /* Must apply to incoming or outgoing (or both) */ + if (!(frwl->fw_flg & (IP_FW_F_IN | IP_FW_F_OUT))) { + dprintf(("%s neither in nor out\n", err_prefix)); + return (EINVAL); + } + /* Empty interface name is no good */ + if (((frwl->fw_flg & IP_FW_F_IIFNAME) + && !*frwl->fw_in_if.fu_via_if.name) + || ((frwl->fw_flg & IP_FW_F_OIFNAME) + && !*frwl->fw_out_if.fu_via_if.name)) { + dprintf(("%s empty interface name\n", err_prefix)); + return (EINVAL); + } + /* Sanity check interface matching */ + if ((frwl->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + ; /* allow "via" backwards compatibility */ + } else if ((frwl->fw_flg & IP_FW_F_IN) + && (frwl->fw_flg & IP_FW_F_OIFACE)) { + dprintf(("%s outgoing interface check on incoming\n", + err_prefix)); + return (EINVAL); + } + /* Sanity check port ranges */ + if ((frwl->fw_flg & IP_FW_F_SRNG) && IP_FW_GETNSRCP(frwl) < 2) { + dprintf(("%s src range set but n_src_p=%d\n", + err_prefix, IP_FW_GETNSRCP(frwl))); + return (EINVAL); + } + if ((frwl->fw_flg & IP_FW_F_DRNG) && IP_FW_GETNDSTP(frwl) < 2) { + dprintf(("%s dst range set but n_dst_p=%d\n", + err_prefix, IP_FW_GETNDSTP(frwl))); + return (EINVAL); + } + if (IP_FW_GETNSRCP(frwl) + IP_FW_GETNDSTP(frwl) > IP_FW_MAX_PORTS) { + dprintf(("%s too many ports (%d+%d)\n", + err_prefix, IP_FW_GETNSRCP(frwl), IP_FW_GETNDSTP(frwl))); + return (EINVAL); + } + /* + * Protocols other than TCP/UDP don't use port range + */ + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (IP_FW_GETNSRCP(frwl) || IP_FW_GETNDSTP(frwl))) { + dprintf(("%s port(s) specified for non TCP/UDP rule\n", + err_prefix)); + return (EINVAL); + } + + /* + * Rather than modify the entry to make such entries work, + * we reject this rule and require user level utilities + * to enforce whatever policy they deem appropriate. + */ + if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) || + (frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) { + dprintf(("%s rule never matches\n", err_prefix)); + return (EINVAL); + } + + if ((frwl->fw_flg & IP_FW_F_FRAG) && + (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) { + if (frwl->fw_nports) { + dprintf(("%s cannot mix 'frag' and ports\n", err_prefix)); + return (EINVAL); + } + if (frwl->fw_prot == IPPROTO_TCP && + frwl->fw_tcpf != frwl->fw_tcpnf) { + dprintf(("%s cannot mix 'frag' and TCP flags\n", err_prefix)); + return (EINVAL); + } + } + + /* Check command specific stuff */ + switch (frwl->fw_flg & IP_FW_F_COMMAND) + { + case IP_FW_F_REJECT: + if (frwl->fw_reject_code >= 0x100 + && !(frwl->fw_prot == IPPROTO_TCP + && frwl->fw_reject_code == IP_FW_REJECT_RST)) { + dprintf(("%s unknown reject code\n", err_prefix)); + return (EINVAL); + } + break; +#if defined(IPDIVERT) || defined(DUMMYNET) +#ifdef IPDIVERT + case IP_FW_F_DIVERT: /* Diverting to port zero is invalid */ + case IP_FW_F_TEE: +#endif +#ifdef DUMMYNET + case IP_FW_F_PIPE: /* piping through 0 is invalid */ +#endif + if (frwl->fw_divert_port == 0) { + dprintf(("%s can't divert to port 0\n", err_prefix)); + return (EINVAL); + } + break; +#endif /* IPDIVERT || DUMMYNET */ + case IP_FW_F_DENY: + case IP_FW_F_ACCEPT: + case IP_FW_F_COUNT: + case IP_FW_F_SKIPTO: +#ifdef IPFIREWALL_FORWARD + case IP_FW_F_FWD: +#endif + case IP_FW_F_UID: + case IP_FW_F_GID: + break; + default: + dprintf(("%s invalid command\n", err_prefix)); + return (EINVAL); + } + + return 0; +} + +static int +ip_fw_ctl(struct sockopt *sopt) +{ + int error, s; + size_t size; + struct ip_fw_chain *fcp; + struct ip_fw frwl, *bp , *buf; + + /* + * Disallow sets in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_dir == SOPT_SET && securelevel >= 3 && + sopt->sopt_name != IP_FW_RESETLOG) + return (EPERM); + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + for (fcp = LIST_FIRST(&ip_fw_chain), size = 0; fcp; + fcp = LIST_NEXT(fcp, chain)) + size += sizeof *fcp->rule; + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == 0) { + error = ENOBUFS; + break; + } + + for (fcp = LIST_FIRST(&ip_fw_chain), bp = buf; fcp; + fcp = LIST_NEXT(fcp, chain)) { + bcopy(fcp->rule, bp, sizeof *fcp->rule); + bp->pipe_ptr = (void *)(intptr_t) + ((struct ip_fw_ext *)fcp->rule)->dont_match_prob; + bp++; + } + error = sooptcopyout(sopt, buf, size); + FREE(buf, M_TEMP); + break; + + case IP_FW_FLUSH: + for (fcp = ip_fw_chain.lh_first; + fcp != 0 && fcp->rule->fw_number != IPFW_DEFAULT_RULE; + fcp = ip_fw_chain.lh_first) { + s = splnet(); + LIST_REMOVE(fcp, chain); +#ifdef DUMMYNET + dn_rule_delete(fcp); +#endif + FREE(fcp->rule, M_IPFW); + FREE(fcp, M_IPFW); + splx(s); + } + break; + + case IP_FW_ZERO: + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, &frwl, sizeof frwl, + sizeof frwl); + if (error || (error = zero_entry(&frwl))) + break; + } else { + error = zero_entry(0); + } + break; + + case IP_FW_ADD: + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error || (error = check_ipfw_struct(&frwl))) + break; + + if (frwl.fw_number == IPFW_DEFAULT_RULE) { + dprintf(("%s can't add rule %u\n", err_prefix, + (unsigned)IPFW_DEFAULT_RULE)); + error = EINVAL; + } else { + error = add_entry(&ip_fw_chain, &frwl); + } + break; + + case IP_FW_DEL: + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error) + break; + + if (frwl.fw_number == IPFW_DEFAULT_RULE) { + dprintf(("%s can't delete rule %u\n", err_prefix, + (unsigned)IPFW_DEFAULT_RULE)); + error = EINVAL; + } else { + error = del_entry(&ip_fw_chain, frwl.fw_number); + } + break; + + case IP_FW_RESETLOG: + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, &frwl, sizeof frwl, + sizeof frwl); + if (error || (error = resetlog_entry(&frwl))) + break; + } else { + error = resetlog_entry(0); + } + break; + + default: + printf("ip_fw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL ; + } + + return (error); +} + +struct ip_fw_chain *ip_fw_default_rule ; + +void +ip_fw_init(void) +{ + struct ip_fw default_rule; + + ip_fw_chk_ptr = ip_fw_chk; + ip_fw_ctl_ptr = ip_fw_ctl; + LIST_INIT(&ip_fw_chain); + + bzero(&default_rule, sizeof default_rule); + default_rule.fw_prot = IPPROTO_IP; + default_rule.fw_number = IPFW_DEFAULT_RULE; +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + default_rule.fw_flg |= IP_FW_F_ACCEPT; +#else + default_rule.fw_flg |= IP_FW_F_DENY; +#endif + default_rule.fw_flg |= IP_FW_F_IN | IP_FW_F_OUT; + if (check_ipfw_struct(&default_rule) != 0 || + add_entry(&ip_fw_chain, &default_rule)) + panic("ip_fw_init"); + + ip_fw_default_rule = ip_fw_chain.lh_first ; + printf("IP packet filtering initialized, " +#ifdef IPDIVERT + "divert enabled, "); +#else + "divert disabled, "); +#endif +#ifdef IPFIREWALL_FORWARD + printf("rule-based forwarding enabled, "); +#else + printf("rule-based forwarding disabled, "); +#endif +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + printf("default to accept, "); +#endif +#ifndef IPFIREWALL_VERBOSE + printf("logging disabled\n"); +#else + if (fw_verbose_limit == 0) + printf("unlimited logging\n"); + else + printf("logging limited to %d packets/entry by default\n", + fw_verbose_limit); +#endif +} + +static ip_fw_chk_t *old_chk_ptr; +static ip_fw_ctl_t *old_ctl_ptr; + +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int s; + + switch (type) { + case MOD_LOAD: + s = splnet(); + + old_chk_ptr = ip_fw_chk_ptr; + old_ctl_ptr = ip_fw_ctl_ptr; + + ip_fw_init(); + splx(s); + return 0; + case MOD_UNLOAD: + s = splnet(); + + ip_fw_chk_ptr = old_chk_ptr; + ip_fw_ctl_ptr = old_ctl_ptr; + + while (LIST_FIRST(&ip_fw_chain) != NULL) { + struct ip_fw_chain *fcp = LIST_FIRST(&ip_fw_chain); + LIST_REMOVE(LIST_FIRST(&ip_fw_chain), chain); + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + } + + splx(s); + return 0; + default: + break; + } + return 0; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; +DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h new file mode 100644 index 0000000..eb4a999 --- /dev/null +++ b/sys/netinet/ip_fw.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_H +#define _IP_FW_H + +#include <sys/queue.h> + +/* + * This union structure identifies an interface, either explicitly + * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME + * and IP_FW_F_OIFNAME say how to interpret this structure. An + * interface unit number of -1 matches any unit number, while an + * IP address of 0.0.0.0 indicates matches any interface. + * + * The receive and transmit interfaces are only compared against the + * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE) + * is set. Note some packets lack a receive or transmit interface + * (in which case the missing "interface" never matches). + */ + +union ip_fw_if { + struct in_addr fu_via_ip; /* Specified by IP address */ + struct { /* Specified by interface name */ +#define FW_IFNLEN 10 /* need room ! was IFNAMSIZ */ + char name[FW_IFNLEN]; + short unit; /* -1 means match any unit */ + } fu_via_if; +}; + +/* + * Format of an IP firewall descriptor + * + * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order. + * fw_flg and fw_n*p are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + * Warning: setsockopt() will fail if sizeof(struct ip_fw) > MLEN (108) + */ + +struct ip_fw { + u_int64_t fw_pcnt,fw_bcnt; /* Packet and byte counters */ + struct in_addr fw_src, fw_dst; /* Source and destination IP addr */ + struct in_addr fw_smsk, fw_dmsk; /* Mask for src and dest IP addr */ + u_short fw_number; /* Rule number */ + u_int fw_flg; /* Flags word */ +#define IP_FW_MAX_PORTS 10 /* A reasonable maximum */ + union { + u_short fw_pts[IP_FW_MAX_PORTS]; /* Array of port numbers to match */ +#define IP_FW_ICMPTYPES_MAX 128 +#define IP_FW_ICMPTYPES_DIM (IP_FW_ICMPTYPES_MAX / (sizeof(unsigned) * 8)) + unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM]; /* ICMP types bitmap */ + } fw_uar; + u_char fw_ipopt,fw_ipnopt; /* IP options set/unset */ + u_char fw_tcpf,fw_tcpnf; /* TCP flags set/unset */ + long timestamp; /* timestamp (tv_sec) of last match */ + union ip_fw_if fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ + union { + u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ + u_short fu_pipe_nr; /* pipe number (option DUMMYNET) */ + u_short fu_skipto_rule; /* SKIPTO command rule number */ + u_short fu_reject_code; /* REJECT response code */ + struct sockaddr_in fu_fwd_ip; + } fw_un; + u_char fw_prot; /* IP protocol */ + /* + * N'of src ports and # of dst ports in ports array (dst ports + * follow src ports; max of 10 ports in all; count of 0 means + * match all ports) + */ + u_char fw_nports; + void *pipe_ptr; /* Pipe ptr in case of dummynet pipe */ + void *next_rule_ptr ; /* next rule in case of match */ + uid_t fw_uid; /* uid to match */ + gid_t fw_gid; /* gid to match */ + int fw_logamount; /* amount to log */ + u_int64_t fw_loghighest; /* highest number packet to log */ +}; + +/* + * extended ipfw structure... some fields in the original struct + * can be used to pass parameters up/down, namely pointers + * void *pipe_ptr + * void *next_rule_ptr + * some others can be used to pass parameters down, namely counters etc. + * u_int64_t fw_pcnt,fw_bcnt; + * long timestamp; + */ + +struct ip_fw_ext { /* extended structure */ + struct ip_fw rule; /* must be at offset 0 */ + long dont_match_prob; /* 0x7fffffff means 1.0, always fail */ + u_int param1; /* unused at the moment */ +}; + +#define IP_FW_GETNSRCP(rule) ((rule)->fw_nports & 0x0f) +#define IP_FW_SETNSRCP(rule, n) do { \ + (rule)->fw_nports &= ~0x0f; \ + (rule)->fw_nports |= (n); \ + } while (0) +#define IP_FW_GETNDSTP(rule) ((rule)->fw_nports >> 4) +#define IP_FW_SETNDSTP(rule, n) do { \ + (rule)->fw_nports &= ~0xf0; \ + (rule)->fw_nports |= (n) << 4;\ + } while (0) + +#define fw_divert_port fw_un.fu_divert_port +#define fw_skipto_rule fw_un.fu_skipto_rule +#define fw_reject_code fw_un.fu_reject_code +#define fw_pipe_nr fw_un.fu_pipe_nr +#define fw_fwd_ip fw_un.fu_fwd_ip + +struct ip_fw_chain { + LIST_ENTRY(ip_fw_chain) chain; + struct ip_fw *rule; +}; + +/* + * Values for "flags" field . + */ +#define IP_FW_F_COMMAND 0x000000ff /* Mask for type of chain entry: */ +#define IP_FW_F_DENY 0x00000000 /* This is a deny rule */ +#define IP_FW_F_REJECT 0x00000001 /* Deny and send a response packet */ +#define IP_FW_F_ACCEPT 0x00000002 /* This is an accept rule */ +#define IP_FW_F_COUNT 0x00000003 /* This is a count rule */ +#define IP_FW_F_DIVERT 0x00000004 /* This is a divert rule */ +#define IP_FW_F_TEE 0x00000005 /* This is a tee rule */ +#define IP_FW_F_SKIPTO 0x00000006 /* This is a skipto rule */ +#define IP_FW_F_FWD 0x00000007 /* This is a "change forwarding address" rule */ +#define IP_FW_F_PIPE 0x00000008 /* This is a dummynet rule */ + +#define IP_FW_F_IN 0x00000100 /* Check inbound packets */ +#define IP_FW_F_OUT 0x00000200 /* Check outbound packets */ +#define IP_FW_F_IIFACE 0x00000400 /* Apply inbound interface test */ +#define IP_FW_F_OIFACE 0x00000800 /* Apply outbound interface test */ + +#define IP_FW_F_PRN 0x00001000 /* Print if this rule matches */ + +#define IP_FW_F_SRNG 0x00002000 /* The first two src ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_DRNG 0x00004000 /* The first two dst ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_FRAG 0x00008000 /* Fragment */ + +#define IP_FW_F_IIFNAME 0x00010000 /* In interface by name/unit (not IP) */ +#define IP_FW_F_OIFNAME 0x00020000 /* Out interface by name/unit (not IP) */ + +#define IP_FW_F_INVSRC 0x00040000 /* Invert sense of src check */ +#define IP_FW_F_INVDST 0x00080000 /* Invert sense of dst check */ + +#define IP_FW_F_ICMPBIT 0x00100000 /* ICMP type bitmap is valid */ + +#define IP_FW_F_UID 0x00200000 /* filter by uid */ + +#define IP_FW_F_GID 0x00400000 /* filter by gid */ + +#define IP_FW_F_RND_MATCH 0x00800000 /* probabilistic rule match */ + +#define IP_FW_F_MASK 0x00FFFFFF /* All possible flag bits mask */ + +/* + * For backwards compatibility with rules specifying "via iface" but + * not restricted to only "in" or "out" packets, we define this combination + * of bits to represent this configuration. + */ + +#define IF_FW_F_VIAHACK (IP_FW_F_IN|IP_FW_F_OUT|IP_FW_F_IIFACE|IP_FW_F_OIFACE) + +/* + * Definitions for REJECT response codes. + * Values less than 256 correspond to ICMP unreachable codes. + */ +#define IP_FW_REJECT_RST 0x0100 /* TCP packets: send RST */ + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP flags. + */ +#define IP_FW_TCPF_FIN TH_FIN +#define IP_FW_TCPF_SYN TH_SYN +#define IP_FW_TCPF_RST TH_RST +#define IP_FW_TCPF_PSH TH_PUSH +#define IP_FW_TCPF_ACK TH_ACK +#define IP_FW_TCPF_URG TH_URG +#define IP_FW_TCPF_ESTAB 0x40 + +/* + * Main firewall chains definitions and global var's definitions. + */ +#ifdef KERNEL + +#define IP_FW_PORT_DYNT_FLAG 0x10000 +#define IP_FW_PORT_TEE_FLAG 0x20000 + +/* + * Function definitions. + */ +void ip_fw_init __P((void)); + +/* Firewall hooks */ +struct ip; +struct sockopt; +typedef int ip_fw_chk_t __P((struct ip **, int, struct ifnet *, u_int16_t *, + struct mbuf **, struct ip_fw_chain **, struct sockaddr_in **)); +typedef int ip_fw_ctl_t __P((struct sockopt *)); +extern ip_fw_chk_t *ip_fw_chk_ptr; +extern ip_fw_ctl_t *ip_fw_ctl_ptr; + +#endif /* KERNEL */ + +#endif /* _IP_FW_H */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c new file mode 100644 index 0000000..145699b --- /dev/null +++ b/sys/netinet/ip_icmp.c @@ -0,0 +1,821 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> +#include <netinet/icmp_var.h> + +/* + * ICMP routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator, and + * host table maintenance routines. + */ + +static struct icmpstat icmpstat; +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD, + &icmpstat, icmpstat, ""); + +static int icmpmaskrepl = 0; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, + &icmpmaskrepl, 0, ""); + +static int drop_redirect = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, + &drop_redirect, 0, ""); + +static int log_redirect = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, + &log_redirect, 0, ""); + +#ifdef ICMP_BANDLIM + +/* + * ICMP error-response bandwidth limiting sysctl. If not enabled, sysctl + * variable content is -1 and read-only. + */ + +static int icmplim = 100; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, + &icmplim, 0, ""); +#else + +static int icmplim = -1; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD, + &icmplim, 0, ""); + +#endif + +/* + * ICMP broadcast echo sysctl + */ + +static int icmpbmcastecho = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, + &icmpbmcastecho, 0, ""); + + +#ifdef ICMPPRINTFS +int icmpprintfs = 0; +#endif + +static void icmp_reflect __P((struct mbuf *)); +static void icmp_send __P((struct mbuf *, struct mbuf *)); +static int ip_next_mtu __P((int, int)); + +extern struct protosw inetsw[]; + +/* + * Generate an error packet of type error + * in response to bad packet ip. + */ +void +icmp_error(n, type, code, dest, destifp) + struct mbuf *n; + int type, code; + n_long dest; + struct ifnet *destifp; +{ + register struct ip *oip = mtod(n, struct ip *), *nip; + register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; + register struct icmp *icp; + register struct mbuf *m; + unsigned icmplen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_error(%p, %x, %d)\n", oip, type, code); +#endif + if (type != ICMP_REDIRECT) + icmpstat.icps_error++; + /* + * Don't send error if not the first fragment of message. + * Don't error if the old packet protocol was ICMP + * error message, only known informational types. + */ + if (oip->ip_off &~ (IP_MF|IP_DF)) + goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && + n->m_len >= oiplen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { + icmpstat.icps_oldicmp++; + goto freeit; + } + /* Don't send error in response to a multicast or broadcast packet */ + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + /* + * First, formulate icmp message + */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + goto freeit; + icmplen = oiplen + min(8, oip->ip_len); + m->m_len = icmplen + ICMP_MINLEN; + MH_ALIGN(m, m->m_len); + icp = mtod(m, struct icmp *); + if ((u_int)type > ICMP_MAXTYPE) + panic("icmp_error"); + icmpstat.icps_outhist[type]++; + icp->icmp_type = type; + if (type == ICMP_REDIRECT) + icp->icmp_gwaddr.s_addr = dest; + else { + icp->icmp_void = 0; + /* + * The following assignments assume an overlay with the + * zeroed icmp_void field. + */ + if (type == ICMP_PARAMPROB) { + icp->icmp_pptr = code; + code = 0; + } else if (type == ICMP_UNREACH && + code == ICMP_UNREACH_NEEDFRAG && destifp) { + icp->icmp_nextmtu = htons(destifp->if_mtu); + } + } + + icp->icmp_code = code; + bcopy((caddr_t)oip, (caddr_t)&icp->icmp_ip, icmplen); + nip = &icp->icmp_ip; + nip->ip_len = htons((u_short)(nip->ip_len + oiplen)); + + /* + * Now, copy old ip header (without options) + * in front of icmp message. + */ + if (m->m_data - sizeof(struct ip) < m->m_pktdat) + panic("icmp len"); + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; + nip = mtod(m, struct ip *); + bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); + nip->ip_len = m->m_len; + nip->ip_vhl = IP_VHL_BORING; + nip->ip_p = IPPROTO_ICMP; + nip->ip_tos = 0; + icmp_reflect(m); + +freeit: + m_freem(n); +} + +static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET }; + +/* + * Process a received ICMP message. + */ +void +icmp_input(m, hlen) + register struct mbuf *m; + int hlen; +{ + register struct icmp *icp; + register struct ip *ip = mtod(m, struct ip *); + int icmplen = ip->ip_len; + register int i; + struct in_ifaddr *ia; + void (*ctlfunc) __P((int, struct sockaddr *, void *)); + int code; + + /* + * Locate icmp structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_src)); + printf("icmp_input from %s to %s, len %d\n", + buf, inet_ntoa(ip->ip_dst), icmplen); + } +#endif + if (icmplen < ICMP_MINLEN) { + icmpstat.icps_tooshort++; + goto freeit; + } + i = hlen + min(icmplen, ICMP_ADVLENMIN); + if (m->m_len < i && (m = m_pullup(m, i)) == 0) { + icmpstat.icps_tooshort++; + return; + } + ip = mtod(m, struct ip *); + m->m_len -= hlen; + m->m_data += hlen; + icp = mtod(m, struct icmp *); + if (in_cksum(m, icmplen)) { + icmpstat.icps_checksum++; + goto freeit; + } + m->m_len += hlen; + m->m_data -= hlen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_input, type %d code %d\n", icp->icmp_type, + icp->icmp_code); +#endif + + /* + * Message type specific processing. + */ + if (icp->icmp_type > ICMP_MAXTYPE) + goto raw; + icmpstat.icps_inhist[icp->icmp_type]++; + code = icp->icmp_code; + switch (icp->icmp_type) { + + case ICMP_UNREACH: + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_PROTOCOL: + case ICMP_UNREACH_PORT: + case ICMP_UNREACH_SRCFAIL: + code += PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_NEEDFRAG: + code = PRC_MSGSIZE; + break; + + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_TOSNET: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_TOSHOST: + code = PRC_UNREACH_HOST; + break; + + case ICMP_UNREACH_FILTER_PROHIB: + case ICMP_UNREACH_HOST_PRECEDENCE: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = PRC_UNREACH_PORT; + break; + + default: + goto badcode; + } + goto deliver; + + case ICMP_TIMXCEED: + if (code > 1) + goto badcode; + code += PRC_TIMXCEED_INTRANS; + goto deliver; + + case ICMP_PARAMPROB: + if (code > 1) + goto badcode; + code = PRC_PARAMPROB; + goto deliver; + + case ICMP_SOURCEQUENCH: + if (code) + goto badcode; + code = PRC_QUENCH; + deliver: + /* + * Problem with datagram; advise higher level routines. + */ + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + goto freeit; + } + NTOHS(icp->icmp_ip.ip_len); + /* Discard ICMP's in response to multicast packets */ + if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) + goto badcode; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; +#if 1 + /* + * MTU discovery: + * If we got a needfrag and there is a host route to the + * original destination, and the MTU is not locked, then + * set the MTU in the route to the suggested new value + * (if given) and then notify as usual. The ULPs will + * notice that the MTU has changed and adapt accordingly. + * If no new MTU was suggested, then we guess a new one + * less than the current value. If the new MTU is + * unreasonably small (arbitrarily set at 296), then + * we reset the MTU to the interface value and enable the + * lock bit, indicating that we are no longer doing MTU + * discovery. + */ + if (code == PRC_MSGSIZE) { + struct rtentry *rt; + int mtu; + + rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt && (rt->rt_flags & RTF_HOST) + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + mtu = ntohs(icp->icmp_nextmtu); + if (!mtu) + mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu, + 1); +#ifdef DEBUG_MTUDISC + printf("MTU for %s reduced to %d\n", + inet_ntoa(icmpsrc.sin_addr), mtu); +#endif + if (mtu < 296) { + /* rt->rt_rmx.rmx_mtu = + rt->rt_ifp->if_mtu; */ + rt->rt_rmx.rmx_locks |= RTV_MTU; + } else if (rt->rt_rmx.rmx_mtu > mtu) { + rt->rt_rmx.rmx_mtu = mtu; + } + } + if (rt) + RTFREE(rt); + } + +#endif + ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; + if (ctlfunc) + (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, + (void *)&icp->icmp_ip); + break; + + badcode: + icmpstat.icps_badcode++; + break; + + case ICMP_ECHO: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + icmpstat.icps_bmcastecho++; + break; + } + icp->icmp_type = ICMP_ECHOREPLY; + goto reflect; + + case ICMP_TSTAMP: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + icmpstat.icps_bmcasttstamp++; + break; + } + if (icmplen < ICMP_TSLEN) { + icmpstat.icps_badlen++; + break; + } + icp->icmp_type = ICMP_TSTAMPREPLY; + icp->icmp_rtime = iptime(); + icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ + goto reflect; + + case ICMP_MASKREQ: +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (icmpmaskrepl == 0) + break; + /* + * We are not able to respond with all ones broadcast + * unless we receive it over a point-to-point interface. + */ + if (icmplen < ICMP_MASKLEN) + break; + switch (ip->ip_dst.s_addr) { + + case INADDR_BROADCAST: + case INADDR_ANY: + icmpdst.sin_addr = ip->ip_src; + break; + + default: + icmpdst.sin_addr = ip->ip_dst; + } + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + if (ia == 0) + break; + if (ia->ia_ifp == 0) + break; + icp->icmp_type = ICMP_MASKREPLY; + icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; + if (ip->ip_src.s_addr == 0) { + if (ia->ia_ifp->if_flags & IFF_BROADCAST) + ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; + else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) + ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; + } +reflect: + ip->ip_len += hlen; /* since ip_input deducts this */ + icmpstat.icps_reflect++; + icmpstat.icps_outhist[icp->icmp_type]++; + icmp_reflect(m); + return; + + case ICMP_REDIRECT: + if (log_redirect) { + u_long src, dst, gw; + + src = ntohl(ip->ip_src.s_addr); + dst = ntohl(icp->icmp_ip.ip_dst.s_addr); + gw = ntohl(icp->icmp_gwaddr.s_addr); + printf("icmp redirect from %d.%d.%d.%d: " + "%d.%d.%d.%d => %d.%d.%d.%d\n", + (int)(src >> 24), (int)((src >> 16) & 0xff), + (int)((src >> 8) & 0xff), (int)(src & 0xff), + (int)(dst >> 24), (int)((dst >> 16) & 0xff), + (int)((dst >> 8) & 0xff), (int)(dst & 0xff), + (int)(gw >> 24), (int)((gw >> 16) & 0xff), + (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); + } + if (drop_redirect) + break; + if (code > 3) + goto badcode; + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + break; + } + /* + * Short circuit routing redirects to force + * immediate change in the kernel's routing + * tables. The message is also handed to anyone + * listening on a raw socket (e.g. the routing + * daemon for use in updating its tables). + */ + icmpgw.sin_addr = ip->ip_src; + icmpdst.sin_addr = icp->icmp_gwaddr; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + + printf("redirect dst %s to %s\n", + buf, inet_ntoa(icp->icmp_gwaddr)); + } +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + rtredirect((struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, + (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, (struct rtentry **)0); + pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); + break; + + /* + * No kernel processing for the following; + * just fall through to send to raw listener. + */ + case ICMP_ECHOREPLY: + case ICMP_ROUTERADVERT: + case ICMP_ROUTERSOLICIT: + case ICMP_TSTAMPREPLY: + case ICMP_IREQREPLY: + case ICMP_MASKREPLY: + default: + break; + } + +raw: + rip_input(m, hlen); + return; + +freeit: + m_freem(m); +} + +/* + * Reflect the ip packet back to the source + */ +static void +icmp_reflect(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct in_ifaddr *ia; + struct in_addr t; + struct mbuf *opts = 0; + int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip); + + if (!in_canforward(ip->ip_src) && + ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != + (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { + m_freem(m); /* Bad return address */ + goto done; /* Ip_output() will check for broadcast */ + } + t = ip->ip_dst; + ip->ip_dst = ip->ip_src; + /* + * If the incoming packet was addressed directly to us, + * use dst as the src for the reply. Otherwise (broadcast + * or anonymous), use the address which corresponds + * to the incoming interface. + */ + for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + if (ia->ia_ifp && (ia->ia_ifp->if_flags & IFF_BROADCAST) && + t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) + break; + } + icmpdst.sin_addr = t; + if ((ia == (struct in_ifaddr *)0) && m->m_pkthdr.rcvif) + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + /* + * The following happens if the packet was not addressed to us, + * and was received on an interface with no IP address. + */ + if (ia == (struct in_ifaddr *)0) + ia = in_ifaddrhead.tqh_first; + t = IA_SIN(ia)->sin_addr; + ip->ip_src = t; + ip->ip_ttl = MAXTTL; + + if (optlen > 0) { + register u_char *cp; + int opt, cnt; + u_int len; + + /* + * Retrieve any source routing from the incoming packet; + * add on any record-route or timestamp options. + */ + cp = (u_char *) (ip + 1); + if ((opts = ip_srcroute()) == 0 && + (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { + opts->m_len = sizeof(struct in_addr); + mtod(opts, struct in_addr *)->s_addr = 0; + } + if (opts) { +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_reflect optlen %d rt %d => ", + optlen, opts->m_len); +#endif + for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + len = 1; + else { + len = cp[IPOPT_OLEN]; + if (len <= 0 || len > cnt) + break; + } + /* + * Should check for overflow, but it "can't happen" + */ + if (opt == IPOPT_RR || opt == IPOPT_TS || + opt == IPOPT_SECURITY) { + bcopy((caddr_t)cp, + mtod(opts, caddr_t) + opts->m_len, len); + opts->m_len += len; + } + } + /* Terminate & pad, if necessary */ + cnt = opts->m_len % 4; + if (cnt) { + for (; cnt < 4; cnt++) { + *(mtod(opts, caddr_t) + opts->m_len) = + IPOPT_EOL; + opts->m_len++; + } + } +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("%d\n", opts->m_len); +#endif + } + /* + * Now strip out original options by copying rest of first + * mbuf's data back, and adjust the IP length. + */ + ip->ip_len -= optlen; + ip->ip_vhl = IP_VHL_BORING; + m->m_len -= optlen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= optlen; + optlen += sizeof(struct ip); + bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), + (unsigned)(m->m_len - sizeof(struct ip))); + } + m->m_flags &= ~(M_BCAST|M_MCAST); + icmp_send(m, opts); +done: + if (opts) + (void)m_free(opts); +} + +/* + * Send an icmp packet back to the ip level, + * after supplying a checksum. + */ +static void +icmp_send(m, opts) + register struct mbuf *m; + struct mbuf *opts; +{ + register struct ip *ip = mtod(m, struct ip *); + register int hlen; + register struct icmp *icp; + struct route ro; + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + m->m_data += hlen; + m->m_len -= hlen; + icp = mtod(m, struct icmp *); + icp->icmp_cksum = 0; + icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + m->m_data -= hlen; + m->m_len += hlen; + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_dst)); + printf("icmp_send dst %s src %s\n", + buf, inet_ntoa(ip->ip_src)); + } +#endif + bzero(&ro, sizeof ro); + (void) ip_output(m, opts, &ro, 0, NULL); + if (ro.ro_rt) + RTFREE(ro.ro_rt); +} + +n_time +iptime() +{ + struct timeval atv; + u_long t; + + microtime(&atv); + t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; + return (htonl(t)); +} + +#if 1 +/* + * Return the next larger or smaller MTU plateau (table from RFC 1191) + * given current value MTU. If DIR is less than zero, a larger plateau + * is returned; otherwise, a smaller value is returned. + */ +static int +ip_next_mtu(mtu, dir) + int mtu; + int dir; +{ + static int mtutab[] = { + 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296, + 68, 0 + }; + int i; + + for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) { + if (mtu >= mtutab[i]) + break; + } + + if (dir < 0) { + if (i == 0) { + return 0; + } else { + return mtutab[i - 1]; + } + } else { + if (mtutab[i] == 0) { + return 0; + } else if(mtu > mtutab[i]) { + return mtutab[i]; + } else { + return mtutab[i + 1]; + } + } +} +#endif + +#ifdef ICMP_BANDLIM + +/* + * badport_bandlim() - check for ICMP bandwidth limit + * + * Return 0 if it is ok to send an ICMP error response, -1 if we have + * hit our bandwidth limit and it is not ok. + * + * If icmplim is <= 0, the feature is disabled and 0 is returned. + * + * For now we separate the TCP and UDP subsystems w/ different 'which' + * values. We may eventually remove this separation (and simplify the + * code further). + * + * Note that the printing of the error message is delayed so we can + * properly print the icmp error rate that the system was trying to do + * (i.e. 22000/100 pps, etc...). This can cause long delays in printing + * the 'final' error, but it doesn't make sense to solve the printing + * delay with more complex code. + */ + +int +badport_bandlim(int which) +{ + static int lticks[2]; + static int lpackets[2]; + int dticks; + + /* + * Return ok status if feature disabled or argument out of + * ranage. + */ + + if (icmplim <= 0 || which >= 2 || which < 0) + return(0); + dticks = ticks - lticks[which]; + + /* + * reset stats when cumulative dt exceeds one second. + */ + + if ((unsigned int)dticks > hz) { + if (lpackets[which] > icmplim) { + printf("icmp-response bandwidth limit %d/%d pps\n", + lpackets[which], + icmplim + ); + } + lticks[which] = ticks; + lpackets[which] = 0; + } + + /* + * bump packet count + */ + + if (++lpackets[which] > icmplim) { + return(-1); + } + return(0); +} + +#endif + + diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h new file mode 100644 index 0000000..c8fd55f --- /dev/null +++ b/sys/netinet/ip_icmp.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_ICMP_H_ +#define _NETINET_IP_ICMP_H_ + +/* + * Interface Control Message Protocol Definitions. + * Per RFC 792, September 1981. + */ + +/* + * Internal of an ICMP Router Advertisement + */ +struct icmp_ra_addr { + u_int32_t ira_addr; + u_int32_t ira_preference; +}; + +/* + * Structure of an icmp header. + */ +struct icmp { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ + union { + u_char ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + n_short icd_id; + n_short icd_seq; + } ih_idseq; + int ih_void; + + /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */ + struct ih_pmtu { + n_short ipm_void; + n_short ipm_nextmtu; + } ih_pmtu; + + struct ih_rtradv { + u_char irt_num_addrs; + u_char irt_wpa; + u_int16_t irt_lifetime; + } ih_rtradv; + } icmp_hun; +#define icmp_pptr icmp_hun.ih_pptr +#define icmp_gwaddr icmp_hun.ih_gwaddr +#define icmp_id icmp_hun.ih_idseq.icd_id +#define icmp_seq icmp_hun.ih_idseq.icd_seq +#define icmp_void icmp_hun.ih_void +#define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void +#define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu +#define icmp_num_addrs icmp_hun.ih_rtradv.irt_num_addrs +#define icmp_wpa icmp_hun.ih_rtradv.irt_wpa +#define icmp_lifetime icmp_hun.ih_rtradv.irt_lifetime + union { + struct id_ts { + n_time its_otime; + n_time its_rtime; + n_time its_ttime; + } id_ts; + struct id_ip { + struct ip idi_ip; + /* options and then 64 bits of data */ + } id_ip; + struct icmp_ra_addr id_radv; + u_int32_t id_mask; + char id_data[1]; + } icmp_dun; +#define icmp_otime icmp_dun.id_ts.its_otime +#define icmp_rtime icmp_dun.id_ts.its_rtime +#define icmp_ttime icmp_dun.id_ts.its_ttime +#define icmp_ip icmp_dun.id_ip.idi_ip +#define icmp_radv icmp_dun.id_radv +#define icmp_mask icmp_dun.id_mask +#define icmp_data icmp_dun.id_data +}; + +/* + * Lower bounds on packet lengths for various types. + * For the error advice packets must first insure that the + * packet is large enough to contain the returned ip header. + * Only then can we do the check to see if 64 bits of packet + * data have been returned, since we need to check the returned + * ip header length. + */ +#define ICMP_MINLEN 8 /* abs minimum */ +#define ICMP_TSLEN (8 + 3 * sizeof (n_time)) /* timestamp */ +#define ICMP_MASKLEN 12 /* address mask */ +#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ +#ifndef _IP_VHL +#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) + /* N.B.: must separately check that ip_hl >= 5 */ +#else +#define ICMP_ADVLEN(p) (8 + (IP_VHL_HL((p)->icmp_ip.ip_vhl) << 2) + 8) + /* N.B.: must separately check that header length >= 5 */ +#endif + +/* + * Definition of type and code field values. + */ +#define ICMP_ECHOREPLY 0 /* echo reply */ +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ +#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ +#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ +#define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ +#define ICMP_REDIRECT 5 /* shorter route, codes: */ +#define ICMP_REDIRECT_NET 0 /* for network */ +#define ICMP_REDIRECT_HOST 1 /* for host */ +#define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ +#define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ +#define ICMP_ECHO 8 /* echo service */ +#define ICMP_ROUTERADVERT 9 /* router advertisement */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TIMXCEED 11 /* time exceeded, code: */ +#define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ +#define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ +#define ICMP_PARAMPROB 12 /* ip header bad */ +#define ICMP_PARAMPROB_ERRATPTR 0 /* error at param ptr */ +#define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */ +#define ICMP_PARAMPROB_LENGTH 2 /* bad length */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_TSTAMPREPLY 14 /* timestamp reply */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_IREQREPLY 16 /* information reply */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_MASKREPLY 18 /* address mask reply */ + +#define ICMP_MAXTYPE 18 + +#define ICMP_INFOTYPE(type) \ + ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \ + (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \ + (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ + (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ + (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) + +#ifdef KERNEL +void icmp_error __P((struct mbuf *, int, int, n_long, struct ifnet *)); +void icmp_input __P((struct mbuf *, int)); +#endif + +#endif diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c new file mode 100644 index 0000000..a202a8b --- /dev/null +++ b/sys/netinet/ip_input.c @@ -0,0 +1,1653 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#define _IP_VHL + +#include "opt_bootp.h" +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_ipfilter.h" +#include "opt_ipstealth.h" + +#include <stddef.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/netisr.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <machine/in_cksum.h> + +#include <sys/socketvar.h> + +#include <netinet/ip_fw.h> + +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif + +int rsvp_on = 0; +static int ip_rsvp_on; +struct socket *ip_rsvpd; + +int ipforwarding = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, + &ipforwarding, 0, "Enable IP forwarding between interfaces"); + +static int ipsendredirects = 1; /* XXX */ +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, + &ipsendredirects, 0, "Enable sending IP redirects"); + +int ip_defttl = IPDEFTTL; +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, + &ip_defttl, 0, "Maximum TTL on IP packets"); + +static int ip_dosourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, + &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); + +static int ip_acceptsourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, + CTLFLAG_RW, &ip_acceptsourceroute, 0, + "Enable accepting source routed IP packets"); +#ifdef DIAGNOSTIC +static int ipprintfs = 0; +#endif + +extern struct domain inetdomain; +extern struct protosw inetsw[]; +u_char ip_protox[IPPROTO_MAX]; +static int ipqmaxlen = IFQ_MAXLEN; +struct in_ifaddrhead in_ifaddrhead; /* first inet address */ +struct ifqueue ipintrq; +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, + &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, + &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); + +struct ipstat ipstat; +SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD, + &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); + +/* Packet reassembly stuff */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + +static struct ipq ipq[IPREASS_NHASH]; +static int nipq = 0; /* total # of reass queues */ +static int maxnipq; + +#ifdef IPCTL_DEFMTU +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, + &ip_mtu, 0, "Default MTU"); +#endif + +#ifdef IPSTEALTH +static int ipstealth = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, + &ipstealth, 0, ""); +#endif + + +/* Firewall hooks */ +ip_fw_chk_t *ip_fw_chk_ptr; +ip_fw_ctl_t *ip_fw_ctl_ptr; + +#ifdef DUMMYNET +ip_dn_ctl_t *ip_dn_ctl_ptr; +#endif + +#if defined(IPFILTER_LKM) || defined(IPFILTER) +int (*fr_checkp) __P((struct ip *, int, struct ifnet *, int, struct mbuf **)) = NULL; +#endif + + +/* + * We need to save the IP options in case a protocol wants to respond + * to an incoming packet over the same route if the packet got here + * using IP source routing. This allows connection establishment and + * maintenance when the remote end is on a network that is not known + * to us. + */ +static int ip_nhops = 0; +static struct ip_srcrt { + struct in_addr dst; /* final destination */ + char nop; /* one NOP to align */ + char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ + struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; +} ip_srcrt; + +struct sockaddr_in *ip_fw_fwd_addr; + +static void save_rte __P((u_char *, struct in_addr)); +static int ip_dooptions __P((struct mbuf *)); +static void ip_forward __P((struct mbuf *, int)); +static void ip_freef __P((struct ipq *)); +#ifdef IPDIVERT +static struct ip *ip_reass __P((struct mbuf *, + struct ipq *, struct ipq *, u_int32_t *, u_int16_t *)); +#else +static struct ip *ip_reass __P((struct mbuf *, struct ipq *, struct ipq *)); +#endif +static struct in_ifaddr *ip_rtaddr __P((struct in_addr)); +static void ipintr __P((void)); + +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void +ip_init() +{ + register struct protosw *pr; + register int i; + + TAILQ_INIT(&in_ifaddrhead); + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == 0) + panic("ip_init"); + for (i = 0; i < IPPROTO_MAX; i++) + ip_protox[i] = pr - inetsw; + for (pr = inetdomain.dom_protosw; + pr < inetdomain.dom_protoswNPROTOSW; pr++) + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) + ip_protox[pr->pr_protocol] = pr - inetsw; + + for (i = 0; i < IPREASS_NHASH; i++) + ipq[i].next = ipq[i].prev = &ipq[i]; + + maxnipq = nmbclusters/4; + + ip_id = time_second & 0xffff; + ipintrq.ifq_maxlen = ipqmaxlen; +} + +static struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; +static struct route ipforward_rt; + +/* + * Ip input routine. Checksum and byte swap header. If fragmented + * try to reassemble. Process options. Pass to next level. + */ +void +ip_input(struct mbuf *m) +{ + struct ip *ip; + struct ipq *fp; + struct in_ifaddr *ia; + int i, hlen, mff; + u_short sum; + u_int16_t divert_cookie; /* firewall cookie */ +#ifdef IPDIVERT + u_int32_t divert_info = 0; /* packet divert/tee info */ +#endif + struct ip_fw_chain *rule = NULL; + +#ifdef IPDIVERT + /* Get and reset firewall cookie */ + divert_cookie = ip_divert_cookie; + ip_divert_cookie = 0; +#else + divert_cookie = 0; +#endif + +#if defined(IPFIREWALL) && defined(DUMMYNET) + /* + * dummynet packet are prepended a vestigial mbuf with + * m_type = MT_DUMMYNET and m_data pointing to the matching + * rule. + */ + if (m->m_type == MT_DUMMYNET) { + rule = (struct ip_fw_chain *)(m->m_data) ; + m = m->m_next ; + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + goto iphack ; + } else + rule = NULL ; +#endif + +#ifdef DIAGNOSTIC + if (m == NULL || (m->m_flags & M_PKTHDR) == 0) + panic("ip_input no HDR"); +#endif + ipstat.ips_total++; + + if (m->m_pkthdr.len < sizeof(struct ip)) + goto tooshort; + + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + return; + } + ip = mtod(m, struct ip *); + + if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { + ipstat.ips_badvers++; + goto bad; + } + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_badhlen++; + return; + } + ip = mtod(m, struct ip *); + } + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + if (sum) { + ipstat.ips_badsum++; + goto bad; + } + + /* + * Convert fields to host representation. + */ + NTOHS(ip->ip_len); + if (ip->ip_len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + NTOHS(ip->ip_id); + NTOHS(ip->ip_off); + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { +tooshort: + ipstat.ips_tooshort++; + goto bad; + } + if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } + /* + * IpHack's section. + * Right now when no processing on packet has done + * and it is still fresh out of network we do our black + * deals with it. + * - Firewall: deny/allow/divert + * - Xlate: translate packet's addr/port (NAT). + * - Pipe: pass pkt through dummynet. + * - Wrap: fake packet's addr/port <unimpl.> + * - Encapsulate: put it in another IP and send out. <unimp.> + */ + +#if defined(IPFIREWALL) && defined(DUMMYNET) +iphack: +#endif +#if defined(IPFILTER) || defined(IPFILTER_LKM) + /* + * Check if we want to allow this packet to be processed. + * Consider it to be bad if not. + */ + if (fr_checkp) { + struct mbuf *m1 = m; + + if ((*fr_checkp)(ip, hlen, m->m_pkthdr.rcvif, 0, &m1) || !m1) + return; + ip = mtod(m = m1, struct ip *); + } +#endif + if (ip_fw_chk_ptr) { +#ifdef IPFIREWALL_FORWARD + /* + * If we've been forwarded from the output side, then + * skip the firewall a second time + */ + if (ip_fw_fwd_addr) + goto ours; +#endif /* IPFIREWALL_FORWARD */ + /* + * See the comment in ip_output for the return values + * produced by the firewall. + */ + i = (*ip_fw_chk_ptr)(&ip, + hlen, NULL, &divert_cookie, &m, &rule, &ip_fw_fwd_addr); + if (m == NULL) /* Packet discarded by firewall */ + return; + if (i == 0 && ip_fw_fwd_addr == NULL) /* common case */ + goto pass; +#ifdef DUMMYNET + if ((i & IP_FW_PORT_DYNT_FLAG) != 0) { + /* Send packet to the appropriate pipe */ + dummynet_io(i&0xffff,DN_TO_IP_IN,m,NULL,NULL,0, rule); + return; + } +#endif +#ifdef IPDIVERT + if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { + /* Divert or tee packet */ + divert_info = i; + goto ours; + } +#endif +#ifdef IPFIREWALL_FORWARD + if (i == 0 && ip_fw_fwd_addr != NULL) + goto pass; +#endif + /* + * if we get here, the packet must be dropped + */ + m_freem(m); + return; + } +pass: + + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + ip_nhops = 0; /* for source routed packets */ + if (hlen > sizeof (struct ip) && ip_dooptions(m)) { +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + } + + /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no + * matter if it is destined to another node, or whether it is + * a multicast one, RSVP wants it! and prevents it from being forwarded + * anywhere else. Also checks if the rsvp daemon is running before + * grabbing the packet. + */ + if (rsvp_on && ip->ip_p==IPPROTO_RSVP) + goto ours; + + /* + * Check our list of addresses, to see if the packet is for us. + * If we don't have any addresses, assume any unicast packet + * we receive might be for us (and let the upper layers deal + * with it). + */ + if (TAILQ_EMPTY(&in_ifaddrhead) && + (m->m_flags & (M_MCAST|M_BCAST)) == 0) + goto ours; + + for (ia = TAILQ_FIRST(&in_ifaddrhead); ia; + ia = TAILQ_NEXT(ia, ia_link)) { +#define satosin(sa) ((struct sockaddr_in *)(sa)) + +#ifdef BOOTP_COMPAT + if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) + goto ours; +#endif +#ifdef IPFIREWALL_FORWARD + /* + * If the addr to forward to is one of ours, we pretend to + * be the destination for this packet. + */ + if (ip_fw_fwd_addr == NULL) { + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) + goto ours; + } else if (IA_SIN(ia)->sin_addr.s_addr == + ip_fw_fwd_addr->sin_addr.s_addr) + goto ours; +#else + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) + goto ours; +#endif + if (ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) { + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + ip->ip_dst.s_addr) + goto ours; + if (ip->ip_dst.s_addr == ia->ia_netbroadcast.s_addr) + goto ours; + } + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + if (ip_mrouter) { + /* + * If we are acting as a multicast router, all + * incoming multicast packets are passed to the + * kernel-level multicast forwarding function. + * The packet is returned (relatively) intact; if + * ip_mforward() returns a non-zero value, the packet + * must be discarded, else it may be accepted below. + * + * (The IP ident field is put in the same byte order + * as expected when ip_mforward() is called from + * ip_output().) + */ + ip->ip_id = htons(ip->ip_id); + if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + ip->ip_id = ntohs(ip->ip_id); + + /* + * The process-level routing demon needs to receive + * all multicast IGMP packets, whether or not this + * host belongs to their destination groups. + */ + if (ip->ip_p == IPPROTO_IGMP) + goto ours; + ipstat.ips_forward++; + } + /* + * See if we belong to the destination multicast group on the + * arrival interface. + */ + IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); + if (inm == NULL) { + ipstat.ips_notmember++; + m_freem(m); + return; + } + goto ours; + } + if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + goto ours; + if (ip->ip_dst.s_addr == INADDR_ANY) + goto ours; + + /* + * Not for us; forward if possible and desirable. + */ + if (ipforwarding == 0) { + ipstat.ips_cantforward++; + m_freem(m); + } else + ip_forward(m, 0); +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + +ours: + + /* + * If offset or IP_MF are set, must reassemble. + * Otherwise, nothing need be done. + * (We could look in the reassembly queue to see + * if the packet was previously fragmented, + * but it's not worth the time; just let them time out.) + */ + if (ip->ip_off & (IP_MF | IP_OFFMASK | IP_RF)) { + if (m->m_flags & M_EXT) { /* XXX */ + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_toosmall++; +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + } + ip = mtod(m, struct ip *); + } + sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); + /* + * Look for queue of fragments + * of this datagram. + */ + for (fp = ipq[sum].next; fp != &ipq[sum]; fp = fp->next) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && + ip->ip_p == fp->ipq_p) + goto found; + + fp = 0; + + /* check if there's a place for the new queue */ + if (nipq > maxnipq) { + /* + * drop something from the tail of the current queue + * before proceeding further + */ + if (ipq[sum].prev == &ipq[sum]) { /* gak */ + for (i = 0; i < IPREASS_NHASH; i++) { + if (ipq[i].prev != &ipq[i]) { + ip_freef(ipq[i].prev); + break; + } + } + } else + ip_freef(ipq[sum].prev); + } +found: + /* + * Adjust ip_len to not reflect header, + * set ip_mff if more fragments are expected, + * convert offset of this to bytes. + */ + ip->ip_len -= hlen; + mff = (ip->ip_off & IP_MF) != 0; + if (mff) { + /* + * Make sure that fragments have a data length + * that's a non-zero multiple of 8 bytes. + */ + if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { + ipstat.ips_toosmall++; /* XXX */ + goto bad; + } + m->m_flags |= M_FRAG; + } + ip->ip_off <<= 3; + + /* + * If datagram marked as having more fragments + * or if this is not the first fragment, + * attempt reassembly; if it succeeds, proceed. + */ + if (mff || ip->ip_off) { + ipstat.ips_fragments++; + m->m_pkthdr.header = ip; +#ifdef IPDIVERT + ip = ip_reass(m, + fp, &ipq[sum], &divert_info, &divert_cookie); +#else + ip = ip_reass(m, fp, &ipq[sum]); +#endif + if (ip == 0) { +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + } + /* Get the length of the reassembled packets header */ + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + ipstat.ips_reassembled++; + m = dtom(ip); +#ifdef IPDIVERT + /* Restore original checksum before diverting packet */ + if (divert_info != 0) { + ip->ip_len += hlen; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + HTONS(ip->ip_id); + ip->ip_sum = 0; + ip->ip_sum = in_cksum_hdr(ip); + NTOHS(ip->ip_id); + NTOHS(ip->ip_off); + NTOHS(ip->ip_len); + ip->ip_len -= hlen; + } +#endif + } else + if (fp) + ip_freef(fp); + } else + ip->ip_len -= hlen; + +#ifdef IPDIVERT + /* + * Divert or tee packet to the divert protocol if required. + * + * If divert_info is zero then cookie should be too, so we shouldn't + * need to clear them here. Assume divert_packet() does so also. + */ + if (divert_info != 0) { + struct mbuf *clone = NULL; + + /* Clone packet if we're doing a 'tee' */ + if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + + /* Restore packet header fields to original values */ + ip->ip_len += hlen; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + HTONS(ip->ip_id); + + /* Deliver packet to divert input routine */ + ip_divert_cookie = divert_cookie; + divert_packet(m, 1, divert_info & 0xffff); + ipstat.ips_delivered++; + + /* If 'tee', continue with original packet */ + if (clone == NULL) + return; + m = clone; + ip = mtod(m, struct ip *); + } +#endif + + /* + * Switch out to protocol's input routine. + */ + ipstat.ips_delivered++; + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; /* tcp needed it */ +#endif + return; +bad: +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + m_freem(m); +} + +/* + * IP software interrupt routine - to go away sometime soon + */ +static void +ipintr(void) +{ + int s; + struct mbuf *m; + + while(1) { + s = splimp(); + IF_DEQUEUE(&ipintrq, m); + splx(s); + if (m == 0) + return; + ip_input(m); + } +} + +NETISR_SET(NETISR_IP, ipintr); + +/* + * Take incoming datagram fragment and try to reassemble it into + * whole datagram. If a chain for reassembly of this datagram already + * exists, then it is given as fp; otherwise have to make a chain. + * + * When IPDIVERT enabled, keep additional state with each packet that + * tells us if we need to divert or tee the packet we're building. + */ + +static struct ip * +#ifdef IPDIVERT +ip_reass(m, fp, where, divinfo, divcookie) +#else +ip_reass(m, fp, where) +#endif + register struct mbuf *m; + register struct ipq *fp; + struct ipq *where; +#ifdef IPDIVERT + u_int32_t *divinfo; + u_int16_t *divcookie; +#endif +{ + struct ip *ip = mtod(m, struct ip *); + register struct mbuf *p = 0, *q, *nq; + struct mbuf *t; + int hlen = IP_VHL_HL(ip->ip_vhl) << 2; + int i, next; + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == 0) { + if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) + goto dropfrag; + fp = mtod(t, struct ipq *); + insque(fp, where); + nipq++; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_src = ip->ip_src; + fp->ipq_dst = ip->ip_dst; + fp->ipq_frags = m; + m->m_nextpkt = NULL; +#ifdef IPDIVERT + fp->ipq_div_info = 0; + fp->ipq_div_cookie = 0; +#endif + goto inserted; + } + +#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) + + /* + * Find a segment which begins after this one does. + */ + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) + if (GETIP(q)->ip_off > ip->ip_off) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us, otherwise + * stick new segment in the proper place. + */ + if (p) { + i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; + if (i > 0) { + if (i >= ip->ip_len) + goto dropfrag; + m_adj(dtom(ip), i); + ip->ip_off += i; + ip->ip_len -= i; + } + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } else { + m->m_nextpkt = fp->ipq_frags; + fp->ipq_frags = m; + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; + q = nq) { + i = (ip->ip_off + ip->ip_len) - + GETIP(q)->ip_off; + if (i < GETIP(q)->ip_len) { + GETIP(q)->ip_len -= i; + GETIP(q)->ip_off += i; + m_adj(q, i); + break; + } + nq = q->m_nextpkt; + m->m_nextpkt = nq; + m_freem(q); + } + +inserted: + +#ifdef IPDIVERT + /* + * Transfer firewall instructions to the fragment structure. + * Any fragment diverting causes the whole packet to divert. + */ + fp->ipq_div_info = *divinfo; + fp->ipq_div_cookie = *divcookie; + *divinfo = 0; + *divcookie = 0; +#endif + + /* + * Check for complete reassembly. + */ + next = 0; + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { + if (GETIP(q)->ip_off != next) + return (0); + next += GETIP(q)->ip_len; + } + /* Make sure the last packet didn't have the IP_MF flag */ + if (p->m_flags & M_FRAG) + return (0); + + /* + * Reassembly is complete. Make sure the packet is a sane size. + */ + q = fp->ipq_frags; + ip = GETIP(q); + if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { + ipstat.ips_toolong++; + ip_freef(fp); + return (0); + } + + /* + * Concatenate fragments. + */ + m = q; + t = m->m_next; + m->m_next = 0; + m_cat(m, t); + nq = q->m_nextpkt; + q->m_nextpkt = 0; + for (q = nq; q != NULL; q = nq) { + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + m_cat(m, q); + } + +#ifdef IPDIVERT + /* + * Extract firewall instructions from the fragment structure. + */ + *divinfo = fp->ipq_div_info; + *divcookie = fp->ipq_div_cookie; +#endif + + /* + * Create header for new ip packet by + * modifying header of first packet; + * dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip->ip_len = next; + ip->ip_src = fp->ipq_src; + ip->ip_dst = fp->ipq_dst; + remque(fp); + nipq--; + (void) m_free(dtom(fp)); + m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); + m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ + register int plen = 0; + for (t = m; m; m = m->m_next) + plen += m->m_len; + t->m_pkthdr.len = plen; + } + return (ip); + +dropfrag: +#ifdef IPDIVERT + *divinfo = 0; + *divcookie = 0; +#endif + ipstat.ips_fragdropped++; + m_freem(m); + return (0); + +#undef GETIP +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +static void +ip_freef(fp) + struct ipq *fp; +{ + register struct mbuf *q; + + while (fp->ipq_frags) { + q = fp->ipq_frags; + fp->ipq_frags = q->m_nextpkt; + m_freem(q); + } + remque(fp); + (void) m_free(dtom(fp)); + nipq--; +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. + */ +void +ip_slowtimo() +{ + register struct ipq *fp; + int s = splnet(); + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + fp = ipq[i].next; + if (fp == 0) + continue; + while (fp != &ipq[i]) { + --fp->ipq_ttl; + fp = fp->next; + if (fp->prev->ipq_ttl == 0) { + ipstat.ips_fragtimeout++; + ip_freef(fp->prev); + } + } + } + ipflow_slowtimo(); + splx(s); +} + +/* + * Drain off all datagram fragments. + */ +void +ip_drain() +{ + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + while (ipq[i].next != &ipq[i]) { + ipstat.ips_fragdropped++; + ip_freef(ipq[i].next); + } + } + in_rtqdrain(); +} + +/* + * Do option processing on a datagram, + * possibly discarding it if bad options are encountered, + * or forwarding it if source-routed. + * Returns 1 if packet has been forwarded/freed, + * 0 if the packet should be processed further. + */ +static int +ip_dooptions(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register u_char *cp; + register struct ip_timestamp *ipt; + register struct in_ifaddr *ia; + int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; + struct in_addr *sin, dst; + n_time ntime; + + dst = ip->ip_dst; + cp = (u_char *)(ip + 1); + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + } + switch (opt) { + + default: + break; + + /* + * Source routing with record. + * Find interface with current destination address. + * If none on this machine then drop if strictly routed, + * or do nothing if loosely routed. + * Record interface address and bring up next address + * component. If strictly routed make sure next + * address is on directly accessible net. + */ + case IPOPT_LSRR: + case IPOPT_SSRR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = ip->ip_dst; + ia = (struct in_ifaddr *) + ifa_ifwithaddr((struct sockaddr *)&ipaddr); + if (ia == 0) { + if (opt == IPOPT_SSRR) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + if (!ip_dosourceroute) + goto nosourcerouting; + /* + * Loose routing, and not at next destination + * yet; nothing to do except forward. + */ + break; + } + off--; /* 0 origin */ + if (off > optlen - sizeof(struct in_addr)) { + /* + * End of source route. Should be for us. + */ + if (!ip_acceptsourceroute) + goto nosourcerouting; + save_rte(cp, ip->ip_src); + break; + } + + if (!ip_dosourceroute) { + if (ipforwarding) { + char buf[16]; /* aaa.bbb.ccc.ddd\0 */ + /* + * Acting as a router, so generate ICMP + */ +nosourcerouting: + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_WARNING, + "attempted source route from %s to %s\n", + inet_ntoa(ip->ip_src), buf); + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } else { + /* + * Not acting as a router, so silently drop. + */ + ipstat.ips_cantforward++; + m_freem(m); + return (1); + } + } + + /* + * locate outgoing interface + */ + (void)memcpy(&ipaddr.sin_addr, cp + off, + sizeof(ipaddr.sin_addr)); + + if (opt == IPOPT_SSRR) { +#define INA struct in_ifaddr * +#define SA struct sockaddr * + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) + ia = (INA)ifa_ifwithnet((SA)&ipaddr); + } else + ia = ip_rtaddr(ipaddr.sin_addr); + if (ia == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + ip->ip_dst = ipaddr.sin_addr; + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + /* + * Let ip_intr's mcast routing check handle mcast pkts + */ + forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); + break; + + case IPOPT_RR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + /* + * If no space remains, ignore. + */ + off--; /* 0 origin */ + if (off > optlen - sizeof(struct in_addr)) + break; + (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, + sizeof(ipaddr.sin_addr)); + /* + * locate outgoing interface; if we're the destination, + * use the incoming interface (should be same). + */ + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && + (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS: + code = cp - (u_char *)ip; + ipt = (struct ip_timestamp *)cp; + if (ipt->ipt_len < 5) + goto bad; + if (ipt->ipt_ptr > ipt->ipt_len - sizeof(int32_t)) { + if (++ipt->ipt_oflw == 0) + goto bad; + break; + } + sin = (struct in_addr *)(cp + ipt->ipt_ptr - 1); + switch (ipt->ipt_flg) { + + case IPOPT_TS_TSONLY: + break; + + case IPOPT_TS_TSANDADDR: + if (ipt->ipt_ptr - 1 + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + ipaddr.sin_addr = dst; + ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, + m->m_pkthdr.rcvif); + if (ia == 0) + continue; + (void)memcpy(sin, &IA_SIN(ia)->sin_addr, + sizeof(struct in_addr)); + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + case IPOPT_TS_PRESPEC: + if (ipt->ipt_ptr - 1 + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + (void)memcpy(&ipaddr.sin_addr, sin, + sizeof(struct in_addr)); + if (ifa_ifwithaddr((SA)&ipaddr) == 0) + continue; + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + default: + goto bad; + } + ntime = iptime(); + (void)memcpy(cp + ipt->ipt_ptr - 1, &ntime, + sizeof(n_time)); + ipt->ipt_ptr += sizeof(n_time); + } + } + if (forward && ipforwarding) { + ip_forward(m, 1); + return (1); + } + return (0); +bad: + ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; /* XXX icmp_error adds in hdr length */ + icmp_error(m, type, code, 0, 0); + ipstat.ips_badoptions++; + return (1); +} + +/* + * Given address of next destination (final or next hop), + * return internet address info of interface to be used to get there. + */ +static struct in_ifaddr * +ip_rtaddr(dst) + struct in_addr dst; +{ + register struct sockaddr_in *sin; + + sin = (struct sockaddr_in *) &ipforward_rt.ro_dst; + + if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = dst; + + rtalloc_ign(&ipforward_rt, RTF_PRCLONING); + } + if (ipforward_rt.ro_rt == 0) + return ((struct in_ifaddr *)0); + return ((struct in_ifaddr *) ipforward_rt.ro_rt->rt_ifa); +} + +/* + * Save incoming source route for use in replies, + * to be picked up later by ip_srcroute if the receiver is interested. + */ +void +save_rte(option, dst) + u_char *option; + struct in_addr dst; +{ + unsigned olen; + + olen = option[IPOPT_OLEN]; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("save_rte: olen %d\n", olen); +#endif + if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) + return; + bcopy(option, ip_srcrt.srcopt, olen); + ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); + ip_srcrt.dst = dst; +} + +/* + * Retrieve incoming source route for use in replies, + * in the same form used by setsockopt. + * The first hop is placed before the options, will be removed later. + */ +struct mbuf * +ip_srcroute() +{ + register struct in_addr *p, *q; + register struct mbuf *m; + + if (ip_nhops == 0) + return ((struct mbuf *)0); + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == 0) + return ((struct mbuf *)0); + +#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) + + /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ + m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + + OPTSIZ; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); +#endif + + /* + * First save first hop for return route + */ + p = &ip_srcrt.route[ip_nhops - 1]; + *(mtod(m, struct in_addr *)) = *p--; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr)); +#endif + + /* + * Copy option fields and padding (nop) to mbuf. + */ + ip_srcrt.nop = IPOPT_NOP; + ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; + (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), + &ip_srcrt.nop, OPTSIZ); + q = (struct in_addr *)(mtod(m, caddr_t) + + sizeof(struct in_addr) + OPTSIZ); +#undef OPTSIZ + /* + * Record return path as an IP source route, + * reversing the path (pointers are now aligned). + */ + while (p >= ip_srcrt.route) { +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx", (u_long)ntohl(q->s_addr)); +#endif + *q++ = *p--; + } + /* + * Last hop goes to final destination. + */ + *q = ip_srcrt.dst; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx\n", (u_long)ntohl(q->s_addr)); +#endif + return (m); +} + +/* + * Strip out IP options, at higher + * level protocol in the kernel. + * Second argument is buffer to which options + * will be moved, and return value is their length. + * XXX should be deleted; last arg currently ignored. + */ +void +ip_stripoptions(m, mopt) + register struct mbuf *m; + struct mbuf *mopt; +{ + register int i; + struct ip *ip = mtod(m, struct ip *); + register caddr_t opts; + int olen; + + olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + opts = (caddr_t)(ip + 1); + i = m->m_len - (sizeof (struct ip) + olen); + bcopy(opts + olen, opts, (unsigned)i); + m->m_len -= olen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= olen; + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2); +} + +u_char inetctlerrmap[PRC_NCMDS] = { + 0, 0, 0, 0, + 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, + EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, EHOSTUNREACH, 0, 0, + 0, 0, 0, 0, + ENOPROTOOPT +}; + +/* + * Forward a packet. If some error occurs return the sender + * an icmp packet. Note we can't always generate a meaningful + * icmp message because icmp doesn't have a large enough repertoire + * of codes and types. + * + * If not forwarding, just drop the packet. This could be confusing + * if ipforwarding was zero but some routing protocol was advancing + * us as a gateway to somewhere. However, we must let the routing + * protocol deal with that. + * + * The srcrt parameter indicates whether the packet is being forwarded + * via a source route. + */ +static void +ip_forward(m, srcrt) + struct mbuf *m; + int srcrt; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct sockaddr_in *sin; + register struct rtentry *rt; + int error, type = 0, code = 0; + struct mbuf *mcopy; + n_long dest; + struct ifnet *destifp; + + dest = 0; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("forward: src %lx dst %lx ttl %x\n", + (u_long)ip->ip_src.s_addr, (u_long)ip->ip_dst.s_addr, + ip->ip_ttl); +#endif + + + if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + HTONS(ip->ip_id); +#ifdef IPSTEALTH + if (!ipstealth) { +#endif + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, + dest, 0); + return; + } + ip->ip_ttl -= IPTTLDEC; +#ifdef IPSTEALTH + } +#endif + + sin = (struct sockaddr_in *)&ipforward_rt.ro_dst; + if ((rt = ipforward_rt.ro_rt) == 0 || + ip->ip_dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + + rtalloc_ign(&ipforward_rt, RTF_PRCLONING); + if (ipforward_rt.ro_rt == 0) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); + return; + } + rt = ipforward_rt.ro_rt; + } + + /* + * Save at most 64 bytes of the packet in case + * we need to generate an ICMP message to the src. + */ + mcopy = m_copy(m, 0, imin((int)ip->ip_len, 64)); + + /* + * If forwarding packet using same interface that it came in on, + * perhaps should send a redirect to sender to shortcut a hop. + * Only send redirect if source is sending directly to us, + * and if packet was not source routed (or has any options). + * Also, don't send redirect if forwarding using a default route + * or a route modified by a redirect. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (rt->rt_ifp == m->m_pkthdr.rcvif && + (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && + satosin(rt_key(rt))->sin_addr.s_addr != 0 && + ipsendredirects && !srcrt) { +#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + u_long src = ntohl(ip->ip_src.s_addr); + + if (RTA(rt) && + (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = ip->ip_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_long)dest); +#endif + } + } + + error = ip_output(m, (struct mbuf *)0, &ipforward_rt, + IP_FORWARDING, 0); + if (error) + ipstat.ips_cantforward++; + else { + ipstat.ips_forward++; + if (type) + ipstat.ips_redirectsent++; + else { + if (mcopy) { + ipflow_create(&ipforward_rt, mcopy); + m_freem(mcopy); + } + return; + } + } + if (mcopy == NULL) + return; + destifp = NULL; + + switch (error) { + + case 0: /* forwarded, but need redirect */ + /* type, code set above */ + break; + + case ENETUNREACH: /* shouldn't happen, checked above */ + case EHOSTUNREACH: + case ENETDOWN: + case EHOSTDOWN: + default: + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + break; + + case EMSGSIZE: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; + if (ipforward_rt.ro_rt) + destifp = ipforward_rt.ro_rt->rt_ifp; + ipstat.ips_cantfrag++; + break; + + case ENOBUFS: + type = ICMP_SOURCEQUENCH; + code = 0; + break; + } + icmp_error(mcopy, type, code, dest, destifp); +} + +void +ip_savecontrol(inp, mp, ip, m) + register struct inpcb *inp; + register struct mbuf **mp; + register struct ip *ip; + register struct mbuf *m; +{ + if (inp->inp_socket->so_options & SO_TIMESTAMP) { + struct timeval tv; + + microtime(&tv); + *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET); + if (*mp) + mp = &(*mp)->m_next; + } + if (inp->inp_flags & INP_RECVDSTADDR) { + *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#ifdef notyet + /* XXX + * Moving these out of udp_input() made them even more broken + * than they already were. + */ + /* options were tossed already */ + if (inp->inp_flags & INP_RECVOPTS) { + *mp = sbcreatecontrol((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + /* ip_srcroute doesn't do what we want here, need to fix */ + if (inp->inp_flags & INP_RECVRETOPTS) { + *mp = sbcreatecontrol((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#endif + if (inp->inp_flags & INP_RECVIF) { + struct ifnet *ifp; + struct sdlbuf { + struct sockaddr_dl sdl; + u_char pad[32]; + } sdlbuf; + struct sockaddr_dl *sdp; + struct sockaddr_dl *sdl2 = &sdlbuf.sdl; + + if (((ifp = m->m_pkthdr.rcvif)) + && ( ifp->if_index && (ifp->if_index <= if_index))) { + sdp = (struct sockaddr_dl *)(ifnet_addrs + [ifp->if_index - 1]->ifa_addr); + /* + * Change our mind and don't try copy. + */ + if ((sdp->sdl_family != AF_LINK) + || (sdp->sdl_len > sizeof(sdlbuf))) { + goto makedummy; + } + bcopy(sdp, sdl2, sdp->sdl_len); + } else { +makedummy: + sdl2->sdl_len + = offsetof(struct sockaddr_dl, sdl_data[0]); + sdl2->sdl_family = AF_LINK; + sdl2->sdl_index = 0; + sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; + } + *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +} + +int +ip_rsvp_init(struct socket *so) +{ + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + if (ip_rsvpd != NULL) + return EADDRINUSE; + + ip_rsvpd = so; + /* + * This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!ip_rsvp_on) { + ip_rsvp_on = 1; + rsvp_on++; + } + + return 0; +} + +int +ip_rsvp_done(void) +{ + ip_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (ip_rsvp_on) { + ip_rsvp_on = 0; + rsvp_on--; + } + return 0; +} diff --git a/sys/netinet/ip_log.c b/sys/netinet/ip_log.c new file mode 100644 index 0000000..73a411a --- /dev/null +++ b/sys/netinet/ip_log.c @@ -0,0 +1,496 @@ +/* + * Copyright (C) 1997-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * $Id: ip_log.c,v 2.1.2.2 1999/09/21 11:55:44 darrenr Exp $ + * $FreeBSD$ + */ +#include <sys/param.h> +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) +# include "opt_ipfilter_log.h" +#endif +#ifdef __FreeBSD__ +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) +# include "opt_ipfilter.h" +# endif +# endif +#endif +#ifdef IPFILTER_LOG +# ifndef SOLARIS +# define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +# endif +# ifndef _KERNEL +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +# include <ctype.h> +# endif +# include <sys/errno.h> +# include <sys/types.h> +# include <sys/file.h> +# if __FreeBSD_version >= 220000 && defined(_KERNEL) +# include <sys/fcntl.h> +# include <sys/filio.h> +# else +# include <sys/ioctl.h> +# endif +# include <sys/time.h> +# if defined(_KERNEL) && !defined(linux) +# include <sys/systm.h> +# endif +# include <sys/uio.h> +# if !SOLARIS +# if (NetBSD > 199609) || (OpenBSD > 199603) || (__FreeBSD_version >= 300000) +# include <sys/dirent.h> +# else +# include <sys/dir.h> +# endif +# ifndef linux +# include <sys/mbuf.h> +# endif +# else +# include <sys/filio.h> +# include <sys/cred.h> +# include <sys/ddi.h> +# include <sys/sunddi.h> +# include <sys/ksynch.h> +# include <sys/kmem.h> +# include <sys/mkdev.h> +# include <sys/dditypes.h> +# include <sys/cmn_err.h> +# endif +# ifndef linux +# include <sys/protosw.h> +# endif +# include <sys/socket.h> + +# include <net/if.h> +# ifdef sun +# include <net/af.h> +# endif +# if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# endif +# include <net/route.h> +# include <netinet/in.h> +# ifdef __sgi +# include <sys/ddi.h> +# ifdef IFF_DRVRLOCK /* IRIX6 */ +# include <sys/hashing.h> +# endif +# endif +# if !defined(linux) && !(defined(__sgi) && !defined(IFF_DRVRLOCK)) /*IRIX<6*/ +# include <netinet/in_var.h> +# endif +# include <netinet/in_systm.h> +# include <netinet/ip.h> +# include <netinet/tcp.h> +# include <netinet/udp.h> +# include <netinet/ip_icmp.h> +# ifndef linux +# include <netinet/ip_var.h> +# endif +# ifndef _KERNEL +# include <syslog.h> +# endif +# include "netinet/ip_compat.h" +# include <netinet/tcpip.h> +# include "netinet/ip_fil.h" +# include "netinet/ip_proxy.h" +# include "netinet/ip_nat.h" +# include "netinet/ip_frag.h" +# include "netinet/ip_state.h" +# include "netinet/ip_auth.h" +# if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# endif + +# ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +# endif + + +# if SOLARIS || defined(__sgi) +extern kmutex_t ipl_mutex; +# if SOLARIS +extern kcondvar_t iplwait; +# endif +# endif + +iplog_t **iplh[IPL_LOGMAX+1], *iplt[IPL_LOGMAX+1], *ipll[IPL_LOGMAX+1]; +size_t iplused[IPL_LOGMAX+1]; +static fr_info_t iplcrc[IPL_LOGMAX+1]; +# ifdef linux +static struct wait_queue *iplwait[IPL_LOGMAX+1]; +# endif + + +/* + * Initialise log buffers & pointers. Also iniialised the CRC to a local + * secret for use in calculating the "last log checksum". + */ +void ipflog_init() +{ + int i; + + for (i = IPL_LOGMAX; i >= 0; i--) { + iplt[i] = NULL; + ipll[i] = NULL; + iplh[i] = &iplt[i]; + iplused[i] = 0; + bzero((char *)&iplcrc[i], sizeof(iplcrc[i])); + } +} + + +/* + * ipflog + * Create a log record for a packet given that it has been triggered by a + * rule (or the default setting). Calculate the transport protocol header + * size using predetermined size of a couple of popular protocols and thus + * how much data to copy into the log, including part of the data body if + * requested. + */ +int ipflog(flags, ip, fin, m) +u_int flags; +ip_t *ip; +fr_info_t *fin; +mb_t *m; +{ + ipflog_t ipfl; + register size_t mlen, hlen; + size_t sizes[2]; + void *ptrs[2]; + int types[2]; +# if SOLARIS + ill_t *ifp = fin->fin_ifp; +# else + struct ifnet *ifp = fin->fin_ifp; +# endif + + /* + * calculate header size. + */ + hlen = fin->fin_hlen; + if ((ip->ip_off & IP_OFFMASK) == 0) { + if (ip->ip_p == IPPROTO_TCP) + hlen += MIN(sizeof(tcphdr_t), fin->fin_dlen); + else if (ip->ip_p == IPPROTO_UDP) + hlen += MIN(sizeof(udphdr_t), fin->fin_dlen); + else if (ip->ip_p == IPPROTO_ICMP) { + struct icmp *icmp; + + icmp = (struct icmp *)((char *)ip + hlen); + + /* + * For ICMP, if the packet is an error packet, also + * include the information about the packet which + * caused the error. + */ + switch (icmp->icmp_type) + { + case ICMP_UNREACH : + case ICMP_SOURCEQUENCH : + case ICMP_REDIRECT : + case ICMP_TIMXCEED : + case ICMP_PARAMPROB : + hlen += MIN(sizeof(struct icmp) + 8, + fin->fin_dlen); + break; + default : + hlen += MIN(sizeof(struct icmp), + fin->fin_dlen); + break; + } + } + } + /* + * Get the interface number and name to which this packet is + * currently associated. + */ +# if SOLARIS + ipfl.fl_unit = (u_char)ifp->ill_ppa; + bcopy(ifp->ill_name, ipfl.fl_ifname, MIN(ifp->ill_name_length, 4)); + mlen = (flags & FR_LOGBODY) ? MIN(msgdsize(m) - hlen, 128) : 0; +# else +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199603)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + strncpy(ipfl.fl_ifname, ifp->if_xname, IFNAMSIZ); +# else +# ifndef linux + ipfl.fl_unit = (u_char)ifp->if_unit; +# endif + if ((ipfl.fl_ifname[0] = ifp->if_name[0])) + if ((ipfl.fl_ifname[1] = ifp->if_name[1])) + if ((ipfl.fl_ifname[2] = ifp->if_name[2])) + ipfl.fl_ifname[3] = ifp->if_name[3]; +# endif + mlen = (flags & FR_LOGBODY) ? MIN(ip->ip_len - hlen, 128) : 0; +# endif + ipfl.fl_plen = (u_char)mlen; + ipfl.fl_hlen = (u_char)hlen; + ipfl.fl_rule = fin->fin_rule; + ipfl.fl_group = fin->fin_group; + if (fin->fin_fr != NULL) + ipfl.fl_loglevel = fin->fin_fr->fr_loglevel; + else + ipfl.fl_loglevel = 0xffff; + ipfl.fl_flags = flags; + ptrs[0] = (void *)&ipfl; + sizes[0] = sizeof(ipfl); + types[0] = 0; +# if SOLARIS + /* + * Are we copied from the mblk or an aligned array ? + */ + if (ip == (ip_t *)m->b_rptr) { + ptrs[1] = m; + sizes[1] = hlen + mlen; + types[1] = 1; + } else { + ptrs[1] = ip; + sizes[1] = hlen + mlen; + types[1] = 0; + } +# else + ptrs[1] = m; + sizes[1] = hlen + mlen; + types[1] = 1; +# endif + return ipllog(IPL_LOGIPF, fin, ptrs, sizes, types, 2); +} + + +/* + * ipllog + */ +int ipllog(dev, fin, items, itemsz, types, cnt) +int dev; +fr_info_t *fin; +void **items; +size_t *itemsz; +int *types, cnt; +{ + caddr_t buf, s; + iplog_t *ipl; + size_t len; + int i; + + /* + * Check to see if this log record has a CRC which matches the last + * record logged. If it does, just up the count on the previous one + * rather than create a new one. + */ + MUTEX_ENTER(&ipl_mutex); + if (fin != NULL) { + if ((ipll[dev] != NULL) && + bcmp((char *)fin, (char *)&iplcrc[dev], FI_CSIZE) == 0) { + ipll[dev]->ipl_count++; + MUTEX_EXIT(&ipl_mutex); + return 1; + } + bcopy((char *)fin, (char *)&iplcrc[dev], FI_CSIZE); + } else + bzero((char *)&iplcrc[dev], FI_CSIZE); + MUTEX_EXIT(&ipl_mutex); + + /* + * Get the total amount of data to be logged. + */ + for (i = 0, len = sizeof(iplog_t); i < cnt; i++) + len += itemsz[i]; + + /* + * check that we have space to record this information and can + * allocate that much. + */ + KMALLOCS(buf, caddr_t, len); + if (!buf) + return 0; + MUTEX_ENTER(&ipl_mutex); + if ((iplused[dev] + len) > IPLLOGSIZE) { + MUTEX_EXIT(&ipl_mutex); + KFREES(buf, len); + return 0; + } + iplused[dev] += len; + MUTEX_EXIT(&ipl_mutex); + + /* + * advance the log pointer to the next empty record and deduct the + * amount of space we're going to use. + */ + ipl = (iplog_t *)buf; + ipl->ipl_magic = IPL_MAGIC; + ipl->ipl_count = 1; + ipl->ipl_next = NULL; + ipl->ipl_dsize = len; +# if SOLARIS || defined(sun) || defined(linux) + uniqtime((struct timeval *)&ipl->ipl_sec); +# else +# if BSD >= 199306 || defined(__FreeBSD__) || defined(__sgi) + microtime((struct timeval *)&ipl->ipl_sec); +# endif +# endif + + /* + * Loop through all the items to be logged, copying each one to the + * buffer. Use bcopy for normal data or the mb_t copyout routine. + */ + for (i = 0, s = buf + sizeof(*ipl); i < cnt; i++) { + if (types[i] == 0) + bcopy(items[i], s, itemsz[i]); + else if (types[i] == 1) { +# if SOLARIS + copyout_mblk(items[i], 0, itemsz[i], s); +# else + m_copydata(items[i], 0, itemsz[i], s); +# endif + } + s += itemsz[i]; + } + MUTEX_ENTER(&ipl_mutex); + ipll[dev] = ipl; + *iplh[dev] = ipl; + iplh[dev] = &ipl->ipl_next; +# if SOLARIS + cv_signal(&iplwait); + mutex_exit(&ipl_mutex); +# else + MUTEX_EXIT(&ipl_mutex); +# ifdef linux + wake_up_interruptible(&iplwait[dev]); +# else + wakeup(&iplh[dev]); +# endif +# endif + return 1; +} + + +int ipflog_read(unit, uio) +minor_t unit; +struct uio *uio; +{ + size_t dlen, copied; + int error = 0; + iplog_t *ipl; +# if defined(_KERNEL) && !SOLARIS + int s; +# endif + + /* + * Sanity checks. Make sure the minor # is valid and we're copying + * a valid chunk of data. + */ + if (IPL_LOGMAX < unit) + return ENXIO; + if (!uio->uio_resid) + return 0; + if ((uio->uio_resid < sizeof(iplog_t)) || + (uio->uio_resid > IPLLOGSIZE)) + return EINVAL; + + /* + * Lock the log so we can snapshot the variables. Wait for a signal + * if the log is empty. + */ + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); + + while (!iplused[unit] || !iplt[unit]) { +# if SOLARIS && defined(_KERNEL) + if (!cv_wait_sig(&iplwait, &ipl_mutex)) { + MUTEX_EXIT(&ipl_mutex); + return EINTR; + } +# else +# ifdef linux + interruptible_sleep_on(&iplwait[unit]); + if (current->signal & ~current->blocked) + return -EINTR; +# else + MUTEX_EXIT(&ipl_mutex); + SPL_X(s); + error = SLEEP(&iplh[unit], "ipl sleep"); + if (error) + return error; + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); +# endif /* linux */ +# endif /* SOLARIS */ + } + +# if BSD >= 199306 || defined(__FreeBSD__) + uio->uio_rw = UIO_READ; +# endif + + for (copied = 0; (ipl = iplt[unit]); copied += dlen) { + dlen = ipl->ipl_dsize; + if (dlen > uio->uio_resid) + break; + /* + * Don't hold the mutex over the uiomove call. + */ + iplt[unit] = ipl->ipl_next; + iplused[unit] -= dlen; + MUTEX_EXIT(&ipl_mutex); + SPL_X(s); + error = UIOMOVE((caddr_t)ipl, dlen, UIO_READ, uio); + if (error) { + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); + ipl->ipl_next = iplt[unit]; + iplt[unit] = ipl; + iplused[unit] += dlen; + break; + } + KFREES((caddr_t)ipl, dlen); + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); + } + if (!iplt[unit]) { + iplused[unit] = 0; + iplh[unit] = &iplt[unit]; + ipll[unit] = NULL; + } + + MUTEX_EXIT(&ipl_mutex); + SPL_X(s); +# ifdef linux + if (!error) + return (int)copied; + return -error; +# else + return error; +# endif +} + + +int ipflog_clear(unit) +minor_t unit; +{ + iplog_t *ipl; + int used; + + MUTEX_ENTER(&ipl_mutex); + while ((ipl = iplt[unit])) { + iplt[unit] = ipl->ipl_next; + KFREES((caddr_t)ipl, ipl->ipl_dsize); + } + iplh[unit] = &iplt[unit]; + ipll[unit] = NULL; + used = iplused[unit]; + iplused[unit] = 0; + bzero((char *)&iplcrc[unit], FI_CSIZE); + MUTEX_EXIT(&ipl_mutex); + return used; +} +#endif /* IPFILTER_LOG */ diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c new file mode 100644 index 0000000..20aa381 --- /dev/null +++ b/sys/netinet/ip_mroute.c @@ -0,0 +1,2281 @@ +/* + * IP multicast forwarding procedures + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Mark J. Steiglitz, Stanford, May, 1991 + * Modified by Van Jacobson, LBL, January 1993 + * Modified by Ajit Thyagarajan, PARC, August 1993 + * Modified by Bill Fenner, PARC, April 1995 + * + * MROUTING Revision: 3.5 + * $FreeBSD$ + */ + +#include "opt_mrouting.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/sockio.h> +#include <sys/syslog.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/in_var.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <netinet/udp.h> + +#ifndef NTOHL +#if BYTE_ORDER != BIG_ENDIAN +#define NTOHL(d) ((d) = ntohl((d))) +#define NTOHS(d) ((d) = ntohs((u_short)(d))) +#define HTONL(d) ((d) = htonl((d))) +#define HTONS(d) ((d) = htons((u_short)(d))) +#else +#define NTOHL(d) +#define NTOHS(d) +#define HTONL(d) +#define HTONS(d) +#endif +#endif + +#ifndef MROUTING +extern u_long _ip_mcast_src __P((int vifi)); +extern int _ip_mforward __P((struct ip *ip, struct ifnet *ifp, + struct mbuf *m, struct ip_moptions *imo)); +extern int _ip_mrouter_done __P((void)); +extern int _ip_mrouter_get __P((struct socket *so, struct sockopt *sopt)); +extern int _ip_mrouter_set __P((struct socket *so, struct sockopt *sopt)); +extern int _mrt_ioctl __P((int req, caddr_t data, struct proc *p)); + +/* + * Dummy routines and globals used when multicast routing is not compiled in. + */ + +struct socket *ip_mrouter = NULL; +u_int rsvpdebug = 0; + +int +_ip_mrouter_set(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_set)(struct socket *, struct sockopt *) = _ip_mrouter_set; + + +int +_ip_mrouter_get(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_get)(struct socket *, struct sockopt *) = _ip_mrouter_get; + +int +_ip_mrouter_done() +{ + return(0); +} + +int (*ip_mrouter_done)(void) = _ip_mrouter_done; + +int +_ip_mforward(ip, ifp, m, imo) + struct ip *ip; + struct ifnet *ifp; + struct mbuf *m; + struct ip_moptions *imo; +{ + return(0); +} + +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = _ip_mforward; + +int +_mrt_ioctl(int req, caddr_t data, struct proc *p) +{ + return EOPNOTSUPP; +} + +int (*mrt_ioctl)(int, caddr_t, struct proc *) = _mrt_ioctl; + +void +rsvp_input(m, iphlen) /* XXX must fixup manually */ + struct mbuf *m; + int iphlen; +{ + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, iphlen); + return; + } + /* Drop the packet */ + m_freem(m); +} + +void ipip_input(struct mbuf *m, int iphlen) { /* XXX must fixup manually */ + rip_input(m, iphlen); +} + +int (*legal_vif_num)(int) = 0; + +/* + * This should never be called, since IP_MULTICAST_VIF should fail, but + * just in case it does get called, the code a little lower in ip_output + * will assign the packet a local address. + */ +u_long +_ip_mcast_src(int vifi) { return INADDR_ANY; } +u_long (*ip_mcast_src)(int) = _ip_mcast_src; + +int +ip_rsvp_vif_init(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EINVAL); +} + +int +ip_rsvp_vif_done(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EINVAL); +} + +void +ip_rsvp_force_done(so) + struct socket *so; +{ + return; +} + +#else /* MROUTING */ + +#define M_HASCL(m) ((m)->m_flags & M_EXT) + +#define INSIZ sizeof(struct in_addr) +#define same(a1, a2) \ + (bcmp((caddr_t)(a1), (caddr_t)(a2), INSIZ) == 0) + +static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); + +/* + * Globals. All but ip_mrouter and ip_mrtproto could be static, + * except for netstat or debugging purposes. + */ +#ifndef MROUTE_LKM +struct socket *ip_mrouter = NULL; +static struct mrtstat mrtstat; +#else /* MROUTE_LKM */ +extern void X_ipip_input __P((struct mbuf *m, int iphlen)); +extern struct mrtstat mrtstat; +static int ip_mrtproto; +#endif + +#define NO_RTE_FOUND 0x1 +#define RTE_FOUND 0x2 + +static struct mfc *mfctable[MFCTBLSIZ]; +static u_char nexpire[MFCTBLSIZ]; +static struct vif viftable[MAXVIFS]; +static u_int mrtdebug = 0; /* debug level */ +#define DEBUG_MFC 0x02 +#define DEBUG_FORWARD 0x04 +#define DEBUG_EXPIRE 0x08 +#define DEBUG_XMIT 0x10 +static u_int tbfdebug = 0; /* tbf debug level */ +static u_int rsvpdebug = 0; /* rsvp debug level */ + +static struct callout_handle expire_upcalls_ch; + +#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ +#define UPCALL_EXPIRE 6 /* number of timeouts */ + +/* + * Define the token bucket filter structures + * tbftable -> each vif has one of these for storing info + */ + +static struct tbf tbftable[MAXVIFS]; +#define TBF_REPROCESS (hz / 100) /* 100x / second */ + +/* + * 'Interfaces' associated with decapsulator (so we can tell + * packets that went through it from ones that get reflected + * by a broken gateway). These interfaces are never linked into + * the system ifnet list & no routes point to them. I.e., packets + * can't be sent this way. They only exist as a placeholder for + * multicast source verification. + */ +static struct ifnet multicast_decap_if[MAXVIFS]; + +#define ENCAP_TTL 64 +#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ + +/* prototype IP hdr for encapsulated packets */ +static struct ip multicast_encap_iphdr = { +#if BYTE_ORDER == LITTLE_ENDIAN + sizeof(struct ip) >> 2, IPVERSION, +#else + IPVERSION, sizeof(struct ip) >> 2, +#endif + 0, /* tos */ + sizeof(struct ip), /* total length */ + 0, /* id */ + 0, /* frag offset */ + ENCAP_TTL, ENCAP_PROTO, + 0, /* checksum */ +}; + +/* + * Private variables. + */ +static vifi_t numvifs = 0; +static int have_encap_tunnel = 0; + +/* + * one-back cache used by ipip_input to locate a tunnel's vif + * given a datagram's src ip address. + */ +static u_long last_encap_src; +static struct vif *last_encap_vif; + +static u_long X_ip_mcast_src __P((int vifi)); +static int X_ip_mforward __P((struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo)); +static int X_ip_mrouter_done __P((void)); +static int X_ip_mrouter_get __P((struct socket *so, struct sockopt *m)); +static int X_ip_mrouter_set __P((struct socket *so, struct sockopt *m)); +static int X_legal_vif_num __P((int vif)); +static int X_mrt_ioctl __P((int cmd, caddr_t data)); + +static int get_sg_cnt(struct sioc_sg_req *); +static int get_vif_cnt(struct sioc_vif_req *); +static int ip_mrouter_init(struct socket *, int); +static int add_vif(struct vifctl *); +static int del_vif(vifi_t); +static int add_mfc(struct mfcctl *); +static int del_mfc(struct mfcctl *); +static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); +static int set_assert(int); +static void expire_upcalls(void *); +static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, + vifi_t); +static void phyint_send(struct ip *, struct vif *, struct mbuf *); +static void encap_send(struct ip *, struct vif *, struct mbuf *); +static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); +static void tbf_queue(struct vif *, struct mbuf *); +static void tbf_process_q(struct vif *); +static void tbf_reprocess_q(void *); +static int tbf_dq_sel(struct vif *, struct ip *); +static void tbf_send_packet(struct vif *, struct mbuf *); +static void tbf_update_tokens(struct vif *); +static int priority(struct vif *, struct ip *); +void multiencap_decap(struct mbuf *); + +/* + * whether or not special PIM assert processing is enabled. + */ +static int pim_assert; +/* + * Rate limit for assert notification messages, in usec + */ +#define ASSERT_MSG_TIME 3000000 + +/* + * Hash function for a source, group entry + */ +#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ + ((g) >> 20) ^ ((g) >> 10) ^ (g)) + +/* + * Find a route for a given origin IP address and Multicast group address + * Type of service parameter to be added in the future!!! + */ + +#define MFCFIND(o, g, rt) { \ + register struct mfc *_rt = mfctable[MFCHASH(o,g)]; \ + rt = NULL; \ + ++mrtstat.mrts_mfc_lookups; \ + while (_rt) { \ + if ((_rt->mfc_origin.s_addr == o) && \ + (_rt->mfc_mcastgrp.s_addr == g) && \ + (_rt->mfc_stall == NULL)) { \ + rt = _rt; \ + break; \ + } \ + _rt = _rt->mfc_next; \ + } \ + if (rt == NULL) { \ + ++mrtstat.mrts_mfc_misses; \ + } \ +} + + +/* + * Macros to compute elapsed time efficiently + * Borrowed from Van Jacobson's scheduling code + */ +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a).tv_usec - (b).tv_usec; \ + if ((xxs = (a).tv_sec - (b).tv_sec)) { \ + switch (xxs) { \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + default: \ + delta += (1000000 * xxs); \ + } \ + } \ +} + +#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ + (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) + +#ifdef UPCALL_TIMING +u_long upcall_data[51]; +static void collate(struct timeval *); +#endif /* UPCALL_TIMING */ + + +/* + * Handle MRT setsockopt commands to modify the multicast routing tables. + */ +static int +X_ip_mrouter_set(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + vifi_t vifi; + struct vifctl vifc; + struct mfcctl mfc; + + if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) + return (EPERM); + + error = 0; + switch (sopt->sopt_name) { + case MRT_INIT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + error = ip_mrouter_init(so, optval); + break; + + case MRT_DONE: + error = ip_mrouter_done(); + break; + + case MRT_ADD_VIF: + error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); + if (error) + break; + error = add_vif(&vifc); + break; + + case MRT_DEL_VIF: + error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); + if (error) + break; + error = del_vif(vifi); + break; + + case MRT_ADD_MFC: + case MRT_DEL_MFC: + error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc); + if (error) + break; + if (sopt->sopt_name == MRT_ADD_MFC) + error = add_mfc(&mfc); + else + error = del_mfc(&mfc); + break; + + case MRT_ASSERT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + set_assert(optval); + break; + + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_set)(struct socket *, struct sockopt *) = X_ip_mrouter_set; +#endif + +/* + * Handle MRT getsockopt commands + */ +static int +X_ip_mrouter_get(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error; + static int version = 0x0305; /* !!! why is this here? XXX */ + + switch (sopt->sopt_name) { + case MRT_VERSION: + error = sooptcopyout(sopt, &version, sizeof version); + break; + + case MRT_ASSERT: + error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get; +#endif + +/* + * Handle ioctl commands to obtain information from the cache + */ +static int +X_mrt_ioctl(cmd, data) + int cmd; + caddr_t data; +{ + int error = 0; + + switch (cmd) { + case (SIOCGETVIFCNT): + return (get_vif_cnt((struct sioc_vif_req *)data)); + break; + case (SIOCGETSGCNT): + return (get_sg_cnt((struct sioc_sg_req *)data)); + break; + default: + return (EINVAL); + break; + } + return error; +} + +#ifndef MROUTE_LKM +int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; +#endif + +/* + * returns the packet, byte, rpf-failure count for the source group provided + */ +static int +get_sg_cnt(req) + register struct sioc_sg_req *req; +{ + register struct mfc *rt; + int s; + + s = splnet(); + MFCFIND(req->src.s_addr, req->grp.s_addr, rt); + splx(s); + if (rt != NULL) { + req->pktcnt = rt->mfc_pkt_cnt; + req->bytecnt = rt->mfc_byte_cnt; + req->wrong_if = rt->mfc_wrong_if; + } else + req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; + + return 0; +} + +/* + * returns the input and output packet and byte counts on the vif provided + */ +static int +get_vif_cnt(req) + register struct sioc_vif_req *req; +{ + register vifi_t vifi = req->vifi; + + if (vifi >= numvifs) return EINVAL; + + req->icount = viftable[vifi].v_pkt_in; + req->ocount = viftable[vifi].v_pkt_out; + req->ibytes = viftable[vifi].v_bytes_in; + req->obytes = viftable[vifi].v_bytes_out; + + return 0; +} + +/* + * Enable multicast routing + */ +static int +ip_mrouter_init(so, version) + struct socket *so; + int version; +{ + if (mrtdebug) + log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; + + if (version != 1) + return ENOPROTOOPT; + + if (ip_mrouter != NULL) return EADDRINUSE; + + ip_mrouter = so; + + bzero((caddr_t)mfctable, sizeof(mfctable)); + bzero((caddr_t)nexpire, sizeof(nexpire)); + + pim_assert = 0; + + expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_init\n"); + + return 0; +} + +/* + * Disable multicast routing + */ +static int +X_ip_mrouter_done() +{ + vifi_t vifi; + int i; + struct ifnet *ifp; + struct ifreq ifr; + struct mfc *rt; + struct rtdetq *rte; + int s; + + s = splnet(); + + /* + * For each phyint in use, disable promiscuous reception of all IP + * multicasts. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_lcl_addr.s_addr != 0 && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr + = INADDR_ANY; + ifp = viftable[vifi].v_ifp; + if_allmulti(ifp, 0); + } + } + bzero((caddr_t)tbftable, sizeof(tbftable)); + bzero((caddr_t)viftable, sizeof(viftable)); + numvifs = 0; + pim_assert = 0; + + untimeout(expire_upcalls, (caddr_t)NULL, expire_upcalls_ch); + + /* + * Free all multicast forwarding cache entries. + */ + for (i = 0; i < MFCTBLSIZ; i++) { + for (rt = mfctable[i]; rt != NULL; ) { + struct mfc *nr = rt->mfc_next; + + for (rte = rt->mfc_stall; rte != NULL; ) { + struct rtdetq *n = rte->next; + + m_freem(rte->m); + free(rte, M_MRTABLE); + rte = n; + } + free(rt, M_MRTABLE); + rt = nr; + } + } + + bzero((caddr_t)mfctable, sizeof(mfctable)); + + /* + * Reset de-encapsulation cache + */ + last_encap_src = 0; + last_encap_vif = NULL; + have_encap_tunnel = 0; + + ip_mrouter = NULL; + + splx(s); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_done\n"); + + return 0; +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_done)(void) = X_ip_mrouter_done; +#endif + +/* + * Set PIM assert processing global + */ +static int +set_assert(i) + int i; +{ + if ((i != 1) && (i != 0)) + return EINVAL; + + pim_assert = i; + + return 0; +} + +/* + * Add a vif to the vif table + */ +static int +add_vif(vifcp) + register struct vifctl *vifcp; +{ + register struct vif *vifp = viftable + vifcp->vifc_vifi; + static struct sockaddr_in sin = {sizeof sin, AF_INET}; + struct ifaddr *ifa; + struct ifnet *ifp; + int error, s; + struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; + + if (vifcp->vifc_vifi >= MAXVIFS) return EINVAL; + if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE; + + /* Find the interface with an address in AF_INET family */ + sin.sin_addr = vifcp->vifc_lcl_addr; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa == 0) return EADDRNOTAVAIL; + ifp = ifa->ifa_ifp; + + if (vifcp->vifc_flags & VIFF_TUNNEL) { + if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { + /* + * An encapsulating tunnel is wanted. Tell ipip_input() to + * start paying attention to encapsulated packets. + */ + if (have_encap_tunnel == 0) { + have_encap_tunnel = 1; + for (s = 0; s < MAXVIFS; ++s) { + multicast_decap_if[s].if_name = "mdecap"; + multicast_decap_if[s].if_unit = s; + } + } + /* + * Set interface to fake encapsulator interface + */ + ifp = &multicast_decap_if[vifcp->vifc_vifi]; + /* + * Prepare cached route entry + */ + bzero(&vifp->v_route, sizeof(vifp->v_route)); + } else { + log(LOG_ERR, "source routed tunnels not supported\n"); + return EOPNOTSUPP; + } + } else { + /* Make sure the interface supports multicast */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) + return EOPNOTSUPP; + + /* Enable promiscuous reception of all IP multicasts from the if */ + s = splnet(); + error = if_allmulti(ifp, 1); + splx(s); + if (error) + return error; + } + + s = splnet(); + /* define parameters for the tbf structure */ + vifp->v_tbf = v_tbf; + GET_TIME(vifp->v_tbf->tbf_last_pkt_t); + vifp->v_tbf->tbf_n_tok = 0; + vifp->v_tbf->tbf_q_len = 0; + vifp->v_tbf->tbf_max_q_len = MAXQSIZE; + vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; + + vifp->v_flags = vifcp->vifc_flags; + vifp->v_threshold = vifcp->vifc_threshold; + vifp->v_lcl_addr = vifcp->vifc_lcl_addr; + vifp->v_rmt_addr = vifcp->vifc_rmt_addr; + vifp->v_ifp = ifp; + /* scaling up here allows division by 1024 in critical code */ + vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; + vifp->v_rsvp_on = 0; + vifp->v_rsvpd = NULL; + /* initialize per vif pkt counters */ + vifp->v_pkt_in = 0; + vifp->v_pkt_out = 0; + vifp->v_bytes_in = 0; + vifp->v_bytes_out = 0; + splx(s); + + /* Adjust numvifs up if the vifi is higher than numvifs */ + if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; + + if (mrtdebug) + log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", + vifcp->vifc_vifi, + (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), + (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", + (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), + vifcp->vifc_threshold, + vifcp->vifc_rate_limit); + + return 0; +} + +/* + * Delete a vif from the vif table + */ +static int +del_vif(vifi) + vifi_t vifi; +{ + register struct vif *vifp = &viftable[vifi]; + register struct mbuf *m; + struct ifnet *ifp; + struct ifreq ifr; + int s; + + if (vifi >= numvifs) return EINVAL; + if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL; + + s = splnet(); + + if (!(vifp->v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; + ifp = vifp->v_ifp; + if_allmulti(ifp, 0); + } + + if (vifp == last_encap_vif) { + last_encap_vif = 0; + last_encap_src = 0; + } + + /* + * Free packets queued at the interface + */ + while (vifp->v_tbf->tbf_q) { + m = vifp->v_tbf->tbf_q; + vifp->v_tbf->tbf_q = m->m_act; + m_freem(m); + } + + bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); + bzero((caddr_t)vifp, sizeof (*vifp)); + + if (mrtdebug) + log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); + + /* Adjust numvifs down */ + for (vifi = numvifs; vifi > 0; vifi--) + if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break; + numvifs = vifi; + + splx(s); + + return 0; +} + +/* + * Add an mfc entry + */ +static int +add_mfc(mfccp) + struct mfcctl *mfccp; +{ + struct mfc *rt; + u_long hash; + struct rtdetq *rte; + register u_short nstl; + int s; + int i; + + MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt); + + /* If an entry already exists, just update the fields */ + if (rt) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + s = splnet(); + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + splx(s); + return 0; + } + + /* + * Find the entry for which the upcall was made and update + */ + s = splnet(); + hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); + for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { + + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && + (rt->mfc_stall != NULL)) { + + if (nstl++) + log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", + "multiple kernel entries", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, (void *)rt->mfc_stall); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, (void *)rt->mfc_stall); + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + + rt->mfc_expire = 0; /* Don't clean this guy up */ + nexpire[hash]--; + + /* free packets Qed at the end of this entry */ + for (rte = rt->mfc_stall; rte != NULL; ) { + struct rtdetq *n = rte->next; + + ip_mdq(rte->m, rte->ifp, rt, -1); + m_freem(rte->m); +#ifdef UPCALL_TIMING + collate(&(rte->t)); +#endif /* UPCALL_TIMING */ + free(rte, M_MRTABLE); + rte = n; + } + rt->mfc_stall = NULL; + } + } + + /* + * It is possible that an entry is being inserted without an upcall + */ + if (nstl == 0) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", + hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { + + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + if (rt->mfc_expire) + nexpire[hash]--; + rt->mfc_expire = 0; + } + } + if (rt == NULL) { + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + splx(s); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + rt->mfc_expire = 0; + rt->mfc_stall = NULL; + + /* link into table */ + rt->mfc_next = mfctable[hash]; + mfctable[hash] = rt; + } + } + splx(s); + return 0; +} + +#ifdef UPCALL_TIMING +/* + * collect delay statistics on the upcalls + */ +static void collate(t) +register struct timeval *t; +{ + register u_long d; + register struct timeval tp; + register u_long delta; + + GET_TIME(tp); + + if (TV_LT(*t, tp)) + { + TV_DELTA(tp, *t, delta); + + d = delta >> 10; + if (d > 50) + d = 50; + + ++upcall_data[d]; + } +} +#endif /* UPCALL_TIMING */ + +/* + * Delete an mfc entry + */ +static int +del_mfc(mfccp) + struct mfcctl *mfccp; +{ + struct in_addr origin; + struct in_addr mcastgrp; + struct mfc *rt; + struct mfc **nptr; + u_long hash; + int s; + + origin = mfccp->mfcc_origin; + mcastgrp = mfccp->mfcc_mcastgrp; + hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", + (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); + + s = splnet(); + + nptr = &mfctable[hash]; + while ((rt = *nptr) != NULL) { + if (origin.s_addr == rt->mfc_origin.s_addr && + mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && + rt->mfc_stall == NULL) + break; + + nptr = &rt->mfc_next; + } + if (rt == NULL) { + splx(s); + return EADDRNOTAVAIL; + } + + *nptr = rt->mfc_next; + free(rt, M_MRTABLE); + + splx(s); + + return 0; +} + +/* + * Send a message to mrouted on the multicast routing socket + */ +static int +socket_send(s, mm, src) + struct socket *s; + struct mbuf *mm; + struct sockaddr_in *src; +{ + if (s) { + if (sbappendaddr(&s->so_rcv, + (struct sockaddr *)src, + mm, (struct mbuf *)0) != 0) { + sorwakeup(s); + return 0; + } + } + m_freem(mm); + return -1; +} + +/* + * IP multicast forwarding function. This function assumes that the packet + * pointed to by "ip" has arrived on (or is about to be sent to) the interface + * pointed to by "ifp", and the packet is to be relayed to other networks + * that have members of the packet's destination IP multicast group. + * + * The packet is returned unscathed to the caller, unless it is + * erroneous, in which case a non-zero return value tells the caller to + * discard it. + */ + +#define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ +#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ + +static int +X_ip_mforward(ip, ifp, m, imo) + register struct ip *ip; + struct ifnet *ifp; + struct mbuf *m; + struct ip_moptions *imo; +{ + register struct mfc *rt; + register u_char *ipoptions; + static struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + static int srctun = 0; + register struct mbuf *mm; + int s; + vifi_t vifi; + struct vif *vifp; + + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", + (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), + (void *)ifp); + + if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || + (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) { + /* + * Packet arrived via a physical interface or + * an encapsulated tunnel. + */ + } else { + /* + * Packet arrived through a source-route tunnel. + * Source-route tunnels are no longer supported. + */ + if ((srctun++ % 1000) == 0) + log(LOG_ERR, + "ip_mforward: received source-routed packet from %lx\n", + (u_long)ntohl(ip->ip_src.s_addr)); + + return 1; + } + + if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { + if (ip->ip_ttl < 255) + ip->ip_ttl++; /* compensate for -1 in *_send routines */ + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + vifp = viftable + vifi; + printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), vifi, + (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", + vifp->v_ifp->if_name, vifp->v_ifp->if_unit); + } + return (ip_mdq(m, ifp, NULL, vifi)); + } + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr)); + if(!imo) + printf("In fact, no options were specified at all\n"); + } + + /* + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. + */ + if (ip->ip_ttl <= 1 || + ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) + return 0; + + /* + * Determine forwarding vifs from the forwarding cache table + */ + s = splnet(); + MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt); + + /* Entry exists, so forward if necessary */ + if (rt != NULL) { + splx(s); + return (ip_mdq(m, ifp, rt, -1)); + } else { + /* + * If we don't have a route for packet's origin, + * Make a copy of the packet & + * send message to routing daemon + */ + + register struct mbuf *mb0; + register struct rtdetq *rte; + register u_long hash; + int hlen = ip->ip_hl << 2; +#ifdef UPCALL_TIMING + struct timeval tp; + + GET_TIME(tp); +#endif + + mrtstat.mrts_no_route++; + if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) + log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", + (u_long)ntohl(ip->ip_src.s_addr), + (u_long)ntohl(ip->ip_dst.s_addr)); + + /* + * Allocate mbufs early so that we don't do extra work if we are + * just going to fail anyway. Make sure to pullup the header so + * that other people can't step on it. + */ + rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); + if (rte == NULL) { + splx(s); + return ENOBUFS; + } + mb0 = m_copy(m, 0, M_COPYALL); + if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) + mb0 = m_pullup(mb0, hlen); + if (mb0 == NULL) { + free(rte, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* is there an upcall waiting for this packet? */ + hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); + for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { + if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && + (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && + (rt->mfc_stall != NULL)) + break; + } + + if (rt == NULL) { + int i; + struct igmpmsg *im; + + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + free(rte, M_MRTABLE); + m_freem(mb0); + splx(s); + return ENOBUFS; + } + /* Make a copy of the header to send to the user level process */ + mm = m_copy(mb0, 0, hlen); + if (mm == NULL) { + free(rte, M_MRTABLE); + m_freem(mb0); + free(rt, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* + * Send message to routing daemon to install + * a route into the kernel table + */ + k_igmpsrc.sin_addr = ip->ip_src; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_NOCACHE; + im->im_mbz = 0; + + mrtstat.mrts_upcalls++; + + if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); + ++mrtstat.mrts_upq_sockfull; + free(rte, M_MRTABLE); + m_freem(mb0); + free(rt, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin.s_addr = ip->ip_src.s_addr; + rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; + rt->mfc_expire = UPCALL_EXPIRE; + nexpire[hash]++; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = 0; + rt->mfc_parent = -1; + + /* link into table */ + rt->mfc_next = mfctable[hash]; + mfctable[hash] = rt; + rt->mfc_stall = rte; + + } else { + /* determine if q has overflowed */ + int npkts = 0; + struct rtdetq **p; + + for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) + npkts++; + + if (npkts > MAX_UPQ) { + mrtstat.mrts_upq_ovflw++; + free(rte, M_MRTABLE); + m_freem(mb0); + splx(s); + return 0; + } + + /* Add this entry to the end of the queue */ + *p = rte; + } + + rte->m = mb0; + rte->ifp = ifp; +#ifdef UPCALL_TIMING + rte->t = tp; +#endif + rte->next = NULL; + + splx(s); + + return 0; + } +} + +#ifndef MROUTE_LKM +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = X_ip_mforward; +#endif + +/* + * Clean up the cache entry if upcall is not serviced + */ +static void +expire_upcalls(void *unused) +{ + struct rtdetq *rte; + struct mfc *mfc, **nptr; + int i; + int s; + + s = splnet(); + for (i = 0; i < MFCTBLSIZ; i++) { + if (nexpire[i] == 0) + continue; + nptr = &mfctable[i]; + for (mfc = *nptr; mfc != NULL; mfc = *nptr) { + /* + * Skip real cache entries + * Make sure it wasn't marked to not expire (shouldn't happen) + * If it expires now + */ + if (mfc->mfc_stall != NULL && + mfc->mfc_expire != 0 && + --mfc->mfc_expire == 0) { + if (mrtdebug & DEBUG_EXPIRE) + log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", + (u_long)ntohl(mfc->mfc_origin.s_addr), + (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); + /* + * drop all the packets + * free the mbuf with the pkt, if, timing info + */ + for (rte = mfc->mfc_stall; rte; ) { + struct rtdetq *n = rte->next; + + m_freem(rte->m); + free(rte, M_MRTABLE); + rte = n; + } + ++mrtstat.mrts_cache_cleanups; + nexpire[i]--; + + *nptr = mfc->mfc_next; + free(mfc, M_MRTABLE); + } else { + nptr = &mfc->mfc_next; + } + } + } + splx(s); + expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); +} + +/* + * Packet forwarding routine once entry in the cache is made + */ +static int +ip_mdq(m, ifp, rt, xmt_vif) + register struct mbuf *m; + register struct ifnet *ifp; + register struct mfc *rt; + register vifi_t xmt_vif; +{ + register struct ip *ip = mtod(m, struct ip *); + register vifi_t vifi; + register struct vif *vifp; + register int plen = ip->ip_len; + +/* + * Macro to send packet on vif. Since RSVP packets don't get counted on + * input, they shouldn't get counted on output, so statistics keeping is + * seperate. + */ +#define MC_SEND(ip,vifp,m) { \ + if ((vifp)->v_flags & VIFF_TUNNEL) \ + encap_send((ip), (vifp), (m)); \ + else \ + phyint_send((ip), (vifp), (m)); \ +} + + /* + * If xmt_vif is not -1, send on only the requested vif. + * + * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) + */ + if (xmt_vif < numvifs) { + MC_SEND(ip, viftable + xmt_vif, m); + return 1; + } + + /* + * Don't forward if it didn't arrive from the parent vif for its origin. + */ + vifi = rt->mfc_parent; + if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { + /* came in the wrong interface */ + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", + (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); + ++mrtstat.mrts_wrong_if; + ++rt->mfc_wrong_if; + /* + * If we are doing PIM assert processing, and we are forwarding + * packets on this interface, and it is a broadcast medium + * interface (and not a tunnel), send a message to the routing daemon. + */ + if (pim_assert && rt->mfc_ttls[vifi] && + (ifp->if_flags & IFF_BROADCAST) && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + struct sockaddr_in k_igmpsrc; + struct mbuf *mm; + struct igmpmsg *im; + int hlen = ip->ip_hl << 2; + struct timeval now; + register u_long delta; + + GET_TIME(now); + + TV_DELTA(rt->mfc_last_assert, now, delta); + + if (delta > ASSERT_MSG_TIME) { + mm = m_copy(m, 0, hlen); + if (mm && (M_HASCL(mm) || mm->m_len < hlen)) + mm = m_pullup(mm, hlen); + if (mm == NULL) { + return ENOBUFS; + } + + rt->mfc_last_assert = now; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_WRONGVIF; + im->im_mbz = 0; + im->im_vif = vifi; + + k_igmpsrc.sin_addr = im->im_src; + + socket_send(ip_mrouter, mm, &k_igmpsrc); + } + } + return 0; + } + + /* If I sourced this packet, it counts as output, else it was input. */ + if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { + viftable[vifi].v_pkt_out++; + viftable[vifi].v_bytes_out += plen; + } else { + viftable[vifi].v_pkt_in++; + viftable[vifi].v_bytes_in += plen; + } + rt->mfc_pkt_cnt++; + rt->mfc_byte_cnt += plen; + + /* + * For each vif, decide if a copy of the packet should be forwarded. + * Forward if: + * - the ttl exceeds the vif's threshold + * - there are group members downstream on interface + */ + for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) + if ((rt->mfc_ttls[vifi] > 0) && + (ip->ip_ttl > rt->mfc_ttls[vifi])) { + vifp->v_pkt_out++; + vifp->v_bytes_out += plen; + MC_SEND(ip, vifp, m); + } + + return 0; +} + +/* + * check if a vif number is legal/ok. This is used by ip_output, to export + * numvifs there, + */ +static int +X_legal_vif_num(vif) + int vif; +{ + if (vif >= 0 && vif < numvifs) + return(1); + else + return(0); +} + +#ifndef MROUTE_LKM +int (*legal_vif_num)(int) = X_legal_vif_num; +#endif + +/* + * Return the local address used by this vif + */ +static u_long +X_ip_mcast_src(vifi) + int vifi; +{ + if (vifi >= 0 && vifi < numvifs) + return viftable[vifi].v_lcl_addr.s_addr; + else + return INADDR_ANY; +} + +#ifndef MROUTE_LKM +u_long (*ip_mcast_src)(int) = X_ip_mcast_src; +#endif + +static void +phyint_send(ip, vifp, m) + struct ip *ip; + struct vif *vifp; + struct mbuf *m; +{ + register struct mbuf *mb_copy; + register int hlen = ip->ip_hl << 2; + + /* + * Make a new reference to the packet; make sure that + * the IP header is actually copied, not just referenced, + * so that ip_output() only scribbles on the copy. + */ + mb_copy = m_copy(m, 0, M_COPYALL); + if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) + mb_copy = m_pullup(mb_copy, hlen); + if (mb_copy == NULL) + return; + + if (vifp->v_rate_limit == 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); +} + +static void +encap_send(ip, vifp, m) + register struct ip *ip; + register struct vif *vifp; + register struct mbuf *m; +{ + register struct mbuf *mb_copy; + register struct ip *ip_copy; + register int i, len = ip->ip_len; + + /* + * copy the old packet & pullup its IP header into the + * new mbuf so we can modify it. Try to fill the new + * mbuf since if we don't the ethernet driver will. + */ + MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); + if (mb_copy == NULL) + return; + mb_copy->m_data += max_linkhdr; + mb_copy->m_len = sizeof(multicast_encap_iphdr); + + if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { + m_freem(mb_copy); + return; + } + i = MHLEN - M_LEADINGSPACE(mb_copy); + if (i > len) + i = len; + mb_copy = m_pullup(mb_copy, i); + if (mb_copy == NULL) + return; + mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); + + /* + * fill in the encapsulating IP header. + */ + ip_copy = mtod(mb_copy, struct ip *); + *ip_copy = multicast_encap_iphdr; + ip_copy->ip_id = htons(ip_id++); + ip_copy->ip_len += len; + ip_copy->ip_src = vifp->v_lcl_addr; + ip_copy->ip_dst = vifp->v_rmt_addr; + + /* + * turn the encapsulated IP header back into a valid one. + */ + ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); + --ip->ip_ttl; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip->ip_sum = 0; + mb_copy->m_data += sizeof(multicast_encap_iphdr); + ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); + mb_copy->m_data -= sizeof(multicast_encap_iphdr); + + if (vifp->v_rate_limit == 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); +} + +/* + * De-encapsulate a packet and feed it back through ip input (this + * routine is called whenever IP gets a packet with proto type + * ENCAP_PROTO and a local destination address). + */ +void +#ifdef MROUTE_LKM +X_ipip_input(m, iphlen) +#else +ipip_input(m, iphlen) +#endif + register struct mbuf *m; + int iphlen; +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + register struct ip *ip = mtod(m, struct ip *); + register int hlen = ip->ip_hl << 2; + register int s; + register struct ifqueue *ifq; + register struct vif *vifp; + + if (!have_encap_tunnel) { + rip_input(m, iphlen); + return; + } + /* + * dump the packet if it's not to a multicast destination or if + * we don't have an encapsulating tunnel with the source. + * Note: This code assumes that the remote site IP address + * uniquely identifies the tunnel (i.e., that this site has + * at most one tunnel with the remote site). + */ + if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) { + ++mrtstat.mrts_bad_tunnel; + m_freem(m); + return; + } + if (ip->ip_src.s_addr != last_encap_src) { + register struct vif *vife; + + vifp = viftable; + vife = vifp + numvifs; + last_encap_src = ip->ip_src.s_addr; + last_encap_vif = 0; + for ( ; vifp < vife; ++vifp) + if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { + if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) + == VIFF_TUNNEL) + last_encap_vif = vifp; + break; + } + } + if ((vifp = last_encap_vif) == 0) { + last_encap_src = 0; + mrtstat.mrts_cant_tunnel++; /*XXX*/ + m_freem(m); + if (mrtdebug) + log(LOG_DEBUG, "ip_mforward: no tunnel with %lx\n", + (u_long)ntohl(ip->ip_src.s_addr)); + return; + } + ifp = vifp->v_ifp; + + if (hlen > IP_HDR_LEN) + ip_stripoptions(m, (struct mbuf *) 0); + m->m_data += IP_HDR_LEN; + m->m_len -= IP_HDR_LEN; + m->m_pkthdr.len -= IP_HDR_LEN; + m->m_pkthdr.rcvif = ifp; + + ifq = &ipintrq; + s = splimp(); + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + } else { + IF_ENQUEUE(ifq, m); + /* + * normally we would need a "schednetisr(NETISR_IP)" + * here but we were called by ip_input and it is going + * to loop back & try to dequeue the packet we just + * queued as soon as we return so we avoid the + * unnecessary software interrrupt. + */ + } + splx(s); +} + +/* + * Token bucket filter module + */ + +static void +tbf_control(vifp, m, ip, p_len) + register struct vif *vifp; + register struct mbuf *m; + register struct ip *ip; + register u_long p_len; +{ + register struct tbf *t = vifp->v_tbf; + + if (p_len > MAX_BKT_SIZE) { + /* drop if packet is too large */ + mrtstat.mrts_pkt2large++; + m_freem(m); + return; + } + + tbf_update_tokens(vifp); + + /* if there are enough tokens, + * and the queue is empty, + * send this packet out + */ + + if (t->tbf_q_len == 0) { + /* queue empty, send packet if enough tokens */ + if (p_len <= t->tbf_n_tok) { + t->tbf_n_tok -= p_len; + tbf_send_packet(vifp, m); + } else { + /* queue packet and timeout till later */ + tbf_queue(vifp, m); + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); + } + } else if (t->tbf_q_len < t->tbf_max_q_len) { + /* finite queue length, so queue pkts and process queue */ + tbf_queue(vifp, m); + tbf_process_q(vifp); + } else { + /* queue length too much, try to dq and queue and process */ + if (!tbf_dq_sel(vifp, ip)) { + mrtstat.mrts_q_overflow++; + m_freem(m); + return; + } else { + tbf_queue(vifp, m); + tbf_process_q(vifp); + } + } + return; +} + +/* + * adds a packet to the queue at the interface + */ +static void +tbf_queue(vifp, m) + register struct vif *vifp; + register struct mbuf *m; +{ + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + if (t->tbf_t == NULL) { + /* Queue was empty */ + t->tbf_q = m; + } else { + /* Insert at tail */ + t->tbf_t->m_act = m; + } + + /* Set new tail pointer */ + t->tbf_t = m; + +#ifdef DIAGNOSTIC + /* Make sure we didn't get fed a bogus mbuf */ + if (m->m_act) + panic("tbf_queue: m_act"); +#endif + m->m_act = NULL; + + t->tbf_q_len++; + + splx(s); +} + + +/* + * processes the queue at the interface + */ +static void +tbf_process_q(vifp) + register struct vif *vifp; +{ + register struct mbuf *m; + register int len; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + /* loop through the queue at the interface and send as many packets + * as possible + */ + while (t->tbf_q_len > 0) { + m = t->tbf_q; + + len = mtod(m, struct ip *)->ip_len; + + /* determine if the packet can be sent */ + if (len <= t->tbf_n_tok) { + /* if so, + * reduce no of tokens, dequeue the packet, + * send the packet. + */ + t->tbf_n_tok -= len; + + t->tbf_q = m->m_act; + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + + m->m_act = NULL; + tbf_send_packet(vifp, m); + + } else break; + } + splx(s); +} + +static void +tbf_reprocess_q(xvifp) + void *xvifp; +{ + register struct vif *vifp = xvifp; + if (ip_mrouter == NULL) + return; + + tbf_update_tokens(vifp); + + tbf_process_q(vifp); + + if (vifp->v_tbf->tbf_q_len) + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); +} + +/* function that will selectively discard a member of the queue + * based on the precedence value and the priority + */ +static int +tbf_dq_sel(vifp, ip) + register struct vif *vifp; + register struct ip *ip; +{ + register int s = splnet(); + register u_int p; + register struct mbuf *m, *last; + register struct mbuf **np; + register struct tbf *t = vifp->v_tbf; + + p = priority(vifp, ip); + + np = &t->tbf_q; + last = NULL; + while ((m = *np) != NULL) { + if (p > priority(vifp, mtod(m, struct ip *))) { + *np = m->m_act; + /* If we're removing the last packet, fix the tail pointer */ + if (m == t->tbf_t) + t->tbf_t = last; + m_freem(m); + /* it's impossible for the queue to be empty, but + * we check anyway. */ + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + splx(s); + mrtstat.mrts_drop_sel++; + return(1); + } + np = &m->m_act; + last = m; + } + splx(s); + return(0); +} + +static void +tbf_send_packet(vifp, m) + register struct vif *vifp; + register struct mbuf *m; +{ + struct ip_moptions imo; + int error; + static struct route ro; + int s = splnet(); + + if (vifp->v_flags & VIFF_TUNNEL) { + /* If tunnel options */ + ip_output(m, (struct mbuf *)0, &vifp->v_route, + IP_FORWARDING, (struct ip_moptions *)0); + } else { + imo.imo_multicast_ifp = vifp->v_ifp; + imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo.imo_multicast_loop = 1; + imo.imo_multicast_vif = -1; + + /* + * Re-entrancy should not be a problem here, because + * the packets that we send out and are looped back at us + * should get rejected because they appear to come from + * the loopback interface, thus preventing looping. + */ + error = ip_output(m, (struct mbuf *)0, &ro, + IP_FORWARDING, &imo); + + if (mrtdebug & DEBUG_XMIT) + log(LOG_DEBUG, "phyint_send on vif %d err %d\n", + vifp - viftable, error); + } + splx(s); +} + +/* determine the current time and then + * the elapsed time (between the last time and time now) + * in milliseconds & update the no. of tokens in the bucket + */ +static void +tbf_update_tokens(vifp) + register struct vif *vifp; +{ + struct timeval tp; + register u_long tm; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + GET_TIME(tp); + + TV_DELTA(tp, t->tbf_last_pkt_t, tm); + + /* + * This formula is actually + * "time in seconds" * "bytes/second". + * + * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) + * + * The (1000/1024) was introduced in add_vif to optimize + * this divide into a shift. + */ + t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; + t->tbf_last_pkt_t = tp; + + if (t->tbf_n_tok > MAX_BKT_SIZE) + t->tbf_n_tok = MAX_BKT_SIZE; + + splx(s); +} + +static int +priority(vifp, ip) + register struct vif *vifp; + register struct ip *ip; +{ + register int prio; + + /* temporary hack; may add general packet classifier some day */ + + /* + * The UDP port space is divided up into four priority ranges: + * [0, 16384) : unclassified - lowest priority + * [16384, 32768) : audio - highest priority + * [32768, 49152) : whiteboard - medium priority + * [49152, 65536) : video - low priority + */ + if (ip->ip_p == IPPROTO_UDP) { + struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); + switch (ntohs(udp->uh_dport) & 0xc000) { + case 0x4000: + prio = 70; + break; + case 0x8000: + prio = 60; + break; + case 0xc000: + prio = 55; + break; + default: + prio = 50; + break; + } + if (tbfdebug > 1) + log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio); + } else { + prio = 50; + } + return prio; +} + +/* + * End of token bucket filter modifications + */ + +int +ip_rsvp_vif_init(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, i, s; + + if (rsvpdebug) + printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + /* Check mbuf. */ + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + return (error); + + if (rsvpdebug) + printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", i, rsvp_on); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + /* Check if socket is available. */ + if (viftable[i].v_rsvpd != NULL) { + splx(s); + return EADDRINUSE; + } + + viftable[i].v_rsvpd = so; + /* This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 1; + rsvp_on++; + } + + splx(s); + return 0; +} + +int +ip_rsvp_vif_done(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, i, s; + + if (rsvpdebug) + printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + return (error); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + if (rsvpdebug) + printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n", + viftable[i].v_rsvpd, so); + + viftable[i].v_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 0; + rsvp_on--; + } + + splx(s); + return 0; +} + +void +ip_rsvp_force_done(so) + struct socket *so; +{ + int vifi; + register int s; + + /* Don't bother if it is not the right type of socket. */ + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return; + + s = splnet(); + + /* The socket may be attached to more than one vif...this + * is perfectly legal. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_rsvpd == so) { + viftable[vifi].v_rsvpd = NULL; + /* This may seem silly, but we need to be sure we don't + * over-decrement the RSVP counter, in case something slips up. + */ + if (viftable[vifi].v_rsvp_on) { + viftable[vifi].v_rsvp_on = 0; + rsvp_on--; + } + } + } + + splx(s); + return; +} + +void +rsvp_input(m, iphlen) + struct mbuf *m; + int iphlen; +{ + int vifi; + register struct ip *ip = mtod(m, struct ip *); + static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; + register int s; + struct ifnet *ifp; + + if (rsvpdebug) + printf("rsvp_input: rsvp_on %d\n",rsvp_on); + + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + /* If the old-style non-vif-associated socket is set, then use + * it and ignore the new ones. + */ + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, iphlen); + return; + } + + s = splnet(); + + if (rsvpdebug) + printf("rsvp_input: check vifs\n"); + +#ifdef DIAGNOSTIC + if (!(m->m_flags & M_PKTHDR)) + panic("rsvp_input no hdr"); +#endif + + ifp = m->m_pkthdr.rcvif; + /* Find which vif the packet arrived on. */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_ifp == ifp) + break; + } + + if (vifi == numvifs) { + /* Can't find vif packet arrived on. Drop packet. */ + if (rsvpdebug) + printf("rsvp_input: Can't find vif for packet...dropping it.\n"); + m_freem(m); + splx(s); + return; + } + + if (rsvpdebug) + printf("rsvp_input: check socket\n"); + + if (viftable[vifi].v_rsvpd == NULL) { + /* drop packet, since there is no specific socket for this + * interface */ + if (rsvpdebug) + printf("rsvp_input: No socket defined for vif %d\n",vifi); + m_freem(m); + splx(s); + return; + } + rsvp_src.sin_addr = ip->ip_src; + + if (rsvpdebug && m) + printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", + m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); + + if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { + if (rsvpdebug) + printf("rsvp_input: Failed to append to socket\n"); + } else { + if (rsvpdebug) + printf("rsvp_input: send packet up\n"); + } + + splx(s); +} + +#ifdef MROUTE_LKM +#include <sys/conf.h> +#include <sys/exec.h> +#include <sys/sysent.h> +#include <sys/lkm.h> + +MOD_MISC("ip_mroute_mod") + +static int +ip_mroute_mod_handle(struct lkm_table *lkmtp, int cmd) +{ + int i; + struct lkm_misc *args = lkmtp->private.lkm_misc; + int err = 0; + + switch(cmd) { + static int (*old_ip_mrouter_cmd)(); + static int (*old_ip_mrouter_done)(); + static int (*old_ip_mforward)(); + static int (*old_mrt_ioctl)(); + static void (*old_proto4_input)(); + static int (*old_legal_vif_num)(); + extern struct protosw inetsw[]; + + case LKM_E_LOAD: + if(lkmexists(lkmtp) || ip_mrtproto) + return(EEXIST); + old_ip_mrouter_cmd = ip_mrouter_cmd; + ip_mrouter_cmd = X_ip_mrouter_cmd; + old_ip_mrouter_done = ip_mrouter_done; + ip_mrouter_done = X_ip_mrouter_done; + old_ip_mforward = ip_mforward; + ip_mforward = X_ip_mforward; + old_mrt_ioctl = mrt_ioctl; + mrt_ioctl = X_mrt_ioctl; + old_proto4_input = inetsw[ip_protox[ENCAP_PROTO]].pr_input; + inetsw[ip_protox[ENCAP_PROTO]].pr_input = X_ipip_input; + old_legal_vif_num = legal_vif_num; + legal_vif_num = X_legal_vif_num; + ip_mrtproto = IGMP_DVMRP; + + printf("\nIP multicast routing loaded\n"); + break; + + case LKM_E_UNLOAD: + if (ip_mrouter) + return EINVAL; + + ip_mrouter_cmd = old_ip_mrouter_cmd; + ip_mrouter_done = old_ip_mrouter_done; + ip_mforward = old_ip_mforward; + mrt_ioctl = old_mrt_ioctl; + inetsw[ip_protox[ENCAP_PROTO]].pr_input = old_proto4_input; + legal_vif_num = old_legal_vif_num; + ip_mrtproto = 0; + break; + + default: + err = EINVAL; + break; + } + + return(err); +} + +int +ip_mroute_mod(struct lkm_table *lkmtp, int cmd, int ver) { + DISPATCH(lkmtp, cmd, ver, ip_mroute_mod_handle, ip_mroute_mod_handle, + nosys); +} + +#endif /* MROUTE_LKM */ +#endif /* MROUTING */ diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h new file mode 100644 index 0000000..abcceda --- /dev/null +++ b/sys/netinet/ip_mroute.h @@ -0,0 +1,267 @@ +/* + * Copyright (c) 1989 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_MROUTE_H_ +#define _NETINET_IP_MROUTE_H_ + +/* + * Definitions for IP multicast forwarding. + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Ajit Thyagarajan, PARC, August 1993. + * Modified by Ajit Thyagarajan, PARC, August 1994. + * + * MROUTING Revision: 3.3.1.3 + */ + + +/* + * Multicast Routing set/getsockopt commands. + */ +#define MRT_INIT 100 /* initialize forwarder */ +#define MRT_DONE 101 /* shut down forwarder */ +#define MRT_ADD_VIF 102 /* create virtual interface */ +#define MRT_DEL_VIF 103 /* delete virtual interface */ +#define MRT_ADD_MFC 104 /* insert forwarding cache entry */ +#define MRT_DEL_MFC 105 /* delete forwarding cache entry */ +#define MRT_VERSION 106 /* get kernel version number */ +#define MRT_ASSERT 107 /* enable PIM assert processing */ + + +#define GET_TIME(t) microtime(&t) + +/* + * Types and macros for handling bitmaps with one bit per virtual interface. + */ +#define MAXVIFS 32 +typedef u_long vifbitmap_t; +typedef u_short vifi_t; /* type of a vif index */ +#define ALL_VIFS (vifi_t)-1 + +#define VIFM_SET(n, m) ((m) |= (1 << (n))) +#define VIFM_CLR(n, m) ((m) &= ~(1 << (n))) +#define VIFM_ISSET(n, m) ((m) & (1 << (n))) +#define VIFM_CLRALL(m) ((m) = 0x00000000) +#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom)) +#define VIFM_SAME(m1, m2) ((m1) == (m2)) + + +/* + * Argument structure for MRT_ADD_VIF. + * (MRT_DEL_VIF takes a single vifi_t argument.) + */ +struct vifctl { + vifi_t vifc_vifi; /* the index of the vif to be added */ + u_char vifc_flags; /* VIFF_ flags defined below */ + u_char vifc_threshold; /* min ttl required to forward on vif */ + u_int vifc_rate_limit; /* max rate */ + struct in_addr vifc_lcl_addr; /* local interface address */ + struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */ +}; + +#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */ +#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */ + +/* + * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC + * (mfcc_tos to be added at a future point) + */ +struct mfcctl { + struct in_addr mfcc_origin; /* ip origin of mcasts */ + struct in_addr mfcc_mcastgrp; /* multicast group associated*/ + vifi_t mfcc_parent; /* incoming vif */ + u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ +}; + +/* + * The kernel's multicast routing statistics. + */ +struct mrtstat { + u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ + u_long mrts_mfc_misses; /* # forw. cache hash table misses */ + u_long mrts_upcalls; /* # calls to mrouted */ + u_long mrts_no_route; /* no route for packet's origin */ + u_long mrts_bad_tunnel; /* malformed tunnel options */ + u_long mrts_cant_tunnel; /* no room for tunnel options */ + u_long mrts_wrong_if; /* arrived on wrong interface */ + u_long mrts_upq_ovflw; /* upcall Q overflow */ + u_long mrts_cache_cleanups; /* # entries with no upcalls */ + u_long mrts_drop_sel; /* pkts dropped selectively */ + u_long mrts_q_overflow; /* pkts dropped - Q overflow */ + u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ +}; + +/* + * Argument structure used by mrouted to get src-grp pkt counts + */ +struct sioc_sg_req { + struct in_addr src; + struct in_addr grp; + u_long pktcnt; + u_long bytecnt; + u_long wrong_if; +}; + +/* + * Argument structure used by mrouted to get vif pkt counts + */ +struct sioc_vif_req { + vifi_t vifi; /* vif number */ + u_long icount; /* Input packet count on vif */ + u_long ocount; /* Output packet count on vif */ + u_long ibytes; /* Input byte count on vif */ + u_long obytes; /* Output byte count on vif */ +}; + + +/* + * The kernel's virtual-interface structure. + */ +struct vif { + u_char v_flags; /* VIFF_ flags defined above */ + u_char v_threshold; /* min ttl required to forward on vif*/ + u_int v_rate_limit; /* max rate */ + struct tbf *v_tbf; /* token bucket structure at intf. */ + struct in_addr v_lcl_addr; /* local interface address */ + struct in_addr v_rmt_addr; /* remote address (tunnels only) */ + struct ifnet *v_ifp; /* pointer to interface */ + u_long v_pkt_in; /* # pkts in on interface */ + u_long v_pkt_out; /* # pkts out on interface */ + u_long v_bytes_in; /* # bytes in on interface */ + u_long v_bytes_out; /* # bytes out on interface */ + struct route v_route; /* cached route if this is a tunnel */ + u_int v_rsvp_on; /* RSVP listening on this vif */ + struct socket *v_rsvpd; /* RSVP daemon socket */ +}; + +/* + * The kernel's multicast forwarding cache entry structure + * (A field for the type of service (mfc_tos) is to be added + * at a future point) + */ +struct mfc { + struct in_addr mfc_origin; /* IP origin of mcasts */ + struct in_addr mfc_mcastgrp; /* multicast group associated*/ + vifi_t mfc_parent; /* incoming vif */ + u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ + u_long mfc_pkt_cnt; /* pkt count for src-grp */ + u_long mfc_byte_cnt; /* byte count for src-grp */ + u_long mfc_wrong_if; /* wrong if for src-grp */ + int mfc_expire; /* time to clean entry up */ + struct timeval mfc_last_assert; /* last time I sent an assert*/ + struct rtdetq *mfc_stall; /* q of packets awaiting mfc */ + struct mfc *mfc_next; /* next mfc entry */ +}; + +/* + * Struct used to communicate from kernel to multicast router + * note the convenient similarity to an IP packet + */ +struct igmpmsg { + u_long unused1; + u_long unused2; + u_char im_msgtype; /* what type of message */ +#define IGMPMSG_NOCACHE 1 +#define IGMPMSG_WRONGVIF 2 + u_char im_mbz; /* must be zero */ + u_char im_vif; /* vif rec'd on */ + u_char unused3; + struct in_addr im_src, im_dst; +}; + +/* + * Argument structure used for pkt info. while upcall is made + */ +struct rtdetq { + struct mbuf *m; /* A copy of the packet */ + struct ifnet *ifp; /* Interface pkt came in on */ + vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */ +#ifdef UPCALL_TIMING + struct timeval t; /* Timestamp */ +#endif /* UPCALL_TIMING */ + struct rtdetq *next; /* Next in list of packets */ +}; + +#define MFCTBLSIZ 256 +#if (MFCTBLSIZ & (MFCTBLSIZ - 1)) == 0 /* from sys:route.h */ +#define MFCHASHMOD(h) ((h) & (MFCTBLSIZ - 1)) +#else +#define MFCHASHMOD(h) ((h) % MFCTBLSIZ) +#endif + +#define MAX_UPQ 4 /* max. no of pkts in upcall Q */ + +/* + * Token Bucket filter code + */ +#define MAX_BKT_SIZE 10000 /* 10K bytes size */ +#define MAXQSIZE 10 /* max # of pkts in queue */ + +/* + * the token bucket filter at each vif + */ +struct tbf +{ + struct timeval tbf_last_pkt_t; /* arr. time of last pkt */ + u_long tbf_n_tok; /* no of tokens in bucket */ + u_long tbf_q_len; /* length of queue at this vif */ + u_long tbf_max_q_len; /* max. queue length */ + struct mbuf *tbf_q; /* Packet queue */ + struct mbuf *tbf_t; /* tail-insertion pointer */ +}; + +#ifdef KERNEL + +struct sockopt; + +extern int (*ip_mrouter_set) __P((struct socket *, struct sockopt *)); +extern int (*ip_mrouter_get) __P((struct socket *, struct sockopt *)); +extern int (*ip_mrouter_done) __P((void)); +#ifdef MROUTING +extern int (*mrt_ioctl) __P((int, caddr_t)); +#else +extern int (*mrt_ioctl) __P((int, caddr_t, struct proc *)); +#endif + +#endif /* KERNEL */ + +#endif /* _NETINET_IP_MROUTE_H_ */ diff --git a/sys/netinet/ip_nat.c b/sys/netinet/ip_nat.c new file mode 100644 index 0000000..e85bc57 --- /dev/null +++ b/sys/netinet/ip_nat.c @@ -0,0 +1,1741 @@ +/* + * Copyright (C) 1995-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * Added redirect stuff and a LOT of bug fixes. (mcn@EnGarde.com) + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_nat.c 1.11 6/5/96 (C) 1995 Darren Reed"; +/*static const char rcsid[] = "@(#)$Id: ip_nat.c,v 2.2.2.5 1999/10/05 12:58:33 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#if defined(__FreeBSD__) && defined(KERNEL) && !defined(_KERNEL) +#define _KERNEL +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#if defined(KERNEL) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/fcntl.h> +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if defined(_KERNEL) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/filio.h> +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#if __FreeBSD_version >= 300000 +# include <sys/queue.h> +#endif +#include <net/if.h> +#if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +#endif +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> + +#ifdef __sgi +# ifdef IFF_DRVRLOCK /* IRIX6 */ +#include <sys/hashing.h> +#include <netinet/in_var.h> +# endif +#endif + +#ifdef RFC1825 +# include <vpn/md5.h> +# include <vpn/ipsec.h> +extern struct ifnet vpnif; +#endif + +#ifndef linux +# include <netinet/ip_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +#endif +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#undef SOCKADDR_IN +#define SOCKADDR_IN struct sockaddr_in + +nat_t **nat_table[2] = { NULL, NULL }, + *nat_instances = NULL; +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +ipnat_t *nat_list = NULL; +#else +static ipnat_t *nat_list = NULL; +#endif +u_int ipf_nattable_sz = NAT_TABLE_SZ; +u_int ipf_natrules_sz = NAT_SIZE; +u_int ipf_rdrrules_sz = RDR_SIZE; +u_32_t nat_masks = 0; +u_32_t rdr_masks = 0; +ipnat_t **nat_rules = NULL; +ipnat_t **rdr_rules = NULL; + +u_long fr_defnatage = DEF_NAT_AGE, + fr_defnaticmpage = 6; /* 3 seconds */ +static natstat_t nat_stats; +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern kmutex_t ipf_rw; +extern KRWLOCK_T ipf_nat; +#endif + +static int nat_flushtable __P((void)); +static int nat_clearlist __P((void)); +static void nat_delete __P((struct nat *)); +static void nat_delrdr __P((struct ipnat *)); +static void nat_delnat __P((struct ipnat *)); + + +int nat_init() +{ + KMALLOCS(nat_table[0], nat_t **, sizeof(nat_t *) * ipf_nattable_sz); + if (nat_table[0] != NULL) + bzero((char *)nat_table[0], ipf_nattable_sz * sizeof(nat_t *)); + else + return -1; + + KMALLOCS(nat_table[1], nat_t **, sizeof(nat_t *) * ipf_nattable_sz); + if (nat_table[1] != NULL) + bzero((char *)nat_table[1], ipf_nattable_sz * sizeof(nat_t *)); + else + return -1; + + KMALLOCS(nat_rules, ipnat_t **, sizeof(ipnat_t *) * ipf_natrules_sz); + if (nat_rules != NULL) + bzero((char *)nat_rules, ipf_natrules_sz * sizeof(ipnat_t *)); + else + return -1; + + KMALLOCS(rdr_rules, ipnat_t **, sizeof(ipnat_t *) * ipf_rdrrules_sz); + if (rdr_rules != NULL) + bzero((char *)rdr_rules, ipf_rdrrules_sz * sizeof(ipnat_t *)); + else + return -1; + return 0; +} + + +void nat_delrdr(n) +ipnat_t *n; +{ + ipnat_t **n1; + u_32_t iph; + u_int hv; + + iph = n->in_outip & n->in_outmsk; + hv = NAT_HASH_FN(iph, ipf_rdrrules_sz); + for (n1 = &rdr_rules[hv]; *n1 && (*n1 != n); n1 = &(*n1)->in_rnext) + ; + if (*n1) + *n1 = n->in_rnext; +} + + +static void nat_delnat(n) +ipnat_t *n; +{ + ipnat_t **n1; + u_32_t iph; + u_int hv; + + iph = n->in_inip & n->in_inmsk; + hv = NAT_HASH_FN(iph, ipf_natrules_sz); + for (n1 = &nat_rules[hv]; *n1 && (*n1 != n); n1 = &(*n1)->in_mnext) + ; + if (*n1) + *n1 = n->in_mnext; +} + + +void fix_outcksum(sp, n) +u_short *sp; +u_32_t n; +{ + register u_short sumshort; + register u_32_t sum1; + + if (!n) + return; + sum1 = (~ntohs(*sp)) & 0xffff; + sum1 += (n); + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + /* Again */ + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + sumshort = ~(u_short)sum1; + *(sp) = htons(sumshort); +} + + +void fix_incksum(sp, n) +u_short *sp; +u_32_t n; +{ + register u_short sumshort; + register u_32_t sum1; + + if (!n) + return; +#ifdef sparc + sum1 = (~(*sp)) & 0xffff; +#else + sum1 = (~ntohs(*sp)) & 0xffff; +#endif + sum1 += ~(n) & 0xffff; + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + /* Again */ + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + sumshort = ~(u_short)sum1; + *(sp) = htons(sumshort); +} + + +/* + * How the NAT is organised and works. + * + * Inside (interface y) NAT Outside (interface x) + * -------------------- -+- ------------------------------------- + * Packet going | out, processsed by ip_natout() for x + * ------------> | ------------> + * src=10.1.1.1 | src=192.1.1.1 + * | + * | in, processed by ip_natin() for x + * <------------ | <------------ + * dst=10.1.1.1 | dst=192.1.1.1 + * -------------------- -+- ------------------------------------- + * ip_natout() - changes ip_src and if required, sport + * - creates a new mapping, if required. + * ip_natin() - changes ip_dst and if required, dport + * + * In the NAT table, internal source is recorded as "in" and externally + * seen as "out". + */ + +/* + * Handle ioctls which manipulate the NAT. + */ +int nat_ioctl(data, cmd, mode) +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +u_long cmd; +#else +int cmd; +#endif +caddr_t data; +int mode; +{ + register ipnat_t *nat, *nt, *n = NULL, **np = NULL; + int error = 0, ret, k; + ipnat_t natd; + u_32_t i, j; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + +#if (BSD >= 199306) && defined(_KERNEL) + if ((securelevel >= 2) && (mode & FWRITE)) + return EPERM; +#endif + + nat = NULL; /* XXX gcc -Wuninitialized */ + KMALLOC(nt, ipnat_t *); + if ((cmd == SIOCADNAT) || (cmd == SIOCRMNAT)) + IRCOPY(data, (char *)&natd, sizeof(natd)); + + /* + * For add/delete, look to see if the NAT entry is already present + */ + SPL_NET(s); + WRITE_ENTER(&ipf_nat); + if ((cmd == SIOCADNAT) || (cmd == SIOCRMNAT)) { + nat = &natd; + nat->in_flags &= IPN_USERFLAGS; + if ((nat->in_redir & NAT_MAPBLK) == 0) { + nat->in_inip &= nat->in_inmsk; + if ((nat->in_flags & IPN_RANGE) == 0) + nat->in_outip &= nat->in_outmsk; + } + for (np = &nat_list; (n = *np); np = &n->in_next) + if (!bcmp((char *)&nat->in_flags, (char *)&n->in_flags, + IPN_CMPSIZ)) + break; + } + + switch (cmd) + { + case SIOCADNAT : + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + if (n) { + error = EEXIST; + break; + } + if (nt == NULL) { + error = ENOMEM; + break; + } + n = nt; + nt = NULL; + bcopy((char *)nat, (char *)n, sizeof(*n)); + n->in_ifp = (void *)GETUNIT(n->in_ifname); + if (!n->in_ifp) + n->in_ifp = (void *)-1; + if (n->in_plabel[0] != '\0') { + n->in_apr = appr_match(n->in_p, n->in_plabel); + if (!n->in_apr) { + error = ENOENT; + break; + } + } + n->in_next = NULL; + *np = n; + + if (n->in_redir & NAT_REDIRECT) { + u_int hv; + + k = countbits(n->in_outmsk); + if ((k >= 0) && (k != 32)) + rdr_masks |= 1 << k; + j = (n->in_outip & n->in_outmsk); + hv = NAT_HASH_FN(j, ipf_rdrrules_sz); + np = rdr_rules + hv; + while (*np != NULL) + np = &(*np)->in_rnext; + n->in_rnext = NULL; + *np = n; + } + if (n->in_redir & (NAT_MAP|NAT_MAPBLK)) { + u_int hv; + + k = countbits(n->in_inmsk); + if ((k >= 0) && (k != 32)) + nat_masks |= 1 << k; + j = (n->in_inip & n->in_inmsk); + hv = NAT_HASH_FN(j, ipf_natrules_sz); + np = nat_rules + hv; + while (*np != NULL) + np = &(*np)->in_mnext; + n->in_mnext = NULL; + *np = n; + } + + n->in_use = 0; + if (n->in_redir & NAT_MAPBLK) + n->in_space = USABLE_PORTS * ~ntohl(n->in_outmsk); + else if (n->in_flags & IPN_AUTOPORTMAP) + n->in_space = USABLE_PORTS * ~ntohl(n->in_inmsk); + else if (n->in_flags & IPN_RANGE) + n->in_space = ntohl(n->in_outmsk) - ntohl(n->in_outip); + else + n->in_space = ~ntohl(n->in_outmsk); + /* + * Calculate the number of valid IP addresses in the output + * mapping range. In all cases, the range is inclusive of + * the start and ending IP addresses. + * If to a CIDR address, lose 2: broadcast + network address + * (so subtract 1) + * If to a range, add one. + * If to a single IP address, set to 1. + */ + if (n->in_space) { + if ((n->in_flags & IPN_RANGE) != 0) + n->in_space += 1; + else + n->in_space -= 1; + } else + n->in_space = 1; + if ((n->in_outmsk != 0xffffffff) && (n->in_outmsk != 0) && + ((n->in_flags & IPN_RANGE) == 0)) + n->in_nip = ntohl(n->in_outip) + 1; + else + n->in_nip = ntohl(n->in_outip); + if (n->in_redir & NAT_MAP) { + n->in_pnext = ntohs(n->in_pmin); + /* + * Multiply by the number of ports made available. + */ + if (ntohs(n->in_pmax) >= ntohs(n->in_pmin)) { + n->in_space *= (ntohs(n->in_pmax) - + ntohs(n->in_pmin) + 1); + /* + * Because two different sources can map to + * different destinations but use the same + * local IP#/port #. + * If the result is smaller than in_space, then + * we may have wrapped around 32bits. + */ + i = n->in_inmsk; + if ((i != 0) && (i != 0xffffffff)) { + j = n->in_space * (~ntohl(i) + 1); + if (j >= n->in_space) + n->in_space = j; + else + n->in_space = 0xffffffff; + } + } + /* + * If no protocol is specified, multiple by 256. + */ + if ((n->in_flags & IPN_TCPUDP) == 0) { + j = n->in_space * 256; + if (j >= n->in_space) + n->in_space = j; + else + n->in_space = 0xffffffff; + } + } + /* Otherwise, these fields are preset */ + n = NULL; + nat_stats.ns_rules++; + break; + case SIOCRMNAT : + if (!(mode & FWRITE)) { + error = EPERM; + n = NULL; + break; + } + if (!n) { + error = ESRCH; + break; + } + if (n->in_redir & NAT_REDIRECT) + nat_delrdr(n); + if (n->in_redir & (NAT_MAPBLK|NAT_MAP)) + nat_delnat(n); + if (nat_list == NULL) { + nat_masks = 0; + rdr_masks = 0; + } + *np = n->in_next; + if (!n->in_use) { + if (n->in_apr) + appr_free(n->in_apr); + KFREE(n); + nat_stats.ns_rules--; + } else { + n->in_flags |= IPN_DELETE; + n->in_next = NULL; + } + n = NULL; + break; + case SIOCGNATS : + MUTEX_DOWNGRADE(&ipf_nat); + nat_stats.ns_table[0] = nat_table[0]; + nat_stats.ns_table[1] = nat_table[1]; + nat_stats.ns_list = nat_list; + nat_stats.ns_nattab_sz = ipf_nattable_sz; + nat_stats.ns_rultab_sz = ipf_natrules_sz; + nat_stats.ns_rdrtab_sz = ipf_rdrrules_sz; + nat_stats.ns_instances = nat_instances; + nat_stats.ns_apslist = ap_sess_list; + IWCOPY((char *)&nat_stats, (char *)data, sizeof(nat_stats)); + break; + case SIOCGNATL : + { + natlookup_t nl; + + MUTEX_DOWNGRADE(&ipf_nat); + IRCOPY((char *)data, (char *)&nl, sizeof(nl)); + + if (nat_lookupredir(&nl)) { + IWCOPY((char *)&nl, (char *)data, sizeof(nl)); + } else + error = ESRCH; + break; + } + case SIOCFLNAT : + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + ret = nat_flushtable(); + MUTEX_DOWNGRADE(&ipf_nat); + IWCOPY((caddr_t)&ret, data, sizeof(ret)); + break; + case SIOCCNATL : + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + ret = nat_clearlist(); + MUTEX_DOWNGRADE(&ipf_nat); + IWCOPY((caddr_t)&ret, data, sizeof(ret)); + break; + case FIONREAD : +#ifdef IPFILTER_LOG + MUTEX_DOWNGRADE(&ipf_nat); + IWCOPY((caddr_t)&iplused[IPL_LOGNAT], (caddr_t)data, + sizeof(iplused[IPL_LOGNAT])); +#endif + break; + default : + error = EINVAL; + break; + } + RWLOCK_EXIT(&ipf_nat); /* READ/WRITE */ + SPL_X(s); + if (nt) + KFREE(nt); + return error; +} + + +/* + * Delete a nat entry from the various lists and table. + */ +static void nat_delete(natd) +struct nat *natd; +{ + register struct nat **natp, *nat; + struct ipnat *ipn; + + for (natp = natd->nat_hstart[0]; (nat = *natp); + natp = &nat->nat_hnext[0]) + if (nat == natd) { + *natp = nat->nat_hnext[0]; + break; + } + + for (natp = natd->nat_hstart[1]; (nat = *natp); + natp = &nat->nat_hnext[1]) + if (nat == natd) { + *natp = nat->nat_hnext[1]; + break; + } + + if (natd->nat_fr != NULL) { + ATOMIC_DEC(natd->nat_fr->fr_ref); + } + /* + * If there is an active reference from the nat entry to its parent + * rule, decrement the rule's reference count and free it too if no + * longer being used. + */ + ipn = natd->nat_ptr; + if (ipn != NULL) { + ipn->in_space++; + ipn->in_use--; + if (!ipn->in_use && (ipn->in_flags & IPN_DELETE)) { + if (ipn->in_apr) + appr_free(ipn->in_apr); + KFREE(ipn); + nat_stats.ns_rules--; + } + } + + /* + * If there's a fragment table entry too for this nat entry, then + * dereference that as well. + */ + ipfr_forget((void *)natd); + aps_free(natd->nat_aps); + nat_stats.ns_inuse--; + KFREE(natd); +} + + +/* + * nat_flushtable - clear the NAT table of all mapping entries. + */ +static int nat_flushtable() +{ + register nat_t *nat, **natp; + register int j = 0; + + /* + * ALL NAT mappings deleted, so lets just make the deletions + * quicker. + */ + if (nat_table[0] != NULL) + bzero((char *)nat_table[0], + sizeof(nat_table[0]) * ipf_nattable_sz); + if (nat_table[1] != NULL) + bzero((char *)nat_table[1], + sizeof(nat_table[1]) * ipf_nattable_sz); + + for (natp = &nat_instances; (nat = *natp); ) { + *natp = nat->nat_next; + nat_delete(nat); + j++; + } + nat_stats.ns_inuse = 0; + return j; +} + + +/* + * nat_clearlist - delete all rules in the active NAT mapping list. + */ +static int nat_clearlist() +{ + register ipnat_t *n, **np = &nat_list; + int i = 0; + + if (nat_rules != NULL) + bzero((char *)nat_rules, sizeof(*nat_rules) * ipf_natrules_sz); + if (rdr_rules != NULL) + bzero((char *)rdr_rules, sizeof(*rdr_rules) * ipf_rdrrules_sz); + + while ((n = *np)) { + *np = n->in_next; + if (!n->in_use) { + if (n->in_apr) + appr_free(n->in_apr); + KFREE(n); + nat_stats.ns_rules--; + } else { + n->in_flags |= IPN_DELETE; + n->in_next = NULL; + } + i++; + } + nat_masks = 0; + rdr_masks = 0; + return i; +} + + +/* + * Create a new NAT table entry. + * NOTE: assumes write lock on ipf_nat has been obtained already. + */ +nat_t *nat_new(np, ip, fin, flags, direction) +ipnat_t *np; +ip_t *ip; +fr_info_t *fin; +u_int flags; +int direction; +{ + register u_32_t sum1, sum2, sumd, l; + u_short port = 0, sport = 0, dport = 0, nport = 0; + nat_t *nat, **natp, *natl = NULL; + struct in_addr in, inb; + tcphdr_t *tcp = NULL; + u_short nflags; + u_int hv; + + nflags = flags & np->in_flags; + if (flags & IPN_TCPUDP) { + tcp = (tcphdr_t *)fin->fin_dp; + sport = tcp->th_sport; + dport = tcp->th_dport; + } + + /* Give me a new nat */ + KMALLOC(nat, nat_t *); + if (nat == NULL) + return NULL; + + bzero((char *)nat, sizeof(*nat)); + nat->nat_flags = flags; + /* + * Search the current table for a match. + */ + if (direction == NAT_OUTBOUND) { + /* + * Values at which the search for a free resouce starts. + */ + u_32_t st_ip; + u_short st_port; + + /* + * If it's an outbound packet which doesn't match any existing + * record, then create a new port + */ + l = 0; + st_ip = np->in_nip; + st_port = np->in_pnext; + + do { + port = 0; + in.s_addr = np->in_nip; + if (l == 0) { + natl = nat_maplookup(fin->fin_ifp, flags, + ip->ip_src, ip->ip_dst); + if (natl != NULL) { + in = natl->nat_outip; +#ifndef sparc + in.s_addr = ntohl(in.s_addr); +#endif + } + } + + if ((np->in_outmsk == 0xffffffff) && + (np->in_pnext == 0)) { + if (l > 0) { + KFREE(nat); + return NULL; + } + } + + if (np->in_redir & NAT_MAPBLK) { + if ((l >= np->in_ppip) || ((l > 0) && + !(flags & IPN_TCPUDP))) { + KFREE(nat); + return NULL; + } + /* + * map-block - Calculate destination address. + */ + in.s_addr = ntohl(ip->ip_src.s_addr); + in.s_addr &= ntohl(~np->in_inmsk); + inb.s_addr = in.s_addr; + in.s_addr /= np->in_ippip; + in.s_addr &= ntohl(~np->in_outmsk); + in.s_addr += ntohl(np->in_outip); + /* + * Calculate destination port. + */ + if ((flags & IPN_TCPUDP) && + (np->in_ppip != 0)) { + port = ntohs(sport) + l; + port %= np->in_ppip; + port += np->in_ppip * + (inb.s_addr % np->in_ippip); + port += MAPBLK_MINPORT; + port = htons(port); + } + } else if (!in.s_addr && + (np->in_outmsk == 0xffffffff)) { + /* + * 0/32 - use the interface's IP address. + */ + if ((l > 0) || + fr_ifpaddr(fin->fin_ifp, &in) == -1) { + KFREE(nat); + return NULL; + } + } else if (!in.s_addr && !np->in_outmsk) { + /* + * 0/0 - use the original source address/port. + */ + if (l > 0) { + KFREE(nat); + return NULL; + } + in.s_addr = ntohl(ip->ip_src.s_addr); + } else if ((np->in_outmsk != 0xffffffff) && + (np->in_pnext == 0) && + ((l > 0) || (natl == NULL))) + np->in_nip++; + natl = NULL; + + if ((nflags & IPN_TCPUDP) && + ((np->in_redir & NAT_MAPBLK) == 0) && + (np->in_flags & IPN_AUTOPORTMAP)) { + if ((l > 0) && (l % np->in_ppip == 0)) { + if (l > np->in_space) { + KFREE(nat); + return NULL; + } else if ((l > np->in_ppip) && + np->in_outmsk != 0xffffffff) + np->in_nip++; + } + if (np->in_ppip != 0) { + port = ntohs(sport); + port += (l % np->in_ppip); + port %= np->in_ppip; + port += np->in_ppip * + (ntohl(ip->ip_src.s_addr) % + np->in_ippip); + port += MAPBLK_MINPORT; + port = htons(port); + } + } else if (((np->in_redir & NAT_MAPBLK) == 0) && + (nflags & IPN_TCPUDP) && + (np->in_pnext != 0)) { + port = htons(np->in_pnext++); + if (np->in_pnext > ntohs(np->in_pmax)) { + np->in_pnext = ntohs(np->in_pmin); + if (np->in_outmsk != 0xffffffff) + np->in_nip++; + } + } + + if (np->in_flags & IPN_RANGE) { + if (np->in_nip >= ntohl(np->in_outmsk)) + np->in_nip = ntohl(np->in_outip); + } else { + if ((np->in_outmsk != 0xffffffff) && + ((np->in_nip + 1) & ntohl(np->in_outmsk)) > + ntohl(np->in_outip)) + np->in_nip = ntohl(np->in_outip) + 1; + } + + if (!port && (flags & IPN_TCPUDP)) + port = sport; + + /* + * Here we do a lookup of the connection as seen from + * the outside. If an IP# pair already exists, try + * again. So if you have A->B becomes C->B, you can + * also have D->E become C->E but not D->B causing + * another C->B. Also take protocol and ports into + * account when determining whether a pre-existing + * NAT setup will cause an external conflict where + * this is appropriate. + */ + inb.s_addr = htonl(in.s_addr); + natl = nat_inlookup(fin->fin_ifp, flags, + (u_int)ip->ip_p, ip->ip_dst, inb, + (port << 16) | dport); + + /* + * Has the search wrapped around and come back to the + * start ? + */ + if ((natl != NULL) && + (np->in_pnext != 0) && (st_port == np->in_pnext) && + (np->in_nip != 0) && (st_ip == np->in_nip)) { + KFREE(nat); + return NULL; + } + l++; + } while (natl != NULL); + + if (np->in_space > 0) + np->in_space--; + + /* Setup the NAT table */ + nat->nat_inip = ip->ip_src; + nat->nat_outip.s_addr = htonl(in.s_addr); + nat->nat_oip = ip->ip_dst; + + sum1 = LONG_SUM(ntohl(ip->ip_src.s_addr)) + ntohs(sport); + sum2 = LONG_SUM(in.s_addr) + ntohs(port); + + if (flags & IPN_TCPUDP) { + nat->nat_inport = sport; + nat->nat_outport = port; /* sport */ + nat->nat_oport = dport; + } + } else { + /* + * Otherwise, it's an inbound packet. Most likely, we don't + * want to rewrite source ports and source addresses. Instead, + * we want to rewrite to a fixed internal address and fixed + * internal port. + */ + in.s_addr = ntohl(np->in_inip); + if (!(nport = np->in_pnext)) + nport = dport; + + /* + * When the redirect-to address is set to 0.0.0.0, just + * assume a blank `forwarding' of the packet. We don't + * setup any translation for this either. + */ + if ((in.s_addr == 0) && (nport == dport)) { + KFREE(nat); + return NULL; + } + + nat->nat_inip.s_addr = htonl(in.s_addr); + nat->nat_outip = ip->ip_dst; + nat->nat_oip = ip->ip_src; + + sum1 = LONG_SUM(ntohl(ip->ip_dst.s_addr)) + ntohs(dport); + sum2 = LONG_SUM(in.s_addr) + ntohs(nport); + + if (flags & IPN_TCPUDP) { + nat->nat_inport = nport; + nat->nat_outport = dport; + nat->nat_oport = sport; + } + } + + CALC_SUMD(sum1, sum2, sumd); + nat->nat_sumd = (sumd & 0xffff) + (sumd >> 16); + + if ((flags & IPN_TCPUDP) && ((sport != port) || (dport != nport))) { + if (direction == NAT_OUTBOUND) + sum1 = LONG_SUM(ntohl(ip->ip_src.s_addr)); + else + sum1 = LONG_SUM(ntohl(ip->ip_dst.s_addr)); + + sum2 = LONG_SUM(in.s_addr); + + CALC_SUMD(sum1, sum2, sumd); + nat->nat_ipsumd = (sumd & 0xffff) + (sumd >> 16); + } else + nat->nat_ipsumd = nat->nat_sumd; + + in.s_addr = htonl(in.s_addr); + nat->nat_next = nat_instances; + nat_instances = nat; + hv = NAT_HASH_FN(nat->nat_inip.s_addr, ipf_nattable_sz); + natp = &nat_table[0][hv]; + nat->nat_hstart[0] = natp; + nat->nat_hnext[0] = *natp; + *natp = nat; + hv = NAT_HASH_FN(nat->nat_outip.s_addr, ipf_nattable_sz); + natp = &nat_table[1][hv]; + nat->nat_hstart[1] = natp; + nat->nat_hnext[1] = *natp; + *natp = nat; + nat->nat_dir = direction; + nat->nat_ifp = fin->fin_ifp; + nat->nat_ptr = np; + nat->nat_p = ip->ip_p; + nat->nat_bytes = 0; + nat->nat_pkts = 0; + nat->nat_age = fr_defnatage; + nat->nat_fr = fin->fin_fr; + if (nat->nat_fr != NULL) { + ATOMIC_INC(nat->nat_fr->fr_ref); + } + if (direction == NAT_OUTBOUND) { + if (flags & IPN_TCPUDP) + tcp->th_sport = port; + } else { + if (flags & IPN_TCPUDP) + tcp->th_dport = nport; + } + nat_stats.ns_added++; + nat_stats.ns_inuse++; + np->in_use++; + return nat; +} + + +nat_t *nat_icmpinlookup(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + icmphdr_t *icmp; + tcphdr_t *tcp = NULL; + ip_t *oip; + int flags = 0, type; + + icmp = (icmphdr_t *)fin->fin_dp; + /* + * Does it at least have the return (basic) IP header ? + * Only a basic IP header (no options) should be with an ICMP error + * header. + */ + if ((ip->ip_hl != 5) || (ip->ip_len < ICMPERR_MINPKTLEN)) + return NULL; + type = icmp->icmp_type; + /* + * If it's not an error type, then return. + */ + if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) && + (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) && + (type != ICMP_PARAMPROB)) + return NULL; + + oip = (ip_t *)((char *)fin->fin_dp + 8); + if (ip->ip_len < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2)) + return NULL; + if (oip->ip_p == IPPROTO_TCP) + flags = IPN_TCP; + else if (oip->ip_p == IPPROTO_UDP) + flags = IPN_UDP; + if (flags & IPN_TCPUDP) { + tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2)); + return nat_inlookup(fin->fin_ifp, flags, (u_int)oip->ip_p, + oip->ip_dst, oip->ip_src, + (tcp->th_sport << 16) | tcp->th_dport); + } + return nat_inlookup(fin->fin_ifp, 0, (u_int)oip->ip_p, oip->ip_dst, + oip->ip_src, 0); +} + + +/* + * This should *ONLY* be used for incoming packets to make sure a NAT'd ICMP + * packet gets correctly recognised. + */ +nat_t *nat_icmpin(ip, fin, nflags) +ip_t *ip; +fr_info_t *fin; +u_int *nflags; +{ + u_32_t sum1, sum2, sumd; + struct in_addr in; + icmphdr_t *icmp; + nat_t *nat; + ip_t *oip; + int flags = 0; + + if (!(nat = nat_icmpinlookup(ip, fin))) + return NULL; + *nflags = IPN_ICMPERR; + icmp = (icmphdr_t *)fin->fin_dp; + oip = (ip_t *)&icmp->icmp_ip; + if (oip->ip_p == IPPROTO_TCP) + flags = IPN_TCP; + else if (oip->ip_p == IPPROTO_UDP) + flags = IPN_UDP; + /* + * Need to adjust ICMP header to include the real IP#'s and + * port #'s. Only apply a checksum change relative to the + * IP address change is it will be modified again in ip_natout + * for both address and port. Two checksum changes are + * necessary for the two header address changes. Be careful + * to only modify the checksum once for the port # and twice + * for the IP#. + */ + if (nat->nat_dir == NAT_OUTBOUND) { + sum1 = LONG_SUM(ntohl(oip->ip_src.s_addr)); + in = nat->nat_inip; + oip->ip_src = in; + } else { + sum1 = LONG_SUM(ntohl(oip->ip_dst.s_addr)); + in = nat->nat_outip; + oip->ip_dst = in; + } + + sum2 = LONG_SUM(ntohl(in.s_addr)); + + CALC_SUMD(sum1, sum2, sumd); + + if (nat->nat_dir == NAT_OUTBOUND) { + fix_incksum(&oip->ip_sum, sumd); + + sumd += (sumd & 0xffff); + while (sumd > 0xffff) + sumd = (sumd & 0xffff) + (sumd >> 16); + fix_outcksum(&icmp->icmp_cksum, sumd); + } else { + fix_outcksum(&oip->ip_sum, sumd); + + sumd += (sumd & 0xffff); + while (sumd > 0xffff) + sumd = (sumd & 0xffff) + (sumd >> 16); + fix_incksum(&icmp->icmp_cksum, sumd); + } + + + if ((flags & IPN_TCPUDP) != 0) { + tcphdr_t *tcp; + + /* XXX - what if this is bogus hl and we go off the end ? */ + tcp = (tcphdr_t *)((((char *)oip) + (oip->ip_hl << 2))); + + if (nat->nat_dir == NAT_OUTBOUND) { + if (tcp->th_sport != nat->nat_inport) { + sum1 = ntohs(tcp->th_sport); + sum2 = ntohs(nat->nat_inport); + CALC_SUMD(sum1, sum2, sumd); + tcp->th_sport = nat->nat_inport; + fix_outcksum(&icmp->icmp_cksum, sumd); + } + } else { + if (tcp->th_dport != nat->nat_outport) { + sum1 = ntohs(tcp->th_dport); + sum2 = ntohs(nat->nat_outport); + CALC_SUMD(sum1, sum2, sumd); + tcp->th_dport = nat->nat_outport; + fix_incksum(&icmp->icmp_cksum, sumd); + } + } + } + nat->nat_age = fr_defnaticmpage; + return nat; +} + + +/* + * NB: these lookups don't lock access to the list, it assume it has already + * been done! + */ +/* + * Lookup a nat entry based on the mapped destination ip address/port and + * real source address/port. We use this lookup when receiving a packet, + * we're looking for a table entry, based on the destination address. + * NOTE: THE PACKET BEING CHECKED (IF FOUND) HAS A MAPPING ALREADY. + */ +nat_t *nat_inlookup(ifp, flags, p, src, mapdst, ports) +void *ifp; +register u_int flags, p; +struct in_addr src , mapdst; +u_32_t ports; +{ + register u_short sport, mapdport; + register nat_t *nat; + register int nflags; + u_int hv; + + mapdport = ports >> 16; + sport = ports & 0xffff; + flags &= IPN_TCPUDP; + + hv = NAT_HASH_FN(mapdst.s_addr, ipf_nattable_sz); + nat = nat_table[1][hv]; + for (; nat; nat = nat->nat_hnext[1]) { + nflags = nat->nat_flags; + if ((!ifp || ifp == nat->nat_ifp) && + nat->nat_oip.s_addr == src.s_addr && + nat->nat_outip.s_addr == mapdst.s_addr && + (((p == 0) && (flags == (nat->nat_flags & IPN_TCPUDP))) + || (p == nat->nat_p)) && (!flags || + (((nat->nat_oport == sport) || (nflags & FI_W_DPORT)) && + ((nat->nat_outport == mapdport) || + (nflags & FI_W_SPORT))))) + return nat; + } + return NULL; +} + + +/* + * Lookup a nat entry based on the source 'real' ip address/port and + * destination address/port. We use this lookup when sending a packet out, + * we're looking for a table entry, based on the source address. + * NOTE: THE PACKET BEING CHECKED (IF FOUND) HAS A MAPPING ALREADY. + */ +nat_t *nat_outlookup(ifp, flags, p, src, dst, ports) +void *ifp; +register u_int flags, p; +struct in_addr src , dst; +u_32_t ports; +{ + register u_short sport, dport; + register nat_t *nat; + register int nflags; + u_int hv; + + sport = ports & 0xffff; + dport = ports >> 16; + flags &= IPN_TCPUDP; + + hv = NAT_HASH_FN(src.s_addr, ipf_nattable_sz); + nat = nat_table[0][hv]; + for (; nat; nat = nat->nat_hnext[0]) { + nflags = nat->nat_flags; + + if ((!ifp || ifp == nat->nat_ifp) && + nat->nat_inip.s_addr == src.s_addr && + nat->nat_oip.s_addr == dst.s_addr && + (((p == 0) && (flags == (nat->nat_flags & IPN_TCPUDP))) + || (p == nat->nat_p)) && (!flags || + ((nat->nat_inport == sport || nflags & FI_W_SPORT) && + (nat->nat_oport == dport || nflags & FI_W_DPORT)))) + return nat; + } + return NULL; +} + + +/* + * check if an ip address has already been allocated for a given mapping that + * is not doing port based translation. + */ +nat_t *nat_maplookup(ifp, flags, src, dst) +void *ifp; +register u_int flags; +struct in_addr src , dst; +{ + register nat_t *nat; + register int oflags; + u_int hv; + + hv = NAT_HASH_FN(src.s_addr, ipf_nattable_sz); + nat = nat_table[0][hv]; + for (; nat; nat = nat->nat_hnext[0]) { + oflags = (flags & IPN_TCPUDP) & nat->nat_ptr->in_flags; + if (oflags != 0) + continue; + + if ((!ifp || ifp == nat->nat_ifp) && + nat->nat_inip.s_addr == src.s_addr && + nat->nat_oip.s_addr == dst.s_addr) + return nat; + } + return NULL; +} + + +/* + * Lookup the NAT tables to search for a matching redirect + */ +nat_t *nat_lookupredir(np) +register natlookup_t *np; +{ + u_32_t ports; + nat_t *nat; + + ports = (np->nl_outport << 16) | np->nl_inport; + /* + * If nl_inip is non null, this is a lookup based on the real + * ip address. Else, we use the fake. + */ + if ((nat = nat_outlookup(NULL, np->nl_flags, 0, np->nl_inip, + np->nl_outip, ports))) { + np->nl_realip = nat->nat_outip; + np->nl_realport = nat->nat_outport; + } + return nat; +} + + +/* + * Packets going out on the external interface go through this. + * Here, the source address requires alteration, if anything. + */ +int ip_natout(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register ipnat_t *np = NULL; + register u_32_t ipa; + tcphdr_t *tcp = NULL; + u_short nflags = 0, sport = 0, dport = 0, *csump = NULL; + struct ifnet *ifp; + int natadd = 1; + frentry_t *fr; + u_int hv, msk; + u_32_t iph; + nat_t *nat; + int i; + + if (nat_list == NULL) + return 0; + + if ((fr = fin->fin_fr) && !(fr->fr_flags & FR_DUP) && + fr->fr_tif.fd_ifp && fr->fr_tif.fd_ifp != (void *)-1) + ifp = fr->fr_tif.fd_ifp; + else + ifp = fin->fin_ifp; + + if (!(ip->ip_off & IP_OFFMASK) && !(fin->fin_fi.fi_fl & FI_SHORT)) { + if (ip->ip_p == IPPROTO_TCP) + nflags = IPN_TCP; + else if (ip->ip_p == IPPROTO_UDP) + nflags = IPN_UDP; + if ((nflags & IPN_TCPUDP)) { + tcp = (tcphdr_t *)fin->fin_dp; + sport = tcp->th_sport; + dport = tcp->th_dport; + } + } + + ipa = ip->ip_src.s_addr; + + READ_ENTER(&ipf_nat); + if ((ip->ip_off & (IP_OFFMASK|IP_MF)) && + (nat = ipfr_nat_knownfrag(ip, fin))) + natadd = 0; + else if ((nat = nat_outlookup(ifp, nflags, (u_int)ip->ip_p, ip->ip_src, + ip->ip_dst, (dport << 16) | sport))) { + nflags = nat->nat_flags; + if ((nflags & (FI_W_SPORT|FI_W_DPORT)) != 0) { + if ((nflags & FI_W_SPORT) && + (nat->nat_inport != sport)) + nat->nat_inport = sport; + else if ((nflags & FI_W_DPORT) && + (nat->nat_oport != dport)) + nat->nat_oport = dport; + if (nat->nat_outport == 0) + nat->nat_outport = sport; + nat->nat_flags &= ~(FI_W_DPORT|FI_W_SPORT); + nflags = nat->nat_flags; + } + } else { + RWLOCK_EXIT(&ipf_nat); + WRITE_ENTER(&ipf_nat); + /* + * If there is no current entry in the nat table for this IP#, + * create one for it (if there is a matching rule). + */ + msk = 0xffffffff; + i = 32; +maskloop: + iph = ipa & htonl(msk); + hv = NAT_HASH_FN(iph, ipf_natrules_sz); + for (np = nat_rules[hv]; np; np = np->in_mnext) + { + if ((np->in_ifp == ifp) && np->in_space && + (!(np->in_flags & IPN_RF) || + (np->in_flags & nflags)) && + ((ipa & np->in_inmsk) == np->in_inip) && + ((np->in_redir & (NAT_MAP|NAT_MAPBLK)) || + (np->in_pnext == sport))) { + if (*np->in_plabel && !appr_ok(ip, tcp, np)) + continue; + /* + * If it's a redirection, then we don't want to + * create new outgoing port stuff. + * Redirections are only for incoming + * connections. + */ + if (!(np->in_redir & (NAT_MAP|NAT_MAPBLK))) + continue; + if ((nat = nat_new(np, ip, fin, (u_int)nflags, + NAT_OUTBOUND))) { + np->in_hits++; +#ifdef IPFILTER_LOG + nat_log(nat, (u_int)np->in_redir); +#endif + break; + } + } + } + if ((np == NULL) && (i > 0)) { + do { + i--; + msk <<= 1; + } while ((i >= 0) && ((nat_masks & (1 << i)) == 0)); + if (i >= 0) + goto maskloop; + } + MUTEX_DOWNGRADE(&ipf_nat); + } + + if (nat) { + np = nat->nat_ptr; + if (natadd && fin->fin_fi.fi_fl & FI_FRAG) + ipfr_nat_newfrag(ip, fin, 0, nat); + ip->ip_src = nat->nat_outip; + MUTEX_ENTER(&ipf_rw); + nat->nat_age = fr_defnatage; + nat->nat_bytes += ip->ip_len; + nat->nat_pkts++; + MUTEX_EXIT(&ipf_rw); + + /* + * Fix up checksums, not by recalculating them, but + * simply computing adjustments. + */ +#if SOLARIS || defined(__sgi) + if (nat->nat_dir == NAT_OUTBOUND) + fix_outcksum(&ip->ip_sum, nat->nat_ipsumd); + else + fix_incksum(&ip->ip_sum, nat->nat_ipsumd); +#endif + + if (!(ip->ip_off & IP_OFFMASK) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + + if ((nat->nat_outport != 0) && (nflags & IPN_TCPUDP)) { + tcp->th_sport = nat->nat_outport; + fin->fin_data[0] = ntohs(tcp->th_sport); + } + + if (ip->ip_p == IPPROTO_TCP) { + csump = &tcp->th_sum; + MUTEX_ENTER(&ipf_rw); + fr_tcp_age(&nat->nat_age, + nat->nat_tcpstate, ip, fin, 1); + if (nat->nat_age < fr_defnaticmpage) + nat->nat_age = fr_defnaticmpage; +#ifdef LARGE_NAT + else if (nat->nat_age > DEF_NAT_AGE) + nat->nat_age = DEF_NAT_AGE; +#endif + /* + * Increase this because we may have + * "keep state" following this too and + * packet storms can occur if this is + * removed too quickly. + */ + if (nat->nat_age == fr_tcpclosed) + nat->nat_age = fr_tcplastack; + MUTEX_EXIT(&ipf_rw); + } else if (ip->ip_p == IPPROTO_UDP) { + udphdr_t *udp = (udphdr_t *)tcp; + + if (udp->uh_sum) + csump = &udp->uh_sum; + } + if (csump) { + if (nat->nat_dir == NAT_OUTBOUND) + fix_outcksum(csump, nat->nat_sumd); + else + fix_incksum(csump, nat->nat_sumd); + } + } + if ((np->in_apr != NULL) && (np->in_dport == 0 || + (tcp != NULL && dport == np->in_dport))) + (void) appr_check(ip, fin, nat); + ATOMIC_INC(nat_stats.ns_mapped[1]); + RWLOCK_EXIT(&ipf_nat); /* READ */ + return 1; + } + RWLOCK_EXIT(&ipf_nat); /* READ/WRITE */ + return 0; +} + + +/* + * Packets coming in from the external interface go through this. + * Here, the destination address requires alteration, if anything. + */ +int ip_natin(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register struct in_addr src; + register struct in_addr in; + register ipnat_t *np; + u_int nflags = 0, natadd = 1, hv, msk; + struct ifnet *ifp = fin->fin_ifp; + tcphdr_t *tcp = NULL; + u_short sport = 0, dport = 0, *csump = NULL; + nat_t *nat; + u_32_t iph; + int i; + + if (nat_list == NULL) + return 0; + + if (!(ip->ip_off & IP_OFFMASK) && !(fin->fin_fi.fi_fl & FI_SHORT)) { + if (ip->ip_p == IPPROTO_TCP) + nflags = IPN_TCP; + else if (ip->ip_p == IPPROTO_UDP) + nflags = IPN_UDP; + if ((nflags & IPN_TCPUDP)) { + tcp = (tcphdr_t *)fin->fin_dp; + dport = tcp->th_dport; + sport = tcp->th_sport; + } + } + + in = ip->ip_dst; + /* make sure the source address is to be redirected */ + src = ip->ip_src; + + READ_ENTER(&ipf_nat); + + if ((ip->ip_p == IPPROTO_ICMP) && (nat = nat_icmpin(ip, fin, &nflags))) + ; + else if ((ip->ip_off & IP_OFFMASK) && + (nat = ipfr_nat_knownfrag(ip, fin))) + natadd = 0; + else if ((nat = nat_inlookup(fin->fin_ifp, nflags, (u_int)ip->ip_p, + ip->ip_src, in, (dport << 16) | sport))) { + nflags = nat->nat_flags; + if ((nflags & (FI_W_SPORT|FI_W_DPORT)) != 0) { + if ((nat->nat_oport != sport) && (nflags & FI_W_DPORT)) + nat->nat_oport = sport; + else if ((nat->nat_outport != dport) && + (nflags & FI_W_SPORT)) + nat->nat_outport = dport; + nat->nat_flags &= ~(FI_W_SPORT|FI_W_DPORT); + nflags = nat->nat_flags; + } + } else { + RWLOCK_EXIT(&ipf_nat); + WRITE_ENTER(&ipf_nat); + /* + * If there is no current entry in the nat table for this IP#, + * create one for it (if there is a matching rule). + */ + msk = 0xffffffff; + i = 32; +maskloop: + iph = in.s_addr & htonl(msk); + hv = NAT_HASH_FN(iph, ipf_rdrrules_sz); + for (np = rdr_rules[hv]; np; np = np->in_rnext) + if ((np->in_ifp == ifp) && + (!np->in_flags || (nflags & np->in_flags)) && + ((in.s_addr & np->in_outmsk) == np->in_outip) && + ((src.s_addr & np->in_srcmsk) == np->in_srcip) && + (np->in_redir & NAT_REDIRECT) && + (!np->in_pmin || np->in_pmin == dport)) { + if ((nat = nat_new(np, ip, fin, nflags, + NAT_INBOUND))) { + np->in_hits++; +#ifdef IPFILTER_LOG + nat_log(nat, (u_int)np->in_redir); +#endif + break; + } + } + if ((np == NULL) && (i > 0)) { + do { + i--; + msk <<= 1; + } while ((i >= 0) && ((rdr_masks & (1 << i)) == 0)); + if (i >= 0) + goto maskloop; + } + MUTEX_DOWNGRADE(&ipf_nat); + } + if (nat) { + np = nat->nat_ptr; + fin->fin_fr = nat->nat_fr; + if (natadd && fin->fin_fi.fi_fl & FI_FRAG) + ipfr_nat_newfrag(ip, fin, 0, nat); + if ((np->in_apr != NULL) && (np->in_dport == 0 || + (tcp != NULL && sport == np->in_dport))) + (void) appr_check(ip, fin, nat); + + MUTEX_ENTER(&ipf_rw); + if (nflags != IPN_ICMPERR) + nat->nat_age = fr_defnatage; + + nat->nat_bytes += ip->ip_len; + nat->nat_pkts++; + MUTEX_EXIT(&ipf_rw); + ip->ip_dst = nat->nat_inip; + fin->fin_fi.fi_dst = nat->nat_inip; + + /* + * Fix up checksums, not by recalculating them, but + * simply computing adjustments. + */ +#if SOLARIS || defined(__sgi) + if (nat->nat_dir == NAT_OUTBOUND) + fix_incksum(&ip->ip_sum, nat->nat_ipsumd); + else + fix_outcksum(&ip->ip_sum, nat->nat_ipsumd); +#endif + if (!(ip->ip_off & IP_OFFMASK) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + + if ((nat->nat_inport != 0) && (nflags & IPN_TCPUDP)) { + tcp->th_dport = nat->nat_inport; + fin->fin_data[1] = ntohs(tcp->th_dport); + } + + if (ip->ip_p == IPPROTO_TCP) { + csump = &tcp->th_sum; + MUTEX_ENTER(&ipf_rw); + fr_tcp_age(&nat->nat_age, + nat->nat_tcpstate, ip, fin, 0); + if (nat->nat_age < fr_defnaticmpage) + nat->nat_age = fr_defnaticmpage; +#ifdef LARGE_NAT + else if (nat->nat_age > DEF_NAT_AGE) + nat->nat_age = DEF_NAT_AGE; +#endif + /* + * Increase this because we may have + * "keep state" following this too and + * packet storms can occur if this is + * removed too quickly. + */ + if (nat->nat_age == fr_tcpclosed) + nat->nat_age = fr_tcplastack; + MUTEX_EXIT(&ipf_rw); + } else if (ip->ip_p == IPPROTO_UDP) { + udphdr_t *udp = (udphdr_t *)tcp; + + if (udp->uh_sum) + csump = &udp->uh_sum; + } + if (csump) { + if (nat->nat_dir == NAT_OUTBOUND) + fix_incksum(csump, nat->nat_sumd); + else + fix_outcksum(csump, nat->nat_sumd); + } + } + ATOMIC_INC(nat_stats.ns_mapped[0]); + RWLOCK_EXIT(&ipf_nat); /* READ */ + return 1; + } + RWLOCK_EXIT(&ipf_nat); /* READ/WRITE */ + return 0; +} + + +/* + * Free all memory used by NAT structures allocated at runtime. + */ +void ip_natunload() +{ + WRITE_ENTER(&ipf_nat); + (void) nat_clearlist(); + (void) nat_flushtable(); + RWLOCK_EXIT(&ipf_nat); + + if (nat_table[0] != NULL) { + KFREES(nat_table[0], sizeof(nat_t *) * ipf_nattable_sz); + nat_table[0] = NULL; + } + if (nat_table[1] != NULL) { + KFREES(nat_table[1], sizeof(nat_t *) * ipf_nattable_sz); + nat_table[1] = NULL; + } + if (nat_rules != NULL) { + KFREES(nat_rules, sizeof(ipnat_t *) * ipf_natrules_sz); + nat_rules = NULL; + } + if (rdr_rules != NULL) { + KFREES(rdr_rules, sizeof(ipnat_t *) * ipf_rdrrules_sz); + rdr_rules = NULL; + } +} + + +/* + * Slowly expire held state for NAT entries. Timeouts are set in + * expectation of this being called twice per second. + */ +void ip_natexpire() +{ + register struct nat *nat, **natp; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + + SPL_NET(s); + WRITE_ENTER(&ipf_nat); + for (natp = &nat_instances; (nat = *natp); ) { + nat->nat_age--; + if (nat->nat_age) { + natp = &nat->nat_next; + continue; + } + *natp = nat->nat_next; +#ifdef IPFILTER_LOG + nat_log(nat, NL_EXPIRE); +#endif + nat_delete(nat); + nat_stats.ns_expire++; + } + RWLOCK_EXIT(&ipf_nat); + SPL_X(s); +} + + +/* + */ +void ip_natsync(ifp) +void *ifp; +{ + register ipnat_t *n; + register nat_t *nat; + register u_32_t sum1, sum2, sumd; + struct in_addr in; + ipnat_t *np; + void *ifp2; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + + /* + * Change IP addresses for NAT sessions for any protocol except TCP + * since it will break the TCP connection anyway. + */ + SPL_NET(s); + WRITE_ENTER(&ipf_nat); + for (nat = nat_instances; nat; nat = nat->nat_next) + if (((ifp == NULL) || (ifp == nat->nat_ifp)) && + !(nat->nat_flags & IPN_TCP) && (np = nat->nat_ptr) && + (np->in_outmsk == 0xffffffff) && !np->in_nip) { + ifp2 = nat->nat_ifp; + /* + * Change the map-to address to be the same as the + * new one. + */ + sum1 = nat->nat_outip.s_addr; + if (fr_ifpaddr(ifp2, &in) != -1) + nat->nat_outip.s_addr = htonl(in.s_addr); + sum2 = nat->nat_outip.s_addr; + + if (sum1 == sum2) + continue; + /* + * Readjust the checksum adjustment to take into + * account the new IP#. + */ + CALC_SUMD(sum1, sum2, sumd); + sumd += nat->nat_sumd; + nat->nat_sumd = (sumd & 0xffff) + (sumd >> 16); + } + + for (n = nat_list; (n != NULL); n = n->in_next) + if (n->in_ifp == ifp) + n->in_ifp = (void *)GETUNIT(n->in_ifname); + RWLOCK_EXIT(&ipf_nat); + SPL_X(s); +} + + +#ifdef IPFILTER_LOG +void nat_log(nat, type) +struct nat *nat; +u_int type; +{ + struct ipnat *np; + struct natlog natl; + void *items[1]; + size_t sizes[1]; + int rulen, types[1]; + + natl.nl_inip = nat->nat_inip; + natl.nl_outip = nat->nat_outip; + natl.nl_origip = nat->nat_oip; + natl.nl_bytes = nat->nat_bytes; + natl.nl_pkts = nat->nat_pkts; + natl.nl_origport = nat->nat_oport; + natl.nl_inport = nat->nat_inport; + natl.nl_outport = nat->nat_outport; + natl.nl_type = type; + natl.nl_rule = -1; +#ifndef LARGE_NAT + if (nat->nat_ptr != NULL) { + for (rulen = 0, np = nat_list; np; np = np->in_next, rulen++) + if (np == nat->nat_ptr) { + natl.nl_rule = rulen; + break; + } + } +#endif + items[0] = &natl; + sizes[0] = sizeof(natl); + types[0] = 0; + + (void) ipllog(IPL_LOGNAT, NULL, items, sizes, types, 1); +} +#endif diff --git a/sys/netinet/ip_nat.h b/sys/netinet/ip_nat.h new file mode 100644 index 0000000..eeeebec --- /dev/null +++ b/sys/netinet/ip_nat.h @@ -0,0 +1,247 @@ +/* + * Copyright (C) 1995-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_nat.h 1.5 2/4/96 + * $Id: ip_nat.h,v 2.1.2.1 1999/08/14 04:47:54 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_NAT_H__ +#define __IP_NAT_H__ + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(__STDC__) || defined(__GNUC__) +#define SIOCADNAT _IOW('r', 80, struct ipnat) +#define SIOCRMNAT _IOW('r', 81, struct ipnat) +#define SIOCGNATS _IOR('r', 82, struct natstat) +#define SIOCGNATL _IOWR('r', 83, struct natlookup) +#define SIOCGFRST _IOR('r', 84, struct ipfrstat) +#define SIOCGIPST _IOR('r', 85, struct ips_stat) +#define SIOCFLNAT _IOWR('r', 86, int) +#define SIOCCNATL _IOWR('r', 87, int) +#else +#define SIOCADNAT _IOW(r, 80, struct ipnat) +#define SIOCRMNAT _IOW(r, 81, struct ipnat) +#define SIOCGNATS _IOR(r, 82, struct natstat) +#define SIOCGNATL _IOWR(r, 83, struct natlookup) +#define SIOCGFRST _IOR(r, 84, struct ipfrstat) +#define SIOCGIPST _IOR(r, 85, struct ips_stat) +#define SIOCFLNAT _IOWR(r, 86, int) +#define SIOCCNATL _IOWR(r, 87, int) +#endif + +#undef LARGE_NAT /* define this if you're setting up a system to NAT + * LARGE numbers of networks/hosts - i.e. in the + * hundreds or thousands. In such a case, you should + * also change the RDR_SIZE and NAT_SIZE below to more + * appropriate sizes. The figures below were used for + * a setup with 1000-2000 networks to NAT. + */ +#define NAT_SIZE 127 +#define RDR_SIZE 127 +#define NAT_TABLE_SZ 127 +#ifdef LARGE_NAT +#undef NAT_SIZE +#undef RDR_SIZE +#undef NAT_TABLE_SZ +#define NAT_SIZE 2047 +#define RDR_SIZE 2047 +#define NAT_TABLE_SZ 16383 +#endif +#ifndef APR_LABELLEN +#define APR_LABELLEN 16 +#endif + +#define DEF_NAT_AGE 1200 /* 10 minutes (600 seconds) */ + +typedef struct nat { + u_long nat_age; + int nat_flags; + u_32_t nat_sumd; + u_32_t nat_ipsumd; + void *nat_data; + void *nat_aps; /* proxy session */ + frentry_t *nat_fr; /* filter rule ptr if appropriate */ + struct in_addr nat_inip; + struct in_addr nat_outip; + struct in_addr nat_oip; /* other ip */ + U_QUAD_T nat_pkts; + U_QUAD_T nat_bytes; + u_short nat_oport; /* other port */ + u_short nat_inport; + u_short nat_outport; + u_short nat_use; + u_char nat_tcpstate[2]; + u_char nat_p; /* protocol for NAT */ + struct ipnat *nat_ptr; /* pointer back to the rule */ + struct nat *nat_next; + struct nat *nat_hnext[2]; + struct nat **nat_hstart[2]; + void *nat_ifp; + int nat_dir; +} nat_t; + +typedef struct ipnat { + struct ipnat *in_next; + struct ipnat *in_rnext; + struct ipnat *in_mnext; + void *in_ifp; + void *in_apr; + u_long in_space; + u_int in_use; + u_int in_hits; + struct in_addr in_nextip; + u_short in_pnext; + u_short in_ppip; /* ports per IP */ + u_short in_ippip; /* IP #'s per IP# */ + u_short in_flags; /* From here to in_dport must be reflected */ + u_short in_port[2]; /* correctly in IPN_CMPSIZ */ + struct in_addr in_in[2]; + struct in_addr in_out[2]; + struct in_addr in_src[2]; + int in_redir; /* 0 if it's a mapping, 1 if it's a hard redir */ + char in_ifname[IFNAMSIZ]; + char in_plabel[APR_LABELLEN]; /* proxy label */ + char in_p; /* protocol */ + u_short in_dport; +} ipnat_t; + +#define in_pmin in_port[0] /* Also holds static redir port */ +#define in_pmax in_port[1] +#define in_nip in_nextip.s_addr +#define in_inip in_in[0].s_addr +#define in_inmsk in_in[1].s_addr +#define in_outip in_out[0].s_addr +#define in_outmsk in_out[1].s_addr +#define in_srcip in_src[0].s_addr +#define in_srcmsk in_src[1].s_addr + +#define NAT_OUTBOUND 0 +#define NAT_INBOUND 1 + +#define NAT_MAP 0x01 +#define NAT_REDIRECT 0x02 +#define NAT_BIMAP (NAT_MAP|NAT_REDIRECT) +#define NAT_MAPBLK 0x04 + +#define MAPBLK_MINPORT 1024 /* don't use reserved ports for src port */ +#define USABLE_PORTS (65536 - MAPBLK_MINPORT) + +#define IPN_CMPSIZ (sizeof(ipnat_t) - offsetof(ipnat_t, in_flags)) + +typedef struct natlookup { + struct in_addr nl_inip; + struct in_addr nl_outip; + struct in_addr nl_realip; + int nl_flags; + u_short nl_inport; + u_short nl_outport; + u_short nl_realport; +} natlookup_t; + +typedef struct natstat { + u_long ns_mapped[2]; + u_long ns_rules; + u_long ns_added; + u_long ns_expire; + u_long ns_inuse; + u_long ns_logged; + u_long ns_logfail; + nat_t **ns_table[2]; + ipnat_t *ns_list; + void *ns_apslist; + u_int ns_nattab_sz; + u_int ns_rultab_sz; + u_int ns_rdrtab_sz; + nat_t *ns_instances; +} natstat_t; + +#define IPN_ANY 0x00 +#define IPN_TCP 0x01 +#define IPN_UDP 0x02 +#define IPN_TCPUDP (IPN_TCP|IPN_UDP) +#define IPN_DELETE 0x04 +#define IPN_ICMPERR 0x08 +#define IPN_RF (IPN_TCPUDP|IPN_DELETE|IPN_ICMPERR) +#define IPN_AUTOPORTMAP 0x10 +#define IPN_RANGE 0x20 +#define IPN_USERFLAGS (IPN_TCPUDP|IPN_AUTOPORTMAP|IPN_RANGE) + + +typedef struct natlog { + struct in_addr nl_origip; + struct in_addr nl_outip; + struct in_addr nl_inip; + u_short nl_origport; + u_short nl_outport; + u_short nl_inport; + u_short nl_type; + int nl_rule; + U_QUAD_T nl_pkts; + U_QUAD_T nl_bytes; +} natlog_t; + + +#define NL_NEWMAP NAT_MAP +#define NL_NEWRDR NAT_REDIRECT +#define NL_EXPIRE 0xffff + +#define NAT_HASH_FN(k,m) (((k) + ((k) >> 12)) % (m)) + +#define LONG_SUM(in) (((in) & 0xffff) + ((in) >> 16)) + +#define CALC_SUMD(s1, s2, sd) { \ + (s1) = ((s1) & 0xffff) + ((s1) >> 16); \ + (s2) = ((s2) & 0xffff) + ((s2) >> 16); \ + /* Do it twice */ \ + (s1) = ((s1) & 0xffff) + ((s1) >> 16); \ + (s2) = ((s2) & 0xffff) + ((s2) >> 16); \ + /* Because ~1 == -2, We really need ~1 == -1 */ \ + if ((s1) > (s2)) (s2)--; \ + (sd) = (s2) - (s1); \ + (sd) = ((sd) & 0xffff) + ((sd) >> 16); } + + +extern u_int ipf_nattable_sz; +extern u_int ipf_natrules_sz; +extern u_int ipf_rdrrules_sz; +extern void ip_natsync __P((void *)); +extern u_long fr_defnatage; +extern u_long fr_defnaticmpage; +extern nat_t **nat_table[2]; +extern nat_t *nat_instances; +extern ipnat_t **nat_rules; +extern ipnat_t **rdr_rules; +extern natstat_t nat_stats; +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +extern int nat_ioctl __P((caddr_t, u_long, int)); +#else +extern int nat_ioctl __P((caddr_t, int, int)); +#endif +extern int nat_init __P((void)); +extern nat_t *nat_new __P((ipnat_t *, ip_t *, fr_info_t *, u_int, int)); +extern nat_t *nat_outlookup __P((void *, u_int, u_int, struct in_addr, + struct in_addr, u_32_t)); +extern nat_t *nat_inlookup __P((void *, u_int, u_int, struct in_addr, + struct in_addr, u_32_t)); +extern nat_t *nat_maplookup __P((void *, u_int, struct in_addr, + struct in_addr)); +extern nat_t *nat_lookupredir __P((natlookup_t *)); +extern nat_t *nat_icmpinlookup __P((ip_t *, fr_info_t *)); +extern nat_t *nat_icmpin __P((ip_t *, fr_info_t *, u_int *)); + +extern int ip_natout __P((ip_t *, fr_info_t *)); +extern int ip_natin __P((ip_t *, fr_info_t *)); +extern void ip_natunload __P((void)), ip_natexpire __P((void)); +extern void nat_log __P((struct nat *, u_int)); +extern void fix_incksum __P((u_short *, u_32_t)); +extern void fix_outcksum __P((u_short *, u_32_t)); + +#endif /* __IP_NAT_H__ */ diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c new file mode 100644 index 0000000..2ed22e6 --- /dev/null +++ b/sys/netinet/ip_output.c @@ -0,0 +1,1571 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#define _IP_VHL + +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_ipfilter.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +#ifdef vax +#include <machine/mtpr.h> +#endif +#include <machine/in_cksum.h> + +static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); + +#include <netinet/ip_fw.h> + +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif + +#ifdef IPFIREWALL_FORWARD_DEBUG +#define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ + (ntohl(a.s_addr)>>16)&0xFF,\ + (ntohl(a.s_addr)>>8)&0xFF,\ + (ntohl(a.s_addr))&0xFF); +#endif + +u_short ip_id; + +static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *)); +static void ip_mloopback + __P((struct ifnet *, struct mbuf *, struct sockaddr_in *, int)); +static int ip_getmoptions + __P((struct sockopt *, struct ip_moptions *)); +static int ip_pcbopts __P((int, struct mbuf **, struct mbuf *)); +static int ip_setmoptions + __P((struct sockopt *, struct ip_moptions **)); + +#if defined(IPFILTER_LKM) || defined(IPFILTER) +int ip_optcopy __P((struct ip *, struct ip *)); +extern int (*fr_checkp) __P((struct ip *, int, struct ifnet *, int, struct mbuf **)); +#else +static int ip_optcopy __P((struct ip *, struct ip *)); +#endif + + +extern struct protosw inetsw[]; + +/* + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + */ +int +ip_output(m0, opt, ro, flags, imo) + struct mbuf *m0; + struct mbuf *opt; + struct route *ro; + int flags; + struct ip_moptions *imo; +{ + struct ip *ip, *mhip; + struct ifnet *ifp; + struct mbuf *m = m0; + int hlen = sizeof (struct ip); + int len, off, error = 0; + struct sockaddr_in *dst; + struct in_ifaddr *ia; + int isbroadcast; + u_int16_t divert_cookie; /* firewall cookie */ +#ifdef IPFIREWALL_FORWARD + int fwd_rewrite_src = 0; +#endif + struct ip_fw_chain *rule = NULL; + +#ifdef IPDIVERT + /* Get and reset firewall cookie */ + divert_cookie = ip_divert_cookie; + ip_divert_cookie = 0; +#else + divert_cookie = 0; +#endif + +#if defined(IPFIREWALL) && defined(DUMMYNET) + /* + * dummynet packet are prepended a vestigial mbuf with + * m_type = MT_DUMMYNET and m_data pointing to the matching + * rule. + */ + if (m->m_type == MT_DUMMYNET) { + /* + * the packet was already tagged, so part of the + * processing was already done, and we need to go down. + * opt, flags and imo have already been used, and now + * they are used to hold ifp, dst and NULL, respectively. + */ + rule = (struct ip_fw_chain *)(m->m_data) ; + m0 = m = m->m_next ; + ip = mtod(m, struct ip *); + dst = (struct sockaddr_in *)flags ; + ifp = (struct ifnet *)opt; + hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + opt = NULL ; + flags = 0 ; /* XXX is this correct ? */ + goto sendit; + } else + rule = NULL ; +#endif + +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ip_output no HDR"); + if (!ro) + panic("ip_output no route, proto = %d", + mtod(m, struct ip *)->ip_p); +#endif + if (opt) { + m = ip_insertoptions(m, opt, &len); + hlen = len; + } + ip = mtod(m, struct ip *); + /* + * Fill in IP header. + */ + if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); + ip->ip_off &= IP_DF; + ip->ip_id = htons(ip_id++); + ipstat.ips_localout++; + } else { + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } + + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if (ro->ro_rt == 0) { + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } + /* + * If routing to interface only, + * short circuit routing lookup. + */ +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) + if (flags & IP_ROUTETOIF) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else { + /* + * If this is the case, we probably don't want to allocate + * a protocol-cloned route since we didn't get one from the + * ULP. This lets TCP do its thing, while not burdening + * forwarding or ICMP with the overhead of cloning a route. + * Of course, we still want to do any cloning requested by + * the link layer, as this is probably required in all cases + * for correct operation (as it is for ARP). + */ + if (ro->ro_rt == 0) + rtalloc_ign(ro, RTF_PRCLONING); + if (ro->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + ro->ro_rt->rt_use++; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_HOST) + isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + + m->m_flags |= M_MCAST; + /* + * IP destination address is multicast. Make sure "dst" + * still points to the address in "ro". (It may have been + * changed to point to a gateway address, above.) + */ + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * See if the caller provided any multicast options + */ + if (imo != NULL) { + ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_ifp != NULL) + ifp = imo->imo_multicast_ifp; + if (imo->imo_multicast_vif != -1) + ip->ip_src.s_addr = + ip_mcast_src(imo->imo_multicast_vif); + } else + ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + /* + * Confirm that the outgoing interface supports multicast. + */ + if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + } + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + register struct in_ifaddr *ia1; + + for (ia1 = in_ifaddrhead.tqh_first; ia1; + ia1 = ia1->ia_link.tqe_next) + if (ia1->ia_ifp == ifp) { + ip->ip_src = IA_SIN(ia1)->sin_addr; + break; + } + } + + IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); + if (inm != NULL && + (imo == NULL || imo->imo_multicast_loop)) { + /* + * If we belong to the destination multicast group + * on the outgoing interface, and the caller did not + * forbid loopback, loop back a copy. + */ + ip_mloopback(ifp, m, dst, hlen); + } + else { + /* + * If we are acting as a multicast router, perform + * multicast forwarding as if the packet had just + * arrived on the interface to which we are about + * to send. The multicast forwarding function + * recursively calls this function, using the + * IP_FORWARDING flag to prevent infinite recursion. + * + * Multicasts that are looped back by ip_mloopback(), + * above, will be forwarded by the ip_input() routine, + * if necessary. + */ + if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + /* + * Check if rsvp daemon is running. If not, don't + * set ip_moptions. This ensures that the packet + * is multicast and not just sent down one link + * as prescribed by rsvpd. + */ + if (!rsvp_on) + imo = NULL; + if (ip_mforward(ip, ifp, m, imo) != 0) { + m_freem(m); + goto done; + } + } + } + + /* + * Multicasts with a time-to-live of zero may be looped- + * back, above, but must not be transmitted on a network. + * Also, multicasts addressed to the loopback interface + * are not sent -- the above call to ip_mloopback() will + * loop back a copy if this host actually belongs to the + * destination group on the loopback interface. + */ + if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { + m_freem(m); + goto done; + } + + goto sendit; + } +#ifndef notdef + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + ip->ip_src = IA_SIN(ia)->sin_addr; +#ifdef IPFIREWALL_FORWARD + /* Keep note that we did this - if the firewall changes + * the next-hop, our interface may change, changing the + * default source IP. It's a shame so much effort happens + * twice. Oh well. + */ + fwd_rewrite_src++; +#endif /* IPFIREWALL_FORWARD */ + } +#endif /* notdef */ + /* + * Verify that we have any chance at all of being able to queue + * the packet or packet fragments + */ + if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= + ifp->if_snd.ifq_maxlen) { + error = ENOBUFS; + goto bad; + } + + /* + * Look for broadcast address and + * and verify user is allowed to send + * such a packet. + */ + if (isbroadcast) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & IP_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + /* don't allow broadcast messages to be fragmented */ + if ((u_short)ip->ip_len > ifp->if_mtu) { + error = EMSGSIZE; + goto bad; + } + m->m_flags |= M_BCAST; + } else { + m->m_flags &= ~M_BCAST; + } + +sendit: + /* + * IpHack's section. + * - Xlate: translate packet's addr/port (NAT). + * - Firewall: deny/allow/etc. + * - Wrap: fake packet's addr/port <unimpl.> + * - Encapsulate: put it in another IP and send out. <unimp.> + */ +#if defined(IPFILTER) || defined(IPFILTER_LKM) + if (fr_checkp) { + struct mbuf *m1 = m; + + if ((error = (*fr_checkp)(ip, hlen, ifp, 1, &m1)) || !m1) + goto done; + ip = mtod(m = m1, struct ip *); + } +#endif + + /* + * Check with the firewall... + */ + if (ip_fw_chk_ptr) { + struct sockaddr_in *old = dst; + + off = (*ip_fw_chk_ptr)(&ip, + hlen, ifp, &divert_cookie, &m, &rule, &dst); + /* + * On return we must do the following: + * m == NULL -> drop the pkt + * 1<=off<= 0xffff -> DIVERT + * (off & 0x10000) -> send to a DUMMYNET pipe + * (off & 0x20000) -> TEE the packet + * dst != old -> IPFIREWALL_FORWARD + * off==0, dst==old -> accept + * If some of the above modules is not compiled in, then + * we should't have to check the corresponding condition + * (because the ipfw control socket should not accept + * unsupported rules), but better play safe and drop + * packets in case of doubt. + */ + if (!m) { /* firewall said to reject */ + error = EACCES; + goto done; + } + if (off == 0 && dst == old) /* common case */ + goto pass ; +#ifdef DUMMYNET + if ((off & IP_FW_PORT_DYNT_FLAG) != 0) { + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + dummynet_io(off & 0xffff, DN_TO_IP_OUT, m,ifp,ro,dst,rule); + goto done; + } +#endif +#ifdef IPDIVERT + if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { + struct mbuf *clone = NULL; + + /* Clone packet if we're doing a 'tee' */ + if ((off & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + + /* Restore packet header fields to original values */ + HTONS(ip->ip_len); + HTONS(ip->ip_off); + + /* Deliver packet to divert input routine */ + ip_divert_cookie = divert_cookie; + divert_packet(m, 0, off & 0xffff); + + /* If 'tee', continue with original packet */ + if (clone != NULL) { + m = clone; + ip = mtod(m, struct ip *); + goto pass; + } + goto done; + } +#endif + +#ifdef IPFIREWALL_FORWARD + /* Here we check dst to make sure it's directly reachable on the + * interface we previously thought it was. + * If it isn't (which may be likely in some situations) we have + * to re-route it (ie, find a route for the next-hop and the + * associated interface) and set them here. This is nested + * forwarding which in most cases is undesirable, except where + * such control is nigh impossible. So we do it here. + * And I'm babbling. + */ + if (off == 0 && old != dst) { + struct in_ifaddr *ia; + + /* It's changed... */ + /* There must be a better way to do this next line... */ + static struct route sro_fwd, *ro_fwd = &sro_fwd; +#ifdef IPFIREWALL_FORWARD_DEBUG + printf("IPFIREWALL_FORWARD: New dst ip: "); + print_ip(dst->sin_addr); + printf("\n"); +#endif + /* + * We need to figure out if we have been forwarded + * to a local socket. If so then we should somehow + * "loop back" to ip_input, and get directed to the + * PCB as if we had received this packet. This is + * because it may be dificult to identify the packets + * you want to forward until they are being output + * and have selected an interface. (e.g. locally + * initiated packets) If we used the loopback inteface, + * we would not be able to control what happens + * as the packet runs through ip_input() as + * it is done through a ISR. + */ + for (ia = TAILQ_FIRST(&in_ifaddrhead); ia; + ia = TAILQ_NEXT(ia, ia_link)) { + /* + * If the addr to forward to is one + * of ours, we pretend to + * be the destination for this packet. + */ + if (IA_SIN(ia)->sin_addr.s_addr == + dst->sin_addr.s_addr) + break; + } + if (ia) { + /* tell ip_input "dont filter" */ + ip_fw_fwd_addr = dst; + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = ifunit("lo0"); + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)ip->ip_off); + ip->ip_sum = 0; + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + ip_input(m); + goto done; + } + /* Some of the logic for this was + * nicked from above. + * + * This rewrites the cached route in a local PCB. + * Is this what we want to do? + */ + bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); + + ro_fwd->ro_rt = 0; + rtalloc_ign(ro_fwd, RTF_PRCLONING); + + if (ro_fwd->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + + ia = ifatoia(ro_fwd->ro_rt->rt_ifa); + ifp = ro_fwd->ro_rt->rt_ifp; + ro_fwd->ro_rt->rt_use++; + if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) + isbroadcast = + (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + RTFREE(ro->ro_rt); + ro->ro_rt = ro_fwd->ro_rt; + dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + + /* + * If we added a default src ip earlier, + * which would have been gotten from the-then + * interface, do it again, from the new one. + */ + if (fwd_rewrite_src) + ip->ip_src = IA_SIN(ia)->sin_addr; + goto pass ; + } +#endif /* IPFIREWALL_FORWARD */ + /* + * if we get here, none of the above matches, and + * we have to drop the pkt + */ + m_freem(m); + error = EACCES; /* not sure this is the right error msg */ + goto done; + } + +pass: + /* + * If small enough for interface, can just send directly. + */ + if ((u_short)ip->ip_len <= ifp->if_mtu) { + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)ip->ip_off); + ip->ip_sum = 0; + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + /* + * This case can happen if the user changed the MTU + * of an interface after enabling IP on it. Because + * most netifs don't keep track of routes pointing to + * them, there is no way for one to update all its + * routes when the MTU is changed. + */ + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + ipstat.ips_cantfrag++; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + { + int mhlen, firstlen = len; + struct mbuf **mnext = &m->m_nextpkt; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_flags |= (m0->m_flags & M_MCAST); + m->m_data += max_linkhdr; + mhip = mtod(m, struct ip *); + *mhip = *ip; + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); + if (ip->ip_off & IP_MF) + mhip->ip_off |= IP_MF; + if (off + len >= (u_short)ip->ip_len) + len = (u_short)ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; /* ??? */ + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = (struct ifnet *)0; + mhip->ip_off = htons((u_short)mhip->ip_off); + mhip->ip_sum = 0; + if (mhip->ip_vhl == IP_VHL_BORING) { + mhip->ip_sum = in_cksum_hdr(mhip); + } else { + mhip->ip_sum = in_cksum(m, mhlen); + } + *mnext = m; + mnext = &m->m_nextpkt; + ipstat.ips_ofragments++; + } + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m = m0; + m_adj(m, hlen + firstlen - (u_short)ip->ip_len); + m->m_pkthdr.len = hlen + firstlen; + ip->ip_len = htons((u_short)m->m_pkthdr.len); + ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_sum = 0; + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; + if (error == 0) + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + } +done: + return (error); +bad: + m_freem(m0); + goto done; +} + +/* + * Insert IP options into preformed packet. + * Adjust IP destination as required for IP source routing, + * as indicated by a non-zero in_addr at the start of the options. + * + * XXX This routine assumes that the packet has no options in place. + */ +static struct mbuf * +ip_insertoptions(m, opt, phlen) + register struct mbuf *m; + struct mbuf *opt; + int *phlen; +{ + register struct ipoption *p = mtod(opt, struct ipoption *); + struct mbuf *n; + register struct ip *ip = mtod(m, struct ip *); + unsigned optlen; + + optlen = opt->m_len - sizeof(p->ipopt_dst); + if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) + return (m); /* XXX should fail */ + if (p->ipopt_dst.s_addr) + ip->ip_dst = p->ipopt_dst; + if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { + MGETHDR(n, M_DONTWAIT, MT_HEADER); + if (n == 0) + return (m); + n->m_pkthdr.len = m->m_pkthdr.len + optlen; + m->m_len -= sizeof(struct ip); + m->m_data += sizeof(struct ip); + n->m_next = m; + m = n; + m->m_len = optlen + sizeof(struct ip); + m->m_data += max_linkhdr; + (void)memcpy(mtod(m, void *), ip, sizeof(struct ip)); + } else { + m->m_data -= optlen; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + bcopy(p->ipopt_list, ip + 1, optlen); + *phlen = sizeof(struct ip) + optlen; + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); + ip->ip_len += optlen; + return (m); +} + +/* + * Copy options from ip to jp, + * omitting those not copied during fragmentation. + */ +#if !defined(IPFILTER) && !defined(IPFILTER_LKM) +static +#endif +int +ip_optcopy(ip, jp) + struct ip *ip, *jp; +{ + register u_char *cp, *dp; + int opt, optlen, cnt; + + cp = (u_char *)(ip + 1); + dp = (u_char *)(jp + 1); + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) { + /* Preserve for IP mcast tunnel's LSRR alignment. */ + *dp++ = IPOPT_NOP; + optlen = 1; + continue; + } else + optlen = cp[IPOPT_OLEN]; + /* bogus lengths should have been caught by ip_dooptions */ + if (optlen > cnt) + optlen = cnt; + if (IPOPT_COPIED(opt)) { + bcopy(cp, dp, optlen); + dp += optlen; + } + } + for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) + *dp++ = IPOPT_EOL; + return (optlen); +} + +/* + * IP socket option processing. + */ +int +ip_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + error = optval = 0; + if (sopt->sopt_level != IPPROTO_IP) { + return (EINVAL); + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_OPTIONS: +#ifdef notyet + case IP_RETOPTS: +#endif + { + struct mbuf *m; + if (sopt->sopt_valsize > MLEN) { + error = EMSGSIZE; + break; + } + MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + break; + } + m->m_len = sopt->sopt_valsize; + error = sooptcopyin(sopt, mtod(m, char *), m->m_len, + m->m_len); + + return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, + m)); + } + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case IP_TOS: + inp->inp_ip_tos = optval; + break; + + case IP_TTL: + inp->inp_ip_ttl = optval; + break; +#define OPTSET(bit) \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; + + case IP_RECVOPTS: + OPTSET(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + OPTSET(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + OPTSET(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + OPTSET(INP_RECVIF); + break; + } + break; +#undef OPTSET + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_setmoptions(sopt, &inp->inp_moptions); + break; + + case IP_PORTRANGE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (optval) { + case IP_PORTRANGE_DEFAULT: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); + break; + + case IP_PORTRANGE_HIGH: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; + break; + + case IP_PORTRANGE_LOW: + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; + break; + + default: + error = EINVAL; + break; + } + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_OPTIONS: + case IP_RETOPTS: + if (inp->inp_options) + error = sooptcopyout(sopt, + mtod(inp->inp_options, + char *), + inp->inp_options->m_len); + else + sopt->sopt_valsize = 0; + break; + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + case IP_PORTRANGE: + switch (sopt->sopt_name) { + + case IP_TOS: + optval = inp->inp_ip_tos; + break; + + case IP_TTL: + optval = inp->inp_ip_ttl; + break; + +#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) + + case IP_RECVOPTS: + optval = OPTBIT(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + optval = OPTBIT(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + optval = OPTBIT(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + optval = OPTBIT(INP_RECVIF); + break; + + case IP_PORTRANGE: + if (inp->inp_flags & INP_HIGHPORT) + optval = IP_PORTRANGE_HIGH; + else if (inp->inp_flags & INP_LOWPORT) + optval = IP_PORTRANGE_LOW; + else + optval = 0; + break; + } + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_getmoptions(sopt, inp->inp_moptions); + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +/* + * Set up IP options in pcb for insertion in output packets. + * Store in mbuf with pointer in pcbopt, adding pseudo-option + * with destination address if source routed. + */ +static int +ip_pcbopts(optname, pcbopt, m) + int optname; + struct mbuf **pcbopt; + register struct mbuf *m; +{ + register int cnt, optlen; + register u_char *cp; + u_char opt; + + /* turn off any old options */ + if (*pcbopt) + (void)m_free(*pcbopt); + *pcbopt = 0; + if (m == (struct mbuf *)0 || m->m_len == 0) { + /* + * Only turning off any previous options. + */ + if (m) + (void)m_free(m); + return (0); + } + +#ifndef vax + if (m->m_len % sizeof(int32_t)) + goto bad; +#endif + /* + * IP first-hop destination address will be stored before + * actual options; move other options back + * and clear it when none present. + */ + if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + goto bad; + cnt = m->m_len; + m->m_len += sizeof(struct in_addr); + cp = mtod(m, u_char *) + sizeof(struct in_addr); + ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); + bzero(mtod(m, caddr_t), sizeof(struct in_addr)); + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= IPOPT_OLEN || optlen > cnt) + goto bad; + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + case IPOPT_SSRR: + /* + * user process specifies route as: + * ->A->B->C->D + * D must be our final destination (but we can't + * check that since we may not have connected yet). + * A is first hop destination, which doesn't appear in + * actual IP option, but is stored before the options. + */ + if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + goto bad; + m->m_len -= sizeof(struct in_addr); + cnt -= sizeof(struct in_addr); + optlen -= sizeof(struct in_addr); + cp[IPOPT_OLEN] = optlen; + /* + * Move first hop before start of options. + */ + bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), + sizeof(struct in_addr)); + /* + * Then copy rest of options back + * to close up the deleted entry. + */ + ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + + sizeof(struct in_addr)), + (caddr_t)&cp[IPOPT_OFFSET+1], + (unsigned)cnt + sizeof(struct in_addr)); + break; + } + } + if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + goto bad; + *pcbopt = m; + return (0); + +bad: + (void)m_free(m); + return (EINVAL); +} + +/* + * XXX + * The whole multicast option thing needs to be re-thought. + * Several of these options are equally applicable to non-multicast + * transmission, and one (IP_MULTICAST_TTL) totally duplicates a + * standard option (IP_TTL). + */ +/* + * Set the IP multicast options in response to user setsockopt(). + */ +static int +ip_setmoptions(sopt, imop) + struct sockopt *sopt; + struct ip_moptions **imop; +{ + int error = 0; + int i; + struct in_addr addr; + struct ip_mreq mreq; + struct ifnet *ifp; + struct ip_moptions *imo = *imop; + struct route ro; + struct sockaddr_in *dst; + int s; + + if (imo == NULL) { + /* + * No multicast option buffer attached to the pcb; + * allocate one and initialize to default values. + */ + imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + } + + switch (sopt->sopt_name) { + /* store an index number for the vif you wanna use in the send */ + case IP_MULTICAST_VIF: + if (legal_vif_num == 0) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + break; + if (!legal_vif_num(i) && (i != -1)) { + error = EINVAL; + break; + } + imo->imo_multicast_vif = i; + break; + + case IP_MULTICAST_IF: + /* + * Select the interface for outgoing multicast packets. + */ + error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); + if (error) + break; + /* + * INADDR_ANY is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (addr.s_addr == INADDR_ANY) { + imo->imo_multicast_ifp = NULL; + break; + } + /* + * The selected interface is identified by its local + * IP address. Find the interface and confirm that + * it supports multicasting. + */ + s = splimp(); + INADDR_TO_IFP(addr, ifp); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + splx(s); + error = EADDRNOTAVAIL; + break; + } + imo->imo_multicast_ifp = ifp; + splx(s); + break; + + case IP_MULTICAST_TTL: + /* + * Set the IP time-to-live for outgoing multicast packets. + * The original multicast API required a char argument, + * which is inconsistent with the rest of the socket API. + * We allow either a char or an int. + */ + if (sopt->sopt_valsize == 1) { + u_char ttl; + error = sooptcopyin(sopt, &ttl, 1, 1); + if (error) + break; + imo->imo_multicast_ttl = ttl; + } else { + u_int ttl; + error = sooptcopyin(sopt, &ttl, sizeof ttl, + sizeof ttl); + if (error) + break; + if (ttl > 255) + error = EINVAL; + else + imo->imo_multicast_ttl = ttl; + } + break; + + case IP_MULTICAST_LOOP: + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. The original multicast API required a + * char argument, which is inconsistent with the rest + * of the socket API. We allow either a char or an int. + */ + if (sopt->sopt_valsize == 1) { + u_char loop; + error = sooptcopyin(sopt, &loop, 1, 1); + if (error) + break; + imo->imo_multicast_loop = !!loop; + } else { + u_int loop; + error = sooptcopyin(sopt, &loop, sizeof loop, + sizeof loop); + if (error) + break; + imo->imo_multicast_loop = !!loop; + } + break; + + case IP_ADD_MEMBERSHIP: + /* + * Add a multicast group membership. + * Group must be a valid IP multicast address. + */ + error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); + if (error) + break; + + if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + s = splimp(); + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq.imr_interface.s_addr == INADDR_ANY) { + bzero((caddr_t)&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq.imr_multiaddr; + rtalloc(&ro); + if (ro.ro_rt == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + ifp = ro.ro_rt->rt_ifp; + rtfree(ro.ro_rt); + } + else { + INADDR_TO_IFP(mreq.imr_interface, ifp); + } + + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq.imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + splx(s); + break; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + splx(s); + break; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + splx(s); + break; + } + ++imo->imo_num_memberships; + splx(s); + break; + + case IP_DROP_MEMBERSHIP: + /* + * Drop a multicast group membership. + * Group must be a valid IP multicast address. + */ + error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); + if (error) + break; + + if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + + s = splimp(); + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq.imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + INADDR_TO_IFP(mreq.imr_interface, ifp); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq.imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + splx(s); + break; + + default: + error = EOPNOTSUPP; + break; + } + + /* + * If all options have default values, no need to keep the mbuf. + */ + if (imo->imo_multicast_ifp == NULL && + imo->imo_multicast_vif == -1 && + imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && + imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && + imo->imo_num_memberships == 0) { + free(*imop, M_IPMOPTS); + *imop = NULL; + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +static int +ip_getmoptions(sopt, imo) + struct sockopt *sopt; + register struct ip_moptions *imo; +{ + struct in_addr addr; + struct in_ifaddr *ia; + int error, optval; + u_char coptval; + + error = 0; + switch (sopt->sopt_name) { + case IP_MULTICAST_VIF: + if (imo != NULL) + optval = imo->imo_multicast_vif; + else + optval = -1; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_IF: + if (imo == NULL || imo->imo_multicast_ifp == NULL) + addr.s_addr = INADDR_ANY; + else { + IFP_TO_IA(imo->imo_multicast_ifp, ia); + addr.s_addr = (ia == NULL) ? INADDR_ANY + : IA_SIN(ia)->sin_addr.s_addr; + } + error = sooptcopyout(sopt, &addr, sizeof addr); + break; + + case IP_MULTICAST_TTL: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_TTL; + else + optval = coptval = imo->imo_multicast_ttl; + if (sopt->sopt_valsize == 1) + error = sooptcopyout(sopt, &coptval, 1); + else + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_LOOP: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_LOOP; + else + optval = coptval = imo->imo_multicast_loop; + if (sopt->sopt_valsize == 1) + error = sooptcopyout(sopt, &coptval, 1); + else + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); +} + +/* + * Discard the IP multicast options. + */ +void +ip_freemoptions(imo) + register struct ip_moptions *imo; +{ + register int i; + + if (imo != NULL) { + for (i = 0; i < imo->imo_num_memberships; ++i) + in_delmulti(imo->imo_membership[i]); + free(imo, M_IPMOPTS); + } +} + +/* + * Routine called from ip_output() to loop back a copy of an IP multicast + * packet to the input queue of a specified interface. Note that this + * calls the output routine of the loopback "driver", but with an interface + * pointer that might NOT be a loopback interface -- evil, but easier than + * replicating that code here. + */ +static void +ip_mloopback(ifp, m, dst, hlen) + struct ifnet *ifp; + register struct mbuf *m; + register struct sockaddr_in *dst; + int hlen; +{ + register struct ip *ip; + struct mbuf *copym; + + copym = m_copy(m, 0, M_COPYALL); + if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_pullup(copym, hlen); + if (copym != NULL) { + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)ip->ip_off); + ip->ip_sum = 0; + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(copym, hlen); + } + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by using if_simloop(). + */ +#if 1 /* XXX */ + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } +#endif + +#ifdef notdef + copym->m_pkthdr.rcvif = ifp; + ip_input(copym); +#else + if_simloop(ifp, copym, (struct sockaddr *)dst, 0); +#endif + } +} diff --git a/sys/netinet/ip_proxy.c b/sys/netinet/ip_proxy.c new file mode 100644 index 0000000..3a03863 --- /dev/null +++ b/sys/netinet/ip_proxy.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) 1997-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +/*static const char rcsid[] = "@(#)$Id: ip_proxy.c,v 2.2.2.1 1999/09/19 12:18:19 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#if defined(__FreeBSD__) && defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if !defined(__FreeBSD_version) +# include <sys/ioctl.h> +#endif +#include <sys/fcntl.h> +#include <sys/uio.h> +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if defined(_KERNEL) +# if !defined(linux) +# include <sys/systm.h> +# else +# include <linux/string.h> +# endif +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#if __FreeBSD__ > 2 +# include <sys/queue.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef linux +# include <netinet/ip_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_state.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +#endif + + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static ap_session_t *appr_new_session __P((aproxy_t *, ip_t *, + fr_info_t *, nat_t *)); +static int appr_fixseqack __P((fr_info_t *, ip_t *, ap_session_t *, int )); + + +#define AP_SESS_SIZE 53 + +#if defined(_KERNEL) && !defined(linux) +#include "netinet/ip_ftp_pxy.c" +#include "netinet/ip_rcmd_pxy.c" +#include "netinet/ip_raudio_pxy.c" +#endif + +ap_session_t *ap_sess_tab[AP_SESS_SIZE]; +ap_session_t *ap_sess_list = NULL; +aproxy_t ap_proxies[] = { +#ifdef IPF_FTP_PROXY + { "ftp", (char)IPPROTO_TCP, 0, 0, ippr_ftp_init, NULL, + ippr_ftp_in, ippr_ftp_out }, +#endif +#ifdef IPF_RCMD_PROXY + { "rcmd", (char)IPPROTO_TCP, 0, 0, ippr_rcmd_init, ippr_rcmd_new, + NULL, ippr_rcmd_out }, +#endif +#ifdef IPF_RAUDIO_PROXY + { "raudio", (char)IPPROTO_TCP, 0, 0, ippr_raudio_init, + ippr_raudio_new, ippr_raudio_in, ippr_raudio_out }, +#endif + { "", '\0', 0, 0, NULL, NULL } +}; + + +int appr_ok(ip, tcp, nat) +ip_t *ip; +tcphdr_t *tcp; +ipnat_t *nat; +{ + aproxy_t *apr = nat->in_apr; + u_short dport = nat->in_dport; + + if (!apr || (apr->apr_flags & APR_DELETE) || + (ip->ip_p != apr->apr_p)) + return 0; + if ((tcp && (tcp->th_dport != dport)) || (!tcp && dport)) + return 0; + return 1; +} + + +/* + * Allocate a new application proxy structure and fill it in with the + * relevant details. call the init function once complete, prior to + * returning. + */ +static ap_session_t *appr_new_session(apr, ip, fin, nat) +aproxy_t *apr; +ip_t *ip; +fr_info_t *fin; +nat_t *nat; +{ + register ap_session_t *aps; + + if (!apr || (apr->apr_flags & APR_DELETE) || (ip->ip_p != apr->apr_p)) + return NULL; + + KMALLOC(aps, ap_session_t *); + if (!aps) + return NULL; + bzero((char *)aps, sizeof(*aps)); + aps->aps_next = ap_sess_list; + aps->aps_p = ip->ip_p; + aps->aps_data = NULL; + aps->aps_apr = apr; + aps->aps_psiz = 0; + ap_sess_list = aps; + aps->aps_nat = nat; + nat->nat_aps = aps; + if (apr->apr_new != NULL) + (void) (*apr->apr_new)(fin, ip, aps, nat); + return aps; +} + + +/* + * check to see if a packet should be passed through an active proxy routine + * if one has been setup for it. + */ +int appr_check(ip, fin, nat) +ip_t *ip; +fr_info_t *fin; +nat_t *nat; +{ + ap_session_t *aps; + aproxy_t *apr; + tcphdr_t *tcp = NULL; + u_32_t sum; + int err; + + if (nat->nat_aps == NULL) + nat->nat_aps = appr_new_session(nat->nat_ptr->in_apr, ip, + fin, nat); + aps = nat->nat_aps; + if ((aps != NULL) && (aps->aps_p == ip->ip_p)) { + if (ip->ip_p == IPPROTO_TCP) { + tcp = (tcphdr_t *)fin->fin_dp; + /* + * verify that the checksum is correct. If not, then + * don't do anything with this packet. + */ +#if SOLARIS && defined(_KERNEL) + sum = fr_tcpsum(fin->fin_qfm, ip, tcp); +#else + sum = fr_tcpsum(*(mb_t **)fin->fin_mp, ip, tcp); +#endif + if (sum != tcp->th_sum) { + frstats[fin->fin_out].fr_tcpbad++; + return -1; + } + } + + apr = aps->aps_apr; + err = 0; + if (fin->fin_out != 0) { + if (apr->apr_outpkt != NULL) + err = (*apr->apr_outpkt)(fin, ip, aps, nat); + } else { + if (apr->apr_inpkt != NULL) + err = (*apr->apr_inpkt)(fin, ip, aps, nat); + } + + if (tcp != NULL) { + err = appr_fixseqack(fin, ip, aps, err); +#if SOLARIS && defined(_KERNEL) + tcp->th_sum = fr_tcpsum(fin->fin_qfm, ip, tcp); +#else + tcp->th_sum = fr_tcpsum(*(mb_t **)fin->fin_mp, ip, tcp); +#endif + } + aps->aps_bytes += ip->ip_len; + aps->aps_pkts++; + return 2; + } + return -1; +} + + +aproxy_t *appr_match(pr, name) +u_int pr; +char *name; +{ + aproxy_t *ap; + + for (ap = ap_proxies; ap->apr_p; ap++) + if ((ap->apr_p == pr) && + !strncmp(name, ap->apr_label, sizeof(ap->apr_label))) { + ap->apr_ref++; + return ap; + } + return NULL; +} + + +void appr_free(ap) +aproxy_t *ap; +{ + ap->apr_ref--; +} + + +void aps_free(aps) +ap_session_t *aps; +{ + ap_session_t *a, **ap; + + if (!aps) + return; + + for (ap = &ap_sess_list; (a = *ap); ap = &a->aps_next) + if (a == aps) { + *ap = a->aps_next; + break; + } + + if (a) { + if ((aps->aps_data != NULL) && (aps->aps_psiz != 0)) + KFREES(aps->aps_data, aps->aps_psiz); + KFREE(aps); + } +} + + +static int appr_fixseqack(fin, ip, aps, inc) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +int inc; +{ + int sel, ch = 0, out, nlen; + u_32_t seq1, seq2; + tcphdr_t *tcp; + + tcp = (tcphdr_t *)fin->fin_dp; + out = fin->fin_out; + nlen = ip->ip_len; + nlen -= (ip->ip_hl << 2) + (tcp->th_off << 2); + + if (out != 0) { + seq1 = (u_32_t)ntohl(tcp->th_seq); + sel = aps->aps_sel[out]; + + /* switch to other set ? */ + if ((aps->aps_seqmin[!sel] > aps->aps_seqmin[sel]) && + (seq1 > aps->aps_seqmin[!sel])) + sel = aps->aps_sel[out] = !sel; + + if (aps->aps_seqoff[sel]) { + seq2 = aps->aps_seqmin[sel] - aps->aps_seqoff[sel]; + if (seq1 > seq2) { + seq2 = aps->aps_seqoff[sel]; + seq1 += seq2; + tcp->th_seq = htonl(seq1); + ch = 1; + } + } + + if (inc && (seq1 > aps->aps_seqmin[!sel])) { + aps->aps_seqmin[!sel] = seq1 + nlen - 1; + aps->aps_seqoff[!sel] = aps->aps_seqoff[sel] + inc; + } + + /***/ + + seq1 = ntohl(tcp->th_ack); + sel = aps->aps_sel[1 - out]; + + /* switch to other set ? */ + if ((aps->aps_ackmin[!sel] > aps->aps_ackmin[sel]) && + (seq1 > aps->aps_ackmin[!sel])) + sel = aps->aps_sel[1 - out] = !sel; + + if (aps->aps_ackoff[sel] && (seq1 > aps->aps_ackmin[sel])) { + seq2 = aps->aps_ackoff[sel]; + tcp->th_ack = htonl(seq1 - seq2); + ch = 1; + } + } else { + seq1 = ntohl(tcp->th_seq); + sel = aps->aps_sel[out]; + + /* switch to other set ? */ + if ((aps->aps_ackmin[!sel] > aps->aps_ackmin[sel]) && + (seq1 > aps->aps_ackmin[!sel])) + sel = aps->aps_sel[out] = !sel; + + if (aps->aps_ackoff[sel]) { + seq2 = aps->aps_ackmin[sel] - + aps->aps_ackoff[sel]; + if (seq1 > seq2) { + seq2 = aps->aps_ackoff[sel]; + seq1 += seq2; + tcp->th_seq = htonl(seq1); + ch = 1; + } + } + + if (inc && (seq1 > aps->aps_ackmin[!sel])) { + aps->aps_ackmin[!sel] = seq1 + nlen - 1; + aps->aps_ackoff[!sel] = aps->aps_ackoff[sel] + inc; + } + + /***/ + + seq1 = ntohl(tcp->th_ack); + sel = aps->aps_sel[1 - out]; + + /* switch to other set ? */ + if ((aps->aps_seqmin[!sel] > aps->aps_seqmin[sel]) && + (seq1 > aps->aps_seqmin[!sel])) + sel = aps->aps_sel[1 - out] = !sel; + + if (aps->aps_seqoff[sel] && (seq1 > aps->aps_seqmin[sel])) { + seq2 = aps->aps_seqoff[sel]; + tcp->th_ack = htonl(seq1 - seq2); + ch = 1; + } + } + return ch ? 2 : 0; +} + + +int appr_init() +{ + aproxy_t *ap; + int err = 0; + + for (ap = ap_proxies; ap->apr_p; ap++) { + err = (*ap->apr_init)(); + if (err != 0) + break; + } + return err; +} diff --git a/sys/netinet/ip_proxy.h b/sys/netinet/ip_proxy.h new file mode 100644 index 0000000..9ccd46a --- /dev/null +++ b/sys/netinet/ip_proxy.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 1997-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * $Id: ip_proxy.h,v 2.1.2.1 1999/09/19 12:18:20 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_PROXY_H__ +#define __IP_PROXY_H__ + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#ifndef APR_LABELLEN +#define APR_LABELLEN 16 +#endif +#define AP_SESS_SIZE 53 + +struct nat; +struct ipnat; + +typedef struct ap_tcp { + u_short apt_sport; /* source port */ + u_short apt_dport; /* destination port */ + short apt_sel[2]; /* {seq,ack}{off,min} set selector */ + short apt_seqoff[2]; /* sequence # difference */ + tcp_seq apt_seqmin[2]; /* don't change seq-off until after this */ + short apt_ackoff[2]; /* sequence # difference */ + tcp_seq apt_ackmin[2]; /* don't change seq-off until after this */ + u_char apt_state[2]; /* connection state */ +} ap_tcp_t; + +typedef struct ap_udp { + u_short apu_sport; /* source port */ + u_short apu_dport; /* destination port */ +} ap_udp_t; + +typedef struct ap_session { + struct aproxy *aps_apr; + union { + struct ap_tcp apu_tcp; + struct ap_udp apu_udp; + } aps_un; + u_int aps_flags; + U_QUAD_T aps_bytes; /* bytes sent */ + U_QUAD_T aps_pkts; /* packets sent */ + void *aps_nat; /* pointer back to nat struct */ + void *aps_data; /* private data */ + int aps_p; /* protocol */ + int aps_psiz; /* size of private data */ + struct ap_session *aps_hnext; + struct ap_session *aps_next; +} ap_session_t ; + +#define aps_sport aps_un.apu_tcp.apt_sport +#define aps_dport aps_un.apu_tcp.apt_dport +#define aps_sel aps_un.apu_tcp.apt_sel +#define aps_seqoff aps_un.apu_tcp.apt_seqoff +#define aps_seqmin aps_un.apu_tcp.apt_seqmin +#define aps_state aps_un.apu_tcp.apt_state +#define aps_ackoff aps_un.apu_tcp.apt_ackoff +#define aps_ackmin aps_un.apu_tcp.apt_ackmin + + +typedef struct aproxy { + char apr_label[APR_LABELLEN]; /* Proxy label # */ + u_char apr_p; /* protocol */ + int apr_ref; /* +1 per rule referencing it */ + int apr_flags; + int (* apr_init) __P((void)); + int (* apr_new) __P((fr_info_t *, ip_t *, + ap_session_t *, struct nat *)); + int (* apr_inpkt) __P((fr_info_t *, ip_t *, + ap_session_t *, struct nat *)); + int (* apr_outpkt) __P((fr_info_t *, ip_t *, + ap_session_t *, struct nat *)); +} aproxy_t; + +#define APR_DELETE 1 + + +/* + * Real audio proxy structure and #defines + */ +typedef struct { + int rap_seenpna; + int rap_seenver; + int rap_version; + int rap_eos; /* End Of Startup */ + int rap_gotid; + int rap_gotlen; + int rap_mode; + int rap_sdone; + u_short rap_plport; + u_short rap_prport; + u_short rap_srport; + char rap_svr[19]; + u_32_t rap_sbf; /* flag to indicate which of the 19 bytes have + * been filled + */ + tcp_seq rap_sseq; +} raudio_t; + +#define RA_ID_END 0 +#define RA_ID_UDP 1 +#define RA_ID_ROBUST 7 + +#define RAP_M_UDP 1 +#define RAP_M_ROBUST 2 +#define RAP_M_TCP 4 +#define RAP_M_UDP_ROBUST (RAP_M_UDP|RAP_M_ROBUST) + + +extern ap_session_t *ap_sess_tab[AP_SESS_SIZE]; +extern ap_session_t *ap_sess_list; +extern aproxy_t ap_proxies[]; + +extern int appr_init __P((void)); +extern int appr_ok __P((ip_t *, tcphdr_t *, struct ipnat *)); +extern void appr_free __P((aproxy_t *)); +extern void aps_free __P((ap_session_t *)); +extern int appr_check __P((ip_t *, fr_info_t *, struct nat *)); +extern aproxy_t *appr_match __P((u_int, char *)); + +#endif /* __IP_PROXY_H__ */ diff --git a/sys/netinet/ip_raudio_pxy.c b/sys/netinet/ip_raudio_pxy.c new file mode 100644 index 0000000..b76eea5 --- /dev/null +++ b/sys/netinet/ip_raudio_pxy.c @@ -0,0 +1,274 @@ +/* + * $Id$ + * $FreeBSD$ + */ +#if SOLARIS && defined(_KERNEL) +extern kmutex_t ipf_rw; +#endif + +#define IPF_RAUDIO_PROXY + + +int ippr_raudio_init __P((void)); +int ippr_raudio_new __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_raudio_in __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_raudio_out __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); + +static frentry_t raudiofr; + + +/* + * Real Audio application proxy initialization. + */ +int ippr_raudio_init() +{ + bzero((char *)&raudiofr, sizeof(raudiofr)); + raudiofr.fr_ref = 1; + raudiofr.fr_flags = FR_INQUE|FR_PASS|FR_QUICK|FR_KEEPSTATE; + return 0; +} + + +/* + * Setup for a new proxy to handle Real Audio. + */ +int ippr_raudio_new(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + raudio_t *rap; + + + KMALLOCS(aps->aps_data, void *, sizeof(raudio_t)); + if (aps->aps_data != NULL) { + bzero(aps->aps_data, sizeof(raudio_t)); + rap = aps->aps_data; + aps->aps_psiz = sizeof(raudio_t); + rap->rap_mode = RAP_M_TCP; /* default is for TCP */ + } + return 0; +} + + + +int ippr_raudio_out(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + char membuf[512 + 1], *s; + int off, dlen, inc = 0; + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + raudio_t *rap = aps->aps_data; + u_short sp, dp, id = 0; + struct in_addr swip; + fr_info_t fi; + int len = 0; + nat_t *ipn; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + /* + * If we've already processed the start messages, then nothing left + * for the proxy to do. + */ + if (rap->rap_eos == 1) + return 0; + + tcp = (tcphdr_t *)fin->fin_dp; + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + bzero(membuf, sizeof(membuf)); +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + if (dlen <= 0) + return 0; + copyout_mblk(m, off, MIN(sizeof(membuf), dlen), membuf); +#else + m = *(mb_t **)fin->fin_mp; + + dlen = mbufchainlen(m) - off; + if (dlen <= 0) + return 0; + m_copydata(m, off, MIN(sizeof(membuf), dlen), membuf); +#endif + /* + * In all the startup parsing, ensure that we don't go outside + * the packet buffer boundary. + */ + /* + * Look for the start of connection "PNA" string if not seen yet. + */ + if (rap->rap_seenpna == 0) { + s = memstr("PNA", membuf, 3, dlen); + if (s == NULL) + return 0; + s += 3; + rap->rap_seenpna = 1; + } else + s = membuf; + + /* + * Directly after the PNA will be the version number of this + * connection. + */ + if (rap->rap_seenpna == 1 && rap->rap_seenver == 0) { + if ((s + 1) - membuf < dlen) { + rap->rap_version = (*s << 8) | *(s + 1); + s += 2; + rap->rap_seenver = 1; + } else + return 0; + } + + /* + * Now that we've been past the PNA and version number, we're into the + * startup messages block. This ends when a message with an ID of 0. + */ + while ((rap->rap_eos == 0) && ((s + 1) - membuf < dlen)) { + if (rap->rap_gotid == 0) { + id = (*s << 8) | *(s + 1); + s += 2; + rap->rap_gotid = 1; + if (id == RA_ID_END) { + rap->rap_eos = 1; + break; + } + } else if (rap->rap_gotlen == 0) { + len = (*s << 8) | *(s + 1); + s += 2; + rap->rap_gotlen = 1; + } + + if (rap->rap_gotid == 1 && rap->rap_gotlen == 1) { + if (id == RA_ID_UDP) { + rap->rap_mode &= ~RAP_M_TCP; + rap->rap_mode |= RAP_M_UDP; + rap->rap_plport = (*s << 8) | *(s + 1); + } else if (id == RA_ID_ROBUST) { + rap->rap_mode |= RAP_M_ROBUST; + rap->rap_prport = (*s << 8) | *(s + 1); + } + s += len; + rap->rap_gotlen = 0; + rap->rap_gotid = 0; + } + } + + /* + * Wait until we've seen the end of the start messages and even then + * only proceed further if we're using UDP. + */ + if ((rap->rap_eos == 0) || ((rap->rap_mode & RAP_M_UDP) != RAP_M_UDP)) + return 0; + sp = rap->rap_plport; + dp = 0; + + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_sport = htons(sp); + tcp2->th_dport = 0; /* XXX - don't specify remote port */ + tcp2->th_win = htons(8192); + fi.fin_dp = (char *)tcp2; + fi.fin_data[0] = sp; + fi.fin_data[1] = 0; + fi.fin_fr = &raudiofr; + swip = ip->ip_src; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_DPORT, NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, FI_W_DPORT); + } + ip->ip_src = swip; + + if ((rap->rap_mode & RAP_M_UDP_ROBUST) == RAP_M_UDP_ROBUST) { + sp = rap->rap_prport; + } + return inc; +} + + +int ippr_raudio_in(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + char membuf[IPF_MAXPORTLEN + 1], *s; + int off, dlen; + raudio_t *rap = aps->aps_data; + u_int a1, a2, a3, a4; + tcphdr_t *tcp; + tcp_seq seq; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + if ((rap->rap_sdone != 0) || + ((rap->rap_mode & RAP_M_UDP_ROBUST) != RAP_M_UDP_ROBUST)) + return 0; + + tcp = (tcphdr_t *)fin->fin_dp; + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + m = *(mb_t **)fin->fin_mp; + +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + if (dlen <= 0) + return 0; + bzero(membuf, sizeof(membuf)); + copyout_mblk(m, off, MIN(sizeof(membuf), dlen), membuf); +#else + dlen = mbufchainlen(m) - off; + if (dlen <= 0) + return 0; + bzero(membuf, sizeof(membuf)); + m_copydata(m, off, MIN(sizeof(membuf), dlen), membuf); +#endif + + seq = ntohl(tcp->th_seq); + /* + * Check to see if the data in this packet is of interest to us. + * We only care for the first 19 bytes coming back from the server. + */ + if (rap->rap_sseq == 0) { + s = memstr("PNA", membuf, 3, dlen); + if (s == NULL) + return 0; + a1 = s - membuf; + dlen -= a1; + a1 = 0; + rap->rap_sseq = seq; + a2 = MIN(dlen, sizeof(rap->rap_svr)); + } else if (seq <= rap->rap_sseq + sizeof(rap->rap_svr)) { + /* + * seq # which is the start of data and from that the offset + * into the buffer array. + */ + a1 = seq - rap->rap_sseq; + a2 = MIN(dlen, sizeof(rap->rap_svr)); + a2 -= a1; + s = membuf; + } else + return 0; + + for (a3 = a1, a4 = a2; a4 > 0; a4--, a3++) { + rap->rap_sbf |= (1 << a3); + rap->rap_svr[a3] = *s++; + } + if (rap->rap_sbf == 0x7ffff) { /* 19 bits */ + s = rap->rap_svr + 13; + rap->rap_srport = (*s << 8) | *(s + 1); + } + return 0; +} diff --git a/sys/netinet/ip_rcmd_pxy.c b/sys/netinet/ip_rcmd_pxy.c new file mode 100644 index 0000000..f9dc5b3 --- /dev/null +++ b/sys/netinet/ip_rcmd_pxy.c @@ -0,0 +1,160 @@ +/* + * $Id$ + * $FreeBSD$ + */ +/* + * Simple RCMD transparent proxy for in-kernel use. For use with the NAT + * code. + */ +#if SOLARIS && defined(_KERNEL) +extern kmutex_t ipf_rw; +#endif + +#define isdigit(x) ((x) >= '0' && (x) <= '9') + +#define IPF_RCMD_PROXY + + +int ippr_rcmd_init __P((void)); +int ippr_rcmd_new __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_rcmd_out __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +u_short ipf_rcmd_atoi __P((char *)); +int ippr_rcmd_portmsg __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); + +static frentry_t rcmdfr; + + +/* + * RCMD application proxy initialization. + */ +int ippr_rcmd_init() +{ + bzero((char *)&rcmdfr, sizeof(rcmdfr)); + rcmdfr.fr_ref = 1; + rcmdfr.fr_flags = FR_INQUE|FR_PASS|FR_QUICK|FR_KEEPSTATE; + return 0; +} + + +/* + * Setup for a new RCMD proxy. + */ +int ippr_rcmd_new(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; + + aps->aps_psiz = sizeof(u_32_t); + KMALLOCS(aps->aps_data, u_32_t *, sizeof(u_32_t)); + if (aps->aps_data == NULL) + return -1; + *(u_32_t *)aps->aps_data = 0; + aps->aps_sport = tcp->th_sport; + aps->aps_dport = tcp->th_dport; + return 0; +} + + +/* + * ipf_rcmd_atoi - implement a simple version of atoi + */ +u_short ipf_rcmd_atoi(ptr) +char *ptr; +{ + register char *s = ptr, c; + register u_short i = 0; + + while ((c = *s++) && isdigit(c)) { + i *= 10; + i += c - '0'; + } + return i; +} + + +int ippr_rcmd_portmsg(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + char portbuf[8], *s; + struct in_addr swip; + u_short sp, dp; + int off, dlen; + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + fr_info_t fi; + nat_t *ipn; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + tcp = (tcphdr_t *)fin->fin_dp; + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + m = *(mb_t **)fin->fin_mp; + +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + bzero(portbuf, sizeof(portbuf)); + copyout_mblk(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#else + dlen = mbufchainlen(m) - off; + bzero(portbuf, sizeof(portbuf)); + m_copydata(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#endif + if ((*(u_32_t *)aps->aps_data != 0) && + (tcp->th_seq != *(u_32_t *)aps->aps_data)) + return 0; + + portbuf[sizeof(portbuf) - 1] = '\0'; + s = portbuf; + sp = ipf_rcmd_atoi(s); + if (!sp) + return 0; + + /* + * Add skeleton NAT entry for connection which will come back the + * other way. + */ + sp = htons(sp); + dp = htons(fin->fin_data[1]); + ipn = nat_outlookup(fin->fin_ifp, IPN_TCP, nat->nat_p, nat->nat_inip, + ip->ip_dst, (dp << 16) | sp); + if (ipn == NULL) { + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_win = htons(8192); + tcp2->th_sport = sp; + tcp2->th_dport = 0; /* XXX - don't specify remote port */ + fi.fin_data[0] = ntohs(sp); + fi.fin_data[1] = 0; + fi.fin_dp = (char *)tcp2; + swip = ip->ip_src; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_DPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + fi.fin_fr = &rcmdfr; + (void) fr_addstate(ip, &fi, FI_W_DPORT); + } + ip->ip_src = swip; + } + return 0; +} + + +int ippr_rcmd_out(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + return ippr_rcmd_portmsg(fin, ip, aps, nat); +} diff --git a/sys/netinet/ip_state.c b/sys/netinet/ip_state.c new file mode 100644 index 0000000..95e8793 --- /dev/null +++ b/sys/netinet/ip_state.c @@ -0,0 +1,1105 @@ +/* + * Copyright (C) 1995-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_state.c 1.8 6/5/96 (C) 1993-1995 Darren Reed"; +/*static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.3.2.9 1999/10/21 14:31:09 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__) +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +#else +# ifdef linux +# include <linux/kernel.h> +# include <linux/module.h> +# endif +#endif +#if defined(KERNEL) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +# if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +#else +# include <sys/ioctl.h> +#endif +#include <sys/time.h> +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/filio.h> +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif + +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#ifndef linux +# include <netinet/ip_var.h> +# include <netinet/tcp_fsm.h> +#endif +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_state.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) +# include <sys/libkern.h> +# include <sys/systm.h> +# endif +#endif + +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +#define TCP_CLOSE (TH_FIN|TH_RST) + +static ipstate_t **ips_table = NULL; +static int ips_num = 0; +static ips_stat_t ips_stats; +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern KRWLOCK_T ipf_state, ipf_mutex; +extern kmutex_t ipf_rw; +#endif + +static int fr_matchsrcdst __P((ipstate_t *, struct in_addr, struct in_addr, + fr_info_t *, tcphdr_t *)); +static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *)); +static int fr_state_flush __P((int)); +static ips_stat_t *fr_statetstats __P((void)); +static void fr_delstate __P((ipstate_t *)); + + +#define FIVE_DAYS (2 * 5 * 86400) /* 5 days: half closed session */ + +#define TCP_MSL 240 /* 2 minutes */ +u_long fr_tcpidletimeout = FIVE_DAYS, + fr_tcpclosewait = 2 * TCP_MSL, + fr_tcplastack = 2 * TCP_MSL, + fr_tcptimeout = 2 * TCP_MSL, + fr_tcpclosed = 1, + fr_udptimeout = 240, + fr_icmptimeout = 120; +int fr_statemax = IPSTATE_MAX, + fr_statesize = IPSTATE_SIZE; +int fr_state_doflush = 0; + + +int fr_stateinit() +{ + KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *)); + if (ips_table != NULL) + bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *)); + else + return -1; + return 0; +} + + +static ips_stat_t *fr_statetstats() +{ + ips_stats.iss_active = ips_num; + ips_stats.iss_table = ips_table; + return &ips_stats; +} + + +/* + * flush state tables. two actions currently defined: + * which == 0 : flush all state table entries + * which == 1 : flush TCP connections which have started to close but are + * stuck for some reason. + */ +static int fr_state_flush(which) +int which; +{ + register int i; + register ipstate_t *is, **isp; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + int delete, removed = 0; + + SPL_NET(s); + WRITE_ENTER(&ipf_state); + for (i = fr_statesize - 1; i >= 0; i--) + for (isp = &ips_table[i]; (is = *isp); ) { + delete = 0; + + switch (which) + { + case 0 : + delete = 1; + break; + case 1 : + if ((is->is_p == IPPROTO_TCP) && + (((is->is_state[0] <= TCPS_ESTABLISHED) && + (is->is_state[1] > TCPS_ESTABLISHED)) || + ((is->is_state[1] <= TCPS_ESTABLISHED) && + (is->is_state[0] > TCPS_ESTABLISHED)))) + delete = 1; + break; + } + + if (delete) { + *isp = is->is_next; + if (is->is_p == IPPROTO_TCP) + ips_stats.iss_fin++; + else + ips_stats.iss_expire++; + if (ips_table[i] == NULL) + ips_stats.iss_inuse--; +#ifdef IPFILTER_LOG + ipstate_log(is, ISL_FLUSH); +#endif + fr_delstate(is); + ips_num--; + removed++; + } else + isp = &is->is_next; + } + if (fr_state_doflush) { + (void) fr_state_flush(1); + fr_state_doflush = 0; + } + RWLOCK_EXIT(&ipf_state); + SPL_X(s); + return removed; +} + + +int fr_state_ioctl(data, cmd, mode) +caddr_t data; +#if defined(__NetBSD__) || defined(__OpenBSD__) +u_long cmd; +#else +int cmd; +#endif +int mode; +{ + int arg, ret, error = 0; + + switch (cmd) + { + case SIOCIPFFL : + IRCOPY(data, (caddr_t)&arg, sizeof(arg)); + if (arg == 0 || arg == 1) { + ret = fr_state_flush(arg); + IWCOPY((caddr_t)&ret, data, sizeof(ret)); + } else + error = EINVAL; + break; + case SIOCGIPST : + IWCOPY((caddr_t)fr_statetstats(), data, sizeof(ips_stat_t)); + break; + case FIONREAD : +#ifdef IPFILTER_LOG + IWCOPY((caddr_t)&iplused[IPL_LOGSTATE], (caddr_t)data, + sizeof(iplused[IPL_LOGSTATE])); +#endif + break; + default : + error = EINVAL; + break; + } + return error; +} + + +/* + * Create a new ipstate structure and hang it off the hash table. + */ +ipstate_t *fr_addstate(ip, fin, flags) +ip_t *ip; +fr_info_t *fin; +u_int flags; +{ + register ipstate_t *is; + register u_int hv; + ipstate_t ips; + u_int pass; + + if ((ip->ip_off & IP_OFFMASK) || (fin->fin_fi.fi_fl & FI_SHORT)) + return NULL; + if (ips_num == fr_statemax) { + ips_stats.iss_max++; + fr_state_doflush = 1; + return NULL; + } + is = &ips; + bzero((char *)is, sizeof(*is)); + ips.is_age = 1; + ips.is_state[0] = 0; + ips.is_state[1] = 0; + /* + * Copy and calculate... + */ + hv = (is->is_p = ip->ip_p); + hv += (is->is_src.s_addr = ip->ip_src.s_addr); + hv += (is->is_dst.s_addr = ip->ip_dst.s_addr); + + switch (ip->ip_p) + { + case IPPROTO_ICMP : + { + struct icmp *ic = (struct icmp *)fin->fin_dp; + + switch (ic->icmp_type) + { + case ICMP_ECHO : + is->is_icmp.ics_type = ICMP_ECHOREPLY; /* XXX */ + hv += (is->is_icmp.ics_id = ic->icmp_id); + hv += (is->is_icmp.ics_seq = ic->icmp_seq); + break; + case ICMP_TSTAMP : + case ICMP_IREQ : + case ICMP_MASKREQ : + is->is_icmp.ics_type = ic->icmp_type + 1; + break; + default : + return NULL; + } + ATOMIC_INC(ips_stats.iss_icmp); + is->is_age = fr_icmptimeout; + break; + } + case IPPROTO_TCP : + { + register tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; + + /* + * The endian of the ports doesn't matter, but the ack and + * sequence numbers do as we do mathematics on them later. + */ + is->is_dport = tcp->th_dport; + is->is_sport = tcp->th_sport; + if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { + hv += tcp->th_dport; + hv += tcp->th_sport; + } + if (tcp->th_seq != 0) { + is->is_send = ntohl(tcp->th_seq) + ip->ip_len - + fin->fin_hlen - (tcp->th_off << 2) + + ((tcp->th_flags & TH_SYN) ? 1 : 0) + + ((tcp->th_flags & TH_FIN) ? 1 : 0); + is->is_maxsend = is->is_send + 1; + } + is->is_dend = 0; + is->is_maxswin = ntohs(tcp->th_win); + if (is->is_maxswin == 0) + is->is_maxswin = 1; + /* + * If we're creating state for a starting connection, start the + * timer on it as we'll never see an error if it fails to + * connect. + */ + MUTEX_ENTER(&ipf_rw); + ips_stats.iss_tcp++; + fr_tcp_age(&is->is_age, is->is_state, ip, fin, + tcp->th_sport == is->is_sport); + MUTEX_EXIT(&ipf_rw); + break; + } + case IPPROTO_UDP : + { + register tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; + + if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { + hv += (is->is_dport = tcp->th_dport); + hv += (is->is_sport = tcp->th_sport); + } + ATOMIC_INC(ips_stats.iss_udp); + is->is_age = fr_udptimeout; + break; + } + default : + return NULL; + } + + KMALLOC(is, ipstate_t *); + if (is == NULL) { + ATOMIC_INC(ips_stats.iss_nomem); + return NULL; + } + bcopy((char *)&ips, (char *)is, sizeof(*is)); + hv %= fr_statesize; + RW_UPGRADE(&ipf_mutex); + is->is_rule = fin->fin_fr; + if (is->is_rule != NULL) { + is->is_rule->fr_ref++; + pass = is->is_rule->fr_flags; + } else + pass = fr_flags; + MUTEX_DOWNGRADE(&ipf_mutex); + WRITE_ENTER(&ipf_state); + + is->is_rout = pass & FR_OUTQUE ? 1 : 0; + is->is_pass = pass; + is->is_pkts = 1; + is->is_bytes = ip->ip_len; + /* + * We want to check everything that is a property of this packet, + * but we don't (automatically) care about it's fragment status as + * this may change. + */ + is->is_opt = fin->fin_fi.fi_optmsk; + is->is_optmsk = 0xffffffff; + is->is_sec = fin->fin_fi.fi_secmsk; + is->is_secmsk = 0xffff; + is->is_auth = fin->fin_fi.fi_auth; + is->is_authmsk = 0xffff; + is->is_flags = fin->fin_fi.fi_fl & FI_CMP; + is->is_flags |= FI_CMP << 4; + is->is_flags |= flags & (FI_W_DPORT|FI_W_SPORT); + /* + * add into table. + */ + is->is_next = ips_table[hv]; + ips_table[hv] = is; + if (is->is_next == NULL) + ips_stats.iss_inuse++; + if (fin->fin_out) { + is->is_ifpin = NULL; + is->is_ifpout = fin->fin_ifp; + } else { + is->is_ifpin = fin->fin_ifp; + is->is_ifpout = NULL; + } + if (pass & FR_LOGFIRST) + is->is_pass &= ~(FR_LOGFIRST|FR_LOG); + ATOMIC_INC(ips_num); +#ifdef IPFILTER_LOG + ipstate_log(is, ISL_NEW); +#endif + RWLOCK_EXIT(&ipf_state); + fin->fin_rev = (is->is_dst.s_addr != ip->ip_dst.s_addr); + if (fin->fin_fi.fi_fl & FI_FRAG) + ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); + return is; +} + + + +/* + * check to see if a packet with TCP headers fits within the TCP window. + * change timeout depending on whether new packet is a SYN-ACK returning for a + * SYN or a RST or FIN which indicate time to close up shop. + */ +int fr_tcpstate(is, fin, ip, tcp) +register ipstate_t *is; +fr_info_t *fin; +ip_t *ip; +tcphdr_t *tcp; +{ + register tcp_seq seq, ack, end; + register int ackskew; + tcpdata_t *fdata, *tdata; + u_short win, maxwin; + int ret = 0; + int source; + + /* + * Find difference between last checked packet and this packet. + */ + source = (ip->ip_src.s_addr == is->is_src.s_addr); + fdata = &is->is_tcp.ts_data[!source]; + tdata = &is->is_tcp.ts_data[source]; + seq = ntohl(tcp->th_seq); + ack = ntohl(tcp->th_ack); + win = ntohs(tcp->th_win); + end = seq + ip->ip_len - fin->fin_hlen - (tcp->th_off << 2) + + ((tcp->th_flags & TH_SYN) ? 1 : 0) + + ((tcp->th_flags & TH_FIN) ? 1 : 0); + + if (fdata->td_end == 0) { + /* + * Must be a (outgoing) SYN-ACK in reply to a SYN. + */ + fdata->td_end = end; + fdata->td_maxwin = 1; + fdata->td_maxend = end + 1; + } + + if (!(tcp->th_flags & TH_ACK)) { /* Pretend an ack was sent */ + ack = tdata->td_end; + win = 1; + } else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) && + (ack == 0)) { + /* gross hack to get around certain broken tcp stacks */ + ack = tdata->td_end; + } + + if (seq == end) + seq = end = fdata->td_end; + + maxwin = tdata->td_maxwin; + ackskew = tdata->td_end - ack; + +#define SEQ_GE(a,b) ((int)((a) - (b)) >= 0) +#define SEQ_GT(a,b) ((int)((a) - (b)) > 0) + if ((SEQ_GE(fdata->td_maxend, end)) && + (SEQ_GE(seq + maxwin, fdata->td_end - maxwin)) && +/* XXX what about big packets */ +#define MAXACKWINDOW 66000 + (ackskew >= -MAXACKWINDOW) && + (ackskew <= MAXACKWINDOW)) { + /* if ackskew < 0 then this should be due to fragented + * packets. There is no way to know the length of the + * total packet in advance. + * We do know the total length from the fragment cache though. + * Note however that there might be more sessions with + * exactly the same source and destination paramters in the + * state cache (and source and destination is the only stuff + * that is saved in the fragment cache). Note further that + * some TCP connections in the state cache are hashed with + * sport and dport as well which makes it not worthwhile to + * look for them. + * Thus, when ackskew is negative but still seems to belong + * to this session, we bump up the destinations end value. + */ + if (ackskew < 0) + tdata->td_end = ack; + + /* update max window seen */ + if (fdata->td_maxwin < win) + fdata->td_maxwin = win; + if (SEQ_GT(end, fdata->td_end)) + fdata->td_end = end; + if (SEQ_GE(ack + win, tdata->td_maxend)) { + tdata->td_maxend = ack + win; + if (win == 0) + tdata->td_maxend++; + } + + ATOMIC_INC(ips_stats.iss_hits); + is->is_pkts++; + is->is_bytes += ip->ip_len; + /* + * Nearing end of connection, start timeout. + */ + MUTEX_ENTER(&ipf_rw); + fr_tcp_age(&is->is_age, is->is_state, ip, fin, source); + MUTEX_EXIT(&ipf_rw); + ret = 1; + } + return ret; +} + + +static int fr_matchsrcdst(is, src, dst, fin, tcp) +ipstate_t *is; +struct in_addr src, dst; +fr_info_t *fin; +tcphdr_t *tcp; +{ + int ret = 0, rev, out, flags; + u_short sp, dp; + void *ifp; + + rev = fin->fin_rev = (is->is_dst.s_addr != dst.s_addr); + ifp = fin->fin_ifp; + out = fin->fin_out; + + if (tcp != NULL) { + flags = is->is_flags; + sp = tcp->th_sport; + dp = tcp->th_dport; + } else { + flags = 0; + sp = 0; + dp = 0; + } + + if (rev == 0) { + if (!out) { + if (is->is_ifpin == ifp) + ret = 1; + } else { + if (is->is_ifpout == NULL || is->is_ifpout == ifp) + ret = 1; + } + } else { + if (out) { + if (is->is_ifpin == ifp) + ret = 1; + } else { + if (is->is_ifpout == NULL || is->is_ifpout == ifp) + ret = 1; + } + } + if (ret == 0) + return 0; + ret = 0; + + if (rev == 0) { + if ((is->is_dst.s_addr == dst.s_addr) && + (is->is_src.s_addr == src.s_addr) && + (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) && + (dp == is->is_dport || flags & FI_W_DPORT)))) { + ret = 1; + } + } else { + if ((is->is_dst.s_addr == src.s_addr) && + (is->is_src.s_addr == dst.s_addr) && + (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) && + (dp == is->is_sport || flags & FI_W_SPORT)))) { + ret = 1; + } + } + if (ret == 0) + return 0; + + /* + * Whether or not this should be here, is questionable, but the aim + * is to get this out of the main line. + */ + if (tcp == NULL) + flags = is->is_flags & (FI_CMP|(FI_CMP<<4)); + + if (((fin->fin_fi.fi_fl & (flags >> 4)) != (flags & FI_CMP)) || + ((fin->fin_fi.fi_optmsk & is->is_optmsk) != is->is_opt) || + ((fin->fin_fi.fi_secmsk & is->is_secmsk) != is->is_sec) || + ((fin->fin_fi.fi_auth & is->is_authmsk) != is->is_auth)) + return 0; + + if ((flags & (FI_W_SPORT|FI_W_DPORT))) { + if ((flags & FI_W_SPORT) != 0) { + if (rev == 0) { + is->is_sport = sp; + is->is_send = htonl(tcp->th_seq); + } else { + is->is_sport = dp; + is->is_send = htonl(tcp->th_ack); + } + is->is_maxsend = is->is_send + 1; + } else if ((flags & FI_W_DPORT) != 0) { + if (rev == 0) { + is->is_dport = dp; + is->is_dend = htonl(tcp->th_ack); + } else { + is->is_dport = sp; + is->is_dend = htonl(tcp->th_seq); + } + is->is_maxdend = is->is_dend + 1; + } + is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT); + } + + if (!rev) { + if (out && (out == is->is_rout)) { + if (!is->is_ifpout) + is->is_ifpout = ifp; + } else { + if (!is->is_ifpin) + is->is_ifpin = ifp; + } + } else { + if (!out && (out != is->is_rout)) { + if (!is->is_ifpin) + is->is_ifpin = ifp; + } else { + if (!is->is_ifpout) + is->is_ifpout = ifp; + } + } + return 1; +} + +frentry_t *fr_checkicmpmatchingstate(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register struct in_addr dst, src; + register ipstate_t *is, **isp; + register u_short sport, dport; + register u_char pr; + struct icmp *ic; + fr_info_t ofin; + u_int hv, dest; + tcphdr_t *tcp; + frentry_t *fr; + ip_t *oip; + int type; + + /* + * Does it at least have the return (basic) IP header ? + * Only a basic IP header (no options) should be with + * an ICMP error header. + */ + if ((ip->ip_hl != 5) || (ip->ip_len < ICMPERR_MINPKTLEN)) + return NULL; + ic = (struct icmp *)((char *)ip + fin->fin_hlen); + type = ic->icmp_type; + /* + * If it's not an error type, then return + */ + if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) && + (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) && + (type != ICMP_PARAMPROB)) + return NULL; + + oip = (ip_t *)((char *)fin->fin_dp + ICMPERR_ICMPHLEN); + if (ip->ip_len < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2)) + return NULL; + if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP)) + return NULL; + + tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2)); + dport = tcp->th_dport; + sport = tcp->th_sport; + + hv = (pr = oip->ip_p); + hv += (src.s_addr = oip->ip_src.s_addr); + hv += (dst.s_addr = oip->ip_dst.s_addr); + hv += dport; + hv += sport; + hv %= fr_statesize; + /* + * we make an fin entry to be able to feed it to + * matchsrcdst note that not all fields are encessary + * but this is the cleanest way. Note further we fill + * in fin_mp such that if someone uses it we'll get + * a kernel panic. fr_matchsrcdst does not use this. + * + * watch out here, as ip is in host order and oip in network + * order. Any change we make must be undone afterwards. + */ + oip->ip_len = ntohs(oip->ip_len); + fr_makefrip(oip->ip_hl << 2, oip, &ofin); + oip->ip_len = htons(oip->ip_len); + ofin.fin_ifp = fin->fin_ifp; + ofin.fin_out = !fin->fin_out; + ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next) { + /* + * Only allow this icmp though if the + * encapsulated packet was allowed through the + * other way around. Note that the minimal amount + * of info present does not allow for checking against + * tcp internals such as seq and ack numbers. + */ + if ((is->is_p == pr) && + fr_matchsrcdst(is, src, dst, &ofin, tcp)) { + fr = is->is_rule; + ips_stats.iss_hits++; + /* + * we must swap src and dst here because the icmp + * comes the other way around + */ + dest = (is->is_dst.s_addr != src.s_addr); + is->is_pkts++; + is->is_bytes += ip->ip_len; + /* + * we deliberately do not touch the timeouts + * for the accompanying state table entry. + * It remains to be seen if that is correct. XXX + */ + RWLOCK_EXIT(&ipf_state); + return fr; + } + } + RWLOCK_EXIT(&ipf_state); + return NULL; +} + +/* + * Check if a packet has a registered state. + */ +frentry_t *fr_checkstate(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register struct in_addr dst, src; + register ipstate_t *is, **isp; + register u_char pr; + u_int hv, hvm, hlen, tryagain, pass; + struct icmp *ic; + frentry_t *fr; + tcphdr_t *tcp; + + if ((ip->ip_off & IP_OFFMASK) || (fin->fin_fi.fi_fl & FI_SHORT)) + return NULL; + + is = NULL; + hlen = fin->fin_hlen; + tcp = (tcphdr_t *)((char *)ip + hlen); + ic = (struct icmp *)tcp; + hv = (pr = ip->ip_p); + hv += (src.s_addr = ip->ip_src.s_addr); + hv += (dst.s_addr = ip->ip_dst.s_addr); + + /* + * Search the hash table for matching packet header info. + */ + switch (ip->ip_p) + { + case IPPROTO_ICMP : + hv += ic->icmp_id; + hv += ic->icmp_seq; + hv %= fr_statesize; + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next) + if ((is->is_p == pr) && + (ic->icmp_id == is->is_icmp.ics_id) && + (ic->icmp_seq == is->is_icmp.ics_seq) && + fr_matchsrcdst(is, src, dst, fin, NULL)) { + if ((is->is_type == ICMP_ECHOREPLY) && + (ic->icmp_type == ICMP_ECHO)) + ; + else if (is->is_type != ic->icmp_type) + continue; + is->is_age = fr_icmptimeout; + break; + } + if (is != NULL) + break; + RWLOCK_EXIT(&ipf_state); + /* + * No matching icmp state entry. Perhaps this is a + * response to another state entry. + */ + fr = fr_checkicmpmatchingstate(ip, fin); + if (fr) + return fr; + break; + case IPPROTO_TCP : + { + register u_short dport = tcp->th_dport, sport = tcp->th_sport; + + tryagain = 0; +retry_tcp: + hvm = hv % fr_statesize; + WRITE_ENTER(&ipf_state); + for (isp = &ips_table[hvm]; (is = *isp); + isp = &is->is_next) + if ((is->is_p == pr) && + fr_matchsrcdst(is, src, dst, fin, tcp)) { + if (fr_tcpstate(is, fin, ip, tcp)) { + break; +#ifndef _KERNEL + if (tcp->th_flags & TCP_CLOSE) { + *isp = is->is_next; + isp = &ips_table[hvm]; + if (ips_table[hvm] == NULL) + ips_stats.iss_inuse--; + fr_delstate(is); + ips_num--; + } +#endif + break; + } + is = NULL; + break; + } + if (is != NULL) + break; + RWLOCK_EXIT(&ipf_state); + hv += dport; + hv += sport; + if (tryagain == 0) { + tryagain = 1; + goto retry_tcp; + } + break; + } + case IPPROTO_UDP : + { + register u_short dport = tcp->th_dport, sport = tcp->th_sport; + + tryagain = 0; +retry_udp: + hvm = hv % fr_statesize; + /* + * Nothing else to match on but ports. and IP#'s + */ + READ_ENTER(&ipf_state); + for (is = ips_table[hvm]; is; is = is->is_next) + if ((is->is_p == pr) && + fr_matchsrcdst(is, src, dst, fin, tcp)) { + is->is_age = fr_udptimeout; + break; + } + if (is != NULL) + break; + RWLOCK_EXIT(&ipf_state); + hv += dport; + hv += sport; + if (tryagain == 0) { + tryagain = 1; + goto retry_udp; + } + break; + } + default : + break; + } + if (is == NULL) { + ATOMIC_INC(ips_stats.iss_miss); + return NULL; + } + MUTEX_ENTER(&ipf_rw); + is->is_bytes += ip->ip_len; + ips_stats.iss_hits++; + is->is_pkts++; + MUTEX_EXIT(&ipf_rw); + fr = is->is_rule; + fin->fin_fr = fr; + pass = is->is_pass; + RWLOCK_EXIT(&ipf_state); + if (fin->fin_fi.fi_fl & FI_FRAG) + ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); + return fr; +} + + +static void fr_delstate(is) +ipstate_t *is; +{ + frentry_t *fr; + + fr = is->is_rule; + if (fr != NULL) { + ATOMIC_DEC(fr->fr_ref); + if (fr->fr_ref == 0) + KFREE(fr); + } + KFREE(is); +} + + +/* + * Free memory in use by all state info. kept. + */ +void fr_stateunload() +{ + register int i; + register ipstate_t *is, **isp; + + WRITE_ENTER(&ipf_state); + for (i = fr_statesize - 1; i >= 0; i--) + for (isp = &ips_table[i]; (is = *isp); ) { + *isp = is->is_next; + fr_delstate(is); + ips_num--; + } + ips_stats.iss_inuse = 0; + ips_num = 0; + RWLOCK_EXIT(&ipf_state); + KFREES(ips_table, fr_statesize * sizeof(ipstate_t *)); + ips_table = NULL; +} + + +/* + * Slowly expire held state for thingslike UDP and ICMP. Timeouts are set + * in expectation of this being called twice per second. + */ +void fr_timeoutstate() +{ + register int i; + register ipstate_t *is, **isp; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + + SPL_NET(s); + WRITE_ENTER(&ipf_state); + for (i = fr_statesize - 1; i >= 0; i--) + for (isp = &ips_table[i]; (is = *isp); ) + if (is->is_age && !--is->is_age) { + *isp = is->is_next; + if (is->is_p == IPPROTO_TCP) + ips_stats.iss_fin++; + else + ips_stats.iss_expire++; + if (ips_table[i] == NULL) + ips_stats.iss_inuse--; +#ifdef IPFILTER_LOG + ipstate_log(is, ISL_EXPIRE); +#endif + fr_delstate(is); + ips_num--; + } else + isp = &is->is_next; + RWLOCK_EXIT(&ipf_state); + SPL_X(s); +} + + +/* + * Original idea freom Pradeep Krishnan for use primarily with NAT code. + * (pkrishna@netcom.com) + */ +void fr_tcp_age(age, state, ip, fin, dir) +u_long *age; +u_char *state; +ip_t *ip; +fr_info_t *fin; +int dir; +{ + tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; + u_char flags = tcp->th_flags; + int dlen, ostate; + + ostate = state[1 - dir]; + + dlen = ip->ip_len - fin->fin_hlen - (tcp->th_off << 2); + + if (flags & TH_RST) { + if (!(tcp->th_flags & TH_PUSH) && !dlen) { + *age = fr_tcpclosed; + state[dir] = TCPS_CLOSED; + } else { + *age = fr_tcpclosewait; + state[dir] = TCPS_CLOSE_WAIT; + } + return; + } + + *age = fr_tcptimeout; /* 1 min */ + + switch(state[dir]) + { + case TCPS_CLOSED: + if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) { + state[dir] = TCPS_ESTABLISHED; + *age = fr_tcpidletimeout; + } + case TCPS_FIN_WAIT_2: + if ((flags & TH_OPENING) == TH_OPENING) + state[dir] = TCPS_SYN_RECEIVED; + else if (flags & TH_SYN) + state[dir] = TCPS_SYN_SENT; + break; + case TCPS_SYN_RECEIVED: + case TCPS_SYN_SENT: + if ((flags & (TH_FIN|TH_ACK)) == TH_ACK) { + state[dir] = TCPS_ESTABLISHED; + *age = fr_tcpidletimeout; + } else if ((flags & (TH_FIN|TH_ACK)) == (TH_FIN|TH_ACK)) { + state[dir] = TCPS_CLOSE_WAIT; + if (!(flags & TH_PUSH) && !dlen && + ostate > TCPS_ESTABLISHED) + *age = fr_tcplastack; + else + *age = fr_tcpclosewait; + } + break; + case TCPS_ESTABLISHED: + if (flags & TH_FIN) { + state[dir] = TCPS_CLOSE_WAIT; + if (!(flags & TH_PUSH) && !dlen && + ostate > TCPS_ESTABLISHED) + *age = fr_tcplastack; + else + *age = fr_tcpclosewait; + } else { + if (ostate < TCPS_CLOSE_WAIT) + *age = fr_tcpidletimeout; + } + break; + case TCPS_CLOSE_WAIT: + if ((flags & TH_FIN) && !(flags & TH_PUSH) && !dlen && + ostate > TCPS_ESTABLISHED) { + *age = fr_tcplastack; + state[dir] = TCPS_LAST_ACK; + } else + *age = fr_tcpclosewait; + break; + case TCPS_LAST_ACK: + if (flags & TH_ACK) { + state[dir] = TCPS_FIN_WAIT_2; + if (!(flags & TH_PUSH) && !dlen && + ostate > TCPS_ESTABLISHED) + *age = fr_tcplastack; + else { + *age = fr_tcpclosewait; + state[dir] = TCPS_CLOSE_WAIT; + } + } + break; + } +} + + +#ifdef IPFILTER_LOG +void ipstate_log(is, type) +struct ipstate *is; +u_int type; +{ + struct ipslog ipsl; + void *items[1]; + size_t sizes[1]; + int types[1]; + + ipsl.isl_type = type; + ipsl.isl_pkts = is->is_pkts; + ipsl.isl_bytes = is->is_bytes; + ipsl.isl_src = is->is_src; + ipsl.isl_dst = is->is_dst; + ipsl.isl_p = is->is_p; + ipsl.isl_flags = is->is_flags; + if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) { + ipsl.isl_sport = is->is_sport; + ipsl.isl_dport = is->is_dport; + if (ipsl.isl_p == IPPROTO_TCP) { + ipsl.isl_state[0] = is->is_state[0]; + ipsl.isl_state[1] = is->is_state[1]; + } + } else if (ipsl.isl_p == IPPROTO_ICMP) + ipsl.isl_itype = is->is_icmp.ics_type; + else { + ipsl.isl_ps.isl_filler[0] = 0; + ipsl.isl_ps.isl_filler[1] = 0; + } + items[0] = &ipsl; + sizes[0] = sizeof(ipsl); + types[0] = 0; + + (void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1); +} +#endif diff --git a/sys/netinet/ip_state.h b/sys/netinet/ip_state.h new file mode 100644 index 0000000..1b7c392 --- /dev/null +++ b/sys/netinet/ip_state.h @@ -0,0 +1,165 @@ +/* + * Copyright (C) 1995-1998 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_state.h 1.3 1/12/96 (C) 1995 Darren Reed + * $Id: ip_state.h,v 2.1 1999/08/04 17:30:00 darrenr Exp $ + * $FreeBSD$ + */ +#ifndef __IP_STATE_H__ +#define __IP_STATE_H__ + +#define IPSTATE_SIZE 257 +#define IPSTATE_MAX 2048 /* Maximum number of states held */ + +#define PAIRS(s1,d1,s2,d2) ((((s1) == (s2)) && ((d1) == (d2))) ||\ + (((s1) == (d2)) && ((d1) == (s2)))) +#define IPPAIR(s1,d1,s2,d2) PAIRS((s1).s_addr, (d1).s_addr, \ + (s2).s_addr, (d2).s_addr) + + +typedef struct udpstate { + u_short us_sport; + u_short us_dport; +} udpstate_t; + +typedef struct icmpstate { + u_short ics_id; + u_short ics_seq; + u_char ics_type; +} icmpstate_t; + +typedef struct tcpdata { + u_32_t td_end; + u_32_t td_maxend; + u_short td_maxwin; +} tcpdata_t; + +typedef struct tcpstate { + u_short ts_sport; + u_short ts_dport; + tcpdata_t ts_data[2]; + u_char ts_state[2]; +} tcpstate_t; + +typedef struct ipstate { + struct ipstate *is_next; + u_long is_age; + u_int is_pass; + U_QUAD_T is_pkts; + U_QUAD_T is_bytes; + void *is_ifpin; + void *is_ifpout; + frentry_t *is_rule; + struct in_addr is_src; + struct in_addr is_dst; + u_char is_p; /* Protocol */ + u_char is_rout; /* Is rule in/out ? */ + u_32_t is_flags; + u_32_t is_opt; /* packet options set */ + u_32_t is_optmsk; /* " " mask */ + u_short is_sec; /* security options set */ + u_short is_secmsk; /* " " mask */ + u_short is_auth; /* authentication options set */ + u_short is_authmsk; /* " " mask */ + union { + icmpstate_t is_ics; + tcpstate_t is_ts; + udpstate_t is_us; + } is_ps; +} ipstate_t; + +#define is_icmp is_ps.is_ics +#define is_type is_icmp.ics_type +#define is_code is_icmp.ics_code +#define is_tcp is_ps.is_ts +#define is_udp is_ps.is_us +#define is_send is_tcp.ts_data[0].td_end +#define is_dend is_tcp.ts_data[1].td_end +#define is_maxswin is_tcp.ts_data[0].td_maxwin +#define is_maxdwin is_tcp.ts_data[1].td_maxwin +#define is_maxsend is_tcp.ts_data[0].td_maxend +#define is_maxdend is_tcp.ts_data[1].td_maxend +#define is_sport is_tcp.ts_sport +#define is_dport is_tcp.ts_dport +#define is_state is_tcp.ts_state + +#define TH_OPENING (TH_SYN|TH_ACK) +/* + * is_flags: + * Bits 0 - 3 are use as a mask with the current packet's bits to check for + * whether it is short, tcp/udp, a fragment or the presence of IP options. + * Bits 4 - 7 are set from the initial packet and contain what the packet + * anded with bits 0-3 must match. + * Bits 8,9 are used to indicate wildcard source/destination port matching. + */ + + +typedef struct ipslog { + U_QUAD_T isl_pkts; + U_QUAD_T isl_bytes; + struct in_addr isl_src; + struct in_addr isl_dst; + u_char isl_p; + u_char isl_flags; + u_char isl_state[2]; + u_short isl_type; + union { + u_short isl_filler[2]; + u_short isl_ports[2]; + u_short isl_icmp; + } isl_ps; +} ipslog_t; + +#define isl_sport isl_ps.isl_ports[0] +#define isl_dport isl_ps.isl_ports[1] +#define isl_itype isl_ps.isl_icmp + +#define ISL_NEW 0 +#define ISL_EXPIRE 0xffff +#define ISL_FLUSH 0xfffe + + +typedef struct ips_stat { + u_long iss_hits; + u_long iss_miss; + u_long iss_max; + u_long iss_tcp; + u_long iss_udp; + u_long iss_icmp; + u_long iss_nomem; + u_long iss_expire; + u_long iss_fin; + u_long iss_active; + u_long iss_logged; + u_long iss_logfail; + u_long iss_inuse; + ipstate_t **iss_table; +} ips_stat_t; + + +extern u_long fr_tcpidletimeout; +extern u_long fr_tcpclosewait; +extern u_long fr_tcplastack; +extern u_long fr_tcptimeout; +extern u_long fr_tcpclosed; +extern u_long fr_udptimeout; +extern u_long fr_icmptimeout; +extern int fr_stateinit __P((void)); +extern int fr_tcpstate __P((ipstate_t *, fr_info_t *, ip_t *, tcphdr_t *)); +extern ipstate_t *fr_addstate __P((ip_t *, fr_info_t *, u_int)); +extern frentry_t *fr_checkstate __P((ip_t *, fr_info_t *)); +extern void fr_timeoutstate __P((void)); +extern void fr_tcp_age __P((u_long *, u_char *, ip_t *, fr_info_t *, int)); +extern void fr_stateunload __P((void)); +extern void ipstate_log __P((struct ipstate *, u_int)); +#if defined(__NetBSD__) || defined(__OpenBSD__) +extern int fr_state_ioctl __P((caddr_t, u_long, int)); +#else +extern int fr_state_ioctl __P((caddr_t, int, int)); +#endif + +#endif /* __IP_STATE_H__ */ diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h new file mode 100644 index 0000000..c56a340 --- /dev/null +++ b/sys/netinet/ip_var.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_VAR_H_ +#define _NETINET_IP_VAR_H_ + +/* + * Overlay for ip header used by other protocols (tcp, udp). + */ +struct ipovly { + u_char ih_x1[9]; /* (unused) */ + u_char ih_pr; /* protocol */ + u_short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; + +/* + * Ip reassembly queue structure. Each fragment + * being reassembled is attached to one of these structures. + * They are timed out after ipq_ttl drops to 0, and may also + * be reclaimed if memory becomes tight. + */ +struct ipq { + struct ipq *next,*prev; /* to other reass headers */ + u_char ipq_ttl; /* time for reass q to live */ + u_char ipq_p; /* protocol of this fragment */ + u_short ipq_id; /* sequence id for reassembly */ + struct mbuf *ipq_frags; /* to ip headers of fragments */ + struct in_addr ipq_src,ipq_dst; +#ifdef IPDIVERT + u_int32_t ipq_div_info; /* ipfw divert port & flags */ + u_int16_t ipq_div_cookie; /* ipfw divert cookie */ +#endif +}; + +/* + * Structure stored in mbuf in inpcb.ip_options + * and passed to ip_output when ip options are in use. + * The actual length of the options (including ipopt_dst) + * is in m_len. + */ +#define MAX_IPOPTLEN 40 + +struct ipoption { + struct in_addr ipopt_dst; /* first-hop dst if source routed */ + char ipopt_list[MAX_IPOPTLEN]; /* options proper */ +}; + +/* + * Structure attached to inpcb.ip_moptions and + * passed to ip_output when IP multicast options are in use. + */ +struct ip_moptions { + struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ + u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ + u_char imo_multicast_loop; /* 1 => hear sends if a member */ + u_short imo_num_memberships; /* no. memberships this socket */ + struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; + u_long imo_multicast_vif; /* vif num outgoing multicasts */ +}; + +struct ipstat { + u_long ips_total; /* total packets received */ + u_long ips_badsum; /* checksum bad */ + u_long ips_tooshort; /* packet too short */ + u_long ips_toosmall; /* not enough data */ + u_long ips_badhlen; /* ip header length < data size */ + u_long ips_badlen; /* ip length < ip header length */ + u_long ips_fragments; /* fragments received */ + u_long ips_fragdropped; /* frags dropped (dups, out of space) */ + u_long ips_fragtimeout; /* fragments timed out */ + u_long ips_forward; /* packets forwarded */ + u_long ips_fastforward; /* packets fast forwarded */ + u_long ips_cantforward; /* packets rcvd for unreachable dest */ + u_long ips_redirectsent; /* packets forwarded on same net */ + u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_delivered; /* datagrams delivered to upper level*/ + u_long ips_localout; /* total ip packets generated here */ + u_long ips_odropped; /* lost packets due to nobufs, etc. */ + u_long ips_reassembled; /* total packets reassembled ok */ + u_long ips_fragmented; /* datagrams successfully fragmented */ + u_long ips_ofragments; /* output fragments created */ + u_long ips_cantfrag; /* don't fragment flag was set, etc. */ + u_long ips_badoptions; /* error in option processing */ + u_long ips_noroute; /* packets discarded due to no route */ + u_long ips_badvers; /* ip version != 4 */ + u_long ips_rawout; /* total raw ip packets generated */ + u_long ips_toolong; /* ip length > max ip packet size */ + u_long ips_notmember; /* multicasts for unregistered grps */ + u_long ips_nogif; /* no match gif found */ +}; + +#ifdef KERNEL + +/* flags passed to ip_output as last parameter */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define IP_RAWOUTPUT 0x2 /* raw ip header exists */ +#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ + +struct ip; +struct inpcb; +struct route; +struct sockopt; + +extern struct ipstat ipstat; +extern u_short ip_id; /* ip packet ctr, for ids */ +extern int ip_defttl; /* default IP ttl */ +extern int ipforwarding; /* ip forwarding */ +extern u_char ip_protox[]; +extern struct socket *ip_rsvpd; /* reservation protocol daemon */ +extern struct socket *ip_mrouter; /* multicast routing daemon */ +extern int (*legal_vif_num) __P((int)); +extern u_long (*ip_mcast_src) __P((int)); +extern int rsvp_on; +extern struct pr_usrreqs rip_usrreqs; + +int ip_ctloutput __P((struct socket *, struct sockopt *sopt)); +void ip_drain __P((void)); +void ip_freemoptions __P((struct ip_moptions *)); +void ip_init __P((void)); +extern int (*ip_mforward) __P((struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *)); +int ip_output __P((struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *)); +void ip_savecontrol __P((struct inpcb *, struct mbuf **, struct ip *, + struct mbuf *)); +void ip_slowtimo __P((void)); +struct mbuf * + ip_srcroute __P((void)); +void ip_stripoptions __P((struct mbuf *, struct mbuf *)); +int rip_ctloutput __P((struct socket *, struct sockopt *)); +void rip_ctlinput __P((int, struct sockaddr *, void *)); +void rip_init __P((void)); +void rip_input __P((struct mbuf *, int)); +int rip_output __P((struct mbuf *, struct socket *, u_long)); +void ipip_input __P((struct mbuf *, int)); +void rsvp_input __P((struct mbuf *, int)); +int ip_rsvp_init __P((struct socket *)); +int ip_rsvp_done __P((void)); +int ip_rsvp_vif_init __P((struct socket *, struct sockopt *)); +int ip_rsvp_vif_done __P((struct socket *, struct sockopt *)); +void ip_rsvp_force_done __P((struct socket *)); + +#ifdef IPDIVERT +void div_init __P((void)); +void div_input __P((struct mbuf *, int)); +void divert_packet __P((struct mbuf *, int, int)); +extern struct pr_usrreqs div_usrreqs; +extern u_int16_t ip_divert_cookie; +#endif + +extern struct sockaddr_in *ip_fw_fwd_addr; + +#endif /* KERNEL */ + +#endif /* !_NETINET_IP_VAR_H_ */ diff --git a/sys/netinet/ipl.h b/sys/netinet/ipl.h new file mode 100644 index 0000000..f29b6fd --- /dev/null +++ b/sys/netinet/ipl.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 1993-1999 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ipl.h 1.21 6/5/96 + * $Id$ + * $FreeBSD$ + */ + +#ifndef __IPL_H__ +#define __IPL_H__ + +#define IPL_VERSION "IP Filter: v3.3.3" + +#endif diff --git a/sys/netinet/libalias/HISTORY b/sys/netinet/libalias/HISTORY new file mode 100644 index 0000000..575d1b4 --- /dev/null +++ b/sys/netinet/libalias/HISTORY @@ -0,0 +1,134 @@ +Version 1.0: August 11, 1996 (cjm) + +Version 1.1: August 20, 1996 (cjm) + - Host accepts incoming connections for ports 0 to 1023. + +Version 1.2: September 7, 1996 (cjm) + - Fragment handling error in alias_db.c corrected. + +Version 1.3: September 15, 1996 (cjm) + - Generalized mechanism for handling incoming + connections (no more 0 to 1023 restriction). + + - Increased ICMP support (will handle traceroute now). + + - Improved TCP close connection logic. + +Version 1.4: September 16, 1996 (cjm) + +Version 1.5: September 17, 1996 (cjm) + - Corrected error in handling incoming UDP packets + with zero checksum. + +Version 1.6: September 18, 1996 + - Simplified ICMP data storage. Will now handle + tracert from Win95 and NT as well as FreeBSD + traceroute, which uses UDP packets to non-existent + ports. + +Verstion 1.7: January 9, 1997 (cjm) + - Reduced malloc() activity for ICMP echo and + timestamp requests. + + - Added handling for out-of-order IP fragments. + + - Switched to differential checksum computation + for IP headers (TCP, UDP and ICMP checksums + were already differential). + + - Accepts FTP data connections from other than + port 20. This allows one ftp connections + from two hosts which are both running packet + aliasing. + + - Checksum error on FTP transfers. Problem + in code located by Martin Renters and + Brian Somers. + +Version 1.8: January 14, 1997 (cjm) + - Fixed data type error in function StartPoint() + in alias_db.c (this bug did not exist before v1.7) + Problem in code located by Ari Suutari. + +Version 1.9: February 1, 1997 (Eivind Eklund <perhaps@yes.no>) + - Added support for IRC DCC (ee) + + - Changed the aliasing routines to use ANSI style + throughout (ee) + + - Minor API changes for integration with other + programs than PPP (ee) + + - Fixed minor security hole in alias_ftp.c for + other applications of the aliasing software. + Hole could _not_ manifest in ppp+pktAlias, but + could potentially manifest in other applications + of the aliasing. (ee) + + - Connections initiated from packet aliasing + host machine will not have their port number + aliased unless it conflicts with an aliasing + port already being used. (There is an option + to disable this for debugging) (cjm) + + - Sockets will be allocated in cases where + there might be port interference with the + host machine. This can be disabled in cases + where the ppp host will be acting purely as a + masquerading router and not generate any + traffic of its own. + (cjm) + +Version 2.0: March, 1997 (cjm) + - Aliasing links are cleared only when a host interface address + changes. + + - PacketAliasPermanentLink() API added. + + - Option for only aliasing private, unregistered + IP addresses added. + + - Substantial rework to the aliasing lookup engine. + +Version 2.1: May, 1997 (cjm) + - Continuing rework to the aliasing lookup engine + to support multiple incoming addresses and static + NAT. PacketAliasRedirectPort() and + PacketAliasRedirectAddr() added to API. + + - Now supports outgoing as well as incoming ICMP + error messges. + +Version 2.2: July, 1997 (cjm) + - Rationalized API function names to all begin with + "PacketAlias..." Old function names are retained + for backwards compatitibility. + + - Packet aliasing engine will now free memory of + fragments which are never resolved after a timeout + period. Once a fragment is resolved, it becomes + the users responsibility to free the memory. + +Version 2.3: August 11, 1997 (cjm) + - Problem associated with socket file descriptor + accumulation in alias_db.c corrected. The sockets + had to be closed when a binding failed. Problem + in code located by Gordon Burditt. + +Version 2.4: September 1, 1997 (cjm) + - PKT_ALIAS_UNREGISTERED_ONLY option repaired. + This part of the code was incorrectly re-implemented + in version 2.1. + +Version 2.5: December, 1997 (ee) + - Added PKT_ALIAS_PUNCH_FW mode for firewall + bypass of FTP/IRC DCC data connections. Also added + improved TCP connection monitoring. + +Version 2.6: May, 1998 (amurai) + - Added supporting routine for NetBios over TCP/IP. + +Version 3.0: January 1, 1999 + - Transparent proxying support added. + - PPTP redirecting support added based on patches + contributed by Dru Nelson <dnelson@redwoodsoft.com>. diff --git a/sys/netinet/libalias/Makefile b/sys/netinet/libalias/Makefile new file mode 100644 index 0000000..e540648 --- /dev/null +++ b/sys/netinet/libalias/Makefile @@ -0,0 +1,15 @@ +# $FreeBSD$ + +LIB= alias +SHLIB_MAJOR= 3 +SHLIB_MINOR= 0 +CFLAGS+= -Wall -I${.CURDIR} +SRCS= alias.c alias_cuseeme.c alias_db.c alias_ftp.c alias_irc.c \ + alias_nbt.c alias_proxy.c alias_util.c +MAN3= libalias.3 + +beforeinstall: + ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 ${.CURDIR}/alias.h \ + ${DESTDIR}/usr/include + +.include <bsd.lib.mk> diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c new file mode 100644 index 0000000..9c072df --- /dev/null +++ b/sys/netinet/libalias/alias.c @@ -0,0 +1,1335 @@ +/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- */ +/* + Alias.c provides supervisory control for the functions of the + packet aliasing software. It consists of routines to monitor + TCP connection state, protocol-specific aliasing routines, + fragment handling and the following outside world functional + interfaces: SaveFragmentPtr, GetFragmentPtr, FragmentAliasIn, + PacketAliasIn and PacketAliasOut. + + The other C program files are briefly described. The data + structure framework which holds information needed to translate + packets is encapsulated in alias_db.c. Data is accessed by + function calls, so other segments of the program need not know + about the underlying data structures. Alias_ftp.c contains + special code for modifying the ftp PORT command used to establish + data connections, while alias_irc.c do the same for IRC + DCC. Alias_util.c contains a few utility routines. + + This software is placed into the public domain with no restrictions + on its distribution. + + Version 1.0 August, 1996 (cjm) + + Version 1.1 August 20, 1996 (cjm) + PPP host accepts incoming connections for ports 0 to 1023. + (Gary Roberts pointed out the need to handle incoming + connections.) + + Version 1.2 September 7, 1996 (cjm) + Fragment handling error in alias_db.c corrected. + (Tom Torrance helped fix this problem.) + + Version 1.4 September 16, 1996 (cjm) + - A more generalized method for handling incoming + connections, without the 0-1023 restriction, is + implemented in alias_db.c + - Improved ICMP support in alias.c. Traceroute + packet streams can now be correctly aliased. + - TCP connection closing logic simplified in + alias.c and now allows for additional 1 minute + "grace period" after FIN or RST is observed. + + Version 1.5 September 17, 1996 (cjm) + Corrected error in handling incoming UDP packets with 0 checksum. + (Tom Torrance helped fix this problem.) + + Version 1.6 September 18, 1996 (cjm) + Simplified ICMP aliasing scheme. Should now support + traceroute from Win95 as well as FreeBSD. + + Version 1.7 January 9, 1997 (cjm) + - Out-of-order fragment handling. + - IP checksum error fixed for ftp transfers + from aliasing host. + - Integer return codes added to all + aliasing/de-aliasing functions. + - Some obsolete comments cleaned up. + - Differential checksum computations for + IP header (TCP, UDP and ICMP were already + differential). + + Version 2.1 May 1997 (cjm) + - Added support for outgoing ICMP error + messages. + - Added two functions PacketAliasIn2() + and PacketAliasOut2() for dynamic address + control (e.g. round-robin allocation of + incoming packets). + + Version 2.2 July 1997 (cjm) + - Rationalized API function names to begin + with "PacketAlias..." + - Eliminated PacketAliasIn2() and + PacketAliasOut2() as poorly conceived. + + Version 2.3 Dec 1998 (dillon) + - Major bounds checking additions, see FreeBSD/CVS + + See HISTORY file for additional revisions. + + $FreeBSD$ +*/ + +#include <stdio.h> +#include <unistd.h> + +#include <sys/param.h> +#include <sys/types.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#ifndef IPPROTO_GRE +#define IPPROTO_GRE 47 +#define IPPROTO_ESP 50 +#define IPPROTO_AH 51 +#endif + +#include "alias_local.h" +#include "alias.h" + +#define NETBIOS_NS_PORT_NUMBER 137 +#define NETBIOS_DGM_PORT_NUMBER 138 +#define FTP_CONTROL_PORT_NUMBER 21 +#define IRC_CONTROL_PORT_NUMBER_1 6667 +#define IRC_CONTROL_PORT_NUMBER_2 6668 +#define CUSEEME_PORT_NUMBER 7648 + + + + +/* TCP Handling Routines + + TcpMonitorIn() -- These routines monitor TCP connections, and + TcpMonitorOut() delete a link when a connection is closed. + +These routines look for SYN, FIN and RST flags to determine when TCP +connections open and close. When a TCP connection closes, the data +structure containing packet aliasing information is deleted after +a timeout period. +*/ + +/* Local prototypes */ +static void TcpMonitorIn(struct ip *, struct alias_link *); + +static void TcpMonitorOut(struct ip *, struct alias_link *); + + +static void +TcpMonitorIn(struct ip *pip, struct alias_link *link) +{ + struct tcphdr *tc; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + switch (GetStateIn(link)) + { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (tc->th_flags & TH_RST) + SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED); + else if (tc->th_flags & TH_SYN) + SetStateIn(link, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (tc->th_flags & (TH_FIN | TH_RST)) + SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + +static void +TcpMonitorOut(struct ip *pip, struct alias_link *link) +{ + struct tcphdr *tc; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + switch (GetStateOut(link)) + { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (tc->th_flags & TH_RST) + SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED); + else if (tc->th_flags & TH_SYN) + SetStateOut(link, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (tc->th_flags & (TH_FIN | TH_RST)) + SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + + + + + +/* Protocol Specific Packet Aliasing Routines + + IcmpAliasIn(), IcmpAliasIn1(), IcmpAliasIn2(), IcmpAliasIn3() + IcmpAliasOut(), IcmpAliasOut1(), IcmpAliasOut2(), IcmpAliasOut3() + UdpAliasIn(), UdpAliasOut() + TcpAliasIn(), TcpAliasOut() + +These routines handle protocol specific details of packet aliasing. +One may observe a certain amount of repetitive arithmetic in these +functions, the purpose of which is to compute a revised checksum +without actually summing over the entire data packet, which could be +unnecessarily time consuming. + +The purpose of the packet aliasing routines is to replace the source +address of the outgoing packet and then correctly put it back for +any incoming packets. For TCP and UDP, ports are also re-mapped. + +For ICMP echo/timestamp requests and replies, the following scheme +is used: the id number is replaced by an alias for the outgoing +packet. + +ICMP error messages are handled by looking at the IP fragment +in the data section of the message. + +For TCP and UDP protocols, a port number is chosen for an outgoing +packet, and then incoming packets are identified by IP address and +port numbers. For TCP packets, there is additional logic in the event +that sequence and ack numbers have been altered (as is the case for +FTP data port commands). + +The port numbers used by the packet aliasing module are not true +ports in the Unix sense. No sockets are actually bound to ports. +They are more correctly thought of as placeholders. + +All packets go through the aliasing mechanism, whether they come from +the gateway machine or other machines on a local area network. +*/ + + +/* Local prototypes */ +static int IcmpAliasIn1(struct ip *); +static int IcmpAliasIn2(struct ip *); +static int IcmpAliasIn3(struct ip *); +static int IcmpAliasIn (struct ip *); + +static int IcmpAliasOut1(struct ip *); +static int IcmpAliasOut2(struct ip *); +static int IcmpAliasOut3(struct ip *); +static int IcmpAliasOut (struct ip *); + +static int UdpAliasOut(struct ip *); +static int UdpAliasIn (struct ip *); + +static int TcpAliasOut(struct ip *, int); +static int TcpAliasIn (struct ip *); + + +static int +IcmpAliasIn1(struct ip *pip) +{ +/* + De-alias incoming echo and timestamp replies +*/ + struct alias_link *link; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + +/* Get source address from ICMP data field and restore original data */ + link = FindIcmpIn(pip->ip_src, pip->ip_dst, ic->icmp_id); + if (link != NULL) + { + u_short original_id; + int accumulate; + + original_id = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum) + +/* Put original sequence number back in */ + ic->icmp_id = original_id; + +/* Put original address back into IP header */ + { + struct in_addr original_address; + + original_address = GetOriginalAddress(link); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + } + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +IcmpAliasIn2(struct ip *pip) +{ +/* + Alias incoming ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + ip = (struct ip *) ic->icmp_data; + + ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2)); + tc = (struct tcphdr *) ud; + ic2 = (struct icmp *) ud; + + if (ip->ip_p == IPPROTO_UDP) + link = FindUdpTcpIn(ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP); + else if (ip->ip_p == IPPROTO_TCP) + link = FindUdpTcpIn(ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + link = FindIcmpIn(ip->ip_dst, ip->ip_src, ic2->icmp_id); + else + link = NULL; + } else + link = NULL; + + if (link != NULL) + { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(link); + original_port = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ud->uh_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum) + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + +/* Un-alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_src = original_address; + ud->uh_sport = original_port; + } + else if (pip->ip_p == IPPROTO_ICMP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(link); + original_id = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic2->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum) + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + +/* Un-alias address of original IP packet and seqence number of + embedded icmp datagram */ + ip->ip_src = original_address; + ic2->icmp_id = original_id; + } + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +IcmpAliasIn3(struct ip *pip) +{ + struct in_addr original_address; + + original_address = FindOriginalAddress(pip->ip_dst); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + return PKT_ALIAS_OK; +} + + +static int +IcmpAliasIn(struct ip *pip) +{ + int iresult; + struct icmp *ic; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) + { + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + if (ic->icmp_code == 0) + { + iresult = IcmpAliasIn1(pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasIn2(pip); + break; + case ICMP_ECHO: + case ICMP_TSTAMP: + iresult = IcmpAliasIn3(pip); + break; + } + return(iresult); +} + + +static int +IcmpAliasOut1(struct ip *pip) +{ +/* + Alias ICMP echo and timestamp packets +*/ + struct alias_link *link; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + +/* Save overwritten data for when echo packet returns */ + link = FindIcmpOut(pip->ip_src, pip->ip_dst, ic->icmp_id); + if (link != NULL) + { + u_short alias_id; + int accumulate; + + alias_id = GetAliasPort(link); + +/* Since data field is being modified, adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum) + +/* Alias sequence number */ + ic->icmp_id = alias_id; + +/* Change source address */ + { + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut2(struct ip *pip) +{ +/* + Alias outgoing ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct in_addr alias_addr; + struct ip *ip; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + ip = (struct ip *) ic->icmp_data; + + alias_addr = FindAliasAddress(ip->ip_src); + +/* Alias destination address in IP fragment */ + DifferentialChecksum(&ic->icmp_cksum, + (u_short *) &alias_addr, + (u_short *) &ip->ip_dst, + 2); + ip->ip_dst = alias_addr; + +/* alias source address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_addr, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_addr; + + return PKT_ALIAS_OK; +} + + +static int +IcmpAliasOut3(struct ip *pip) +{ +/* + Handle outgoing echo and timestamp replies. The + only thing which is done in this case is to alias + the source IP address of the packet. +*/ + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(pip->ip_src); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_addr, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_addr; + + return PKT_ALIAS_OK; +} + + +static int +IcmpAliasOut(struct ip *pip) +{ + int iresult; + struct icmp *ic; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) + { + case ICMP_ECHO: + case ICMP_TSTAMP: + if (ic->icmp_code == 0) + { + iresult = IcmpAliasOut1(pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasOut2(pip); + break; + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + iresult = IcmpAliasOut3(pip); + } + return(iresult); +} + + + +static int +PptpAliasIn(struct ip *pip) +{ +/* + Handle incoming PPTP packets. The + only thing which is done in this case is to alias + the dest IP address of the packet to our inside + machine. +*/ + struct in_addr alias_addr; + + if (!GetPptpAlias (&alias_addr)) + return PKT_ALIAS_IGNORED; + + if (pip->ip_src.s_addr != alias_addr.s_addr) { + + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_addr, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = alias_addr; + } + + return PKT_ALIAS_OK; +} + + +static int +PptpAliasOut(struct ip *pip) +{ +/* + Handle outgoing PPTP packets. The + only thing which is done in this case is to alias + the source IP address of the packet. +*/ + struct in_addr alias_addr; + + if (!GetPptpAlias (&alias_addr)) + return PKT_ALIAS_IGNORED; + + if (pip->ip_src.s_addr == alias_addr.s_addr) { + + alias_addr = FindAliasAddress(pip->ip_src); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_addr, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_addr; + } + + return PKT_ALIAS_OK; +} + + + +static int +UdpAliasIn(struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpIn(pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP); + if (link != NULL) + { + struct in_addr alias_address; + struct in_addr original_address; + u_short alias_port; + int accumulate; + u_short *sptr; + int r = 0; + + alias_address = GetAliasAddress(link); + original_address = GetOriginalAddress(link); + alias_port = ud->uh_dport; + ud->uh_dport = GetOriginalPort(link); + +/* If NETBIOS Datagram, It should be alias address in UDP Data, too */ + if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER ) + { + r = AliasHandleUdpNbt(pip, link, &original_address, ud->uh_dport); + } else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER ) + { + r = AliasHandleUdpNbtNS(pip, link, + &alias_address, + &alias_port, + &original_address, + &ud->uh_dport ); + } + + if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER) + AliasHandleCUSeeMeIn(pip, original_address); + +/* If UDP checksum is not zero, then adjust since destination port */ +/* is being unaliased and destination port is being altered. */ + if (ud->uh_sum != 0) + { + accumulate = alias_port; + accumulate -= ud->uh_dport; + sptr = (u_short *) &alias_address; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + ADJUST_CHECKSUM(accumulate, ud->uh_sum) + } + +/* Restore original IP address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + /* + * If we cannot figure out the packet, ignore it. + */ + if (r < 0) + return(PKT_ALIAS_IGNORED); + else + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +UdpAliasOut(struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpOut(pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP); + if (link != NULL) + { + u_short alias_port; + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + alias_port = GetAliasPort(link); + + if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER) + AliasHandleCUSeeMeOut(pip, link); + +/* If NETBIOS Datagram, It should be alias address in UDP Data, too */ + if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER ) + { + AliasHandleUdpNbt(pip, link, &alias_address, alias_port); + } else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER ) + { + AliasHandleUdpNbtNS(pip, link, + &pip->ip_src, + &ud->uh_sport, + &alias_address, + &alias_port); + } + +/* If UDP checksum is not zero, adjust since source port is */ +/* being aliased and source address is being altered */ + if (ud->uh_sum != 0) + { + int accumulate; + u_short *sptr; + + accumulate = ud->uh_sport; + accumulate -= alias_port; + sptr = (u_short *) &(pip->ip_src); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + ADJUST_CHECKSUM(accumulate, ud->uh_sum) + } + +/* Put alias port in UDP header */ + ud->uh_sport = alias_port; + +/* Change source address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + + +static int +TcpAliasIn(struct ip *pip) +{ + struct tcphdr *tc; + struct alias_link *link; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpIn(pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP); + if (link != NULL) + { + struct in_addr alias_address; + struct in_addr original_address; + struct in_addr proxy_address; + u_short alias_port; + u_short proxy_port; + int accumulate; + u_short *sptr; + + alias_address = GetAliasAddress(link); + original_address = GetOriginalAddress(link); + proxy_address = GetProxyAddress(link); + alias_port = tc->th_dport; + tc->th_dport = GetOriginalPort(link); + proxy_port = GetProxyPort(link); + +/* Adjust TCP checksum since destination port is being unaliased */ +/* and destination port is being altered. */ + accumulate = alias_port; + accumulate -= tc->th_dport; + sptr = (u_short *) &alias_address; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* If this is a proxy, then modify the tcp source port and + checksum accumulation */ + if (proxy_port != 0) + { + accumulate += tc->th_sport; + tc->th_sport = proxy_port; + accumulate -= tc->th_sport; + + sptr = (u_short *) &pip->ip_src; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &proxy_address; + accumulate -= *sptr++; + accumulate -= *sptr; + } + +/* See if ack number needs to be modified */ + if (GetAckModified(link) == 1) + { + int delta; + + delta = GetDeltaAckIn(pip, link); + if (delta != 0) + { + sptr = (u_short *) &tc->th_ack; + accumulate += *sptr++; + accumulate += *sptr; + tc->th_ack = htonl(ntohl(tc->th_ack) - delta); + sptr = (u_short *) &tc->th_ack; + accumulate -= *sptr++; + accumulate -= *sptr; + } + } + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Restore original IP address */ + sptr = (u_short *) &pip->ip_dst; + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_dst = original_address; + sptr = (u_short *) &pip->ip_dst; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* If this is a transparent proxy packet, then modify the source + address */ + if (proxy_address.s_addr != 0) + { + sptr = (u_short *) &pip->ip_src; + accumulate += *sptr++; + accumulate += *sptr; + pip->ip_src = proxy_address; + sptr = (u_short *) &pip->ip_src; + accumulate -= *sptr++; + accumulate -= *sptr; + } + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + +/* Monitor TCP connection state */ + TcpMonitorIn(pip, link); + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +TcpAliasOut(struct ip *pip, int maxpacketsize) +{ + int proxy_type; + u_short dest_port; + u_short proxy_server_port; + struct in_addr dest_address; + struct in_addr proxy_server_address; + struct tcphdr *tc; + struct alias_link *link; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + proxy_type = ProxyCheck(pip, &proxy_server_address, &proxy_server_port); + + if (proxy_type == 0 && (packetAliasMode & PKT_ALIAS_PROXY_ONLY)) + return PKT_ALIAS_OK; + +/* If this is a transparent proxy, save original destination, + then alter the destination and adust checksums */ + dest_port = tc->th_dport; + dest_address = pip->ip_dst; + if (proxy_type != 0) + { + int accumulate; + u_short *sptr; + + accumulate = tc->th_dport; + tc->th_dport = proxy_server_port; + accumulate -= tc->th_dport; + + sptr = (u_short *) &(pip->ip_dst); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &proxy_server_address; + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + sptr = (u_short *) &(pip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_dst = proxy_server_address; + sptr = (u_short *) &(pip->ip_dst); + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + + link = FindUdpTcpOut(pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP); + if (link !=NULL) + { + u_short alias_port; + struct in_addr alias_address; + int accumulate; + u_short *sptr; + +/* Save original destination address, if this is a proxy packet. + Also modify packet to include destination encoding. */ + if (proxy_type != 0) + { + SetProxyPort(link, dest_port); + SetProxyAddress(link, dest_address); + ProxyModify(link, pip, maxpacketsize, proxy_type); + } + +/* Get alias address and port */ + alias_port = GetAliasPort(link); + alias_address = GetAliasAddress(link); + +/* Monitor tcp connection state */ + TcpMonitorOut(pip, link); + +/* Special processing for IP encoding protocols */ + if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == FTP_CONTROL_PORT_NUMBER) + AliasHandleFtpOut(pip, link, maxpacketsize); + if (ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_2) + AliasHandleIrcOut(pip, link, maxpacketsize); + +/* Adjust TCP checksum since source port is being aliased */ +/* and source address is being altered */ + accumulate = tc->th_sport; + tc->th_sport = alias_port; + accumulate -= tc->th_sport; + + sptr = (u_short *) &(pip->ip_src); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* Modify sequence number if necessary */ + if (GetAckModified(link) == 1) + { + int delta; + + delta = GetDeltaSeqOut(pip, link); + if (delta != 0) + { + sptr = (u_short *) &tc->th_seq; + accumulate += *sptr++; + accumulate += *sptr; + tc->th_seq = htonl(ntohl(tc->th_seq) + delta); + sptr = (u_short *) &tc->th_seq; + accumulate -= *sptr++; + accumulate -= *sptr; + } + } + + ADJUST_CHECKSUM(accumulate, tc->th_sum) + +/* Change source address */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_src = alias_address; + sptr = (u_short *) &(pip->ip_src); + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum) + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + + + +/* Fragment Handling + + FragmentIn() + FragmentOut() + +The packet aliasing module has a limited ability for handling IP +fragments. If the ICMP, TCP or UDP header is in the first fragment +received, then the id number of the IP packet is saved, and other +fragments are identified according to their ID number and IP address +they were sent from. Pointers to unresolved fragments can also be +saved and recalled when a header fragment is seen. +*/ + +/* Local prototypes */ +static int FragmentIn(struct ip *); +static int FragmentOut(struct ip *); + + +static int +FragmentIn(struct ip *pip) +{ + struct alias_link *link; + + link = FindFragmentIn2(pip->ip_src, pip->ip_dst, pip->ip_id); + if (link != NULL) + { + struct in_addr original_address; + + GetFragmentAddr(link, &original_address); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_UNRESOLVED_FRAGMENT); +} + + +static int +FragmentOut(struct ip *pip) +{ + struct in_addr alias_address; + + alias_address = FindAliasAddress(pip->ip_src); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); +} + + + + + + +/* Outside World Access + + PacketAliasSaveFragment() + PacketAliasGetFragment() + PacketAliasFragmentIn() + PacketAliasIn() + PacketAliasOut() + +(prototypes in alias.h) +*/ + + +int +PacketAliasSaveFragment(char *ptr) +{ + int iresult; + struct alias_link *link; + struct ip *pip; + + pip = (struct ip *) ptr; + link = AddFragmentPtrLink(pip->ip_src, pip->ip_id); + iresult = PKT_ALIAS_ERROR; + if (link != NULL) + { + SetFragmentPtr(link, ptr); + iresult = PKT_ALIAS_OK; + } + return(iresult); +} + + +char * +PacketAliasGetFragment(char *ptr) +{ + struct alias_link *link; + char *fptr; + struct ip *pip; + + pip = (struct ip *) ptr; + link = FindFragmentPtr(pip->ip_src, pip->ip_id); + if (link != NULL) + { + GetFragmentPtr(link, &fptr); + SetFragmentPtr(link, NULL); + SetExpire(link, 0); /* Deletes link */ + + return(fptr); + } + else + { + return(NULL); + } +} + + +void +PacketAliasFragmentIn(char *ptr, /* Points to correctly de-aliased + header fragment */ + char *ptr_fragment /* Points to fragment which must + be de-aliased */ + ) +{ + struct ip *pip; + struct ip *fpip; + + pip = (struct ip *) ptr; + fpip = (struct ip *) ptr_fragment; + + DifferentialChecksum(&fpip->ip_sum, + (u_short *) &pip->ip_dst, + (u_short *) &fpip->ip_dst, + 2); + fpip->ip_dst = pip->ip_dst; +} + + +int +PacketAliasIn(char *ptr, int maxpacketsize) +{ + struct in_addr alias_addr; + struct ip *pip; + int iresult; + + if (packetAliasMode & PKT_ALIAS_REVERSE) { + packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = PacketAliasOut(ptr, maxpacketsize); + packetAliasMode |= PKT_ALIAS_REVERSE; + return iresult; + } + + HouseKeeping(); + ClearCheckNewLink(); + pip = (struct ip *) ptr; + alias_addr = pip->ip_dst; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return PKT_ALIAS_IGNORED; + + iresult = PKT_ALIAS_IGNORED; + if ( (ntohs(pip->ip_off) & IP_OFFMASK) == 0 ) + { + switch (pip->ip_p) + { + case IPPROTO_ICMP: + iresult = IcmpAliasIn(pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasIn(pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasIn(pip); + break; + case IPPROTO_GRE: + case IPPROTO_ESP: + case IPPROTO_AH: + iresult = PptpAliasIn(pip); + break; + } + + if (ntohs(pip->ip_off) & IP_MF) + { + struct alias_link *link; + + link = FindFragmentIn1(pip->ip_src, alias_addr, pip->ip_id); + if (link != NULL) + { + iresult = PKT_ALIAS_FOUND_HEADER_FRAGMENT; + SetFragmentAddr(link, pip->ip_dst); + } + else + { + iresult = PKT_ALIAS_ERROR; + } + } + } + else + { + iresult = FragmentIn(pip); + } + + return(iresult); +} + + + +/* Unregistered address ranges */ + +/* 10.0.0.0 -> 10.255.255.255 */ +#define UNREG_ADDR_A_LOWER 0x0a000000 +#define UNREG_ADDR_A_UPPER 0x0affffff + +/* 172.16.0.0 -> 172.31.255.255 */ +#define UNREG_ADDR_B_LOWER 0xac100000 +#define UNREG_ADDR_B_UPPER 0xac1fffff + +/* 192.168.0.0 -> 192.168.255.255 */ +#define UNREG_ADDR_C_LOWER 0xc0a80000 +#define UNREG_ADDR_C_UPPER 0xc0a8ffff + +int +PacketAliasOut(char *ptr, /* valid IP packet */ + int maxpacketsize /* How much the packet data may grow + (FTP and IRC inline changes) */ + ) +{ + int iresult; + struct in_addr addr_save; + struct ip *pip; + + if (packetAliasMode & PKT_ALIAS_REVERSE) { + packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = PacketAliasIn(ptr, maxpacketsize); + packetAliasMode |= PKT_ALIAS_REVERSE; + return iresult; + } + + HouseKeeping(); + ClearCheckNewLink(); + pip = (struct ip *) ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return PKT_ALIAS_IGNORED; + + addr_save = GetDefaultAliasAddress(); + if (packetAliasMode & PKT_ALIAS_UNREGISTERED_ONLY) + { + unsigned int addr; + int iclass; + + iclass = 0; + addr = ntohl(pip->ip_src.s_addr); + if (addr >= UNREG_ADDR_C_LOWER && addr <= UNREG_ADDR_C_UPPER) + iclass = 3; + else if (addr >= UNREG_ADDR_B_LOWER && addr <= UNREG_ADDR_B_UPPER) + iclass = 2; + else if (addr >= UNREG_ADDR_A_LOWER && addr <= UNREG_ADDR_A_UPPER) + iclass = 1; + + if (iclass == 0) + { + SetDefaultAliasAddress(pip->ip_src); + } + } + + iresult = PKT_ALIAS_IGNORED; + if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) + { + switch (pip->ip_p) + { + case IPPROTO_ICMP: + iresult = IcmpAliasOut(pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasOut(pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasOut(pip, maxpacketsize); + break; + case IPPROTO_GRE: + case IPPROTO_ESP: + case IPPROTO_AH: + iresult = PptpAliasOut(pip); + break; + } + } + else + { + iresult = FragmentOut(pip); + } + + SetDefaultAliasAddress(addr_save); + return(iresult); +} diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h new file mode 100644 index 0000000..9666b0f --- /dev/null +++ b/sys/netinet/libalias/alias.h @@ -0,0 +1,166 @@ +/*lint -save -library Flexelint comment for external headers */ + +/* + Alias.h defines the outside world interfaces for the packet + aliasing software. + + This software is placed into the public domain with no restrictions + on its distribution. + + $FreeBSD$ +*/ + + +#ifndef _ALIAS_H_ +#define _ALIAS_H_ + +#ifndef NULL +#define NULL 0 +#endif + +/* Alias link representative (incomplete struct) */ +struct alias_link; + +/* External interfaces (API) to packet aliasing engine */ + +/* Initialization and Control */ + extern void + PacketAliasInit(void); + + extern void + PacketAliasUninit(void); + + extern void + PacketAliasSetAddress(struct in_addr); + + extern unsigned int + PacketAliasSetMode(unsigned int, unsigned int); + +#ifndef NO_FW_PUNCH + extern void + PacketAliasSetFWBase(unsigned int, unsigned int); +#endif + +/* Packet Handling */ + extern int + PacketAliasIn(char *, int maxpacketsize); + + extern int + PacketAliasOut(char *, int maxpacketsize); + +/* Port and Address Redirection */ + extern struct alias_link * + PacketAliasRedirectPort(struct in_addr, u_short, + struct in_addr, u_short, + struct in_addr, u_short, + u_char); + + extern int + PacketAliasPptp(struct in_addr); + + + extern struct alias_link * + PacketAliasRedirectAddr(struct in_addr, + struct in_addr); + + extern void + PacketAliasRedirectDelete(struct alias_link *); + +/* Fragment Handling */ + extern int + PacketAliasSaveFragment(char *); + + extern char * + PacketAliasGetFragment(char *); + + extern void + PacketAliasFragmentIn(char *, char *); + +/* Miscellaneous Functions */ + extern void + PacketAliasSetTarget(struct in_addr addr); + + extern int + PacketAliasCheckNewLink(void); + + extern u_short + PacketAliasInternetChecksum(u_short *, int); + +/* Transparent Proxying */ + extern int + PacketAliasProxyRule(const char *); + + +/********************** Mode flags ********************/ +/* Set these flags using SetPacketAliasMode() */ + +/* If PKT_ALIAS_LOG is set, a message will be printed to + /var/log/alias.log every time a link is created or deleted. This + is useful for debugging */ +#define PKT_ALIAS_LOG 0x01 + +/* If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. + to ftp, telnet or web servers will be prevented by the aliasing + mechanism. */ +#define PKT_ALIAS_DENY_INCOMING 0x02 + +/* If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from + the same port as they originated on. This allows eg rsh to work + *99% of the time*, but _not_ 100%. (It will be slightly flakey + instead of not working at all.) This mode bit is set by + PacketAliasInit(), so it is a default mode of operation. */ +#define PKT_ALIAS_SAME_PORTS 0x04 + +/* If PKT_ALIAS_USE_SOCKETS is set, then when partially specified + links (e.g. destination port and/or address is zero), the packet + aliasing engine will attempt to allocate a socket for the aliasing + port it chooses. This will avoid interference with the host + machine. Fully specified links do not require this. This bit + is set after a call to PacketAliasInit(), so it is a default + mode of operation.*/ +#define PKT_ALIAS_USE_SOCKETS 0x08 + +/* If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with with + unregistered source addresses will be aliased (along with those + of the ppp host maching itself. Private addresses are those + in the following ranges: + 10.0.0.0 -> 10.255.255.255 + 172.16.0.0 -> 172.31.255.255 + 192.168.0.0 -> 192.168.255.255 */ +#define PKT_ALIAS_UNREGISTERED_ONLY 0x10 + +/* If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic + aliasing links will be reset whenever PacketAliasSetAddress() + changes the default aliasing address. If the default aliasing + address is left unchanged by this functions call, then the + table of dynamic aliasing links will be left intact. This + bit is set after a call to PacketAliasInit(). */ +#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 + +#ifndef NO_FW_PUNCH +/* If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections + will create a 'hole' in the firewall to allow the transfers to + work. Where (IPFW "line-numbers") the hole is created is + controlled by PacketAliasSetFWBase(base, size). The hole will be + attached to that particular alias_link, so when the link goes away + so do the hole. */ +#define PKT_ALIAS_PUNCH_FW 0x40 +#endif + +/* If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + transparent proxying performed */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() + and PacketAliasOut() are reversed */ +#define PKT_ALIAS_REVERSE 0x80 + +/* Return Codes */ +#define PKT_ALIAS_ERROR -1 +#define PKT_ALIAS_OK 1 +#define PKT_ALIAS_IGNORED 2 +#define PKT_ALIAS_UNRESOLVED_FRAGMENT 3 +#define PKT_ALIAS_FOUND_HEADER_FRAGMENT 4 + +#endif +/*lint -restore */ diff --git a/sys/netinet/libalias/alias_cuseeme.c b/sys/netinet/libalias/alias_cuseeme.c new file mode 100644 index 0000000..b1b95f4 --- /dev/null +++ b/sys/netinet/libalias/alias_cuseeme.c @@ -0,0 +1,120 @@ +/*- + * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> + * with the aid of code written by + * Junichi SATOH <junichi@astec.co.jp> 1996, 1997. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> + +#include "alias_local.h" + +/* CU-SeeMe Data Header */ +struct cu_header { + u_int16_t dest_family; + u_int16_t dest_port; + u_int32_t dest_addr; + int16_t family; + u_int16_t port; + u_int32_t addr; + u_int32_t seq; + u_int16_t msg; + u_int16_t data_type; + u_int16_t packet_len; +}; + +/* Open Continue Header */ +struct oc_header { + u_int16_t client_count; /* Number of client info structs */ + u_int32_t seq_no; + char user_name[20]; + char reserved[4]; /* flags, version stuff, etc */ +}; + +/* client info structures */ +struct client_info { + u_int32_t address; /* Client address */ + char reserved[8]; /* Flags, pruning bitfield, packet counts etc */ +}; + +void +AliasHandleCUSeeMeOut(struct ip *pip, struct alias_link *link) +{ + struct udphdr *ud; + + ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2)); + if(ud->uh_ulen >= sizeof(struct cu_header)) { + struct cu_header *cu; + struct alias_link *cu_link; + + cu = (struct cu_header *)(ud + 1); + if (cu->addr) + cu->addr = (u_int32_t)GetAliasAddress(link).s_addr; + + cu_link = FindUdpTcpOut(pip->ip_src, GetDestAddress(link), + ud->uh_dport, 0, IPPROTO_UDP); + +#ifndef NO_FW_PUNCH + if (cu_link) + PunchFWHole(cu_link); +#endif + } +} + +void +AliasHandleCUSeeMeIn(struct ip *pip, struct in_addr original_addr) +{ + struct in_addr alias_addr; + struct udphdr *ud; + struct cu_header *cu; + struct oc_header *oc; + struct client_info *ci; + char *end; + int i; + + alias_addr.s_addr = pip->ip_dst.s_addr; + ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2)); + cu = (struct cu_header *)(ud + 1); + oc = (struct oc_header *)(cu + 1); + ci = (struct client_info *)(oc + 1); + end = (char *)cu + ud->uh_ulen; + + if ((char *)oc <= end) { + if(cu->dest_addr) + cu->dest_addr = (u_int32_t)original_addr.s_addr; + if(ntohs(cu->data_type) == 101) + /* Find and change our address */ + for(i = 0; (char *)(ci + 1) <= end && i < oc->client_count; i++, ci++) + if(ci->address == (u_int32_t)alias_addr.s_addr) { + ci->address = (u_int32_t)original_addr.s_addr; + break; + } + } +} diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c new file mode 100644 index 0000000..618162c --- /dev/null +++ b/sys/netinet/libalias/alias_db.c @@ -0,0 +1,2394 @@ +/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- + Alias_db.c encapsulates all data structures used for storing + packet aliasing data. Other parts of the aliasing software + access data through functions provided in this file. + + Data storage is based on the notion of a "link", which is + established for ICMP echo/reply packets, UDP datagrams and + TCP stream connections. A link stores the original source + and destination addresses. For UDP and TCP, it also stores + source and destination port numbers, as well as an alias + port number. Links are also used to store information about + fragments. + + There is a facility for sweeping through and deleting old + links as new packets are sent through. A simple timeout is + used for ICMP and UDP links. TCP links are left alone unless + there is an incomplete connection, in which case the link + can be deleted after a certain amount of time. + + + This software is placed into the public domain with no restrictions + on its distribution. + + Initial version: August, 1996 (cjm) + + Version 1.4: September 16, 1996 (cjm) + Facility for handling incoming links added. + + Version 1.6: September 18, 1996 (cjm) + ICMP data handling simplified. + + Version 1.7: January 9, 1997 (cjm) + Fragment handling simplified. + Saves pointers for unresolved fragments. + Permits links for unspecied remote ports + or unspecified remote addresses. + Fixed bug which did not properly zero port + table entries after a link was deleted. + Cleaned up some obsolete comments. + + Version 1.8: January 14, 1997 (cjm) + Fixed data type error in StartPoint(). + (This error did not exist prior to v1.7 + and was discovered and fixed by Ari Suutari) + + Version 1.9: February 1, 1997 + Optionally, connections initiated from packet aliasing host + machine will will not have their port number aliased unless it + conflicts with an aliasing port already being used. (cjm) + + All options earlier being #ifdef'ed now are available through + a new interface, SetPacketAliasMode(). This allow run time + control (which is now available in PPP+pktAlias through the + 'alias' keyword). (ee) + + Added ability to create an alias port without + either destination address or port specified. + port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee) + + Removed K&R style function headers + and general cleanup. (ee) + + Added packetAliasMode to replace compiler #defines's (ee) + + Allocates sockets for partially specified + ports if ALIAS_USE_SOCKETS defined. (cjm) + + Version 2.0: March, 1997 + SetAliasAddress() will now clean up alias links + if the aliasing address is changed. (cjm) + + PacketAliasPermanentLink() function added to support permanent + links. (J. Fortes suggested the need for this.) + Examples: + + (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port + + (192.168.0.2, port 21) <-> alias port 3604, known dest addr + unknown dest port + + These permament links allow for incoming connections to + machines on the local network. They can be given with a + user-chosen amount of specificity, with increasing specificity + meaning more security. (cjm) + + Quite a bit of rework to the basic engine. The portTable[] + array, which kept track of which ports were in use was replaced + by a table/linked list structure. (cjm) + + SetExpire() function added. (cjm) + + DeleteLink() no longer frees memory association with a pointer + to a fragment (this bug was first recognized by E. Eklund in + v1.9). + + Version 2.1: May, 1997 (cjm) + Packet aliasing engine reworked so that it can handle + multiple external addresses rather than just a single + host address. + + PacketAliasRedirectPort() and PacketAliasRedirectAddr() + added to the API. The first function is a more generalized + version of PacketAliasPermanentLink(). The second function + implements static network address translation. + + See HISTORY file for additional revisions. + + $FreeBSD$ +*/ + + +/* System include files */ +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> + +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/types.h> + +/* BSD network include files */ +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include "alias.h" +#include "alias_local.h" + + + +/* + Constants (note: constants are also defined + near relevant functions or structs) +*/ + +/* Sizes of input and output link tables */ +#define LINK_TABLE_OUT_SIZE 101 +#define LINK_TABLE_IN_SIZE 4001 + +/* Parameters used for cleanup of expired links */ +#define ALIAS_CLEANUP_INTERVAL_SECS 60 +#define ALIAS_CLEANUP_MAX_SPOKES 30 + +/* Timeouts (in seconds) for different link types */ +#define ICMP_EXPIRE_TIME 60 +#define UDP_EXPIRE_TIME 60 +#define FRAGMENT_ID_EXPIRE_TIME 10 +#define FRAGMENT_PTR_EXPIRE_TIME 30 + +/* TCP link expire time for different cases */ +/* When the link has been used and closed - minimal grace time to + allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */ +#ifndef TCP_EXPIRE_DEAD +# define TCP_EXPIRE_DEAD 10 +#endif + +/* When the link has been used and closed on one side - the other side + is allowed to still send data */ +#ifndef TCP_EXPIRE_SINGLEDEAD +# define TCP_EXPIRE_SINGLEDEAD 90 +#endif + +/* When the link isn't yet up */ +#ifndef TCP_EXPIRE_INITIAL +# define TCP_EXPIRE_INITIAL 300 +#endif + +/* When the link is up */ +#ifndef TCP_EXPIRE_CONNECTED +# define TCP_EXPIRE_CONNECTED 86400 +#endif + + +/* Dummy port number codes used for FindLinkIn/Out() and AddLink(). + These constants can be anything except zero, which indicates an + unknown port number. */ + +#define NO_DEST_PORT 1 +#define NO_SRC_PORT 1 + + + +/* Data Structures + + The fundamental data structure used in this program is + "struct alias_link". Whenever a TCP connection is made, + a UDP datagram is sent out, or an ICMP echo request is made, + a link record is made (if it has not already been created). + The link record is identified by the source address/port + and the destination address/port. In the case of an ICMP + echo request, the source port is treated as being equivalent + with the 16-bit id number of the ICMP packet. + + The link record also can store some auxiliary data. For + TCP connections that have had sequence and acknowledgment + modifications, data space is available to track these changes. + A state field is used to keep track in changes to the tcp + connection state. Id numbers of fragments can also be + stored in the auxiliary space. Pointers to unresolved + framgents can also be stored. + + The link records support two independent chainings. Lookup + tables for input and out tables hold the initial pointers + the link chains. On input, the lookup table indexes on alias + port and link type. On output, the lookup table indexes on + source addreess, destination address, source port, destination + port and link type. +*/ + +struct ack_data_record /* used to save changes to ack/seq numbers */ +{ + u_long ack_old; + u_long ack_new; + int delta; + int active; +}; + +struct tcp_state /* Information about tcp connection */ +{ + int in; /* State for outside -> inside */ + int out; /* State for inside -> outside */ + int index; /* Index to ack data array */ + int ack_modified; /* Indicates whether ack and seq numbers */ + /* been modified */ +}; + +#define N_LINK_TCP_DATA 3 /* Number of distinct ack number changes + saved for a modified TCP stream */ +struct tcp_dat +{ + struct tcp_state state; + struct ack_data_record ack[N_LINK_TCP_DATA]; + int fwhole; /* Which firewall record is used for this hole? */ +}; + +struct alias_link /* Main data structure */ +{ + struct in_addr src_addr; /* Address and port information */ + struct in_addr dst_addr; + struct in_addr alias_addr; + struct in_addr proxy_addr; + u_short src_port; + u_short dst_port; + u_short alias_port; + u_short proxy_port; + + int link_type; /* Type of link: tcp, udp, icmp, frag */ + +/* values for link_type */ +#define LINK_ICMP 1 +#define LINK_UDP 2 +#define LINK_TCP 3 +#define LINK_FRAGMENT_ID 4 +#define LINK_FRAGMENT_PTR 5 +#define LINK_ADDR 6 + + int flags; /* indicates special characteristics */ + +/* flag bits */ +#define LINK_UNKNOWN_DEST_PORT 0x01 +#define LINK_UNKNOWN_DEST_ADDR 0x02 +#define LINK_PERMANENT 0x04 +#define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */ +#define LINK_UNFIREWALLED 0x08 + + int timestamp; /* Time link was last accessed */ + int expire_time; /* Expire time for link */ + + int sockfd; /* socket descriptor */ + + u_int start_point_out; /* Index number in output lookup table */ + u_int start_point_in; + struct alias_link *next_out; /* Linked list pointers for input and */ + struct alias_link *last_out; /* output tables */ + struct alias_link *next_in; /* . */ + struct alias_link *last_in; /* . */ + + union /* Auxiliary data */ + { + char *frag_ptr; + struct in_addr frag_addr; + struct tcp_dat *tcp; + } data; +}; + + + + + +/* Global Variables + + The global variables listed here are only accessed from + within alias_db.c and so are prefixed with the static + designation. +*/ + +int packetAliasMode; /* Mode flags */ + /* - documented in alias.h */ + +static struct in_addr aliasAddress; /* Address written onto source */ + /* field of IP packet. */ + +static struct in_addr targetAddress; /* IP address incoming packets */ + /* are sent to if no aliasing */ + /* link already exists */ + +static struct in_addr nullAddress; /* Used as a dummy parameter for */ + /* some function calls */ +static struct alias_link * +linkTableOut[LINK_TABLE_OUT_SIZE]; /* Lookup table of pointers to */ + /* chains of link records. Each */ +static struct alias_link * /* link record is doubly indexed */ +linkTableIn[LINK_TABLE_IN_SIZE]; /* into input and output lookup */ + /* tables. */ + +static int icmpLinkCount; /* Link statistics */ +static int udpLinkCount; +static int tcpLinkCount; +static int fragmentIdLinkCount; +static int fragmentPtrLinkCount; +static int sockCount; + +static int cleanupIndex; /* Index to chain of link table */ + /* being inspected for old links */ + +static int timeStamp; /* System time in seconds for */ + /* current packet */ + +static int lastCleanupTime; /* Last time IncrementalCleanup() */ + /* was called */ + +static int houseKeepingResidual; /* used by HouseKeeping() */ + +static int deleteAllLinks; /* If equal to zero, DeleteLink() */ + /* will not remove permanent links */ + +static FILE *monitorFile; /* File descriptor for link */ + /* statistics monitoring file */ + +static int newDefaultLink; /* Indicates if a new aliasing */ + /* link has been created after a */ + /* call to PacketAliasIn/Out(). */ + +#ifndef NO_FW_PUNCH +static int fireWallFD = -1; /* File descriptor to be able to */ + /* control firewall. Opened by */ + /* PacketAliasSetMode on first */ + /* setting the PKT_ALIAS_PUNCH_FW */ + /* flag. */ +#endif + +static int pptpAliasFlag; /* Indicates if PPTP aliasing is */ + /* on or off */ +static struct in_addr pptpAliasAddr; /* Address of source of PPTP */ + /* packets. */ + + + + + + + +/* Internal utility routines (used only in alias_db.c) + +Lookup table starting points: + StartPointIn() -- link table initial search point for + incoming packets + StartPointOut() -- port table initial search point for + outgoing packets + +Miscellaneous: + SeqDiff() -- difference between two TCP sequences + ShowAliasStats() -- send alias statistics to a monitor file +*/ + + +/* Local prototypes */ +static u_int StartPointIn(struct in_addr, u_short, int); + +static u_int StartPointOut(struct in_addr, struct in_addr, + u_short, u_short, int); + +static int SeqDiff(u_long, u_long); + +static void ShowAliasStats(void); + +#ifndef NO_FW_PUNCH +/* Firewall control */ +static void InitPunchFW(void); +static void UninitPunchFW(void); +static void ClearFWHole(struct alias_link *link); +#endif + +/* Log file control */ +static void InitPacketAliasLog(void); +static void UninitPacketAliasLog(void); + +static u_int +StartPointIn(struct in_addr alias_addr, + u_short alias_port, + int link_type) +{ + u_int n; + + n = alias_addr.s_addr; + n += alias_port; + n += link_type; + return(n % LINK_TABLE_IN_SIZE); +} + + +static u_int +StartPointOut(struct in_addr src_addr, struct in_addr dst_addr, + u_short src_port, u_short dst_port, int link_type) +{ + u_int n; + + n = src_addr.s_addr; + n += dst_addr.s_addr; + n += src_port; + n += dst_port; + n += link_type; + + return(n % LINK_TABLE_OUT_SIZE); +} + + +static int +SeqDiff(u_long x, u_long y) +{ +/* Return the difference between two TCP sequence numbers */ + +/* + This function is encapsulated in case there are any unusual + arithmetic conditions that need to be considered. +*/ + + return (ntohl(y) - ntohl(x)); +} + + +static void +ShowAliasStats(void) +{ +/* Used for debugging */ + + if (monitorFile) + { + fprintf(monitorFile, "icmp=%d, udp=%d, tcp=%d, frag_id=%d frag_ptr=%d", + icmpLinkCount, + udpLinkCount, + tcpLinkCount, + fragmentIdLinkCount, + fragmentPtrLinkCount); + + fprintf(monitorFile, " / tot=%d (sock=%d)\n", + icmpLinkCount + udpLinkCount + + tcpLinkCount + + fragmentIdLinkCount + + fragmentPtrLinkCount, + sockCount); + + fflush(monitorFile); + } +} + + + + + +/* Internal routines for finding, deleting and adding links + +Port Allocation: + GetNewPort() -- find and reserve new alias port number + GetSocket() -- try to allocate a socket for a given port + +Link creation and deletion: + CleanupAliasData() - remove all link chains from lookup table + IncrementalCleanup() - look for stale links in a single chain + DeleteLink() - remove link + AddLink() - add link + ReLink() - change link + +Link search: + FindLinkOut() - find link for outgoing packets + FindLinkIn() - find link for incoming packets +*/ + +/* Local prototypes */ +static int GetNewPort(struct alias_link *, int); + +static u_short GetSocket(u_short, int *, int); + +static void CleanupAliasData(void); + +static void IncrementalCleanup(void); + +static void DeleteLink(struct alias_link *); + +static struct alias_link * +AddLink(struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +ReLink(struct alias_link *, + struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +FindLinkOut(struct in_addr, struct in_addr, u_short, u_short, int, int); + +static struct alias_link * +FindLinkIn(struct in_addr, struct in_addr, u_short, u_short, int, int); + + +#define ALIAS_PORT_BASE 0x08000 +#define ALIAS_PORT_MASK 0x07fff +#define GET_NEW_PORT_MAX_ATTEMPTS 20 + +#define GET_ALIAS_PORT -1 +#define GET_ALIAS_ID GET_ALIAS_PORT + +/* GetNewPort() allocates port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because GetNewPort() looks for + unused triplets: (dest addr, dest port, alias port). */ + +static int +GetNewPort(struct alias_link *link, int alias_port_param) +{ + int i; + int max_trials; + u_short port_sys; + u_short port_net; + +/* + Description of alias_port_param for GetNewPort(). When + this parameter is zero or positive, it precisely specifies + the port number. GetNewPort() will return this number + without check that it is in use. + + Whis this parameter is -1, it indicates to get a randomly + selected port number. +*/ + + if (alias_port_param == GET_ALIAS_PORT) + { + /* + * The aliasing port is automatically selected + * by one of two methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (packetAliasMode & PKT_ALIAS_SAME_PORTS) + { + /* + * When the ALIAS_SAME_PORTS option is + * chosen, the first try will be the + * actual source port. If this is already + * in use, the remainder of the trials + * will be random. + */ + port_net = link->src_port; + port_sys = ntohs(port_net); + } + else + { + /* First trial and all subsequent are random. */ + port_sys = random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + } + else if (alias_port_param >= 0 && alias_port_param < 0x10000) + { + link->alias_port = (u_short) alias_port_param; + return(0); + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetNewPort(): "); + fprintf(stderr, "input parameter error\n"); +#endif + return(-1); + } + + +/* Port number search */ + for (i=0; i<max_trials; i++) + { + int go_ahead; + struct alias_link *search_result; + + search_result = FindLinkIn(link->dst_addr, link->alias_addr, + link->dst_port, port_net, + link->link_type, 0); + + if (search_result == NULL) + go_ahead = 1; + else if (!(link->flags & LINK_PARTIALLY_SPECIFIED) + && (search_result->flags & LINK_PARTIALLY_SPECIFIED)) + go_ahead = 1; + else + go_ahead = 0; + + if (go_ahead) + { + if ((packetAliasMode & PKT_ALIAS_USE_SOCKETS) + && (link->flags & LINK_PARTIALLY_SPECIFIED)) + { + if (GetSocket(port_net, &link->sockfd, link->link_type)) + { + link->alias_port = port_net; + return(0); + } + } + else + { + link->alias_port = port_net; + return(0); + } + } + + port_sys = random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetnewPort(): "); + fprintf(stderr, "could not find free port\n"); +#endif + + return(-1); +} + + +static u_short +GetSocket(u_short port_net, int *sockfd, int link_type) +{ + int err; + int sock; + struct sockaddr_in sock_addr; + + if (link_type == LINK_TCP) + sock = socket(AF_INET, SOCK_STREAM, 0); + else if (link_type == LINK_UDP) + sock = socket(AF_INET, SOCK_DGRAM, 0); + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "incorrect link type\n"); +#endif + return(0); + } + + if (sock < 0) + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "socket() error %d\n", *sockfd); +#endif + return(0); + } + + sock_addr.sin_family = AF_INET; + sock_addr.sin_addr.s_addr = htonl(INADDR_ANY); + sock_addr.sin_port = port_net; + + err = bind(sock, + (struct sockaddr *) &sock_addr, + sizeof(sock_addr)); + if (err == 0) + { + sockCount++; + *sockfd = sock; + return(1); + } + else + { + close(sock); + return(0); + } +} + + +static void +CleanupAliasData(void) +{ + struct alias_link *link; + int i, icount; + + icount = 0; + for (i=0; i<LINK_TABLE_OUT_SIZE; i++) + { + link = linkTableOut[i]; + while (link != NULL) + { + struct alias_link *link_next; + link_next = link->next_out; + icount++; + DeleteLink(link); + link = link_next; + } + } + + cleanupIndex =0; +} + + +static void +IncrementalCleanup(void) +{ + int icount; + struct alias_link *link; + + icount = 0; + link = linkTableOut[cleanupIndex++]; + while (link != NULL) + { + int idelta; + struct alias_link *link_next; + + link_next = link->next_out; + idelta = timeStamp - link->timestamp; + switch (link->link_type) + { + case LINK_ICMP: + case LINK_UDP: + case LINK_FRAGMENT_ID: + case LINK_FRAGMENT_PTR: + if (idelta > link->expire_time) + { + DeleteLink(link); + icount++; + } + break; + case LINK_TCP: + if (idelta > link->expire_time) + { + struct tcp_dat *tcp_aux; + + tcp_aux = link->data.tcp; + if (tcp_aux->state.in != ALIAS_TCP_STATE_CONNECTED + || tcp_aux->state.out != ALIAS_TCP_STATE_CONNECTED) + { + DeleteLink(link); + icount++; + } + } + break; + } + link = link_next; + } + + if (cleanupIndex == LINK_TABLE_OUT_SIZE) + cleanupIndex = 0; +} + +void +DeleteLink(struct alias_link *link) +{ + struct alias_link *link_last; + struct alias_link *link_next; + +/* Don't do anything if the link is marked permanent */ + if (deleteAllLinks == 0 && link->flags & LINK_PERMANENT) + return; + +#ifndef NO_FW_PUNCH +/* Delete associatied firewall hole, if any */ + ClearFWHole(link); +#endif + +/* Adjust output table pointers */ + link_last = link->last_out; + link_next = link->next_out; + + if (link_last != NULL) + link_last->next_out = link_next; + else + linkTableOut[link->start_point_out] = link_next; + + if (link_next != NULL) + link_next->last_out = link_last; + +/* Adjust input table pointers */ + link_last = link->last_in; + link_next = link->next_in; + + if (link_last != NULL) + link_last->next_in = link_next; + else + linkTableIn[link->start_point_in] = link_next; + + if (link_next != NULL) + link_next->last_in = link_last; + +/* Close socket, if one has been allocated */ + if (link->sockfd != -1) + { + sockCount--; + close(link->sockfd); + } + +/* Link-type dependent cleanup */ + switch(link->link_type) + { + case LINK_ICMP: + icmpLinkCount--; + break; + case LINK_UDP: + udpLinkCount--; + break; + case LINK_TCP: + tcpLinkCount--; + if (link->data.tcp != NULL) + free(link->data.tcp); + break; + case LINK_FRAGMENT_ID: + fragmentIdLinkCount--; + break; + case LINK_FRAGMENT_PTR: + fragmentPtrLinkCount--; + if (link->data.frag_ptr != NULL) + free(link->data.frag_ptr); + break; + } + +/* Free memory */ + free(link); + +/* Write statistics, if logging enabled */ + if (packetAliasMode & PKT_ALIAS_LOG) + { + ShowAliasStats(); + } +} + + +static struct alias_link * +AddLink(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) /* port will be automatically */ +{ /* chosen. If greater than */ + u_int start_point; /* zero, equal to alias port */ + struct alias_link *link; + struct alias_link *first_link; + + link = malloc(sizeof(struct alias_link)); + if (link != NULL) + { + /* Basic initialization */ + link->src_addr = src_addr; + link->dst_addr = dst_addr; + link->alias_addr = alias_addr; + link->proxy_addr.s_addr = 0; + link->src_port = src_port; + link->dst_port = dst_port; + link->proxy_port = 0; + link->link_type = link_type; + link->sockfd = -1; + link->flags = 0; + link->timestamp = timeStamp; + + /* Expiration time */ + switch (link_type) + { + case LINK_ICMP: + link->expire_time = ICMP_EXPIRE_TIME; + break; + case LINK_UDP: + link->expire_time = UDP_EXPIRE_TIME; + break; + case LINK_TCP: + link->expire_time = TCP_EXPIRE_INITIAL; + break; + case LINK_FRAGMENT_ID: + link->expire_time = FRAGMENT_ID_EXPIRE_TIME; + break; + case LINK_FRAGMENT_PTR: + link->expire_time = FRAGMENT_PTR_EXPIRE_TIME; + break; + } + + /* Determine alias flags */ + if (dst_addr.s_addr == 0) + link->flags |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + link->flags |= LINK_UNKNOWN_DEST_PORT; + + /* Determine alias port */ + if (GetNewPort(link, alias_port_param) != 0) + { + free(link); + return(NULL); + } + + /* Set up pointers for output lookup table */ + start_point = StartPointOut(src_addr, dst_addr, + src_port, dst_port, link_type); + first_link = linkTableOut[start_point]; + + link->last_out = NULL; + link->next_out = first_link; + link->start_point_out = start_point; + + if (first_link != NULL) + first_link->last_out = link; + + linkTableOut[start_point] = link; + + /* Set up pointers for input lookup table */ + start_point = StartPointIn(alias_addr, link->alias_port, link_type); + first_link = linkTableIn[start_point]; + + link->last_in = NULL; + link->next_in = first_link; + link->start_point_in = start_point; + + if (first_link != NULL) + first_link->last_in = link; + + linkTableIn[start_point] = link; + + /* Link-type dependent initialization */ + switch(link_type) + { + struct tcp_dat *aux_tcp; + + case LINK_ICMP: + icmpLinkCount++; + break; + case LINK_UDP: + udpLinkCount++; + break; + case LINK_TCP: + aux_tcp = malloc(sizeof(struct tcp_dat)); + link->data.tcp = aux_tcp; + if (aux_tcp != NULL) + { + int i; + + tcpLinkCount++; + aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.index = 0; + aux_tcp->state.ack_modified = 0; + for (i=0; i<N_LINK_TCP_DATA; i++) + aux_tcp->ack[i].active = 0; + aux_tcp->fwhole = -1; + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/AddLink: "); + fprintf(stderr, " cannot allocate auxiliary TCP data\n"); +#endif + } + break; + case LINK_FRAGMENT_ID: + fragmentIdLinkCount++; + break; + case LINK_FRAGMENT_PTR: + fragmentPtrLinkCount++; + break; + } + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/AddLink(): "); + fprintf(stderr, "malloc() call failed.\n"); +#endif + } + + if (packetAliasMode & PKT_ALIAS_LOG) + { + ShowAliasStats(); + } + + return(link); +} + +static struct alias_link * +ReLink(struct alias_link *old_link, + struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) /* port will be automatically */ +{ /* chosen. If greater than */ + struct alias_link *new_link; /* zero, equal to alias port */ + + new_link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port_param, + link_type); +#ifndef NO_FW_PUNCH + if (new_link != NULL && + old_link->link_type == LINK_TCP && + old_link->data.tcp && + old_link->data.tcp->fwhole > 0) { + PunchFWHole(new_link); + } +#endif + DeleteLink(old_link); + return new_link; +} + +static struct alias_link * +_FindLinkOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type); + link = linkTableOut[i]; + while (link != NULL) + { + if (link->src_addr.s_addr == src_addr.s_addr + && link->dst_addr.s_addr == dst_addr.s_addr + && link->dst_port == dst_port + && link->src_port == src_port + && link->link_type == link_type) + { + link->timestamp = timeStamp; + break; + } + link = link->next_out; + } + +/* Search for partially specified links. */ + if (link == NULL) + { + if (dst_port != 0) + { + link = _FindLinkOut(src_addr, dst_addr, src_port, 0, + link_type, 0); + if (link != NULL && replace_partial_links) + { + link = ReLink(link, + src_addr, dst_addr, link->alias_addr, + src_port, dst_port, link->alias_port, + link_type); + } + } + else if (dst_addr.s_addr != 0) + { + link = _FindLinkOut(src_addr, nullAddress, src_port, 0, + link_type, 0); + } + } + + return(link); +} + +static struct alias_link * +FindLinkOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *link; + + link = _FindLinkOut(src_addr, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + + if (link == NULL) + { + /* The following allows permanent links to be + specified as using the default source address + (i.e. device interface address) without knowing + in advance what that address is. */ + if (aliasAddress.s_addr != 0 && + src_addr.s_addr == aliasAddress.s_addr) + { + link = _FindLinkOut(nullAddress, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + } + } + + return(link); +} + + +struct alias_link * +_FindLinkIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + int flags_in; + u_int start_point; + struct alias_link *link; + struct alias_link *link_fully_specified; + struct alias_link *link_unknown_all; + struct alias_link *link_unknown_dst_addr; + struct alias_link *link_unknown_dst_port; + +/* Initialize pointers */ + link_fully_specified = NULL; + link_unknown_all = NULL; + link_unknown_dst_addr = NULL; + link_unknown_dst_port = NULL; + +/* If either the dest addr or port is unknown, the search + loop will have to know about this. */ + + flags_in = 0; + if (dst_addr.s_addr == 0) + flags_in |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + flags_in |= LINK_UNKNOWN_DEST_PORT; + +/* Search loop */ + start_point = StartPointIn(alias_addr, alias_port, link_type); + link = linkTableIn[start_point]; + while (link != NULL) + { + int flags; + + flags = flags_in | link->flags; + if (!(flags & LINK_PARTIALLY_SPECIFIED)) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->dst_addr.s_addr == dst_addr.s_addr + && link->dst_port == dst_port + && link->link_type == link_type) + { + link_fully_specified = link; + break; + } + } + else if ((flags & LINK_UNKNOWN_DEST_ADDR) + && (flags & LINK_UNKNOWN_DEST_PORT)) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type) + { + if (link_unknown_all == NULL) + link_unknown_all = link; + } + } + else if (flags & LINK_UNKNOWN_DEST_ADDR) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type + && link->dst_port == dst_port) + { + if (link_unknown_dst_addr == NULL) + link_unknown_dst_addr = link; + } + } + else if (flags & LINK_UNKNOWN_DEST_PORT) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type + && link->dst_addr.s_addr == dst_addr.s_addr) + { + if (link_unknown_dst_port == NULL) + link_unknown_dst_port = link; + } + } + link = link->next_in; + } + + + + if (link_fully_specified != NULL) + { + link_fully_specified->timestamp = timeStamp; + return link_fully_specified; + } + else if (link_unknown_dst_port != NULL) + { + return replace_partial_links + ? ReLink(link_unknown_dst_port, + link_unknown_dst_port->src_addr, dst_addr, alias_addr, + link_unknown_dst_port->src_port, dst_port, alias_port, + link_type) + : link_unknown_dst_port; + } + else if (link_unknown_dst_addr != NULL) + { + return replace_partial_links + ? ReLink(link_unknown_dst_addr, + link_unknown_dst_addr->src_addr, dst_addr, alias_addr, + link_unknown_dst_addr->src_port, dst_port, alias_port, + link_type) + : link_unknown_dst_addr; + } + else if (link_unknown_all != NULL) + { + return replace_partial_links + ? ReLink(link_unknown_all, + link_unknown_all->src_addr, dst_addr, alias_addr, + link_unknown_all->src_port, dst_port, alias_port, + link_type) + : link_unknown_all; + } + else + { + return(NULL); + } +} + +struct alias_link * +FindLinkIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *link; + + link = _FindLinkIn(dst_addr, alias_addr, dst_port, alias_port, + link_type, replace_partial_links); + + if (link == NULL) + { + /* The following allows permanent links to be + specified as using the default aliasing address + (i.e. device interface address) without knowing + in advance what that address is. */ + if (aliasAddress.s_addr != 0 && + alias_addr.s_addr == aliasAddress.s_addr) + { + link = _FindLinkIn(dst_addr, nullAddress, dst_port, alias_port, + link_type, replace_partial_links); + } + } + + return(link); +} + + + + +/* External routines for finding/adding links + +-- "external" means outside alias_db.c, but within alias*.c -- + + FindIcmpIn(), FindIcmpOut() + FindFragmentIn1(), FindFragmentIn2() + AddFragmentPtrLink(), FindFragmentPtr() + FindUdpTcpIn(), FindUdpTcpOut() + FindOriginalAddress(), FindAliasAddress() + +(prototypes in alias_local.h) +*/ + + +struct alias_link * +FindIcmpIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short id_alias) +{ + return FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, id_alias, + LINK_ICMP, 0); +} + + +struct alias_link * +FindIcmpOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short id) +{ + struct alias_link * link; + + link = FindLinkOut(src_addr, dst_addr, + id, NO_DEST_PORT, + LINK_ICMP, 0); + if (link == NULL) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + id, NO_DEST_PORT, GET_ALIAS_ID, + LINK_ICMP); + } + + return(link); +} + + +struct alias_link * +FindFragmentIn1(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short ip_id) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); + + if (link == NULL) + { + link = AddLink(nullAddress, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID); + } + + return(link); +} + + +struct alias_link * +FindFragmentIn2(struct in_addr dst_addr, /* Doesn't add a link if one */ + struct in_addr alias_addr, /* is not found. */ + u_short ip_id) +{ + return FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); +} + + +struct alias_link * +AddFragmentPtrLink(struct in_addr dst_addr, + u_short ip_id) +{ + return AddLink(nullAddress, dst_addr, nullAddress, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR); +} + + +struct alias_link * +FindFragmentPtr(struct in_addr dst_addr, + u_short ip_id) +{ + return FindLinkIn(dst_addr, nullAddress, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR, 0); +} + + +struct alias_link * +FindUdpTcpIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkIn(dst_addr, alias_addr, + dst_port, alias_port, + link_type, 1); + + if (!(packetAliasMode & PKT_ALIAS_DENY_INCOMING) + && !(packetAliasMode & PKT_ALIAS_PROXY_ONLY) + && link == NULL) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + alias_port, dst_port, alias_port, + link_type); + } + + return(link); +} + + +struct alias_link * +FindUdpTcpOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkOut(src_addr, dst_addr, src_port, dst_port, link_type, 1); + + if (link == NULL) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, GET_ALIAS_PORT, + link_type); + } + + return(link); +} + + +struct in_addr +FindOriginalAddress(struct in_addr alias_addr) +{ + struct alias_link *link; + + link = FindLinkIn(nullAddress, alias_addr, + 0, 0, LINK_ADDR, 0); + if (link == NULL) + { + newDefaultLink = 1; + if (targetAddress.s_addr != 0) + return targetAddress; + else + return alias_addr; + } + else + { + if (link->src_addr.s_addr == 0) + return aliasAddress; + else + return link->src_addr; + } +} + + +struct in_addr +FindAliasAddress(struct in_addr original_addr) +{ + struct alias_link *link; + + link = FindLinkOut(original_addr, nullAddress, + 0, 0, LINK_ADDR, 0); + if (link == NULL) + { + return aliasAddress; + } + else + { + if (link->alias_addr.s_addr == 0) + return aliasAddress; + else + return link->alias_addr; + } +} + + +/* External routines for getting or changing link data + (external to alias_db.c, but internal to alias*.c) + + SetFragmentData(), GetFragmentData() + SetFragmentPtr(), GetFragmentPtr() + SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut() + GetOriginalAddress(), GetDestAddress(), GetAliasAddress() + GetOriginalPort(), GetAliasPort() + SetAckModified(), GetAckModified() + GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq() +*/ + + +void +SetFragmentAddr(struct alias_link *link, struct in_addr src_addr) +{ + link->data.frag_addr = src_addr; +} + + +void +GetFragmentAddr(struct alias_link *link, struct in_addr *src_addr) +{ + *src_addr = link->data.frag_addr; +} + + +void +SetFragmentPtr(struct alias_link *link, char *fptr) +{ + link->data.frag_ptr = fptr; +} + + +void +GetFragmentPtr(struct alias_link *link, char **fptr) +{ + *fptr = link->data.frag_ptr; +} + + +void +SetStateIn(struct alias_link *link, int state) +{ + /* TCP input state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (link->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED) { + link->expire_time = TCP_EXPIRE_DEAD; + } else { + link->expire_time = TCP_EXPIRE_SINGLEDEAD; + } + link->data.tcp->state.in = state; + break; + case ALIAS_TCP_STATE_CONNECTED: + link->expire_time = TCP_EXPIRE_CONNECTED; + /*FALLTHROUGH*/ + case ALIAS_TCP_STATE_NOT_CONNECTED: + link->data.tcp->state.in = state; + break; + default: + abort(); + } +} + + +void +SetStateOut(struct alias_link *link, int state) +{ + /* TCP output state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (link->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED) { + link->expire_time = TCP_EXPIRE_DEAD; + } else { + link->expire_time = TCP_EXPIRE_SINGLEDEAD; + } + link->data.tcp->state.out = state; + break; + case ALIAS_TCP_STATE_CONNECTED: + link->expire_time = TCP_EXPIRE_CONNECTED; + /*FALLTHROUGH*/ + case ALIAS_TCP_STATE_NOT_CONNECTED: + link->data.tcp->state.out = state; + break; + default: + abort(); + } +} + + +int +GetStateIn(struct alias_link *link) +{ + /* TCP input state */ + return link->data.tcp->state.in; +} + + +int +GetStateOut(struct alias_link *link) +{ + /* TCP output state */ + return link->data.tcp->state.out; +} + + +struct in_addr +GetOriginalAddress(struct alias_link *link) +{ + if (link->src_addr.s_addr == 0) + return aliasAddress; + else + return(link->src_addr); +} + + +struct in_addr +GetDestAddress(struct alias_link *link) +{ + return(link->dst_addr); +} + + +struct in_addr +GetAliasAddress(struct alias_link *link) +{ + if (link->alias_addr.s_addr == 0) + return aliasAddress; + else + return link->alias_addr; +} + + +struct in_addr +GetDefaultAliasAddress() +{ + return aliasAddress; +} + + +void +SetDefaultAliasAddress(struct in_addr alias_addr) +{ + aliasAddress = alias_addr; +} + + +u_short +GetOriginalPort(struct alias_link *link) +{ + return(link->src_port); +} + + +u_short +GetAliasPort(struct alias_link *link) +{ + return(link->alias_port); +} + +u_short +GetDestPort(struct alias_link *link) +{ + return(link->dst_port); +} + +void +SetAckModified(struct alias_link *link) +{ +/* Indicate that ack numbers have been modified in a TCP connection */ + link->data.tcp->state.ack_modified = 1; +} + + +struct in_addr +GetProxyAddress(struct alias_link *link) +{ + return link->proxy_addr; +} + + +void +SetProxyAddress(struct alias_link *link, struct in_addr addr) +{ + link->proxy_addr = addr; +} + + +u_short +GetProxyPort(struct alias_link *link) +{ + return link->proxy_port; +} + + +void +SetProxyPort(struct alias_link *link, u_short port) +{ + link->proxy_port = port; +} + + +int +GetAckModified(struct alias_link *link) +{ +/* See if ack numbers have been modified */ + return link->data.tcp->state.ack_modified; +} + + +int +GetDeltaAckIn(struct ip *pip, struct alias_link *link) +{ +/* +Find out how much the ack number has been altered for an incoming +TCP packet. To do this, a circular list is ack numbers where the TCP +packet size was altered is searched. +*/ + + int i; + struct tcphdr *tc; + int delta, ack_diff_min; + u_long ack; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + ack = tc->th_ack; + + delta = 0; + ack_diff_min = -1; + for (i=0; i<N_LINK_TCP_DATA; i++) + { + struct ack_data_record x; + + x = link->data.tcp->ack[i]; + if (x.active == 1) + { + int ack_diff; + + ack_diff = SeqDiff(x.ack_new, ack); + if (ack_diff >= 0) + { + if (ack_diff_min >= 0) + { + if (ack_diff < ack_diff_min) + { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + else + { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + } + } + return (delta); +} + + +int +GetDeltaSeqOut(struct ip *pip, struct alias_link *link) +{ +/* +Find out how much the seq number has been altered for an outgoing +TCP packet. To do this, a circular list is ack numbers where the TCP +packet size was altered is searched. +*/ + + int i; + struct tcphdr *tc; + int delta, seq_diff_min; + u_long seq; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + seq = tc->th_seq; + + delta = 0; + seq_diff_min = -1; + for (i=0; i<N_LINK_TCP_DATA; i++) + { + struct ack_data_record x; + + x = link->data.tcp->ack[i]; + if (x.active == 1) + { + int seq_diff; + + seq_diff = SeqDiff(x.ack_old, seq); + if (seq_diff >= 0) + { + if (seq_diff_min >= 0) + { + if (seq_diff < seq_diff_min) + { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + else + { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + } + } + return (delta); +} + + +void +AddSeq(struct ip *pip, struct alias_link *link, int delta) +{ +/* +When a TCP packet has been altered in length, save this +information in a circular list. If enough packets have +been altered, then this list will begin to overwrite itself. +*/ + + struct tcphdr *tc; + struct ack_data_record x; + int hlen, tlen, dlen; + int i; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + x.ack_old = htonl(ntohl(tc->th_seq) + dlen); + x.ack_new = htonl(ntohl(tc->th_seq) + dlen + delta); + x.delta = delta; + x.active = 1; + + i = link->data.tcp->state.index; + link->data.tcp->ack[i] = x; + + i++; + if (i == N_LINK_TCP_DATA) + link->data.tcp->state.index = 0; + else + link->data.tcp->state.index = i; +} + +void +SetExpire(struct alias_link *link, int expire) +{ + if (expire == 0) + { + link->flags &= ~LINK_PERMANENT; + DeleteLink(link); + } + else if (expire == -1) + { + link->flags |= LINK_PERMANENT; + } + else if (expire > 0) + { + link->expire_time = expire; + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/SetExpire(): "); + fprintf(stderr, "error in expire parameter\n"); +#endif + } +} + +void +ClearCheckNewLink(void) +{ + newDefaultLink = 0; +} + + +/* Miscellaneous Functions + + HouseKeeping() + InitPacketAliasLog() + UninitPacketAliasLog() +*/ + +/* + Whenever an outgoing or incoming packet is handled, HouseKeeping() + is called to find and remove timed-out aliasing links. Logic exists + to sweep through the entire table and linked list structure + every 60 seconds. + + (prototype in alias_local.h) +*/ + +void +HouseKeeping(void) +{ + int i, n, n100; + struct timeval tv; + struct timezone tz; + + /* + * Save system time (seconds) in global variable timeStamp for + * use by other functions. This is done so as not to unnecessarily + * waste timeline by making system calls. + */ + gettimeofday(&tv, &tz); + timeStamp = tv.tv_sec; + + /* Compute number of spokes (output table link chains) to cover */ + n100 = LINK_TABLE_OUT_SIZE * 100 + houseKeepingResidual; + n100 *= timeStamp - lastCleanupTime; + n100 /= ALIAS_CLEANUP_INTERVAL_SECS; + + n = n100/100; + + /* Handle different cases */ + if (n > ALIAS_CLEANUP_MAX_SPOKES) + { + n = ALIAS_CLEANUP_MAX_SPOKES; + lastCleanupTime = timeStamp; + houseKeepingResidual = 0; + + for (i=0; i<n; i++) + IncrementalCleanup(); + } + else if (n > 0) + { + lastCleanupTime = timeStamp; + houseKeepingResidual = n100 - 100*n; + + for (i=0; i<n; i++) + IncrementalCleanup(); + } + else if (n < 0) + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/HouseKeeping(): "); + fprintf(stderr, "something unexpected in time values\n"); +#endif + lastCleanupTime = timeStamp; + houseKeepingResidual = 0; + } +} + + +/* Init the log file and enable logging */ +static void +InitPacketAliasLog(void) +{ + if ((~packetAliasMode & PKT_ALIAS_LOG) + && (monitorFile = fopen("/var/log/alias.log", "w"))) + { + packetAliasMode |= PKT_ALIAS_LOG; + fprintf(monitorFile, + "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n"); + } +} + + +/* Close the log-file and disable logging. */ +static void +UninitPacketAliasLog(void) +{ + if (monitorFile) { + fclose(monitorFile); + monitorFile = NULL; + } + packetAliasMode &= ~PKT_ALIAS_LOG; +} + + + + + + +/* Outside world interfaces + +-- "outside world" means other than alias*.c routines -- + + PacketAliasRedirectPort() + PacketAliasRedirectAddr() + PacketAliasRedirectDelete() + PacketAliasSetAddress() + PacketAliasInit() + PacketAliasUninit() + PacketAliasSetMode() + +(prototypes in alias.h) +*/ + +/* Redirection from a specific public addr:port to a + a private addr:port */ +struct alias_link * +PacketAliasRedirectPort(struct in_addr src_addr, u_short src_port, + struct in_addr dst_addr, u_short dst_port, + struct in_addr alias_addr, u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch(proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: +#ifdef DEBUG + fprintf(stderr, "PacketAliasRedirectPort(): "); + fprintf(stderr, "only TCP and UDP protocols allowed\n"); +#endif + return NULL; + } + + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectPort(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + +/* Translate PPTP packets to a machine on the inside + */ +int +PacketAliasPptp(struct in_addr src_addr) +{ + + pptpAliasAddr = src_addr; /* Address of the inside PPTP machine */ + pptpAliasFlag = src_addr.s_addr != INADDR_NONE; + + return 1; +} + +int GetPptpAlias (struct in_addr* alias_addr) +{ + if (pptpAliasFlag) + *alias_addr = pptpAliasAddr; + + return pptpAliasFlag; +} + +/* Static address translation */ +struct alias_link * +PacketAliasRedirectAddr(struct in_addr src_addr, + struct in_addr alias_addr) +{ + struct alias_link *link; + + link = AddLink(src_addr, nullAddress, alias_addr, + 0, 0, 0, + LINK_ADDR); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectAddr(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + + +void +PacketAliasRedirectDelete(struct alias_link *link) +{ +/* This is a dangerous function to put in the API, + because an invalid pointer can crash the program. */ + + deleteAllLinks = 1; + DeleteLink(link); + deleteAllLinks = 0; +} + + +void +PacketAliasSetAddress(struct in_addr addr) +{ + if (packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE + && aliasAddress.s_addr != addr.s_addr) + CleanupAliasData(); + + aliasAddress = addr; +} + + +void +PacketAliasSetTarget(struct in_addr target_addr) +{ + targetAddress = target_addr; +} + + +void +PacketAliasInit(void) +{ + int i; + struct timeval tv; + struct timezone tz; + static int firstCall = 1; + + if (firstCall == 1) + { + gettimeofday(&tv, &tz); + timeStamp = tv.tv_sec; + lastCleanupTime = tv.tv_sec; + houseKeepingResidual = 0; + + for (i=0; i<LINK_TABLE_OUT_SIZE; i++) + linkTableOut[i] = NULL; + for (i=0; i<LINK_TABLE_IN_SIZE; i++) + linkTableIn[i] = NULL; + + atexit(PacketAliasUninit); + firstCall = 0; + } + else + { + deleteAllLinks = 1; + CleanupAliasData(); + deleteAllLinks = 0; + } + + aliasAddress.s_addr = 0; + targetAddress.s_addr = 0; + + icmpLinkCount = 0; + udpLinkCount = 0; + tcpLinkCount = 0; + fragmentIdLinkCount = 0; + fragmentPtrLinkCount = 0; + sockCount = 0; + + cleanupIndex =0; + + packetAliasMode = PKT_ALIAS_SAME_PORTS + | PKT_ALIAS_USE_SOCKETS + | PKT_ALIAS_RESET_ON_ADDR_CHANGE; + + pptpAliasFlag = 0; +} + +void +PacketAliasUninit(void) { + deleteAllLinks = 1; + CleanupAliasData(); + deleteAllLinks = 0; + UninitPacketAliasLog(); +#ifndef NO_FW_PUNCH + UninitPunchFW(); +#endif +} + + +/* Change mode for some operations */ +unsigned int +PacketAliasSetMode( + unsigned int flags, /* Which state to bring flags to */ + unsigned int mask /* Mask of which flags to affect (use 0 to do a + probe for flag values) */ +) +{ +/* Enable logging? */ + if (flags & mask & PKT_ALIAS_LOG) + { + InitPacketAliasLog(); /* Do the enable */ + } else +/* _Disable_ logging? */ + if (~flags & mask & PKT_ALIAS_LOG) { + UninitPacketAliasLog(); + } + +#ifndef NO_FW_PUNCH +/* Start punching holes in the firewall? */ + if (flags & mask & PKT_ALIAS_PUNCH_FW) { + InitPunchFW(); + } else +/* Stop punching holes in the firewall? */ + if (~flags & mask & PKT_ALIAS_PUNCH_FW) { + UninitPunchFW(); + } +#endif + +/* Other flags can be set/cleared without special action */ + packetAliasMode = (flags & mask) | (packetAliasMode & ~mask); + return packetAliasMode; +} + + +int +PacketAliasCheckNewLink(void) +{ + return newDefaultLink; +} + + +#ifndef NO_FW_PUNCH + +/***************** + Code to support firewall punching. This shouldn't really be in this + file, but making variables global is evil too. + ****************/ + +/* Firewall include files */ +#include <sys/queue.h> +#include <net/if.h> +#include <netinet/ip_fw.h> +#include <string.h> +#include <err.h> + +static void ClearAllFWHoles(void); + +static int fireWallBaseNum; /* The first firewall entry free for our use */ +static int fireWallNumNums; /* How many entries can we use? */ +static int fireWallActiveNum; /* Which entry did we last use? */ +static char *fireWallField; /* bool array for entries */ + +#define fw_setfield(field, num) \ +do { \ + (field)[num] = 1; \ +} /*lint -save -e717 */ while(0) /*lint -restore */ +#define fw_clrfield(field, num) \ +do { \ + (field)[num] = 0; \ +} /*lint -save -e717 */ while(0) /*lint -restore */ +#define fw_tstfield(field, num) ((field)[num]) + +void +PacketAliasSetFWBase(unsigned int base, unsigned int num) { + fireWallBaseNum = base; + fireWallNumNums = num; +} + +static void +InitPunchFW(void) { + fireWallField = malloc(fireWallNumNums); + if (fireWallField) { + memset(fireWallField, 0, fireWallNumNums); + if (fireWallFD < 0) { + fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + } + ClearAllFWHoles(); + fireWallActiveNum = fireWallBaseNum; + } +} + +static void +UninitPunchFW(void) { + ClearAllFWHoles(); + if (fireWallFD >= 0) + close(fireWallFD); + fireWallFD = -1; + if (fireWallField) + free(fireWallField); + fireWallField = NULL; + packetAliasMode &= ~PKT_ALIAS_PUNCH_FW; +} + +/* Make a certain link go through the firewall */ +void +PunchFWHole(struct alias_link *link) { + int r; /* Result code */ + struct ip_fw rule; /* On-the-fly built rule */ + int fwhole; /* Where to punch hole */ + +/* Don't do anything unless we are asked to */ + if ( !(packetAliasMode & PKT_ALIAS_PUNCH_FW) || + fireWallFD < 0 || + link->link_type != LINK_TCP || + !link->data.tcp) + return; + + memset(&rule, 0, sizeof rule); + +/** Build rule **/ + + /* Find empty slot */ + for (fwhole = fireWallActiveNum; + fwhole < fireWallBaseNum + fireWallNumNums && + fw_tstfield(fireWallField, fwhole); + fwhole++) + ; + if (fwhole >= fireWallBaseNum + fireWallNumNums || + fw_tstfield(fireWallField, fwhole)) { + for (fwhole = fireWallBaseNum; + fwhole < fireWallActiveNum && + fw_tstfield(fireWallField, fwhole); + fwhole++) + ; + if (fwhole == fireWallActiveNum) { + /* No rule point empty - we can't punch more holes. */ + fireWallActiveNum = fireWallBaseNum; +#ifdef DEBUG + fprintf(stderr, "libalias: Unable to create firewall hole!\n"); +#endif + return; + } + } + /* Start next search at next position */ + fireWallActiveNum = fwhole+1; + + /* Build generic part of the two rules */ + rule.fw_number = fwhole; + rule.fw_nports = 1; /* Number of source ports; dest ports follow */ + rule.fw_flg = IP_FW_F_ACCEPT; + rule.fw_prot = IPPROTO_TCP; + rule.fw_smsk.s_addr = INADDR_BROADCAST; + rule.fw_dmsk.s_addr = INADDR_BROADCAST; + + /* Build and apply specific part of the rules */ + rule.fw_src = GetOriginalAddress(link); + rule.fw_dst = GetDestAddress(link); + rule.fw_uar.fw_pts[0] = ntohs(GetOriginalPort(link)); + rule.fw_uar.fw_pts[1] = ntohs(GetDestPort(link)); + + /* Skip non-bound links - XXX should not be strictly necessary, + but seems to leave hole if not done. Leak of non-bound links? + (Code should be left even if the problem is fixed - it is a + clear optimization) */ + if (rule.fw_uar.fw_pts[0] != 0 && rule.fw_uar.fw_pts[1] != 0) { + r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule); +#ifdef DEBUG + if (r) + err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)"); +#endif + rule.fw_src = GetDestAddress(link); + rule.fw_dst = GetOriginalAddress(link); + rule.fw_uar.fw_pts[0] = ntohs(GetDestPort(link)); + rule.fw_uar.fw_pts[1] = ntohs(GetOriginalPort(link)); + r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule); +#ifdef DEBUG + if (r) + err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)"); +#endif + } +/* Indicate hole applied */ + link->data.tcp->fwhole = fwhole; + fw_setfield(fireWallField, fwhole); +} + +/* Remove a hole in a firewall associated with a particular alias + link. Calling this too often is harmless. */ +static void +ClearFWHole(struct alias_link *link) { + if (link->link_type == LINK_TCP && link->data.tcp) { + int fwhole = link->data.tcp->fwhole; /* Where is the firewall hole? */ + struct ip_fw rule; + + if (fwhole < 0) + return; + + memset(&rule, 0, sizeof rule); + rule.fw_number = fwhole; + while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule)) + ; + fw_clrfield(fireWallField, fwhole); + link->data.tcp->fwhole = -1; + } +} + +/* Clear out the entire range dedicated to firewall holes. */ +static void +ClearAllFWHoles(void) { + struct ip_fw rule; /* On-the-fly built rule */ + int i; + + if (fireWallFD < 0) + return; + + memset(&rule, 0, sizeof rule); + for (i = fireWallBaseNum; i < fireWallBaseNum + fireWallNumNums; i++) { + rule.fw_number = i; + while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule)) + ; + } + memset(fireWallField, 0, fireWallNumNums); +} +#endif diff --git a/sys/netinet/libalias/alias_ftp.c b/sys/netinet/libalias/alias_ftp.c new file mode 100644 index 0000000..ca2d53b --- /dev/null +++ b/sys/netinet/libalias/alias_ftp.c @@ -0,0 +1,231 @@ +/* + Alias_ftp.c performs special processing for FTP sessions under + TCP. Specifically, when a PORT command from the client side + is sent, it is intercepted and modified. The address is changed + to the gateway machine and an aliasing port is used. + + For this routine to work, the PORT command must fit entirely + into a single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new PORT command will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + + This software is placed into the public domain with no restrictions + on its distribution. + + Initial version: August, 1996 (cjm) + + Version 1.6 + Brian Somers and Martin Renters identified an IP checksum + error for modified IP packets. + + Version 1.7: January 9, 1996 (cjm) + Differental checksum computation for change + in IP packet length. + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + withing the packet alising module. + + See HISTORY file for record of revisions. + + $FreeBSD$ +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include "alias_local.h" + +static void NewFtpPortCommand(struct ip *, struct alias_link *, struct in_addr, u_short, int); + + + +void +AliasHandleFtpOut( +struct ip *pip, /* IP packet to examine/patch */ +struct alias_link *link, /* The link to go through (aliased port) */ +int maxpacketsize /* The maximum size this packet can grow to (including headers) */) +{ + int hlen, tlen, dlen; + struct in_addr true_addr; + u_short true_port; + char *sptr; + struct tcphdr *tc; + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Return is data length is too long or too short */ + if (dlen<10 || dlen>80) + return; + +/* Place string pointer and beginning of data */ + sptr = (char *) pip; + sptr += hlen; + +/* Parse through string using state diagram method */ + { + char ch, zero; + int i, state; + u_long a1, a2, a3, a4; + u_short p1, p2; + + a1=0; a2=0; a3=0; a4=0; p1=0; p2=0; + zero = '0'; + state=-4; + for (i=0; i<dlen; i++) + { + ch = sptr[i]; + switch (state) + { + case -4: if (ch == 'P') state=-3; else return; break; + case -3: if (ch == 'O') state=-2; else return; break; + case -2: if (ch == 'R') state=-1; else return; break; + case -1: if (ch == 'T') state= 0; else return; break; + + case 0 : + if (isdigit(ch)) {a1=ch-zero; state=1 ;} break; + case 1 : + if (isdigit(ch)) a1=10*a1+ch-zero; else state=2 ; break; + case 2 : + if (isdigit(ch)) {a2=ch-zero; state=3 ;} break; + case 3 : + if (isdigit(ch)) a2=10*a2+ch-zero; else state=4 ; break; + case 4 : + if (isdigit(ch)) {a3=ch-zero; state=5 ;} break; + case 5 : + if (isdigit(ch)) a3=10*a3+ch-zero; else state=6 ; break; + case 6 : + if (isdigit(ch)) {a4=ch-zero; state=7 ;} break; + case 7 : + if (isdigit(ch)) a4=10*a4+ch-zero; else state=8 ; break; + case 8 : + if (isdigit(ch)) {p1=ch-zero; state=9 ;} break; + case 9 : + if (isdigit(ch)) p1=10*p1+ch-zero; else state=10; break; + case 10: + if (isdigit(ch)) {p2=ch-zero; state=11;} break; + case 11: + if (isdigit(ch)) p2=10*p2+ch-zero; break; + } + } + + if (state == 11) + { + true_port = htons((p1<<8) + p2); + true_addr.s_addr = htonl((a1<<24) + (a2<<16) +(a3<<8) + a4); + NewFtpPortCommand(pip, link, true_addr, true_port, maxpacketsize); + } + } +} + +static void +NewFtpPortCommand(struct ip *pip, + struct alias_link *link, + struct in_addr true_addr, + u_short true_port, + int maxpacketsize) +{ + struct alias_link *ftp_link; + +/* Establish link to address and port found in PORT command */ + ftp_link = FindUdpTcpOut(true_addr, GetDestAddress(link), + true_port, 0, IPPROTO_TCP); + + if (ftp_link != NULL) + { + int slen, hlen, tlen, dlen; + struct tcphdr *tc; + +#ifndef NO_FW_PUNCH +/* Punch hole in firewall */ + PunchFWHole(ftp_link); +#endif + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Create new PORT command */ + { + char stemp[80]; + char *sptr; + u_short alias_port; + u_char *ptr; + int a1, a2, a3, a4, p1, p2; + struct in_addr alias_address; + +/* Decompose alias address into quad format */ + alias_address = GetAliasAddress(link); + ptr = (u_char *) &alias_address.s_addr; + a1 = *ptr++; a2=*ptr++; a3=*ptr++; a4=*ptr; + +/* Decompose alias port into pair format */ + alias_port = GetAliasPort(ftp_link); + ptr = (char *) &alias_port; + p1 = *ptr++; p2=*ptr; + +/* Generate command string */ + sprintf(stemp, "PORT %d,%d,%d,%d,%d,%d\r\n", + a1,a2,a3,a4,p1,p2); + +/* Save string length for IP header modification */ + slen = strlen(stemp); + +/* Copy into IP packet */ + sptr = (char *) pip; sptr += hlen; + strncpy(sptr, stemp, maxpacketsize-hlen); + } + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+slen-dlen); + } + +/* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + slen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + +/* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + } + else + { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/HandleFtpOut: Cannot allocate FTP data port\n"); +#endif + } +} diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c new file mode 100644 index 0000000..8c52c1f --- /dev/null +++ b/sys/netinet/libalias/alias_irc.c @@ -0,0 +1,318 @@ +/* Alias_irc.c intercepts packages contain IRC CTCP commands, and + changes DCC commands to export a port on the aliasing host instead + of an aliased host. + + For this routine to work, the DCC command must fit entirely into a + single TCP packet. This will usually happen, but is not + guaranteed. + + The interception is likely to change the length of the packet. + The handling of this is copied more-or-less verbatim from + ftp_alias.c + + This software is placed into the public domain with no restrictions + on its distribution. + + Initial version: Eivind Eklund <perhaps@yes.no> (ee) 97-01-29 + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + withing the packet alising module. + + $FreeBSD$ +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <limits.h> + +#include "alias_local.h" + +/* Local defines */ +#define DBprintf(a) + + +void +AliasHandleIrcOut(struct ip *pip, /* IP packet to examine */ + struct alias_link *link, /* Which link are we on? */ + int maxsize /* Maximum size of IP packet including headers */ + ) +{ + int hlen, tlen, dlen; + struct in_addr true_addr; + u_short true_port; + char *sptr; + struct tcphdr *tc; + int i; /* Iterator through the source */ + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Return if data length is too short - assume an entire PRIVMSG in each packet. */ + if (dlen<sizeof(":A!a@n.n PRIVMSG A :aDCC 1 1a")-1) + return; + +/* Place string pointer at beginning of data */ + sptr = (char *) pip; + sptr += hlen; + maxsize -= hlen; /* We're interested in maximum size of data, not packet */ + + /* Search for a CTCP command [Note 1] */ + for( i=0; i<dlen; i++ ) { + if(sptr[i]=='\001') + goto lFOUND_CTCP; + } + return; /* No CTCP commands in */ + /* Handle CTCP commands - the buffer may have to be copied */ +lFOUND_CTCP: + { + char newpacket[65536]; /* Estimate of maximum packet size :) */ + int copyat = i; /* Same */ + int iCopy = 0; /* How much data have we written to copy-back string? */ + unsigned long org_addr; /* Original IP address */ + unsigned short org_port; /* Original source port address */ + lCTCP_START: + if( i >= dlen || iCopy >= sizeof(newpacket) ) + goto lPACKET_DONE; + newpacket[iCopy++] = sptr[i++]; /* Copy the CTCP start character */ + /* Start of a CTCP */ + if( i+4 >= dlen ) /* Too short for DCC */ + goto lBAD_CTCP; + if( sptr[i+0] != 'D' ) + goto lBAD_CTCP; + if( sptr[i+1] != 'C' ) + goto lBAD_CTCP; + if( sptr[i+2] != 'C' ) + goto lBAD_CTCP; + if( sptr[i+3] != ' ' ) + goto lBAD_CTCP; + /* We have a DCC command - handle it! */ + i+= 4; /* Skip "DCC " */ + if( iCopy+4 > sizeof(newpacket) ) + goto lPACKET_DONE; + newpacket[iCopy++] = 'D'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = ' '; + + DBprintf(("Found DCC\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen) { + DBprintf(("DCC packet terminated in just spaces\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring command...\n")); + while(sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if( ++i >= dlen || iCopy >= sizeof(newpacket) ) { + DBprintf(("DCC packet terminated during command\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if( i+1 < dlen && iCopy < sizeof(newpacket) ) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done command - removing spaces\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("DCC packet terminated in just spaces (post-command)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring filename...\n")); + while(sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if( ++i >= dlen || iCopy >= sizeof(newpacket) ) { + DBprintf(("DCC packet terminated during filename\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if( i+1 < dlen && iCopy < sizeof(newpacket) ) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done filename - removing spaces\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("DCC packet terminated in just spaces (post-filename)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Fetching IP address\n")); + /* Fetch IP address */ + org_addr = 0; + while(i<dlen && isdigit(sptr[i])) { + if( org_addr > ULONG_MAX/10UL ) { /* Terminate on overflow */ + DBprintf(("DCC Address overflow (org_addr == 0x%08lx, next char %c\n", org_addr, sptr[i])); + goto lBAD_CTCP; + } + org_addr *= 10; + org_addr += sptr[i++]-'0'; + } + DBprintf(("Skipping space\n")); + if( i+1 >= dlen || sptr[i] != ' ' ) { + DBprintf(("Overflow (%d >= %d) or bad character (%02x) terminating IP address\n", i+1, dlen, sptr[i])); + goto lBAD_CTCP; + } + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway, so we might + as well play it safe */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("Packet failure - space overflow.\n")); + goto lPACKET_DONE; + } + } + DBprintf(("Fetching port number\n")); + /* Fetch source port */ + org_port = 0; + while(i<dlen && isdigit(sptr[i])) { + if( org_port > 6554 ) { /* Terminate on overflow (65536/10 rounded up*/ + DBprintf(("DCC: port number overflow\n")); + goto lBAD_CTCP; + } + org_port *= 10; + org_port += sptr[i++]-'0'; + } + /* Skip illegal addresses (or early termination) */ + if( i >= dlen || (sptr[i] != '\001' && sptr[i] != ' ') ) { + DBprintf(("Bad port termination\n")); + goto lBAD_CTCP; + } + DBprintf(("Got IP %lu and port %u\n", org_addr, (unsigned)org_port)); + + /* We've got the address and port - now alias it */ + { + struct alias_link *dcc_link; + struct in_addr destaddr; + + + true_port = htons(org_port); + true_addr.s_addr = htonl(org_addr); + destaddr.s_addr = 0; + + /* Steal the FTP_DATA_PORT - it doesn't really matter, and this + would probably allow it through at least _some_ + firewalls. */ + dcc_link = FindUdpTcpOut (true_addr, + destaddr, + true_port, + 0, IPPROTO_TCP); + DBprintf(("Got a DCC link\n")); + if ( dcc_link ) { + struct in_addr alias_address; /* Address from aliasing */ + u_short alias_port; /* Port given by aliasing */ + +#ifndef NO_FW_PUNCH + /* Generate firewall hole as appropriate */ + PunchFWHole(dcc_link); +#endif + + alias_address = GetAliasAddress(link); + iCopy += snprintf(&newpacket[iCopy], + sizeof(newpacket)-iCopy, + "%lu ", (u_long)htonl(alias_address.s_addr)); + if( iCopy >= sizeof(newpacket) ) { /* Truncated/fit exactly - bad news */ + DBprintf(("DCC constructed packet overflow.\n")); + goto lBAD_CTCP; + } + alias_port = GetAliasPort(dcc_link); + iCopy += snprintf(&newpacket[iCopy], + sizeof(newpacket)-iCopy, + "%u", htons(alias_port) ); + /* Done - truncated cases will be taken care of by lBAD_CTCP */ + DBprintf(("Aliased IP %lu and port %u\n", alias_address.s_addr, (unsigned)alias_port)); + } + } + /* An uninteresting CTCP - state entered right after '\001' has + been pushed. Also used to copy the rest of a DCC, after IP + address and port has been handled */ + lBAD_CTCP: + for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if(sptr[i] == '\001') { + goto lNORMAL_TEXT; + } + } + goto lPACKET_DONE; + /* Normal text */ + lNORMAL_TEXT: + for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if(sptr[i] == '\001') { + goto lCTCP_START; + } + } + /* Handle the end of a packet */ + lPACKET_DONE: + iCopy = iCopy > maxsize-copyat ? maxsize-copyat : iCopy; + memcpy(sptr+copyat, newpacket, iCopy); + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+copyat+iCopy-dlen); + } + + /* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + iCopy + copyat); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + return; + } +} + +/* Notes: + [Note 1] + The initial search will most often fail; it could be replaced with a 32-bit specific search. + Such a search would be done for 32-bit unsigned value V: + V ^= 0x01010101; (Search is for null bytes) + if( ((V-0x01010101)^V) & 0x80808080 ) { + (found a null bytes which was a 01 byte) + } + To assert that the processor is 32-bits, do + extern int ircdccar[32]; (32 bits) + extern int ircdccar[CHAR_BIT*sizeof(unsigned int)]; + which will generate a type-error on all but 32-bit machines. + + [Note 2] This routine really ought to be replaced with one that + creates a transparent proxy on the aliasing host, to allow arbitary + changes in the TCP stream. This should not be too difficult given + this base; I (ee) will try to do this some time later. + */ diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h new file mode 100644 index 0000000..6b4389c --- /dev/null +++ b/sys/netinet/libalias/alias_local.h @@ -0,0 +1,172 @@ +/* -*- mode: c; tab-width: 3; c-basic-offset: 3; -*- + Alias_local.h contains the function prototypes for alias.c, + alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well + as any future add-ons). It also includes macros, globals and + struct definitions shared by more than one alias*.c file. + + This include file is intended to be used only within the aliasing + software. Outside world interfaces are defined in alias.h + + This software is placed into the public domain with no restrictions + on its distribution. + + Initial version: August, 1996 (cjm) + + <updated several times by original author and Eivind Eklund> + + $FreeBSD$ +*/ +#ifndef ALIAS_LOCAL_H +#define ALIAS_LOCAL_H + + +/* + Macros + */ + +/* + The following macro is used to update an + internet checksum. "delta" is a 32-bit + accumulation of all the changes to the + checksum (adding in new 16-bit words and + subtracting out old words), and "cksum" + is the checksum value to be updated. +*/ +#define ADJUST_CHECKSUM(acc, cksum) { \ + acc += cksum; \ + if (acc < 0) \ + { \ + acc = -acc; \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) ~acc; \ + } \ + else \ + { \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) acc; \ + } \ +} + + +/* + Globals +*/ + +extern int packetAliasMode; + + +/* + Structs +*/ + +struct alias_link; /* Incomplete structure */ + + +/* + Prototypes +*/ + +/* General utilities */ +u_short IpChecksum(struct ip *); +u_short TcpChecksum(struct ip *); +void DifferentialChecksum(u_short *, u_short *, u_short *, int); + +/* Internal data access */ +struct alias_link * +FindIcmpIn(struct in_addr, struct in_addr, u_short); + +struct alias_link * +FindIcmpOut(struct in_addr, struct in_addr, u_short); + +struct alias_link * +FindFragmentIn1(struct in_addr, struct in_addr, u_short); + +struct alias_link * +FindFragmentIn2(struct in_addr, struct in_addr, u_short); + +struct alias_link * +AddFragmentPtrLink(struct in_addr, u_short); + +struct alias_link * +FindFragmentPtr(struct in_addr, u_short); + +struct alias_link * +FindUdpTcpIn (struct in_addr, struct in_addr, u_short, u_short, u_char); + +struct alias_link * +FindUdpTcpOut(struct in_addr, struct in_addr, u_short, u_short, u_char); + +struct in_addr +FindOriginalAddress(struct in_addr); + +struct in_addr +FindAliasAddress(struct in_addr); + +/* External data access/modification */ +void GetFragmentAddr(struct alias_link *, struct in_addr *); +void SetFragmentAddr(struct alias_link *, struct in_addr); +void GetFragmentPtr(struct alias_link *, char **); +void SetFragmentPtr(struct alias_link *, char *); +void SetStateIn(struct alias_link *, int); +void SetStateOut(struct alias_link *, int); +int GetStateIn(struct alias_link *); +int GetStateOut(struct alias_link *); +struct in_addr GetOriginalAddress(struct alias_link *); +struct in_addr GetDestAddress(struct alias_link *); +struct in_addr GetAliasAddress(struct alias_link *); +struct in_addr GetDefaultAliasAddress(void); +void SetDefaultAliasAddress(struct in_addr); +u_short GetOriginalPort(struct alias_link *); +u_short GetAliasPort(struct alias_link *); +struct in_addr GetProxyAddress(struct alias_link *); +void SetProxyAddress(struct alias_link *, struct in_addr); +u_short GetProxyPort(struct alias_link *); +void SetProxyPort(struct alias_link *, u_short); +void SetAckModified(struct alias_link *); +int GetAckModified(struct alias_link *); +int GetDeltaAckIn(struct ip *, struct alias_link *); +int GetDeltaSeqOut(struct ip *, struct alias_link *); +void AddSeq(struct ip *, struct alias_link *, int); +void SetExpire(struct alias_link *, int); +void ClearCheckNewLink(void); +#ifndef NO_FW_PUNCH +void PunchFWHole(struct alias_link *); +#endif + + +/* Housekeeping function */ +void HouseKeeping(void); + +/* Tcp specfic routines */ +/*lint -save -library Suppress flexelint warnings */ + +/* FTP routines */ +void AliasHandleFtpOut(struct ip *, struct alias_link *, int); + +/* IRC routines */ +void AliasHandleIrcOut(struct ip *, struct alias_link *, int); + +/* NetBIOS routines */ +int AliasHandleUdpNbt(struct ip *, struct alias_link *, struct in_addr *, u_short); +int AliasHandleUdpNbtNS(struct ip *, struct alias_link *, struct in_addr *, u_short *, struct in_addr *, u_short *); + +/* CUSeeMe routines */ +void AliasHandleCUSeeMeOut(struct ip *, struct alias_link *); +void AliasHandleCUSeeMeIn(struct ip *, struct in_addr); + +/* Transparent proxy routines */ +int ProxyCheck(struct ip *, struct in_addr *, u_short *); +void ProxyModify(struct alias_link *, struct ip *, int, int); + + +enum alias_tcp_state { + ALIAS_TCP_STATE_NOT_CONNECTED, + ALIAS_TCP_STATE_CONNECTED, + ALIAS_TCP_STATE_DISCONNECTED +}; + +int GetPptpAlias (struct in_addr*); +/*lint -restore */ +#endif /* defined(ALIAS_LOCAL_H) */ diff --git a/sys/netinet/libalias/alias_nbt.c b/sys/netinet/libalias/alias_nbt.c new file mode 100644 index 0000000..b5afedf --- /dev/null +++ b/sys/netinet/libalias/alias_nbt.c @@ -0,0 +1,711 @@ +/* + * Written by Atsushi Murai <amurai@spec.co.jp> + * + * Copyright (C) 1998, System Planning and Engineering Co. All rights reserverd. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the System Planning and Engineering Co. The name of the + * SPEC may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * $FreeBSD$ + * + * TODO: + * oClean up. + * oConsidering for word alignment for other platform. + */ +/* + alias_nbt.c performs special processing for NetBios over TCP/IP + sessions by UDP. + + Initial version: May, 1998 (Atsushi Murai <amurai@spec.co.jp>) + + See HISTORY file for record of revisions. +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <netinet/tcp.h> + +#include "alias_local.h" + +#define ADJUST_CHECKSUM(acc, cksum) { \ + acc += cksum; \ + if (acc < 0) \ + { \ + acc = -acc; \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) ~acc; \ + } \ + else \ + { \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) acc; \ + } \ +} + +typedef struct { + struct in_addr oldaddr; + u_short oldport; + struct in_addr newaddr; + u_short newport; + u_short *uh_sum; +} NBTArguments; + +typedef struct { + unsigned char type; + unsigned char flags; + u_short id; + struct in_addr source_ip; + u_short source_port; + u_short len; + u_short offset; +} NbtDataHeader; + +#define OpQuery 0 +#define OpUnknown 4 +#define OpRegist 5 +#define OpRelease 6 +#define OpWACK 7 +#define OpRefresh 8 +typedef struct { + u_short nametrid; + u_short dir:1, opcode:4, nmflags:7, rcode:4; + u_short qdcount; + u_short ancount; + u_short nscount; + u_short arcount; +} NbtNSHeader; + +#define FMT_ERR 0x1 +#define SRV_ERR 0x2 +#define IMP_ERR 0x4 +#define RFS_ERR 0x5 +#define ACT_ERR 0x6 +#define CFT_ERR 0x7 + + +#ifdef DEBUG +static void PrintRcode( u_char rcode ) { + + switch (rcode) { + case FMT_ERR: + printf("\nFormat Error."); + case SRV_ERR: + printf("\nSever failure."); + case IMP_ERR: + printf("\nUnsupported request error.\n"); + case RFS_ERR: + printf("\nRefused error.\n"); + case ACT_ERR: + printf("\nActive error.\n"); + case CFT_ERR: + printf("\nName in conflict error.\n"); + default: + printf("\n???=%0x\n", rcode ); + + } +} +#endif + + +/* Handling Name field */ +static u_char *AliasHandleName ( u_char *p, char *pmax ) { + + u_char *s; + u_char c; + int compress; + + /* Following length field */ + + if (p == NULL || (char *)p >= pmax) + return(NULL); + + if (*p & 0xc0 ) { + p = p + 2; + if ((char *)p > pmax) + return(NULL); + return ((u_char *)p); + } + while ( ( *p & 0x3f) != 0x00 ) { + s = p + 1; + if ( *p == 0x20 ) + compress = 1; + else + compress = 0; + + /* Get next length field */ + p = (u_char *)(p + (*p & 0x3f) + 1); + if ((char *)p > pmax) { + p = NULL; + break; + } +#ifdef DEBUG + printf(":"); +#endif + while (s < p) { + if ( compress == 1 ) { + c = (u_char )(((((*s & 0x0f) << 4) | (*(s+1) & 0x0f)) - 0x11)); +#ifdef DEBUG + if (isprint( c ) ) + printf("%c", c ); + else + printf("<0x%02x>", c ); +#endif + s +=2; + } else { +#ifdef DEBUG + printf("%c", *s); +#endif + s++; + } + } +#ifdef DEBUG + printf(":"); +#endif + fflush(stdout); + } + + /* Set up to out of Name field */ + if (p == NULL || (char *)p >= pmax) + p = NULL; + else + p++; + return ((u_char *)p); +} + +/* + * NetBios Datagram Handler (IP/UDP) + */ +#define DGM_DIRECT_UNIQ 0x10 +#define DGM_DIRECT_GROUP 0x11 +#define DGM_BROADCAST 0x12 +#define DGM_ERROR 0x13 +#define DGM_QUERY 0x14 +#define DGM_POSITIVE_RES 0x15 +#define DGM_NEGATIVE_RES 0x16 + +int AliasHandleUdpNbt( + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link, + struct in_addr *alias_address, + u_short alias_port +) { + struct udphdr * uh; + NbtDataHeader *ndh; + u_char *p = NULL; + char *pmax; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + pmax = (char *)uh + ntohs( uh->uh_ulen ); + + ndh = (NbtDataHeader *)((char *)uh + (sizeof (struct udphdr))); + if ((char *)(ndh + 1) > pmax) + return(-1); +#ifdef DEBUG + printf("\nType=%02x,", ndh->type ); +#endif + switch ( ndh->type ) { + case DGM_DIRECT_UNIQ: + case DGM_DIRECT_GROUP: + case DGM_BROADCAST: + p = (u_char *)ndh + 14; + p = AliasHandleName ( p, pmax ); /* Source Name */ + p = AliasHandleName ( p, pmax ); /* Destination Name */ + break; + case DGM_ERROR: + p = (u_char *)ndh + 11; + break; + case DGM_QUERY: + case DGM_POSITIVE_RES: + case DGM_NEGATIVE_RES: + p = (u_char *)ndh + 10; + p = AliasHandleName ( p, pmax ); /* Destination Name */ + break; + } + if (p == NULL || (char *)p > pmax) + p = NULL; +#ifdef DEBUG + printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) ); +#endif + /* Doing a IP address and Port number Translation */ + if ( uh->uh_sum != 0 ) { + int acc; + u_short *sptr; + acc = ndh->source_port; + acc -= alias_port; + sptr = (u_short *) &(ndh->source_ip); + acc += *sptr++; + acc += *sptr; + sptr = (u_short *) alias_address; + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, uh->uh_sum) + } + ndh->source_ip = *alias_address; + ndh->source_port = alias_port; +#ifdef DEBUG + printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) ); + fflush(stdout); +#endif + return((p == NULL) ? -1 : 0); +} +/* Question Section */ +#define QS_TYPE_NB 0x0020 +#define QS_TYPE_NBSTAT 0x0021 +#define QS_CLAS_IN 0x0001 +typedef struct { + u_short type; /* The type of Request */ + u_short class; /* The class of Request */ +} NBTNsQuestion; + +static u_char * +AliasHandleQuestion( + u_short count, + NBTNsQuestion *q, + char *pmax, + NBTArguments *nbtarg) +{ + + while ( count != 0 ) { + /* Name Filed */ + q = (NBTNsQuestion *)AliasHandleName((u_char *)q, pmax); + + if (q == NULL || (char *)(q + 1) > pmax) { + q = NULL; + break; + } + + /* Type and Class filed */ + switch ( ntohs(q->type) ) { + case QS_TYPE_NB: + case QS_TYPE_NBSTAT: + q= q+1; + break; + default: +#ifdef DEBUG + printf("\nUnknown Type on Question %0x\n", ntohs(q->type) ); +#endif + break; + } + count--; + } + + /* Set up to out of Question Section */ + return ((u_char *)q); +} + +/* Resource Record */ +#define RR_TYPE_A 0x0001 +#define RR_TYPE_NS 0x0002 +#define RR_TYPE_NULL 0x000a +#define RR_TYPE_NB 0x0020 +#define RR_TYPE_NBSTAT 0x0021 +#define RR_CLAS_IN 0x0001 +#define SizeOfNsResource 8 +typedef struct { + u_short type; + u_short class; + unsigned int ttl; + u_short rdlen; +} NBTNsResource; + +#define SizeOfNsRNB 6 +typedef struct { + u_short g:1, ont:2, resv:13; + struct in_addr addr; +} NBTNsRNB; + +static u_char * +AliasHandleResourceNB( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsRNB *nb; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + /* Check out a length */ + bcount = ntohs(q->rdlen); + + /* Forward to Resource NB position */ + nb = (NBTNsRNB *)((u_char *)q + SizeOfNsResource); + + /* Processing all in_addr array */ +#ifdef DEBUG + printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s, %dbytes] ",inet_ntoa(nbtarg->newaddr ), bcount); +#endif + while ( nb != NULL && bcount != 0 ) { + if ((char *)(nb + 1) > pmax) { + nb = NULL; + break; + } +#ifdef DEBUG + printf("<%s>", inet_ntoa(nb->addr) ); +#endif + if (!bcmp(&nbtarg->oldaddr,&nb->addr, sizeof(struct in_addr) ) ) { + if ( *nbtarg->uh_sum != 0 ) { + int acc; + u_short *sptr; + + sptr = (u_short *) &(nb->addr); + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) &(nbtarg->newaddr); + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum) + } + + nb->addr = nbtarg->newaddr; +#ifdef DEBUG + printf("O"); +#endif + } +#ifdef DEBUG + else { + printf("."); + } +#endif + nb=(NBTNsRNB *)((u_char *)nb + SizeOfNsRNB); + bcount -= SizeOfNsRNB; + } + if (nb == NULL || (char *)(nb + 1) > pmax) { + nb = NULL; + } + + return ((u_char *)nb); +} + +#define SizeOfResourceA 6 +typedef struct { + struct in_addr addr; +} NBTNsResourceA; + +static u_char * +AliasHandleResourceA( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceA *a; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource A position */ + a = (NBTNsResourceA *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ +#ifdef DEBUG + printf("Arec [%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s]",inet_ntoa(nbtarg->newaddr )); +#endif + while ( bcount != 0 ) { + if (a == NULL || (char *)(a + 1) > pmax) + return(NULL); +#ifdef DEBUG + printf("..%s", inet_ntoa(a->addr) ); +#endif + if ( !bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr) ) ) { + if ( *nbtarg->uh_sum != 0 ) { + int acc; + u_short *sptr; + + sptr = (u_short *) &(a->addr); /* Old */ + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) &nbtarg->newaddr; /* New */ + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum) + } + + a->addr = nbtarg->newaddr; + } + a++; /*XXXX*/ + bcount -= SizeOfResourceA; + } + if (a == NULL || (char *)(a + 1) > pmax) + a = NULL; + return ((u_char *)a); +} + +typedef struct { + u_short opcode:4, flags:8, resv:4; +} NBTNsResourceNULL; + +static u_char * +AliasHandleResourceNULL( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ + while ( bcount != 0 ) { + if ((char *)(n + 1) > pmax) { + n = NULL; + break; + } + n++; + bcount -= sizeof(NBTNsResourceNULL); + } + if ((char *)(n + 1) > pmax) + n = NULL; + + return ((u_char *)n); +} + +static u_char * +AliasHandleResourceNS( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Resource Record Name Filed */ + q = (NBTNsResource *)AliasHandleName( (u_char *)n, pmax ); /* XXX */ + + if (q == NULL || (char *)((u_char *)n + bcount) > pmax) + return(NULL); + else + return ((u_char *)n + bcount); +} + +typedef struct { + u_short numnames; +} NBTNsResourceNBSTAT; + +static u_char * +AliasHandleResourceNBSTAT( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNBSTAT *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NBSTAT position */ + n = (NBTNsResourceNBSTAT *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + if (q == NULL || (char *)((u_char *)n + bcount) > pmax) + return(NULL); + else + return ((u_char *)n + bcount); +} + +static u_char * +AliasHandleResource( + u_short count, + NBTNsResource *q, + char *pmax, + NBTArguments + *nbtarg) +{ + while ( count != 0 ) { + /* Resource Record Name Filed */ + q = (NBTNsResource *)AliasHandleName( (u_char *)q, pmax ); + + if (q == NULL || (char *)(q + 1) > pmax) + break; +#ifdef DEBUG + printf("type=%02x, count=%d\n", ntohs(q->type), count ); +#endif + + /* Type and Class filed */ + switch ( ntohs(q->type) ) { + case RR_TYPE_NB: + q = (NBTNsResource *)AliasHandleResourceNB( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_A: + q = (NBTNsResource *)AliasHandleResourceA( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NS: + q = (NBTNsResource *)AliasHandleResourceNS( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NULL: + q = (NBTNsResource *)AliasHandleResourceNULL( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NBSTAT: + q = (NBTNsResource *)AliasHandleResourceNBSTAT( + q, + pmax, + nbtarg + ); + break; + default: +#ifdef DEBUG + printf( + "\nUnknown Type of Resource %0x\n", + ntohs(q->type) + ); +#endif + break; + } + count--; + } + fflush(stdout); + return ((u_char *)q); +} + +int AliasHandleUdpNbtNS( + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link, + struct in_addr *alias_address, + u_short *alias_port, + struct in_addr *original_address, + u_short *original_port ) +{ + struct udphdr * uh; + NbtNSHeader * nsh; + u_char * p; + char *pmax; + NBTArguments nbtarg; + + /* Set up Common Parameter */ + nbtarg.oldaddr = *alias_address; + nbtarg.oldport = *alias_port; + nbtarg.newaddr = *original_address; + nbtarg.newport = *original_port; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + nbtarg.uh_sum = &(uh->uh_sum); + nsh = (NbtNSHeader *)((char *)uh + (sizeof(struct udphdr))); + p = (u_char *)(nsh + 1); + pmax = (char *)uh + ntohs( uh->uh_ulen ); + + if ((char *)(nsh + 1) > pmax) + return(-1); + +#ifdef DEBUG + printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x" + ", an=%04x, ns=%04x, ar=%04x, [%d]-->", + nsh->dir ? "Response": "Request", + nsh->nametrid, + nsh->opcode, + nsh->nmflags, + nsh->rcode, + ntohs(nsh->qdcount), + ntohs(nsh->ancount), + ntohs(nsh->nscount), + ntohs(nsh->arcount), + (u_char *)p -(u_char *)nsh + ); +#endif + + /* Question Entries */ + if (ntohs(nsh->qdcount) !=0 ) { + p = AliasHandleQuestion( + ntohs(nsh->qdcount), + (NBTNsQuestion *)p, + pmax, + &nbtarg + ); + } + + /* Answer Resource Records */ + if (ntohs(nsh->ancount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->ancount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + + /* Authority Resource Recodrs */ + if (ntohs(nsh->nscount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->nscount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + + /* Additional Resource Recodrs */ + if (ntohs(nsh->arcount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->arcount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + +#ifdef DEBUG + PrintRcode(nsh->rcode); +#endif + return ((p == NULL) ? -1 : 0); +} diff --git a/sys/netinet/libalias/alias_proxy.c b/sys/netinet/libalias/alias_proxy.c new file mode 100644 index 0000000..02e1765 --- /dev/null +++ b/sys/netinet/libalias/alias_proxy.c @@ -0,0 +1,807 @@ +/* file: alias_proxy.c + + This file encapsulates special operations related to transparent + proxy redirection. This is where packets with a particular destination, + usually tcp port 80, are redirected to a proxy server. + + When packets are proxied, the destination address and port are + modified. In certain cases, it is necessary to somehow encode + the original address/port info into the packet. Two methods are + presently supported: addition of a [DEST addr port] string at the + beginning a of tcp stream, or inclusion of an optional field + in the IP header. + + There is one public API function: + + PacketAliasProxyRule() -- Adds and deletes proxy + rules. + + Rules are stored in a linear linked list, so lookup efficiency + won't be too good for large lists. + + + Initial development: April, 1998 (cjm) + + $FreeBSD$ +*/ + + +/* System includes */ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> + +#include <sys/types.h> +#include <sys/socket.h> + +/* BSD IPV4 includes */ +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <arpa/inet.h> + +#include "alias_local.h" /* Functions used by alias*.c */ +#include "alias.h" /* Public API functions for libalias */ + + + +/* + Data structures + */ + +/* + * A linked list of arbitrary length, based on struct proxy_entry is + * used to store proxy rules. + */ +struct proxy_entry +{ +#define PROXY_TYPE_ENCODE_NONE 1 +#define PROXY_TYPE_ENCODE_TCPSTREAM 2 +#define PROXY_TYPE_ENCODE_IPHDR 3 + int rule_index; + int proxy_type; + u_char proto; + u_short proxy_port; + u_short server_port; + + struct in_addr server_addr; + + struct in_addr src_addr; + struct in_addr src_mask; + + struct in_addr dst_addr; + struct in_addr dst_mask; + + struct proxy_entry *next; + struct proxy_entry *last; +}; + + + +/* + File scope variables +*/ + +static struct proxy_entry *proxyList; + + + +/* Local (static) functions: + + IpMask() -- Utility function for creating IP + masks from integer (1-32) specification. + IpAddr() -- Utility function for converting string + to IP address + IpPort() -- Utility function for converting string + to port number + RuleAdd() -- Adds an element to the rule list. + RuleDelete() -- Removes an element from the rule list. + RuleNumberDelete() -- Removes all elements from the rule list + having a certain rule number. + ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning + of a TCP stream. + ProxyEncodeIpHeader() -- Adds an IP option indicating the true + destination of a proxied IP packet +*/ + +static int IpMask(int, struct in_addr *); +static int IpAddr(char *, struct in_addr *); +static int IpPort(char *, int, int *); +static void RuleAdd(struct proxy_entry *); +static void RuleDelete(struct proxy_entry *); +static int RuleNumberDelete(int); +static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int); +static void ProxyEncodeIpHeader(struct ip *, int); + +static int +IpMask(int nbits, struct in_addr *mask) +{ + int i; + u_int imask; + + if (nbits < 0 || nbits > 32) + return -1; + + imask = 0; + for (i=0; i<nbits; i++) + imask = (imask >> 1) + 0x80000000; + mask->s_addr = htonl(imask); + + return 0; +} + +static int +IpAddr(char *s, struct in_addr *addr) +{ + if (inet_aton(s, addr) == 0) + return -1; + else + return 0; +} + +static int +IpPort(char *s, int proto, int *port) +{ + int n; + + n = sscanf(s, "%d", port); + if (n != 1) + { + struct servent *se; + + if (proto == IPPROTO_TCP) + se = getservbyname(s, "tcp"); + else if (proto == IPPROTO_UDP) + se = getservbyname(s, "udp"); + else + return -1; + + if (se == NULL) + return -1; + + *port = (u_int) ntohs(se->s_port); + } + + return 0; +} + +void +RuleAdd(struct proxy_entry *entry) +{ + int rule_index; + struct proxy_entry *ptr; + struct proxy_entry *ptr_last; + + if (proxyList == NULL) + { + proxyList = entry; + entry->last = NULL; + entry->next = NULL; + return; + } + + rule_index = entry->rule_index; + ptr = proxyList; + ptr_last = NULL; + while (ptr != NULL) + { + if (ptr->rule_index >= rule_index) + { + if (ptr_last == NULL) + { + entry->next = proxyList; + entry->last = NULL; + proxyList->last = entry; + proxyList = entry; + return; + } + + ptr_last->next = entry; + ptr->last = entry; + entry->last = ptr->last; + entry->next = ptr; + return; + } + ptr_last = ptr; + ptr = ptr->next; + } + + ptr_last->next = entry; + entry->last = ptr_last; + entry->next = NULL; +} + +static void +RuleDelete(struct proxy_entry *entry) +{ + if (entry->last != NULL) + entry->last->next = entry->next; + else + proxyList = entry->next; + + if (entry->next != NULL) + entry->next->last = entry->last; + + free(entry); +} + +static int +RuleNumberDelete(int rule_index) +{ + int err; + struct proxy_entry *ptr; + + err = -1; + ptr = proxyList; + while (ptr != NULL) + { + struct proxy_entry *ptr_next; + + ptr_next = ptr->next; + if (ptr->rule_index == rule_index) + { + err = 0; + RuleDelete(ptr); + } + + ptr = ptr_next; + } + + return err; +} + +static void +ProxyEncodeTcpStream(struct alias_link *link, + struct ip *pip, + int maxpacketsize) +{ + int slen; + char buffer[40]; + struct tcphdr *tc; + +/* Compute pointer to tcp header */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + +/* Don't modify if once already modified */ + + if (GetAckModified (link)) + return; + +/* Translate destination address and port to string form */ + snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]", + inet_ntoa(GetProxyAddress (link)), (u_int) ntohs(GetProxyPort (link))); + +/* Pad string out to a multiple of two in length */ + slen = strlen(buffer); + switch (slen % 2) + { + case 0: + strcat(buffer, " \n"); + slen += 2; + break; + case 1: + strcat(buffer, "\n"); + slen += 1; + } + +/* Check for packet overflow */ + if ((ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize) + return; + +/* Shift existing TCP data and insert destination string */ + { + int dlen; + int hlen; + u_char *p; + + hlen = (pip->ip_hl + tc->th_off) << 2; + dlen = ntohs (pip->ip_len) - hlen; + +/* Modify first packet that has data in it */ + + if (dlen == 0) + return; + + p = (char *) pip; + p += hlen; + + memmove(p + slen, p, dlen); + memcpy(p, buffer, slen); + } + +/* Save information about modfied sequence number */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+slen); + } + +/* Update IP header packet length and checksum */ + { + int accumulate; + + accumulate = pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + slen); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + +/* Update TCP checksum, Use TcpChecksum since so many things have + already changed. */ + + tc->th_sum = 0; + tc->th_sum = TcpChecksum (pip); +} + +static void +ProxyEncodeIpHeader(struct ip *pip, + int maxpacketsize) +{ +#define OPTION_LEN_BYTES 8 +#define OPTION_LEN_INT16 4 +#define OPTION_LEN_INT32 2 + u_char option[OPTION_LEN_BYTES]; + +#ifdef DEBUG + fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip)); +#endif + +/* Check to see that there is room to add an IP option */ + if (pip->ip_hl > (0x0f - OPTION_LEN_INT32)) + return; + +/* Build option and copy into packet */ + { + u_char *ptr; + struct tcphdr *tc; + + ptr = (u_char *) pip; + ptr += 20; + memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20); + + option[0] = 0x64; /* class: 3 (reserved), option 4 */ + option[1] = OPTION_LEN_BYTES; + + memcpy(&option[2], (u_char *) &pip->ip_dst, 4); + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + memcpy(&option[6], (u_char *) &tc->th_sport, 2); + + memcpy(ptr, option, 8); + } + +/* Update checksum, header length and packet length */ + { + int i; + int accumulate; + u_short *sptr; + + sptr = (u_short *) option; + accumulate = 0; + for (i=0; i<OPTION_LEN_INT16; i++) + accumulate -= *(sptr++); + + sptr = (u_short *) pip; + accumulate += *sptr; + pip->ip_hl += OPTION_LEN_INT32; + accumulate -= *sptr; + + accumulate += pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } +#undef OPTION_LEN_BYTES +#undef OPTION_LEN_INT16 +#undef OPTION_LEN_INT32 +#ifdef DEBUG + fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip)); +#endif +} + + +/* Functions by other packet alias source files + + ProxyCheck() -- Checks whether an outgoing packet should + be proxied. + ProxyModify() -- Encodes the original destination address/port + for a packet which is to be redirected to + a proxy server. +*/ + +int +ProxyCheck(struct ip *pip, + struct in_addr *proxy_server_addr, + u_short *proxy_server_port) +{ + u_short dst_port; + struct in_addr src_addr; + struct in_addr dst_addr; + struct proxy_entry *ptr; + + src_addr = pip->ip_src; + dst_addr = pip->ip_dst; + dst_port = ((struct tcphdr *) ((char *) pip + (pip->ip_hl << 2))) + ->th_dport; + + ptr = proxyList; + while (ptr != NULL) + { + u_short proxy_port; + + proxy_port = ptr->proxy_port; + if ((dst_port == proxy_port || proxy_port == 0) + && pip->ip_p == ptr->proto + && src_addr.s_addr != ptr->server_addr.s_addr) + { + struct in_addr src_addr_masked; + struct in_addr dst_addr_masked; + + src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr; + dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr; + + if ((src_addr_masked.s_addr == ptr->src_addr.s_addr) + && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr)) + { + if ((*proxy_server_port = ptr->server_port) == 0) + *proxy_server_port = dst_port; + *proxy_server_addr = ptr->server_addr; + return ptr->proxy_type; + } + } + ptr = ptr->next; + } + + return 0; +} + +void +ProxyModify(struct alias_link *link, + struct ip *pip, + int maxpacketsize, + int proxy_type) +{ + switch (proxy_type) + { + case PROXY_TYPE_ENCODE_IPHDR: + ProxyEncodeIpHeader(pip, maxpacketsize); + break; + + case PROXY_TYPE_ENCODE_TCPSTREAM: + ProxyEncodeTcpStream(link, pip, maxpacketsize); + break; + } +} + + +/* + Public API functions +*/ + +int +PacketAliasProxyRule(const char *cmd) +{ +/* + * This function takes command strings of the form: + * + * server <addr>[:<port>] + * [port <port>] + * [rule n] + * [proto tcp|udp] + * [src <addr>[/n]] + * [dst <addr>[/n]] + * [type encode_tcp_stream|encode_ip_hdr|no_encode] + * + * delete <rule number> + * + * Subfields can be in arbitrary order. Port numbers and addresses + * must be in either numeric or symbolic form. An optional rule number + * is used to control the order in which rules are searched. If two + * rules have the same number, then search order cannot be guaranteed, + * and the rules should be disjoint. If no rule number is specified, + * then 0 is used, and group 0 rules are always checked before any + * others. + */ + int i, n, len; + int cmd_len; + int token_count; + int state; + char *token; + char buffer[256]; + char str_port[sizeof(buffer)]; + char str_server_port[sizeof(buffer)]; + + int rule_index; + int proto; + int proxy_type; + int proxy_port; + int server_port; + struct in_addr server_addr; + struct in_addr src_addr, src_mask; + struct in_addr dst_addr, dst_mask; + struct proxy_entry *proxy_entry; + +/* Copy command line into a buffer */ + cmd_len = strlen(cmd); + if (cmd_len > (sizeof(buffer) - 1)) + return -1; + strcpy(buffer, cmd); + +/* Convert to lower case */ + len = strlen(buffer); + for (i=0; i<len; i++) + buffer[i] = tolower(buffer[i]); + +/* Set default proxy type */ + +/* Set up default values */ + rule_index = 0; + proxy_type = PROXY_TYPE_ENCODE_NONE; + proto = IPPROTO_TCP; + proxy_port = 0; + server_addr.s_addr = 0; + server_port = 0; + src_addr.s_addr = 0; + IpMask(0, &src_mask); + dst_addr.s_addr = 0; + IpMask(0, &dst_mask); + + str_port[0] = 0; + str_server_port[0] = 0; + +/* Parse command string with state machine */ +#define STATE_READ_KEYWORD 0 +#define STATE_READ_TYPE 1 +#define STATE_READ_PORT 2 +#define STATE_READ_SERVER 3 +#define STATE_READ_RULE 4 +#define STATE_READ_DELETE 5 +#define STATE_READ_PROTO 6 +#define STATE_READ_SRC 7 +#define STATE_READ_DST 8 + state = STATE_READ_KEYWORD; + token = strtok(buffer, " \t"); + token_count = 0; + while (token != NULL) + { + token_count++; + switch (state) + { + case STATE_READ_KEYWORD: + if (strcmp(token, "type") == 0) + state = STATE_READ_TYPE; + else if (strcmp(token, "port") == 0) + state = STATE_READ_PORT; + else if (strcmp(token, "server") == 0) + state = STATE_READ_SERVER; + else if (strcmp(token, "rule") == 0) + state = STATE_READ_RULE; + else if (strcmp(token, "delete") == 0) + state = STATE_READ_DELETE; + else if (strcmp(token, "proto") == 0) + state = STATE_READ_PROTO; + else if (strcmp(token, "src") == 0) + state = STATE_READ_SRC; + else if (strcmp(token, "dst") == 0) + state = STATE_READ_DST; + else + return -1; + break; + + case STATE_READ_TYPE: + if (strcmp(token, "encode_ip_hdr") == 0) + proxy_type = PROXY_TYPE_ENCODE_IPHDR; + else if (strcmp(token, "encode_tcp_stream") == 0) + proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM; + else if (strcmp(token, "no_encode") == 0) + proxy_type = PROXY_TYPE_ENCODE_NONE; + else + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_PORT: + strcpy(str_port, token); + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SERVER: + { + int err; + char *p; + char s[sizeof(buffer)]; + + p = token; + while (*p != ':' && *p != 0) + p++; + + if (*p != ':') + { + err = IpAddr(token, &server_addr); + if (err) + return -1; + } + else + { + *p = ' '; + + n = sscanf(token, "%s %s", s, str_server_port); + if (n != 2) + return -1; + + err = IpAddr(s, &server_addr); + if (err) + return -1; + } + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_RULE: + n = sscanf(token, "%d", &rule_index); + if (n != 1 || rule_index < 0) + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_DELETE: + { + int err; + int rule_to_delete; + + if (token_count != 2) + return -1; + + n = sscanf(token, "%d", &rule_to_delete); + if (n != 1) + return -1; + err = RuleNumberDelete(rule_to_delete); + if (err) + return -1; + return 0; + } + + case STATE_READ_PROTO: + if (strcmp(token, "tcp") == 0) + proto = IPPROTO_TCP; + else if (strcmp(token, "udp") == 0) + proto = IPPROTO_UDP; + else + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SRC: + case STATE_READ_DST: + { + int err; + char *p; + struct in_addr mask; + struct in_addr addr; + + p = token; + while (*p != '/' && *p != 0) + p++; + + if (*p != '/') + { + IpMask(32, &mask); + err = IpAddr(token, &addr); + if (err) + return -1; + } + else + { + int n; + int nbits; + char s[sizeof(buffer)]; + + *p = ' '; + n = sscanf(token, "%s %d", s, &nbits); + if (n != 2) + return -1; + + err = IpAddr(s, &addr); + if (err) + return -1; + + err = IpMask(nbits, &mask); + if (err) + return -1; + } + + if (state == STATE_READ_SRC) + { + src_addr = addr; + src_mask = mask; + } + else + { + dst_addr = addr; + dst_mask = mask; + } + } + state = STATE_READ_KEYWORD; + break; + + default: + return -1; + break; + } + + token = strtok(NULL, " \t"); + } +#undef STATE_READ_KEYWORD +#undef STATE_READ_TYPE +#undef STATE_READ_PORT +#undef STATE_READ_SERVER +#undef STATE_READ_RULE +#undef STATE_READ_DELETE +#undef STATE_READ_PROTO +#undef STATE_READ_SRC +#undef STATE_READ_DST + +/* Convert port strings to numbers. This needs to be done after + the string is parsed, because the prototype might not be designated + before the ports (which might be symbolic entries in /etc/services) */ + + if (strlen(str_port) != 0) + { + int err; + + err = IpPort(str_port, proto, &proxy_port); + if (err) + return -1; + } + else + { + proxy_port = 0; + } + + if (strlen(str_server_port) != 0) + { + int err; + + err = IpPort(str_server_port, proto, &server_port); + if (err) + return -1; + } + else + { + server_port = 0; + } + +/* Check that at least the server address has been defined */ + if (server_addr.s_addr == 0) + return -1; + +/* Add to linked list */ + proxy_entry = malloc(sizeof(struct proxy_entry)); + if (proxy_entry == NULL) + return -1; + + proxy_entry->proxy_type = proxy_type; + proxy_entry->rule_index = rule_index; + proxy_entry->proto = proto; + proxy_entry->proxy_port = htons(proxy_port); + proxy_entry->server_port = htons(server_port); + proxy_entry->server_addr = server_addr; + proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr; + proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr; + proxy_entry->src_mask = src_mask; + proxy_entry->dst_mask = dst_mask; + + RuleAdd(proxy_entry); + + return 0; +} diff --git a/sys/netinet/libalias/alias_util.c b/sys/netinet/libalias/alias_util.c new file mode 100644 index 0000000..41ca6cc --- /dev/null +++ b/sys/netinet/libalias/alias_util.c @@ -0,0 +1,141 @@ +/* + Alias_util.h contains general utilities used by other functions + in the packet aliasing module. At the moment, there are functions + for computing IP header and TCP packet checksums. + + The checksum routines are based upon example code in a Unix networking + text written by Stevens (sorry, I can't remember the title -- but + at least this is a good author). + + Initial Version: August, 1996 (cjm) + + Version 1.7: January 9, 1997 + Added differential checksum update function. + + $FreeBSD$ +*/ + +/* +Note: the checksum routines assume that the actual checksum word has +been zeroed out. If the checksum workd is filled with the proper value, +then these routines will give a result of zero (useful for testing +purposes); +*/ + +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include "alias.h" +#include "alias_local.h" + +u_short +PacketAliasInternetChecksum(u_short *ptr, int nbytes) +{ + int sum, oddbyte; + + sum = 0; + while (nbytes > 1) + { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) + { + oddbyte = 0; + ((u_char *) &oddbyte)[0] = *(u_char *) ptr; + ((u_char *) &oddbyte)[1] = 0; + sum += oddbyte; + } + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + return(~sum); +} + +u_short +IpChecksum(struct ip *pip) +{ + return( PacketAliasInternetChecksum((u_short *) pip, + (pip->ip_hl << 2)) ); + +} + +u_short +TcpChecksum(struct ip *pip) +{ + u_short *ptr; + struct tcphdr *tc; + int nhdr, ntcp, nbytes; + int sum, oddbyte; + + nhdr = pip->ip_hl << 2; + ntcp = ntohs(pip->ip_len) - nhdr; + + tc = (struct tcphdr *) ((char *) pip + nhdr); + ptr = (u_short *) tc; + +/* Add up TCP header and data */ + nbytes = ntcp; + sum = 0; + while (nbytes > 1) + { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) + { + oddbyte = 0; + ((u_char *) &oddbyte)[0] = *(u_char *) ptr; + ((u_char *) &oddbyte)[1] = 0; + sum += oddbyte; + } + +/* "Pseudo-header" data */ + ptr = (u_short *) &(pip->ip_dst); + sum += *ptr++; + sum += *ptr; + ptr = (u_short *) &(pip->ip_src); + sum += *ptr++; + sum += *ptr; + sum += htons((u_short) ntcp); + sum += htons((u_short) pip->ip_p); + +/* Roll over carry bits */ + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + +/* Return checksum */ + return((u_short) ~sum); +} + + +void +DifferentialChecksum(u_short *cksum, u_short *new, u_short *old, int n) +{ + int i; + int accumulate; + + accumulate = *cksum; + for (i=0; i<n; i++) + { + accumulate -= *new++; + accumulate += *old++; + } + + if (accumulate < 0) + { + accumulate = -accumulate; + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) ~accumulate; + } + else + { + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) accumulate; + } +} + diff --git a/sys/netinet/libalias/libalias.3 b/sys/netinet/libalias/libalias.3 new file mode 100644 index 0000000..f3f187d --- /dev/null +++ b/sys/netinet/libalias/libalias.3 @@ -0,0 +1,884 @@ +.\" $FreeBSD$ +.\" +.Dd July, 1997 +.Dt LIBALIAS 3 +.Os +.Sh NAME +.Nm libalias +.Nd packet aliasing library for masquerading and address translation (NAT) +.Sh SYNOPSIS +.Fd #include <sys/types.h> +.Fd #include <netinet/in.h> +.Fd #include <alias.h> + +Function prototypes are given in the main body +of the text. +.Sh DESCRIPTION +The +.Nm +library is a collection of +functions for aliasing and de-aliasing +of IP packets, intended for masquerading and +network address translation (NAT). +.Sh CONTENTS +.Bd -literal -offset left +1. Introduction +2. Initialization and Control + 2.1 PacketAliasInit() + 2.2 PacketAliasUninit() + 2.3 PacketAliasSetAddress() + 2.4 PacketAliasSetMode() + 2.5 PacketAliasSetFWBase() +3. Packet Handling + 3.1 PacketAliasIn() + 3.2 PacketAliasOut() +4. Port and Address Redirection + 4.1 PacketAliasRedirectPort() + 4.2 PacketAliasRedirectAddr() + 4.3 PacketAliasRedirectDelete() + 4.4 PacketAliasProxyRule() + 4.5 PacketAliasPptp() +5. Fragment Handling + 5.1 PacketAliasSaveFragment() + 5.2 PacketAliasGetFragment() + 5.3 PacketAliasFragmentIn() +6. Miscellaneous Functions + 6.1 PacketAliasSetTarget() + 6.2 PacketAliasCheckNewLink() + 6.3 PacketAliasInternetChecksum() +7. Authors +8. Acknowledgments + +Appendix A: Conceptual Background + A.1 Aliasing Links + A.2 Static and Dynamic Links + A.3 Partially Specified Links + A.4 Dynamic Link Creation +.Ed +.Sh 1. Introduction +This library is a moderately portable +set of functions designed to assist +in the process of IP masquerading and +network address translation. Outgoing +packets from a local network with +unregistered IP addresses can be aliased +to appear as if they came from an +accessible IP address. Incoming packets +are then de-aliased so that they are sent +to the correct machine on the local network. + +A certain amount of flexibility is built +into the packet aliasing engine. In +the simplest mode of operation, a +many-to-one address mapping takes place +between local network and the packet +aliasing host. This is known as IP +masquerading. In addition, one-to-one +mappings between local and public addresses +can also be implemented, which is known as +static NAT. In between these extremes, +different groups of private addresses +can be linked to different public addresses, +comprising several distinct many-to-one +mappings. Also, a given public address +and port can be statically redirected to +a private address/port. + +The packet aliasing engine was designed +to operate in user space outside of the +kernel, without any access to private +kernel data structure, but the source code +can also be ported to a kernel environment. +.Sh 2. Initialization and Control +Two specific functions, PacketAliasInit() +and PacketAliasSetAddress(), must always be +called before any packet handling may be +performed. In addition, the operating mode +of the packet aliasing engine can be customized +by calling PacketAliasSetMode(). +.Ss 2.1 PacketAliasInit() + +.Ft void +.Fn PacketAliasInit "void" + +This function has no argument or return +value and is used to initialize internal +data structures. The following mode bits +are always set after calling +PacketAliasInit(). See section 2.3 for +the meaning of these mode bits. +.Bd -literal -offset indent + PKT_ALIAS_USE_SAME_PORTS + PKT_ALIAS_USE_SOCKETS + PKT_ALIAS_RESET_ON_ADDR_CHANGE + +.Ed +This function will always return the packet +aliasing engine to the same initial state. +PacketAliasSetAddress() must be called afterwards, +and any desired changes from the default mode +bits listed above require a call to +PacketAliasSetMode(). + +It is mandatory that this function be called +at the beginning of a program prior to any +packet handling. +.Ss 2.2 PacketAliasUninit() + +.Ft void +.Fn PacketAliasUninit "void" + +This function has no argument or return +value and is used to clear any resources +attached to internal data structures. + +This functions should be called when a +program stop using the aliasing engine; +it do, among other things, clear out any +firewall holes. To provide backwards +compatibility and extra security, it is +added to the atexit() chain by +PacketAliasInit(). Calling it multiple +times is harmless. +.Ss 2.3 PacketAliasSetAddress() + +.Ft void +.Fn PacketAliasSetAddress "struct in_addr addr" + +This function sets the source address to which +outgoing packets from the local area network +are aliased. All outgoing packets are remapped +to this address unless overridden by a static +address mapping established by +PacketAliasRedirectAddr(). + +If the PKT_ALIAS_RESET_ON_ADDR_CHANGE mode bit +is set (the default mode of operation), then +the internal aliasing link tables will be reset +any time the aliasing address changes. +This is useful +for interfaces such as ppp where the IP +address may or may not change on successive +dial-up attempts. + +If the PKT_ALIAS_RESET_ON_ADDR_CHANGE mode bit +is set to zero, this function can also be used to +dynamically change the aliasing address on a +packet to packet basis (it is a low overhead +call). + +It is mandatory that this function be called +prior to any packet handling. +.Ss 2.4 PacketAliasSetMode() + +.Ft unsigned int +.Fn PacketAliasSetMode "unsigned int mode" "unsigned int mask" + +This function sets or clears mode bits +according to the value of +.Em mode . +Only bits marked in +.Em mask +are affected. The following mode bits are +defined in alias.h: +.Bl -hang -offset left +.It PKT_ALIAS_LOG. +Enables logging /var/log/alias.log. The log file +shows total numbers of links (icmp, tcp, udp) each +time an aliasing link is created or deleted. Mainly +useful for debugging when the log file is viewed +continuously with "tail -f". +.It PKT_ALIAS_DENY_INCOMING. +If this mode bit is set, all incoming packets +associated with new TCP connections or new +UDP transactions will be marked for being +ignored (PacketAliasIn() return code +PKT_ALIAS_IGNORED) by the calling program. +Response packets to connections or transactions +initiated from the packet aliasing host or +local network will be unaffected. This mode +bit is useful for implementing a one-way firewall. +.It PKT_ALIAS_SAME_PORTS. +If this mode bit is set, the packet aliasing +engine will attempt to leave the alias port +numbers unchanged from the actual local port +number. This can be done as long as the +quintuple (proto, alias addr, alias port, +remote addr, remote port) is unique. If a +conflict exists, a new aliasing port number is +chosen even if this mode bit is set. +.It PKT_ALIAS_USE_SOCKETS. +This bit should be set when the packet +aliasing host originates network traffic as +well as forwards it. When the packet aliasing +host is waiting for a connection from an +unknown host address or unknown port number +(e.g. an FTP data connection), this mode bit +specifies that a socket be allocated as a place +holder to prevent port conflicts. Once a +connection is established, usually within a +minute or so, the socket is closed. +.It PKT_ALIAS_UNREGISTERED_ONLY. +If this mode bit is set, traffic on the +local network which does not originate from +unregistered address spaces will be ignored. +Standard Class A, B and C unregistered addresses +are: +.Bd -literal -offset indent + 10.0.0.0 -> 10.255.255.255 (Class A subnet) + 172.16.0.0 -> 172.31.255.255 (Class B subnets) + 192.168.0.0 -> 192.168.255.255 (Class C subnets) + +.Ed +This option is useful in the case that +packet aliasing host has both registered and +unregistered subnets on different interfaces. +The registered subnet is fully accessible to +the outside world, so traffic from it doesn't +need to be passed through the packet aliasing +engine. +.It PKT_ALIAS_RESET_ON_ADDR_CHANGE. +When this mode bit is set and +PacketAliasSetAddress() is called to change +the aliasing address, the internal link table +of the packet aliasing engine will be cleared. +This operating mode is useful for ppp links +where the interface address can sometimes +change or remain the same between dial-ups. +If this mode bit is not set, the link table +will never be reset in the event of an +address change. +.It PKT_ALIAS_PUNCH_FW. +This option makes libalias `punch holes' in an +ipfw based firewall for FTP/IRC DCC connections. +The holes punched are bound by from/to IP address +and port; it will not be possible to use a hole +for another connection. A hole is removed when +the connection that uses it dies. To cater to +unexpected death of a program using libalias (e.g +kill -9), changing the state of the flag will +clear the entire ipfw range allocated for holes. +This will also happen on the initial call to +PacketAliasSetFWBase(). This call must happen +prior to setting this flag. +.It PKT_ALIAS_REVERSE. +This option makes libalias reverse the way it +handles incoming and outgoing packets, allowing +it to be fed data that passes through the internal +interface rather than the external one. + +.El + +.Ss 2.5 PacketAliasSetFWBase() + +.Ft void +.Fn PacketAliasSetFWBase "unsigned int base" "unsigned int num" + +Set IPFW range allocated for punching firewall holes (with the +PKT_ALIAS_PUNCH_FW flag). The range will be cleared for all rules on +initialization. +.Sh 3. Packet Handling +The packet handling functions are used to +modify incoming (remote->local) and outgoing +(local->remote) packets. The calling program +is responsible for receiving and sending +packets via network interfaces. + +Along with PacketAliasInit() and PacketAliasSetAddress(), +the two packet handling functions, PacketAliasIn() +and PacketAliasOut(), comprise minimal set of functions +needed for a basic IP masquerading implementation. +.Ss 3.1 PacketAliasIn() + +.Ft int +.Fn PacketAliasIn "char *buffer" "int maxpacketsize" + +An incoming packet coming from a remote machine to +the local network is de-aliased by this function. +The IP packet is pointed to by +.Em buffer , +and +.Em maxpacketsize +indicates the size of the data structure containing +the packet and should be at least as large as the +actual packet size. + +Return codes: +.Bl -hang -offset left +.It PKT_ALIAS_ERROR. +An internal error within the packet aliasing +engine occurred. +.It PKT_ALIAS_OK. +The packet aliasing process was successful. +.It PKT_ALIAS_IGNORED. +The packet was ignored and not de-aliased. +This can happen if the protocol is unrecognized, +possibly an ICMP message type is not handled or +if incoming packets for new connections are being +ignored (see PKT_ALIAS_DENY_INCOMING in section +2.2). +.It PKT_ALIAS_UNRESOLVED_FRAGMENT. +This is returned when a fragment cannot be +resolved because the header fragment has not +been sent yet. In this situation, fragments +must be saved with PacketAliasSaveFragment() +until a header fragment is found. +.It PKT_ALIAS_FOUND_HEADER_FRAGMENT. +The packet aliasing process was successful, +and a header fragment was found. This is a +signal to retrieve any unresolved fragments +with PacketAliasGetFragment() and de-alias +them with PacketAliasFragmentIn(). +.El +.Ss 3.2 PacketAliasOut() + +.Ft int +.Fn PacketAliasOut "char *buffer" "int maxpacketsize" + +An outgoing packet coming from the local network +to a remote machine is aliased by this function. +The IP packet is pointed to by +.Em buffer , +and +.Em maxpacketsize +indicates the maximum packet size permissible +should the packet length be changed. IP encoding +protocols place address and port information in +the encapsulated data stream which have to be +modified and can account for changes in packet +length. Well known examples of such protocols +are FTP and IRC DCC. + +Return codes: +.Bl -hang -offset left +.It PKT_ALIAS_ERROR. +An internal error within the packet aliasing +engine occurred. +.It PKT_ALIAS_OK. +The packet aliasing process was successful. +.It PKT_ALIAS_IGNORED. +The packet was ignored and not de-aliased. +This can happen if the protocol is unrecognized, +or possibly an ICMP message type is not handled. +.El +.Sh 4. Port and Address Redirection +The functions described in this section allow machines +on the local network to be accessible in some degree +to new incoming connections from the external network. +Individual ports can be re-mapped or static network +address translations can be designated. +.Ss 4.1 PacketAliasRedirectPort() + +.Ft struct alias_link * +.Fo PacketAliasRedirectPort +.Fa "struct in_addr local_addr" +.Fa "u_short local_port" +.Fa "struct in_addr remote_addr" +.Fa "u_short remote_port" +.Fa "struct in_addr alias_addr" +.Fa "u_short alias_port" +.Fa "u_char proto" +.Fc + +This function specifies that traffic from a +given remote address/port to an alias address/port +be redirected to a specified local address/port. +The parameter +.Em proto +can be either IPPROTO_TCP or IPPROTO_UDP, as +defined in <netinet/in.h>. + +If +.Em local_addr +or +.Em alias_addr +is zero, this indicates that the packet aliasing +address as established by PacketAliasSetAddress() +is to be used. Even if PacketAliasSetAddress() is +called to change the address after PacketAliasRedirectPort() +is called, a zero reference will track this change. + +If +.Em remote_addr +is zero, this indicates to redirect packets from +any remote address. Likewise, if +.Em remote_port +is zero, this indicates to redirect packets originating +from any remote port number. Almost always, the remote +port specification will be zero, but non-zero remote +addresses can sometimes be useful for firewalling. +If two calls to PacketAliasRedirectPort() overlap in +their address/port specifications, then the most recent +call will have precedence. + +This function returns a pointer which can subsequently +be used by PacketAliasRedirectDelete(). If NULL is +returned, then the function call did not complete +successfully. + +All port numbers are in network address byte order, +so it is necessary to use htons() to convert these +parameters from internally readable numbers to +network byte order. Addresses are also in network +byte order, which is implicit in the use of the +.Em struct in_addr +data type. +.Ss 4.2 PacketAliasRedirectAddr() + +.Ft struct alias_link * +.Fo PacketAliasRedirectAddr +.Fa "struct in_addr local_addr" +.Fa "struct in_addr alias_addr" +.Fc + +This function desgnates that all incoming +traffic to +.Em alias_addr +be redirected to +.Em local_addr. +Similarly, all outgoing traffic from +.Em local_addr +is aliased to +.Em alias_addr . + +If +.Em local_addr +or +.Em alias_addr +is zero, this indicates that the packet aliasing +address as established by PacketAliasSetAddress() +is to be used. Even if PacketAliasSetAddress() is +called to change the address after PacketAliasRedirectAddr() +is called, a zero reference will track this change. + +If subsequent calls to PacketAliasRedirectAddr() +use the same aliasing address, all new incoming +traffic to this aliasing address will be redirected +to the local address made in the last function call. +New traffic generated by any of the local machines, designated +in the several function calls, will be aliased to +the same address. Consider the following example: +.Bd -literal -offset left + PacketAliasRedirectAddr(inet_aton("192.168.0.2"), + inet_aton("141.221.254.101")); + PacketAliasRedirectAddr(inet_aton("192.168.0.3"), + inet_aton("141.221.254.101")); + PacketAliasRedirectAddr(inet_aton("192.168.0.4"), + inet_aton("141.221.254.101")); +.Ed + +Any outgoing connections such as telnet or ftp +from 192.168.0.2, 192.168.0.3, 192.168.0.4 will +appear to come from 141.221.254.101. Any incoming +connections to 141.221.254.101 will be directed +to 192.168.0.4. + +Any calls to PacketAliasRedirectPort() will +have precedence over address mappings designated +by PacketAliasRedirectAddr(). + +This function returns a pointer which can subsequently +be used by PacketAliasRedirectDelete(). If NULL is +returned, then the function call did not complete +successfully. +.Ss 4.3 PacketAliasRedirectDelete() + +.Ft void +.Fn PacketAliasRedirectDelete "struct alias_link *ptr" + +This function will delete a specific static redirect +rule entered by PacketAliasRedirectPort() or +PacketAliasRedirectAddr(). The parameter +.Em ptr +is the pointer returned by either of the redirection +functions. If an invalid pointer is passed to +PacketAliasRedirectDelete(), then a program crash +or unpredictable operation could result, so it is +necessary to be careful using this function. +.Ss 4.4 PacketAliasProxyRule() + +.Ft int +.Fn PacketAliasProxyRule "const char *cmd" + +The passed +.Ar cmd +string consists of one or more pairs of words. The first word in each +pair is a token and the second is the value that should be applied for +that token. Tokens and their argument types are as follows: + +.Bl -tag -offset XXX -width XXX +.It type encode_ip_hdr|encode_tcp_stream|no_encode +In order to support transparent proxying, it is necessary to somehow +pass the original address and port information into the new destination +server. If +.Dq encode_ip_hdr +is specified, the original address and port is passed as an extra IP +option. If +.Dq encode_tcp_stream +is specified, the original address and port is passed as the first +piece of data in the tcp stream in the format +.Dq DEST Ar IP port . +.It port Ar portnum +Only packets with the destination port +.Ar portnum +are proxied. +.It server Ar host[:portnum] +This specifies the +.Ar host +and +.Ar portnum +that the data is to be redirected to. +.Ar host +must be an IP address rather than a DNS host name. If +.Ar portnum +is not specified, the destination port number is not changed. +.Pp +The +.Ar server +specification is mandatory unless the +.Dq delete +command is being used. +.It rule Ar index +Normally, each call to +.Fn PacketAliasProxyRule +inserts the next rule at the start of a linear list of rules. If an +.Ar index +is specified, the new rule will be checked after all rules with lower +indices. Calls to +.Fn PacketAliasProxyRule +that do not specify a rule are assigned rule 0. +.It delete Ar index +This token and its argument must not be used with any other tokens. When +used, all existing rules with the given +.Ar index +are deleted. +.It proto tcp|udp +If specified, only packets of the given protocol type are matched. +.It src Ar IP[/bits] +If specified, only packets with a source address matching the given +.Ar IP +are matched. If +.Ar bits +is also specified, then the first +.Ar bits +bits of +.Ar IP +are taken as a network specification, and all IP addresses from that +network will be matched. +.It dst Ar IP[/bits] +If specified, only packets with a destination address matching the given +.Ar IP +are matched. If +.Ar bits +is also specified, then the first +.Ar bits +bits of +.Ar IP +are taken as a network specification, and all IP addresses from that +network will be matched. +.El + +This function is usually used to redirect outgoing connections for +internal machines that are not permitted certain types of internet +access, or to restrict access to certain external machines. +.Ss 4.5 PacketAliasPptp() + +.Ft extern int +.Fn PacketAliasPptp "struct in_addr addr" + +This function causes any +.Em G Ns No eneral +.Em R Ns No outing +.Em E Ns No ncapsulation +.Pq Dv IPPROTO_GRE +packets to be aliased using +.Ar addr +rather than the address set via +.Fn PacketAliasSetAddress . +This allows the uses of the +.Em P Ns No oint +to +.Em P Ns No oint +.Em T Ns No unneling +.Em P Ns No rotocol +on a machine on the internal network. +.Pp +If the passed address is +.Dv INADDR_NONE +.Pq 255.255.255.255 , +.Dv PPTP +aliasing is disabled. +.Sh 5. Fragment Handling +The functions in this section are used to deal with +incoming fragments. + +Outgoing fragments are handled within PacketAliasOut() +by changing the address according to any +applicable mapping set by PacketAliasRedirectAddress(), +or the default aliasing address set by +PacketAliasSetAddress(). + +Incoming fragments are handled in one of two ways. +If the header of a fragmented IP packet has already +been seen, then all subsequent fragments will be +re-mapped in the same manner the header fragment +was. Fragments which arrive before the header +are saved and then retrieved once the header fragment +has been resolved. +.Ss 5.1 PacketAliasSaveFragment() + +.Ft int +.Fn PacketAliasSaveFragment "char *ptr" + +When PacketAliasIn() returns +PKT_ALIAS_UNRESOLVED_FRAGMENT, this +function can be used to save the pointer to +the unresolved fragment. + +It is implicitly assumed that +.Em ptr +points to a block of memory allocated by +malloc(). If the fragment is never +resolved, the packet aliasing engine will +automatically free the memory after a +timeout period. [Eventually this function +should be modified so that a callback +function for freeing memory is passed as +an argument.] + +This function returns PKT_ALIAS_OK if it +was successful and PKT_ALIAS_ERROR if there +was an error. + +.Ss 5.2 PacketAliasGetFragment() + +.Ft char * +.Fn PacketAliasGetFragment "char *buffer" + +This function can be used to retrieve fragment +pointers saved by PacketAliasSaveFragment(). +The IP header fragment pointed to by +.Em buffer +is the header fragment indicated when +PacketAliasIn() returns PKT_ALIAS_FOUND_HEADER_FRAGMENT. +Once a a fragment pointer is retrieved, it +becomes the calling program's responsibility +to free the dynamically allocated memory for +the fragment. + +PacketAliasGetFragment() can be called +sequentially until there are no more fragments +available, at which time it returns NULL. +.Ss 5.3 PacketAliasFragmentIn() + +.Ft void +.Fn PacketAliasFragmentIn "char *header" "char *fragment" + +When a fragment is retrieved with +PacketAliasGetFragment(), it can then be +de-aliased with a call to PacketAliasFragmentIn(). +.Em header +is the pointer to a header fragment used as a +template, and +.Em fragment +is the pointer to the packet to be de-aliased. +.Sh 6. Miscellaneous Functions + +.Ss 6.1 PacketAliasSetTarget() + +.Ft void +.Fn PacketAliasSetTarget "struct in_addr addr" + +When an incoming packet not associated with +any pre-existing aliasing link arrives at the +host machine, it will be sent to the address +indicated by a call to PacketAliasSetTarget(). + +If this function is not called, or is called +with a zero address argument, then all new +incoming packets go to the address set by +PacketAliasSetAddress. +.Ss 6.2 PacketAliasCheckNewLink() + +.Ft int +.Fn PacketAliasCheckNewLink "void" + +This function returns a non-zero value when +a new aliasing link is created. In circumstances +where incoming traffic is being sequentially +sent to different local servers, this function +can be used to trigger when PacketAliasSetTarget() +is called to change the default target address. +.Ss 6.3 PacketAliasInternetChecksum() + +.Ft u_short +.Fn PacketAliasInternetChecksum "u_short *buffer" "int nbytes" + +This is a utility function that does not seem +to be available elswhere and is included as a +convenience. It computes the internet checksum, +which is used in both IP and protocol-specific +headers (TCP, UDP, ICMP). + +.Em buffer +points to the data block to be checksummed, and +.Em nbytes +is the number of bytes. The 16-bit checksum +field should be zeroed before computing the checksum. + +Checksums can also be verified by operating on a block +of data including its checksum. If the checksum is +valid, PacketAliasInternetChecksum() will return zero. +.Sh 7. Authors +Charles Mott (cmott@srv.net), versions 1.0 - 1.8, 2.0 - 2.4. + +Eivind Eklund (eivind@freebsd.org), versions 1.8b, 1.9 and +2.5. Added IRC DCC support as well as contributing a number of +architectural improvements; added the firewall bypass +for FTP/IRC DCC. +.Sh 8. Acknowledgments + +Listed below, in approximate chronological +order, are individuals who have provided +valuable comments and/or debugging assistance. + +.Bl -inset -compact -offset left +.It Gary Roberts +.It Tom Torrance +.It Reto Burkhalter +.It Martin Renters +.It Brian Somers +.It Paul Traina +.It Ari Suutari +.It Dave Remien +.It J. Fortes +.It Andrzej Bialeki +.It Gordon Burditt +.El +.Sh Appendix: Conceptual Background +This appendix is intended for those who +are planning to modify the source code or want +to create somewhat esoteric applications using +the packet aliasing functions. + +The conceptual framework under which the +packet aliasing engine operates is described here. +Central to the discussion is the idea of an +"aliasing link" which describes the relationship +for a given packet transaction between the local +machine, aliased identity and remote machine. It +is discussed how such links come into existence +and are destroyed. +.Ss A.1 Aliasing Links +There is a notion of an "aliasing link", +which is 7-tuple describing a specific +translation: +.Bd -literal -offset indent +(local addr, local port, alias addr, alias port, + remote addr, remote port, protocol) +.Ed + +Outgoing packets have the local address and +port number replaced with the alias address +and port number. Incoming packets undergo the +reverse process. The packet aliasing engine +attempts to match packets against an internal +table of aliasing links to determine how to +modify a given IP packet. Both the IP +header and protocol dependent headers are +modified as necessary. Aliasing links are +created and deleted as necessary according +to network traffic. + +Protocols can be TCP, UDP or even ICMP in +certain circumstances. (Some types of ICMP +packets can be aliased according to sequence +or id number which acts as an equivalent port +number for identifying how individual packets +should be handled.) + +Each aliasing link must have a unique +combination of the following five quantities: +alias address/port, remote address/port +and protocol. This ensures that several +machines on a local network can share the +same aliased IP address. In cases where +conflicts might arise, the aliasing port +is chosen so that uniqueness is maintained. +.Ss A.2 Static and Dynamic Links +Aliasing links can either be static or dynamic. +Static links persist indefinitely and represent +fixed rules for translating IP packets. Dynamic +links come into existence for a specific TCP +connection or UDP transaction or ICMP echo +sequence. For the case of TCP, the connection +can be monitored to see when the associated +aliasing link should be deleted. Aliasing links +for UDP transactions (and ICMP echo and timestamp +requests) work on a simple timeout rule. When +no activity is observed on a dynamic link for +a certain amount of time it is automatically +deleted. Timeout rules also apply to TCP +connections which do not open or close +properly. +.Ss A.3 Partially Specified Aliasing Links +Aliasing links can be partially specified, +meaning that the remote address and/or remote +ports are unknown. In this case, when a packet +matching the incomplete specification is found, +a fully specified dynamic link is created. If +the original partially specified link is dynamic, +it will be deleted after the fully specified link +is created, otherwise it will persist. + +For instance, a partially specified link might +be +.Bd -literal -offset indent +(192.168.0.4, 23, 204.228.203.215, 8066, 0, 0, tcp) +.Ed + +The zeros denote unspecified components for +the remote address and port. If this link were +static it would have the effect of redirecting +all incoming traffic from port 8066 of +204.228.203.215 to port 23 (telnet) of machine +192.168.0.4 on the local network. Each +individual telnet connection would initiate +the creation of a distinct dynamic link. +.Ss A.4 Dynamic Link Creation +In addition to aliasing links, there are +also address mappings that can be stored +within the internal data table of the packet +aliasing mechanism. +.Bd -literal -offset indent +(local addr, alias addr) +.Ed + +Address mappings are searched when creating +new dynamic links. + +All outgoing packets from the local network +automatically create a dynamic link if +they do not match an already existing fully +specified link. If an address mapping exists +for the outgoing packet, this determines +the alias address to be used. If no mapping +exists, then a default address, usually the +address of the packet aliasing host, is used. +If necessary, this default address can be +changed as often as each individual packet +arrives. + +The aliasing port number is determined +such that the new dynamic link does not +conflict with any existing links. In the +default operating mode, the packet aliasing +engine attempts to set the aliasing port +equal to the local port number. If this +results in a conflict, then port numbers +are randomly chosen until a unique aliasing +link can be established. In an alternate +operating mode, the first choice of an +aliasing port is also random and unrelated +to the local port number. + diff --git a/sys/netinet/mlfk_ipl.c b/sys/netinet/mlfk_ipl.c new file mode 100644 index 0000000..f19a6e1 --- /dev/null +++ b/sys/netinet/mlfk_ipl.c @@ -0,0 +1,180 @@ +/* + * Copyright 1999 Guido van Rooij. All rights reserved. + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/conf.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> + + +#include <netinet/ipl.h> +#include <netinet/ip_compat.h> +#include <netinet/ip_fil.h> +#include <netinet/ip_state.h> +#include <netinet/ip_nat.h> +#include <netinet/ip_auth.h> +#include <netinet/ip_frag.h> + +static dev_t ipf_devs[IPL_LOGMAX + 1]; + +SYSCTL_DECL(_net_inet); +SYSCTL_NODE(_net_inet, OID_AUTO, ipf, CTLFLAG_RW, 0, "IPF"); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_flags, CTLFLAG_RW, &fr_flags, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_pass, CTLFLAG_RW, &fr_pass, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_active, CTLFLAG_RD, &fr_active, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcpidletimeout, CTLFLAG_RW, + &fr_tcpidletimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcpclosewait, CTLFLAG_RW, + &fr_tcpclosewait, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcplastack, CTLFLAG_RW, + &fr_tcplastack, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcptimeout, CTLFLAG_RW, + &fr_tcptimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcpclosed, CTLFLAG_RW, + &fr_tcpclosed, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_udptimeout, CTLFLAG_RW, + &fr_udptimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_icmptimeout, CTLFLAG_RW, + &fr_icmptimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_defnatage, CTLFLAG_RW, + &fr_defnatage, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_ipfrttl, CTLFLAG_RW, + &fr_ipfrttl, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, ipl_unreach, CTLFLAG_RW, + &ipl_unreach, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, ipl_inited, CTLFLAG_RD, + &ipl_inited, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_authsize, CTLFLAG_RD, + &fr_authsize, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_authused, CTLFLAG_RD, + &fr_authused, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_defaultauthage, CTLFLAG_RW, + &fr_defaultauthage, 0, ""); + +#define CDEV_MAJOR 79 +static struct cdevsw ipl_cdevsw = { + /* open */ iplopen, + /* close */ iplclose, + /* read */ iplread, + /* write */ nowrite, + /* ioctl */ iplioctl, + /* poll */ nopoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "ipl", + /* maj */ CDEV_MAJOR, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ 0, + /* bmaj */ -1 +}; + +static int +ipfilter_modevent(module_t mod, int type, void *unused) +{ + char *c; + int i, error = 0; + + switch (type) { + case MOD_LOAD : + error = iplattach(); + if (error) + break; + + c = NULL; + for(i=strlen(IPL_NAME); i>0; i--) + if (IPL_NAME[i] == '/') { + c = &IPL_NAME[i+1]; + break; + } + if (!c) + c = IPL_NAME; + ipf_devs[IPL_LOGIPF] = + make_dev(&ipl_cdevsw, IPL_LOGIPF, 0, 0, 0600, c); + + c = NULL; + for(i=strlen(IPL_NAT); i>0; i--) + if (IPL_NAT[i] == '/') { + c = &IPL_NAT[i+1]; + break; + } + if (!c) + c = IPL_NAT; + ipf_devs[IPL_LOGNAT] = + make_dev(&ipl_cdevsw, IPL_LOGNAT, 0, 0, 0600, c); + + c = NULL; + for(i=strlen(IPL_STATE); i>0; i--) + if (IPL_STATE[i] == '/') { + c = &IPL_STATE[i+1]; + break; + } + if (!c) + c = IPL_STATE; + ipf_devs[IPL_LOGSTATE] = + make_dev(&ipl_cdevsw, IPL_LOGSTATE, 0, 0, 0600, c); + + c = NULL; + for(i=strlen(IPL_AUTH); i>0; i--) + if (IPL_AUTH[i] == '/') { + c = &IPL_AUTH[i+1]; + break; + } + if (!c) + c = IPL_AUTH; + ipf_devs[IPL_LOGAUTH] = + make_dev(&ipl_cdevsw, IPL_LOGAUTH, 0, 0, 0600, c); + + break; + case MOD_UNLOAD : + destroy_dev(ipf_devs[IPL_LOGIPF]); + destroy_dev(ipf_devs[IPL_LOGNAT]); + destroy_dev(ipf_devs[IPL_LOGSTATE]); + destroy_dev(ipf_devs[IPL_LOGAUTH]); + cdevsw_remove(&ipl_cdevsw); + error = ipldetach(); + break; + default: + error = EINVAL; + break; + } + return error; +} + +static moduledata_t ipfiltermod = { + IPL_VERSION, + ipfilter_modevent, + 0 +}; +DECLARE_MODULE(ipfilter, ipfiltermod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c new file mode 100644 index 0000000..ba36bd3 --- /dev/null +++ b/sys/netinet/raw_ip.c @@ -0,0 +1,637 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/route.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/ip_mroute.h> + +#include <netinet/ip_fw.h> + +#include "opt_ipdn.h" +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif + +struct inpcbhead ripcb; +struct inpcbinfo ripcbinfo; + +/* + * Nominal space allocated to a raw ip socket. + */ +#define RIPSNDQ 8192 +#define RIPRCVQ 8192 + +/* + * Raw interface to IP protocol. + */ + +/* + * Initialize raw connection block q. + */ +void +rip_init() +{ + LIST_INIT(&ripcb); + ripcbinfo.listhead = &ripcb; + /* + * XXX We don't use the hash list for raw IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); + ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); + ripcbinfo.ipi_zone = zinit("ripcb", sizeof(struct inpcb), + maxsockets, ZONE_INTERRUPT, 0); +} + +static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; +/* + * Setup generic address and protocol structures + * for raw_input routine, then pass them along with + * mbuf chain. + */ +void +rip_input(m, iphlen) + struct mbuf *m; + int iphlen; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct inpcb *inp; + struct inpcb *last = 0; + struct mbuf *opts = 0; + + ripsrc.sin_addr = ip->ip_src; + for (inp = ripcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if (inp->inp_ip_p && inp->inp_ip_p != ip->ip_p) + continue; + if (inp->inp_laddr.s_addr && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + continue; + if (last) { + struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + if (n) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, n); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, n, + opts) == 0) { + /* should notify about lost packet */ + m_freem(n); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + opts = 0; + } + } + last = inp; + } + if (last) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, m); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, m, opts) == 0) { + m_freem(m); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Generate IP header and pass packet to ip_output. + * Tack on options user may have setup with control call. + */ +int +rip_output(m, so, dst) + struct mbuf *m; + struct socket *so; + u_long dst; +{ + register struct ip *ip; + register struct inpcb *inp = sotoinpcb(so); + int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + + /* + * If the user handed us a complete IP packet, use it. + * Otherwise, allocate an mbuf for a header and fill it in. + */ + if ((inp->inp_flags & INP_HDRINCL) == 0) { + if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + M_PREPEND(m, sizeof(struct ip), M_WAIT); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_off = 0; + ip->ip_p = inp->inp_ip_p; + ip->ip_len = m->m_pkthdr.len; + ip->ip_src = inp->inp_laddr; + ip->ip_dst.s_addr = dst; + ip->ip_ttl = MAXTTL; + } else { + if (m->m_pkthdr.len > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + ip = mtod(m, struct ip *); + /* don't allow both user specified and setsockopt options, + and don't allow packet length sizes that will crash */ + if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2)) + && inp->inp_options) + || (ip->ip_len > m->m_pkthdr.len) + || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) { + m_freem(m); + return EINVAL; + } + if (ip->ip_id == 0) + ip->ip_id = htons(ip_id++); + /* XXX prevent ip_output from overwriting header fields */ + flags |= IP_RAWOUTPUT; + ipstat.ips_rawout++; + } + return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + inp->inp_moptions)); +} + +/* + * Raw IP socket option processing. + */ +int +rip_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + if (sopt->sopt_level != IPPROTO_IP) + return (EINVAL); + + error = 0; + + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + optval = inp->inp_flags & INP_HDRINCL; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_FW_GET: + if (ip_fw_ctl_ptr == 0) + error = ENOPROTOOPT; + else + error = ip_fw_ctl_ptr(sopt); + break; + +#ifdef DUMMYNET + case IP_DUMMYNET_GET: + if (ip_dn_ctl_ptr == NULL) + error = ENOPROTOOPT ; + else + error = ip_dn_ctl_ptr(sopt); + break ; +#endif /* DUMMYNET */ + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + error = ip_mrouter_get(so, sopt); + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval) + inp->inp_flags |= INP_HDRINCL; + else + inp->inp_flags &= ~INP_HDRINCL; + break; + + case IP_FW_ADD: + case IP_FW_DEL: + case IP_FW_FLUSH: + case IP_FW_ZERO: + case IP_FW_RESETLOG: + if (ip_fw_ctl_ptr == 0) + error = ENOPROTOOPT; + else + error = ip_fw_ctl_ptr(sopt); + break; + +#ifdef DUMMYNET + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: + case IP_DUMMYNET_FLUSH: + if (ip_dn_ctl_ptr == NULL) + error = ENOPROTOOPT ; + else + error = ip_dn_ctl_ptr(sopt); + break ; +#endif + + case IP_RSVP_ON: + error = ip_rsvp_init(so); + break; + + case IP_RSVP_OFF: + error = ip_rsvp_done(); + break; + + /* XXX - should be combined */ + case IP_RSVP_VIF_ON: + error = ip_rsvp_vif_init(so, sopt); + break; + + case IP_RSVP_VIF_OFF: + error = ip_rsvp_vif_done(so, sopt); + break; + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + error = ip_mrouter_set(so, sopt); + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + } + + return (error); +} + +/* + * This function exists solely to receive the PRC_IFDOWN messages which + * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, + * and calls in_ifadown() to remove all routes corresponding to that address. + * It also receives the PRC_IFUP messages from if_up() and reinstalls the + * interface routes. + */ +void +rip_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct in_ifaddr *ia; + struct ifnet *ifp; + int err; + int flags; + + switch (cmd) { + case PRC_IFDOWN: + for (ia = in_ifaddrhead.tqh_first; ia; + ia = ia->ia_link.tqe_next) { + if (ia->ia_ifa.ifa_addr == sa + && (ia->ia_flags & IFA_ROUTE)) { + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ia->ia_ifp, ia); + /* + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. + */ + in_ifadown(&ia->ia_ifa); + break; + } + } + break; + + case PRC_IFUP: + for (ia = in_ifaddrhead.tqh_first; ia; + ia = ia->ia_link.tqe_next) { + if (ia->ia_ifa.ifa_addr == sa) + break; + } + if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) + return; + flags = RTF_UP; + ifp = ia->ia_ifa.ifa_ifp; + + if ((ifp->if_flags & IFF_LOOPBACK) + || (ifp->if_flags & IFF_POINTOPOINT)) + flags |= RTF_HOST; + + err = rtinit(&ia->ia_ifa, RTM_ADD, flags); + if (err == 0) + ia->ia_flags |= IFA_ROUTE; + break; + } +} + +u_long rip_sendspace = RIPSNDQ; +u_long rip_recvspace = RIPRCVQ; + +SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, + &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); +SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, + &rip_recvspace, 0, "Maximum incoming raw IP datagram size"); + +static int +rip_attach(struct socket *so, int proto, struct proc *p) +{ + struct inpcb *inp; + int error, s; + + inp = sotoinpcb(so); + if (inp) + panic("rip_attach"); + if (p && (error = suser(p)) != 0) + return error; + + s = splnet(); + error = in_pcballoc(so, &ripcbinfo, p); + splx(s); + if (error) + return error; + error = soreserve(so, rip_sendspace, rip_recvspace); + if (error) + return error; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip_p = proto; + return 0; +} + +static int +rip_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + panic("rip_detach"); + if (so == ip_mrouter) + ip_mrouter_done(); + ip_rsvp_force_done(so); + if (so == ip_rsvpd) + ip_rsvp_done(); + in_pcbdetach(inp); + return 0; +} + +static int +rip_abort(struct socket *so) +{ + soisdisconnected(so); + return rip_detach(so); +} + +static int +rip_disconnect(struct socket *so) +{ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + return rip_abort(so); +} + +static int +rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp = sotoinpcb(so); + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof(*addr)) + return EINVAL; + + if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) || + (addr->sin_addr.s_addr && + ifa_ifwithaddr((struct sockaddr *)addr) == 0)) + return EADDRNOTAVAIL; + inp->inp_laddr = addr->sin_addr; + return 0; +} + +static int +rip_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp = sotoinpcb(so); + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof(*addr)) + return EINVAL; + if (TAILQ_EMPTY(&ifnet)) + return EADDRNOTAVAIL; + if ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) + return EAFNOSUPPORT; + inp->inp_faddr = addr->sin_addr; + soisconnected(so); + return 0; +} + +static int +rip_shutdown(struct socket *so) +{ + socantsendmore(so); + return 0; +} + +static int +rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) +{ + struct inpcb *inp = sotoinpcb(so); + register u_long dst; + + if (so->so_state & SS_ISCONNECTED) { + if (nam) { + m_freem(m); + return EISCONN; + } + dst = inp->inp_faddr.s_addr; + } else { + if (nam == NULL) { + m_freem(m); + return ENOTCONN; + } + dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; + } + return rip_output(m, so, dst); +} + +static int +rip_pcblist SYSCTL_HANDLER_ARGS +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = ripcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = ripcbinfo.ipi_gencnt; + n = ripcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = ripcbinfo.listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = ripcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = ripcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, + rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); + +struct pr_usrreqs rip_usrreqs = { + rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, + pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, + pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h new file mode 100644 index 0000000..436f23a --- /dev/null +++ b/sys/netinet/tcp.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_H_ +#define _NETINET_TCP_H_ + +typedef u_int32_t tcp_seq; +typedef u_int32_t tcp_cc; /* connection count per rfc1644 */ + +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOPT_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 /* Experimental */ +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 /* Experimental */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ +#define TCPOPT_TSTAMP_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) + +#define TCPOPT_CC 11 /* CC options: RFC-1644 */ +#define TCPOPT_CCNEW 12 +#define TCPOPT_CCECHO 13 +#define TCPOLEN_CC 6 +#define TCPOLEN_CC_APPA (TCPOLEN_CC+2) +#define TCPOPT_CC_HDR(ccopt) \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|(ccopt)<<8|TCPOLEN_CC) + +/* + * Default maximum segment size for TCP. + * With an IP MSS of 576, this is 536, + * but 512 is probably more convenient. + * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + */ +#define TCP_MSS 512 + +/* + * Default maximum segment size for TCP6. + * With an IP6 MSS of 1280, this is 1220, + * but 1024 is probably more convenient. (xxx kazu in doubt) + * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) + */ +#define TCP6_MSS 1024 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + /* max space left for options */ + +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#define TCP_MAXSEG 0x02 /* set maximum segment size */ +#define TCP_NOPUSH 0x04 /* don't push last block of write */ +#define TCP_NOOPT 0x08 /* don't use TCP options */ + +#endif diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c new file mode 100644 index 0000000..a01607d --- /dev/null +++ b/sys/netinet/tcp_debug.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_tcpdebug.h" + +#ifndef INET +#error The option TCPDEBUG requires option INET. +#endif + +#ifdef TCPDEBUG +/* load symbolic names */ +#define PRUREQUESTS +#define TCPSTATES +#define TCPTIMERS +#define TANAMES +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/protosw.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/tcp_debug.h> + +#ifdef TCPDEBUG +static int tcpconsdebug = 0; +#endif + +static struct tcp_debug tcp_debug[TCP_NDEBUG]; +static int tcp_debx; + +/* + * Tcp debug routines + */ +void +tcp_trace(act, ostate, tp, ti, req) + short act, ostate; + struct tcpcb *tp; + struct tcpiphdr *ti; + int req; +{ + tcp_seq seq, ack; + int len, flags; + struct tcp_debug *td = &tcp_debug[tcp_debx++]; + + if (tcp_debx == TCP_NDEBUG) + tcp_debx = 0; + td->td_time = iptime(); + td->td_act = act; + td->td_ostate = ostate; + td->td_tcb = (caddr_t)tp; + if (tp) + td->td_cb = *tp; + else + bzero((caddr_t)&td->td_cb, sizeof (*tp)); + if (ti) + td->td_ti = *ti; + else + bzero((caddr_t)&td->td_ti, sizeof (*ti)); + td->td_req = req; +#ifdef TCPDEBUG + if (tcpconsdebug == 0) + return; + if (tp) + printf("%p %s:", tp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", tanames[act]); + switch (act) { + + case TA_INPUT: + case TA_OUTPUT: + case TA_DROP: + if (ti == 0) + break; + seq = ti->ti_seq; + ack = ti->ti_ack; + len = ti->ti_len; + if (act == TA_OUTPUT) { + seq = ntohl(seq); + ack = ntohl(ack); + len = ntohs((u_short)len); + } + if (act == TA_OUTPUT) + len -= sizeof (struct tcphdr); + if (len) + printf("[%x..%x)", seq, seq+len); + else + printf("%x", seq); + printf("@%x, urp=%x", ack, ti->ti_urp); + flags = ti->ti_flags; + if (flags) { + char *cp = "<"; +#define pf(f) { \ + if (ti->ti_flags & TH_##f) { \ + printf("%s%s", cp, #f); \ + cp = ","; \ + } \ +} + pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG); + printf(">"); + } + break; + + case TA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", tcptimers[req>>8]); + break; + } + if (tp) + printf(" -> %s", tcpstates[tp->t_state]); + /* print out internal state of tp !?! */ + printf("\n"); + if (tp == 0) + return; + printf( + "\trcv_(nxt,wnd,up) (%lx,%lx,%lx) snd_(una,nxt,max) (%lx,%lx,%lx)\n", + (u_long)tp->rcv_nxt, tp->rcv_wnd, (u_long)tp->rcv_up, + (u_long)tp->snd_una, (u_long)tp->snd_nxt, (u_long)tp->snd_max); + printf("\tsnd_(wl1,wl2,wnd) (%lx,%lx,%lx)\n", + (u_long)tp->snd_wl1, (u_long)tp->snd_wl2, tp->snd_wnd); +#endif /* TCPDEBUG */ +} diff --git a/sys/netinet/tcp_debug.h b/sys/netinet/tcp_debug.h new file mode 100644 index 0000000..c359cb4 --- /dev/null +++ b/sys/netinet/tcp_debug.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_DEBUG_H_ +#define _NETINET_TCP_DEBUG_H_ + +struct tcp_debug { + n_time td_time; + short td_act; + short td_ostate; + caddr_t td_tcb; + struct tcpiphdr td_ti; + short td_req; + struct tcpcb td_cb; +}; + +#define TA_INPUT 0 +#define TA_OUTPUT 1 +#define TA_USER 2 +#define TA_RESPOND 3 +#define TA_DROP 4 + +#ifdef TANAMES +static char *tanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define TCP_NDEBUG 100 + +#ifndef KERNEL +/* XXX common variables for broken applications. */ +struct tcp_debug tcp_debug[TCP_NDEBUG]; +int tcp_debx; +#endif + +#endif /* !_NETINET_TCP_DEBUG_H_ */ diff --git a/sys/netinet/tcp_fsm.h b/sys/netinet/tcp_fsm.h new file mode 100644 index 0000000..37752c4 --- /dev/null +++ b/sys/netinet/tcp_fsm.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_FSM_H_ +#define _NETINET_TCP_FSM_H_ + +/* + * TCP FSM state definitions. + * Per RFC793, September, 1981. + */ + +#define TCP_NSTATES 11 + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +/* for KAME src sync over BSD*'s */ +#define TCP6_NSTATES TCP_NSTATES +#define TCP6S_CLOSED TCPS_CLOSED +#define TCP6S_LISTEN TCPS_LISTEN +#define TCP6S_SYN_SENT TCPS_SYN_SENT +#define TCP6S_SYN_RECEIVED TCPS_SYN_RECEIVED +#define TCP6S_ESTABLISHED TCPS_ESTABLISHED +#define TCP6S_CLOSE_WAIT TCPS_CLOSE_WAIT +#define TCP6S_FIN_WAIT_1 TCPS_FIN_WAIT_1 +#define TCP6S_CLOSING TCPS_CLOSING +#define TCP6S_LAST_ACK TCPS_LAST_ACK +#define TCP6S_FIN_WAIT_2 TCPS_FIN_WAIT_2 +#define TCP6S_TIME_WAIT TCPS_TIME_WAIT + +#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED) +#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) +#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) + +#ifdef TCPOUTFLAGS +/* + * Flags used when sending segments in tcp_output. + * Basic flags (TH_RST,TH_ACK,TH_SYN,TH_FIN) are totally + * determined by state, with the proviso that TH_FIN is sent only + * if all data queued for output is included in the segment. + */ +static u_char tcp_outflags[TCP_NSTATES] = { + TH_RST|TH_ACK, /* 0, CLOSED */ + 0, /* 1, LISTEN */ + TH_SYN, /* 2, SYN_SENT */ + TH_SYN|TH_ACK, /* 3, SYN_RECEIVED */ + TH_ACK, /* 4, ESTABLISHED */ + TH_ACK, /* 5, CLOSE_WAIT */ + TH_FIN|TH_ACK, /* 6, FIN_WAIT_1 */ + TH_FIN|TH_ACK, /* 7, CLOSING */ + TH_FIN|TH_ACK, /* 8, LAST_ACK */ + TH_ACK, /* 9, FIN_WAIT_2 */ + TH_ACK, /* 10, TIME_WAIT */ +}; +#endif + +#ifdef KPROF +int tcp_acounts[TCP_NSTATES][PRU_NREQ]; +#endif + +#ifdef TCPSTATES +char *tcpstates[] = { + "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", + "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", +}; +#endif + +#endif diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c new file mode 100644 index 0000000..33a478c --- /dev/null +++ b/sys/netinet/tcp_input.c @@ -0,0 +1,2351 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +static struct tcpiphdr tcp_saveti; +#endif + +static int tcprexmtthresh = 3; +tcp_seq tcp_iss; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, + &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming TCP connections"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send RST when dropping refused connections"); + +int tcp_delack_enabled = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &tcp_delack_enabled, 0, + "Delay ACK to try and piggyback it onto a data packet"); + +#ifdef TCP_DROP_SYNFIN +static int drop_synfin = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +#endif + +#ifdef TCP_RESTRICT_RST +static int restrict_rst = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW, + &restrict_rst, 0, "Restrict RST emission"); +#endif + +struct inpcbhead tcb; +struct inpcbinfo tcbinfo; + +static void tcp_dooptions __P((struct tcpcb *, + u_char *, int, struct tcpiphdr *, struct tcpopt *)); +static void tcp_pulloutofband __P((struct socket *, + struct tcpiphdr *, struct mbuf *)); +static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); +static void tcp_xmit_timer __P((struct tcpcb *, int)); + + +/* + * Insert segment ti into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#define TCP_REASS(tp, ti, m, so, flags) { \ + if ((ti)->ti_seq == (tp)->rcv_nxt && \ + (tp)->t_segq == NULL && \ + (tp)->t_state == TCPS_ESTABLISHED) { \ + if (tcp_delack_enabled) \ + callout_reset(tp->tt_delack, tcp_delacktime, \ + tcp_timer_delack, tp); \ + else \ + tp->t_flags |= TF_ACKNOW; \ + (tp)->rcv_nxt += (ti)->ti_len; \ + flags = (ti)->ti_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (ti), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} + +static int +tcp_reass(tp, ti, m) + register struct tcpcb *tp; + register struct tcpiphdr *ti; + struct mbuf *m; +{ + struct mbuf *q; + struct mbuf *p; + struct mbuf *nq; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + +#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header) + + /* + * Call with ti==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (ti == 0) + goto present; + + m->m_pkthdr.header = ti; + + /* + * Find a segment which begins after this one does. + */ + for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt) + if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + register int i; + /* conversion to int (in i) handles seq wraparound */ + i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq; + if (i > 0) { + if (i >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + m_freem(m); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + ti->ti_len -= i; + ti->ti_seq += i; + } + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += ti->ti_len; + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq; + if (i <= 0) + break; + if (i < GETTCP(q)->ti_len) { + GETTCP(q)->ti_seq += i; + GETTCP(q)->ti_len -= i; + m_adj(q, i); + break; + } + + nq = q->m_nextpkt; + if (p) + p->m_nextpkt = nq; + else + tp->t_segq = nq; + m_freem(q); + q = nq; + } + + if (p == NULL) { + m->m_nextpkt = tp->t_segq; + tp->t_segq = m; + } else { + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = tp->t_segq; + if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += GETTCP(q)->ti_len; + flags = GETTCP(q)->ti_flags & TH_FIN; + nq = q->m_nextpkt; + tp->t_segq = nq; + q->m_nextpkt = NULL; + if (so->so_state & SS_CANTRCVMORE) + m_freem(q); + else + sbappend(&so->so_rcv, q); + q = nq; + } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt); + sorwakeup(so); + return (flags); + +#undef GETTCP +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +void +tcp_input(m, iphlen) + register struct mbuf *m; + int iphlen; +{ + register struct tcpiphdr *ti; + register struct inpcb *inp; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + register struct tcpcb *tp = 0; + register int tiflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + struct in_addr laddr; + int dropsocket = 0; + int iss = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ +#ifdef TCPDEBUG + short ostate = 0; +#endif + + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + ti = mtod(m, struct tcpiphdr *); + if (iphlen > sizeof (struct ip)) + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + + /* + * Checksum extended TCP header and data. + */ + tlen = ((struct ip *)ti)->ip_len; + len = sizeof (struct ip) + tlen; + bzero(ti->ti_x1, sizeof(ti->ti_x1)); + ti->ti_len = (u_short)tlen; + HTONS(ti->ti_len); + ti->ti_sum = in_cksum(m, len); + if (ti->ti_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = ti->ti_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; + ti->ti_len = tlen; + if (off > sizeof (struct tcphdr)) { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + optlen = off - sizeof (struct tcphdr); + optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); + } + tiflags = ti->ti_flags; + +#ifdef TCP_DROP_SYNFIN + /* + * If the drop_synfin option is enabled, drop all packets with + * both the SYN and FIN bits set. This prevents e.g. nmap from + * identifying the TCP/IP stack. + * + * This is incompatible with RFC1644 extensions (T/TCP). + */ + if (drop_synfin && (tiflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + goto drop; +#endif + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(ti->ti_seq); + NTOHL(ti->ti_ack); + NTOHS(ti->ti_win); + NTOHS(ti->ti_urp); + + /* + * Drop TCP, IP headers and TCP options. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + + /* + * Locate pcb for segment. + */ +findpcb: +#ifdef IPFIREWALL_FORWARD + if (ip_fw_fwd_addr != NULL) { + /* + * Diverted. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, 0, m->m_pkthdr.rcvif); + if (!inp) { + /* + * No, then it's new. Try find the ambushing socket + */ + if (!ip_fw_fwd_addr->sin_port) { + inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, + ti->ti_sport, ip_fw_fwd_addr->sin_addr, + ti->ti_dport, 1, m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_hash(&tcbinfo, + ti->ti_src, ti->ti_sport, + ip_fw_fwd_addr->sin_addr, + ntohs(ip_fw_fwd_addr->sin_port), 1, + m->m_pkthdr.rcvif); + } + } + ip_fw_fwd_addr = NULL; + } else +#endif /* IPFIREWALL_FORWARD */ + + inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, 1, m->m_pkthdr.rcvif); + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain) { + char buf[4*sizeof "123"]; + + strcpy(buf, inet_ntoa(ti->ti_dst)); + switch (log_in_vain) { + case 1: + if(tiflags & TH_SYN) + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d\n", + buf, ntohs(ti->ti_dport), + inet_ntoa(ti->ti_src), + ntohs(ti->ti_sport)); + break; + case 2: + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src), + ntohs(ti->ti_sport), tiflags); + break; + default: + break; + } + } +#ifdef ICMP_BANDLIM + if (badport_bandlim(1) < 0) + goto drop; +#endif + if (blackhole) { + switch (blackhole) { + case 1: + if (tiflags & TH_SYN) + goto drop; + break; + case 2: + goto drop; + default: + goto drop; + } + } + goto dropwithreset; + } + tp = intotcpcb(inp); + if (tp == 0) + goto dropwithreset; + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((tiflags & TH_SYN) == 0) + tiwin = ti->ti_win << tp->snd_scale; + else + tiwin = ti->ti_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; + tcp_saveti = *ti; + } +#endif + if (so->so_options & SO_ACCEPTCONN) { + register struct tcpcb *tp0 = tp; + struct socket *so2; + if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (tiflags & TH_ACK) { + tcpstat.tcps_badsyn++; + goto dropwithreset; + } + goto drop; + } + so2 = sonewconn(so, 0); + if (so2 == 0) { + tcpstat.tcps_listendrop++; + so2 = sodropablereq(so); + if (so2) { + tcp_drop(sototcpcb(so2), ETIMEDOUT); + so2 = sonewconn(so, 0); + } + if (!so2) + goto drop; + } + so = so2; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; + inp->inp_laddr = ti->ti_dst; + inp->inp_lport = ti->ti_dport; + if (in_pcbinshash(inp) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + goto drop; + } + inp->inp_options = ip_srcroute(); + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); + + /* Compute proper scaling value from buffer space */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < + so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, ti, &to); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flag & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + ti->ti_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (ti->ti_len == 0) { + if (SEQ_GT(ti->ti_ack, tp->snd_una) && + SEQ_LEQ(ti->ti_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + /* + * "bad retransmit" recovery + */ + if (tp->t_rxtshift == 1 && + ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + if ((to.to_flag & TOF_TS) != 0) + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && + SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = ti->ti_ack; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + callout_stop(tp->tt_rexmt); + else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); + + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (ti->ti_ack == tp->snd_una && + tp->t_segq == NULL && + ti->ti_len <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += ti->ti_len; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += ti->ti_len; + /* + * Add data to socket buffer. + */ + sbappend(&so->so_rcv, m); + sorwakeup(so); + if (tcp_delack_enabled) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + register struct sockaddr_in *sin; + + if (tiflags & TH_RST) + goto drop; + if (tiflags & TH_ACK) + goto dropwithreset; + if ((tiflags & TH_SYN) == 0) + goto drop; + if ((ti->ti_dport == ti->ti_sport) && + (ti->ti_dst.s_addr == ti->ti_src.s_addr)) + goto drop; + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + */ + if (m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + goto drop; + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_NOWAIT); + if (sin == NULL) + goto drop; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ti->ti_src; + sin->sin_port = ti->ti_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ti->ti_dst; + if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { + inp->inp_laddr = laddr; + FREE(sin, M_SONAME); + goto drop; + } + FREE(sin, M_SONAME); + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + tcp_dooptions(tp, optp, optlen, ti, &to); + if (iss) + tp->iss = iss; + else + tp->iss = tcp_iss; + tcp_iss += TCP_ISSINCR/4; + tp->irs = ti->ti_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->snd_wnd = tiwin; /* initial send-window */ + tp->cc_send = CC_INC(tcp_ccgen); + tp->cc_recv = to.to_cc; + /* + * Perform TAO test on incoming CC (SEG.CC) option, if any. + * - compare SEG.CC against cached CC from the same host, + * if any. + * - if SEG.CC > chached value, SYN must be new and is accepted + * immediately: save new CC in the cache, mark the socket + * connected, enter ESTABLISHED state, turn on flag to + * send a SYN in the next segment. + * A virtual advertised window is set in rcv_adv to + * initialize SWS prevention. Then enter normal segment + * processing: drop SYN, process data and FIN. + * - otherwise do a normal 3-way handshake. + */ + if ((to.to_flag & TOF_CC) != 0) { + if (((tp->t_flags & TF_NOPUSH) != 0) && + taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { + + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if (tcp_delack_enabled && ((tiflags & TH_FIN) || + (ti->ti_len != 0 && + in_localaddr(inp->inp_faddr)))) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + /* + * Limit the `virtual advertised window' to TCP_MAXWIN + * here. Even if we requested window scaling, it will + * become effective only later when our SYN is acked. + */ + tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); + tcpstat.tcps_connects++; + soisconnected(so); + callout_reset(tp->tt_keep, tcp_keepinit, + tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + /* else do standard 3-way handshake */ + } else { + /* + * No CC option, but maybe CC.NEW: + * invalidate cached value. + */ + taop->tao_cc = 0; + } + /* + * TAO test failed or there was no CC option, + * do a standard 3-way handshake. + */ + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->snd_una) || + SEQ_GT(ti->ti_ack, tp->snd_max))) + goto dropwithreset; + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->iss) || + SEQ_GT(ti->ti_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else + goto dropwithreset; + } + if (tiflags & TH_RST) { + if (tiflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((tiflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = ti->ti_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = ti->ti_seq; + tcp_rcvseqinit(tp); + if (tiflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flag & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else + goto dropwithreset; + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (tcp_delack_enabled && ti->ti_len != 0) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + /* + * Received <SYN,ACK> in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + tiflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + callout_stop(tp->tt_rexmt); + if (to.to_flag & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance ti->ti_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + ti->ti_seq++; + if (ti->ti_len > tp->rcv_wnd) { + todrop = ti->ti_len - tp->rcv_wnd; + m_adj(m, -todrop); + ti->ti_len = tp->rcv_wnd; + tiflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = ti->ti_seq - 1; + tp->rcv_up = ti->ti_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (tiflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((tiflags & TH_SYN) && + (to.to_flag & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + (ticks - tp->t_starttime) > tcp_msl) + goto dropwithreset; + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * If we have multiple segments in flight, the intial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (tiflags & TH_RST) { + if (SEQ_GEQ(ti->ti_seq, tp->last_ack_sent) && + SEQ_LT(ti->ti_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; + + case TCPS_TIME_WAIT: + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if its CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) + goto dropafterack; + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(ti->ti_seq, tp->irs)) + goto dropwithreset; + + todrop = tp->rcv_nxt - ti->ti_seq; + if (todrop > 0) { + if (tiflags & TH_SYN) { + tiflags &= ~TH_SYN; + ti->ti_seq++; + if (ti->ti_urp > 1) + ti->ti_urp--; + else + tiflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > ti->ti_len + || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + tiflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = ti->ti_len; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + m_adj(m, todrop); + ti->ti_seq += todrop; + ti->ti_len -= todrop; + if (ti->ti_urp > todrop) + ti->ti_urp -= todrop; + else { + tiflags &= ~TH_URG; + ti->ti_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (tiflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + iss = tp->snd_nxt + TCP_ISSINCR; + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + ti->ti_len -= todrop; + tiflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (tiflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((tiflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(inp)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcpiphdr *)0, + (struct mbuf *)0); + tp->snd_wl1 = ti->ti_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < ti->ti_ack <= tp->snd_max + * then advance tp->snd_una to ti->ti_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { + if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (!callout_active(tp->tt_rexmt) || + ti->ti_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flag & TOF_TS) + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (ti->ti_ack == tp->snd_max) { + callout_stop(tp->tt_rexmt); + needoutput = 1; + } else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + sowwakeup(so); + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((tiflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, ti->ti_seq) || + (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || + (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (ti->ti_len == 0 && + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = ti->ti_seq; + tp->snd_wl2 = ti->ti_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((tiflags & TH_URG) && ti->ti_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { + ti->ti_urp = 0; /* XXX */ + tiflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { + tp->rcv_up = ti->ti_seq + ti->ti_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (ti->ti_urp <= (u_long)ti->ti_len +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, ti, m); + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((ti->ti_len || (tiflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + TCP_REASS(tp, ti, m, so, tiflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + tiflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (tiflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /*FALLTHROUGH*/ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (tiflags & TH_ACK) && + (SEQ_GT(tp->snd_una, ti->ti_ack) || + SEQ_GT(ti->ti_ack, tp->snd_max)) ) + goto dropwithreset; +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: +#ifdef TCP_RESTRICT_RST + if (restrict_rst) + goto drop; +#endif + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + goto drop; +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + if (tiflags & TH_ACK) + tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + else { + if (tiflags & TH_SYN) + ti->ti_len++; + tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, + TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +} + +static void +tcp_dooptions(tp, cp, cnt, ti, to) + struct tcpcb *tp; + u_char *cp; + int cnt; + struct tcpiphdr *ti; + struct tcpopt *to; +{ + u_short mss = 0; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flag |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + NTOHL(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + NTOHL(to->to_tsecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (ti->ti_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to->to_tsval; + tp->ts_recent_age = ticks; + } + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flag |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + if (ti->ti_flags & TH_SYN) + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + NTOHL(to->to_ccecho); + break; + } + } + if (ti->ti_flags & TH_SYN) + tcp_mss(tp, mss); /* sets t_maxseg */ +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(so, ti, m) + struct socket *so; + struct tcpiphdr *ti; + register struct mbuf *m; +{ + int cnt = ti->ti_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + int rtt; +{ + register int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(tp, offer) + struct tcpcb *tp; + int offer; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; + + inp = tp->t_inpcb; + if ((rt = tcp_rtlookup(inp)) == NULL) { + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); + tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + else + { + mss = ifp->if_mtu - sizeof(struct tcpiphdr); + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize, so, NULL); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize, so, NULL); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if (in_localaddr(inp->inp_faddr)) + tp->snd_cwnd = mss * ss_fltsz_local; + else + tp->snd_cwnd = mss * ss_fltsz; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(tp) + struct tcpcb *tp; +{ + struct rtentry *rt; + + rt = tcp_rtlookup(tp->t_inpcb); + if (rt == NULL) + return tcp_mssdflt; + + return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); +} diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c new file mode 100644 index 0000000..d029f07 --- /dev/null +++ b/sys/netinet/tcp_output.c @@ -0,0 +1,777 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_tcpdebug.h" + +#include <stddef.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#define TCPOUTFLAGS +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +#ifdef notyet +extern struct mbuf *m_copypack(); +#endif + +static int path_mtu_discovery = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, + &path_mtu_discovery, 1, "Enable Path MTU Discovery"); + +int ss_fltsz = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz, 1, "Slow start flight size"); + +int ss_fltsz_local = TCP_MAXWIN; /* something large */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz_local, 1, "Slow start flight size for local networks"); + +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output(tp) + register struct tcpcb *tp; +{ + register struct socket *so = tp->t_inpcb->inp_socket; + register long len, win; + int off, flags, error; + register struct mbuf *m; + register struct tcpiphdr *ti; + u_char opt[TCP_MAXOLEN]; + unsigned ipoptlen, optlen, hdrlen; + int idle, sendalot; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + idle = (tp->snd_max == tp->snd_una); + if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + if (in_localaddr(tp->t_inpcb->inp_faddr)) + tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + } +again: + sendalot = 0; + off = tp->snd_nxt - tp->snd_una; + win = min(tp->snd_wnd, tp->snd_cwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_force) { + if (win == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unsent data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (off < so->so_snd.sb_cc) + flags &= ~TH_FIN; + win = 1; + } else { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + } + + len = (long)ulmin(so->so_snd.sb_cc, win) - off; + + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + /* + * Lop off SYN bit if it has already been sent. However, if this + * is SYN-SENT state and if segment contains data and if we don't + * know that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + flags &= ~TH_SYN; + off--, len++; + if (len > 0 && tp->t_state == TCPS_SYN_SENT && + taop->tao_ccsent == 0) + return 0; + } + + /* + * Be careful not to send data and/or FIN on SYN segments + * in cases when no CC option will be sent. + * This measure is needed to prevent interoperability problems + * with not fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && + ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || + ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { + len = 0; + flags &= ~TH_FIN; + } + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be -1. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + len = 0; + if (win == 0) { + callout_stop(tp->tt_rexmt); + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_una; + if (!callout_active(tp->tt_persist)) + tcp_setpersist(tp); + } + } + if (len > tp->t_maxseg) { + len = tp->t_maxseg; + sendalot = 1; + } + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + + win = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. If connection is idle + * and can send all data, a maximum segment, + * at least a maximum default-size segment do it, + * or are forced, do it; otherwise don't bother. + * If peer's buffer is tiny, then send + * when window is at least half open. + * If retransmitting (possibly after persist timer forced us + * to send into a small window), then must resend. + */ + if (len) { + if (len == tp->t_maxseg) + goto send; + if (!(tp->t_flags & TF_MORETOCOME) && + (idle || tp->t_flags & TF_NODELAY) && + (tp->t_flags & TF_NOPUSH) == 0 && + len + off >= so->so_snd.sb_cc) + goto send; + if (tp->t_force) + goto send; + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + goto send; + } + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + */ + if (win > 0) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe peer an ACK. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if ((flags & TH_RST) || + ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, or we're retransmitting the FIN, + * then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + + /* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * callout_active(tp->tt_persist) + * is true when we are in persist state. + * tp->t_force + * is set when we are called to send a persist packet. + * callout_active(tp->tt_rexmt) + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ + if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + + /* + * No reason to send a segment, just return. + */ + return (0); + +send: + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN + */ + optlen = 0; + hdrlen = sizeof (struct tcpiphdr); + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + if ((tp->t_flags & TF_NOOPT) == 0) { + u_short mss; + + opt[0] = TCPOPT_MAXSEG; + opt[1] = TCPOLEN_MAXSEG; + mss = htons((u_short) tcp_mssopt(tp)); + (void)memcpy(opt + 2, &mss, sizeof(mss)); + optlen = TCPOLEN_MAXSEG; + + if ((tp->t_flags & TF_REQ_SCALE) && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_SCALE))) { + *((u_int32_t *)(opt + optlen)) = htonl( + TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | + TCPOLEN_WINDOW << 8 | + tp->request_r_scale); + optlen += 4; + } + } + } + + /* + * Send a timestamp and echo-reply if this is a SYN and our side + * wants to use timestamps (TF_REQ_TSTMP is set) or both our side + * and our peer have sent timestamps in our SYN's. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (flags & TH_RST) == 0 && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_TSTMP))) { + u_int32_t *lp = (u_int32_t *)(opt + optlen); + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(ticks); + *lp = htonl(tp->ts_recent); + optlen += TCPOLEN_TSTAMP_APPA; + } + + /* + * Send `CC-family' options if our side wants to use them (TF_REQ_CC), + * options are allowed (!TF_NOOPT) and it's not a RST. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (flags & TH_RST) == 0) { + switch (flags & (TH_SYN|TH_ACK)) { + /* + * This is a normal ACK, send CC if we received CC before + * from our peer. + */ + case TH_ACK: + if (!(tp->t_flags & TF_RCVD_CC)) + break; + /*FALLTHROUGH*/ + + /* + * We can only get here in T/TCP's SYN_SENT* state, when + * we're a sending a non-SYN segment without waiting for + * the ACK of our SYN. A check above assures that we only + * do this if our peer understands T/TCP. + */ + case 0: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + + optlen += 4; + break; + + /* + * This is our initial SYN, check whether we have to use + * CC or CC.new. + */ + case TH_SYN: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? + TCPOPT_CCNEW : TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + optlen += 4; + break; + + /* + * This is a SYN,ACK; send CC and CC.echo if we received + * CC from our peer. + */ + case (TH_SYN|TH_ACK): + if (tp->t_flags & TF_RCVD_CC) { + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_send); + optlen += 4; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CCECHO; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_recv); + optlen += 4; + } + break; + } + } + + hdrlen += optlen; + + if (tp->t_inpcb->inp_options) { + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + } else { + ipoptlen = 0; + } + + /* + * Adjust data length if insertion of options will + * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. + */ + if (len + optlen + ipoptlen > tp->t_maxopd) { + /* + * If there is still more to send, don't close the connection. + */ + flags &= ~TH_FIN; + len = tp->t_maxopd - optlen - ipoptlen; + sendalot = 1; + } + +/*#ifdef DIAGNOSTIC*/ + if (max_linkhdr + hdrlen > MHLEN) + panic("tcphdr too big"); +/*#endif*/ + + /* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ + if (len) { + if (tp->t_force && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, off, + (int)len, max_linkhdr + hdrlen)) == 0) { + error = ENOBUFS; + goto out; + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(so->so_snd.sb_mb, off, (int) len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; + } else { + m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; + goto out; + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (off + len == so->so_snd.sb_cc) + flags |= TH_PUSH; + } else { + if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + m->m_pkthdr.rcvif = (struct ifnet *)0; + ti = mtod(m, struct tcpiphdr *); + if (tp->t_template == 0) + panic("tcp_output"); + (void)memcpy(ti, tp->t_template, sizeof (struct tcpiphdr)); + + /* + * Fill in fields, remembering maximum advertised + * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are doing retransmissions, then snd_nxt will + * not reflect the first unsent octet. For ACK only + * packets, we do not want the sequence number of the + * retransmitted packet, we want the sequence number + * of the next unsent octet. So, if there is no data + * (and no SYN or FIN), use snd_max instead of snd_nxt + * when filling in ti_seq. But if we are in persist + * state, snd_max might reflect one byte beyond the + * right edge of the window, so use snd_nxt in that + * case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (len || (flags & (TH_SYN|TH_FIN)) + || callout_active(tp->tt_persist)) + ti->ti_seq = htonl(tp->snd_nxt); + else + ti->ti_seq = htonl(tp->snd_max); + ti->ti_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, ti + 1, optlen); + ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; + } + ti->ti_flags = flags; + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) + win = 0; + if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) + win = (long)(tp->rcv_adv - tp->rcv_nxt); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + ti->ti_win = htons((u_short) (win>>tp->rcv_scale)); + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + ti->ti_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull + * the urgent pointer to the left edge of the send window + * so that it doesn't drift into the send window on sequence + * number wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + + /* + * Put TCP length in extended header, and then + * checksum extended header and data. + */ + if (len + optlen) + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + + optlen + len)); + ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing an ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (!callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_una) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + } + } else + if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) + tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); +#endif + + /* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ + m->m_pkthdr.len = hdrlen + len; + { + struct rtentry *rt; + ((struct ip *)ti)->ip_len = m->m_pkthdr.len; + ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ + ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ + /* + * See if we should do MTU discovery. We do it only if the following + * are true: + * 1) we have a valid route to the destination + * 2) the MTU is not locked (if it is, then discovery has been + * disabled) + */ + if (path_mtu_discovery + && (rt = tp->t_inpcb->inp_route.ro_rt) + && rt->rt_flags & RTF_UP + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + ((struct ip *)ti)->ip_off |= IP_DF; + } + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + so->so_options & SO_DONTROUTE, 0); + } + if (error) { +out: + if (error == ENOBUFS) { + tcp_quench(tp->t_inpcb, 0); + return (0); + } + if (error == EMSGSIZE) { + /* + * ip_output() will have already fixed the route + * for us. tcp_mtudisc() will, as its last action, + * initiate retransmission, so it is important to + * not do so here. + */ + tcp_mtudisc(tp->t_inpcb, 0); + return 0; + } + if ((error == EHOSTUNREACH || error == ENETDOWN) + && TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); + } + return (error); + } + tcpstat.tcps_sndtotal++; + + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertised window. + * Any pending ACK has now been sent. + */ + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~TF_ACKNOW; + if (tcp_delack_enabled) + callout_stop(tp->tt_delack); + if (sendalot) + goto again; + return (0); +} + +void +tcp_setpersist(tp) + register struct tcpcb *tp; +{ + int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int tt; + + if (callout_active(tp->tt_rexmt)) + panic("tcp_setpersist: retransmit pending"); + /* + * Start/restart persistance timer. + */ + TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; +} diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c new file mode 100644 index 0000000..33a478c --- /dev/null +++ b/sys/netinet/tcp_reass.c @@ -0,0 +1,2351 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +static struct tcpiphdr tcp_saveti; +#endif + +static int tcprexmtthresh = 3; +tcp_seq tcp_iss; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, + &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming TCP connections"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send RST when dropping refused connections"); + +int tcp_delack_enabled = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &tcp_delack_enabled, 0, + "Delay ACK to try and piggyback it onto a data packet"); + +#ifdef TCP_DROP_SYNFIN +static int drop_synfin = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +#endif + +#ifdef TCP_RESTRICT_RST +static int restrict_rst = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, restrict_rst, CTLFLAG_RW, + &restrict_rst, 0, "Restrict RST emission"); +#endif + +struct inpcbhead tcb; +struct inpcbinfo tcbinfo; + +static void tcp_dooptions __P((struct tcpcb *, + u_char *, int, struct tcpiphdr *, struct tcpopt *)); +static void tcp_pulloutofband __P((struct socket *, + struct tcpiphdr *, struct mbuf *)); +static int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); +static void tcp_xmit_timer __P((struct tcpcb *, int)); + + +/* + * Insert segment ti into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#define TCP_REASS(tp, ti, m, so, flags) { \ + if ((ti)->ti_seq == (tp)->rcv_nxt && \ + (tp)->t_segq == NULL && \ + (tp)->t_state == TCPS_ESTABLISHED) { \ + if (tcp_delack_enabled) \ + callout_reset(tp->tt_delack, tcp_delacktime, \ + tcp_timer_delack, tp); \ + else \ + tp->t_flags |= TF_ACKNOW; \ + (tp)->rcv_nxt += (ti)->ti_len; \ + flags = (ti)->ti_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (ti), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} + +static int +tcp_reass(tp, ti, m) + register struct tcpcb *tp; + register struct tcpiphdr *ti; + struct mbuf *m; +{ + struct mbuf *q; + struct mbuf *p; + struct mbuf *nq; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + +#define GETTCP(m) ((struct tcpiphdr *)m->m_pkthdr.header) + + /* + * Call with ti==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (ti == 0) + goto present; + + m->m_pkthdr.header = ti; + + /* + * Find a segment which begins after this one does. + */ + for (q = tp->t_segq, p = NULL; q; p = q, q = q->m_nextpkt) + if (SEQ_GT(GETTCP(q)->ti_seq, ti->ti_seq)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + register int i; + /* conversion to int (in i) handles seq wraparound */ + i = GETTCP(p)->ti_seq + GETTCP(p)->ti_len - ti->ti_seq; + if (i > 0) { + if (i >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + m_freem(m); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + ti->ti_len -= i; + ti->ti_seq += i; + } + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += ti->ti_len; + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + register int i = (ti->ti_seq + ti->ti_len) - GETTCP(q)->ti_seq; + if (i <= 0) + break; + if (i < GETTCP(q)->ti_len) { + GETTCP(q)->ti_seq += i; + GETTCP(q)->ti_len -= i; + m_adj(q, i); + break; + } + + nq = q->m_nextpkt; + if (p) + p->m_nextpkt = nq; + else + tp->t_segq = nq; + m_freem(q); + q = nq; + } + + if (p == NULL) { + m->m_nextpkt = tp->t_segq; + tp->t_segq = m; + } else { + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = tp->t_segq; + if (!q || GETTCP(q)->ti_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += GETTCP(q)->ti_len; + flags = GETTCP(q)->ti_flags & TH_FIN; + nq = q->m_nextpkt; + tp->t_segq = nq; + q->m_nextpkt = NULL; + if (so->so_state & SS_CANTRCVMORE) + m_freem(q); + else + sbappend(&so->so_rcv, q); + q = nq; + } while (q && GETTCP(q)->ti_seq == tp->rcv_nxt); + sorwakeup(so); + return (flags); + +#undef GETTCP +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +void +tcp_input(m, iphlen) + register struct mbuf *m; + int iphlen; +{ + register struct tcpiphdr *ti; + register struct inpcb *inp; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + register struct tcpcb *tp = 0; + register int tiflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + struct in_addr laddr; + int dropsocket = 0; + int iss = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ +#ifdef TCPDEBUG + short ostate = 0; +#endif + + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + ti = mtod(m, struct tcpiphdr *); + if (iphlen > sizeof (struct ip)) + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + + /* + * Checksum extended TCP header and data. + */ + tlen = ((struct ip *)ti)->ip_len; + len = sizeof (struct ip) + tlen; + bzero(ti->ti_x1, sizeof(ti->ti_x1)); + ti->ti_len = (u_short)tlen; + HTONS(ti->ti_len); + ti->ti_sum = in_cksum(m, len); + if (ti->ti_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = ti->ti_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; + ti->ti_len = tlen; + if (off > sizeof (struct tcphdr)) { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + optlen = off - sizeof (struct tcphdr); + optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); + } + tiflags = ti->ti_flags; + +#ifdef TCP_DROP_SYNFIN + /* + * If the drop_synfin option is enabled, drop all packets with + * both the SYN and FIN bits set. This prevents e.g. nmap from + * identifying the TCP/IP stack. + * + * This is incompatible with RFC1644 extensions (T/TCP). + */ + if (drop_synfin && (tiflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + goto drop; +#endif + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(ti->ti_seq); + NTOHL(ti->ti_ack); + NTOHS(ti->ti_win); + NTOHS(ti->ti_urp); + + /* + * Drop TCP, IP headers and TCP options. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + + /* + * Locate pcb for segment. + */ +findpcb: +#ifdef IPFIREWALL_FORWARD + if (ip_fw_fwd_addr != NULL) { + /* + * Diverted. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, 0, m->m_pkthdr.rcvif); + if (!inp) { + /* + * No, then it's new. Try find the ambushing socket + */ + if (!ip_fw_fwd_addr->sin_port) { + inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, + ti->ti_sport, ip_fw_fwd_addr->sin_addr, + ti->ti_dport, 1, m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_hash(&tcbinfo, + ti->ti_src, ti->ti_sport, + ip_fw_fwd_addr->sin_addr, + ntohs(ip_fw_fwd_addr->sin_port), 1, + m->m_pkthdr.rcvif); + } + } + ip_fw_fwd_addr = NULL; + } else +#endif /* IPFIREWALL_FORWARD */ + + inp = in_pcblookup_hash(&tcbinfo, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, 1, m->m_pkthdr.rcvif); + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain) { + char buf[4*sizeof "123"]; + + strcpy(buf, inet_ntoa(ti->ti_dst)); + switch (log_in_vain) { + case 1: + if(tiflags & TH_SYN) + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d\n", + buf, ntohs(ti->ti_dport), + inet_ntoa(ti->ti_src), + ntohs(ti->ti_sport)); + break; + case 2: + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + buf, ntohs(ti->ti_dport), inet_ntoa(ti->ti_src), + ntohs(ti->ti_sport), tiflags); + break; + default: + break; + } + } +#ifdef ICMP_BANDLIM + if (badport_bandlim(1) < 0) + goto drop; +#endif + if (blackhole) { + switch (blackhole) { + case 1: + if (tiflags & TH_SYN) + goto drop; + break; + case 2: + goto drop; + default: + goto drop; + } + } + goto dropwithreset; + } + tp = intotcpcb(inp); + if (tp == 0) + goto dropwithreset; + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((tiflags & TH_SYN) == 0) + tiwin = ti->ti_win << tp->snd_scale; + else + tiwin = ti->ti_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; + tcp_saveti = *ti; + } +#endif + if (so->so_options & SO_ACCEPTCONN) { + register struct tcpcb *tp0 = tp; + struct socket *so2; + if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (tiflags & TH_ACK) { + tcpstat.tcps_badsyn++; + goto dropwithreset; + } + goto drop; + } + so2 = sonewconn(so, 0); + if (so2 == 0) { + tcpstat.tcps_listendrop++; + so2 = sodropablereq(so); + if (so2) { + tcp_drop(sototcpcb(so2), ETIMEDOUT); + so2 = sonewconn(so, 0); + } + if (!so2) + goto drop; + } + so = so2; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; + inp->inp_laddr = ti->ti_dst; + inp->inp_lport = ti->ti_dport; + if (in_pcbinshash(inp) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + goto drop; + } + inp->inp_options = ip_srcroute(); + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); + + /* Compute proper scaling value from buffer space */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < + so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, ti, &to); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flag & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + ti->ti_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (ti->ti_len == 0) { + if (SEQ_GT(ti->ti_ack, tp->snd_una) && + SEQ_LEQ(ti->ti_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + /* + * "bad retransmit" recovery + */ + if (tp->t_rxtshift == 1 && + ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + if ((to.to_flag & TOF_TS) != 0) + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && + SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = ti->ti_ack; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + callout_stop(tp->tt_rexmt); + else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); + + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (ti->ti_ack == tp->snd_una && + tp->t_segq == NULL && + ti->ti_len <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += ti->ti_len; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += ti->ti_len; + /* + * Add data to socket buffer. + */ + sbappend(&so->so_rcv, m); + sorwakeup(so); + if (tcp_delack_enabled) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + register struct sockaddr_in *sin; + + if (tiflags & TH_RST) + goto drop; + if (tiflags & TH_ACK) + goto dropwithreset; + if ((tiflags & TH_SYN) == 0) + goto drop; + if ((ti->ti_dport == ti->ti_sport) && + (ti->ti_dst.s_addr == ti->ti_src.s_addr)) + goto drop; + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + */ + if (m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + goto drop; + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_NOWAIT); + if (sin == NULL) + goto drop; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ti->ti_src; + sin->sin_port = ti->ti_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ti->ti_dst; + if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { + inp->inp_laddr = laddr; + FREE(sin, M_SONAME); + goto drop; + } + FREE(sin, M_SONAME); + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + tcp_dooptions(tp, optp, optlen, ti, &to); + if (iss) + tp->iss = iss; + else + tp->iss = tcp_iss; + tcp_iss += TCP_ISSINCR/4; + tp->irs = ti->ti_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->snd_wnd = tiwin; /* initial send-window */ + tp->cc_send = CC_INC(tcp_ccgen); + tp->cc_recv = to.to_cc; + /* + * Perform TAO test on incoming CC (SEG.CC) option, if any. + * - compare SEG.CC against cached CC from the same host, + * if any. + * - if SEG.CC > chached value, SYN must be new and is accepted + * immediately: save new CC in the cache, mark the socket + * connected, enter ESTABLISHED state, turn on flag to + * send a SYN in the next segment. + * A virtual advertised window is set in rcv_adv to + * initialize SWS prevention. Then enter normal segment + * processing: drop SYN, process data and FIN. + * - otherwise do a normal 3-way handshake. + */ + if ((to.to_flag & TOF_CC) != 0) { + if (((tp->t_flags & TF_NOPUSH) != 0) && + taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { + + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if (tcp_delack_enabled && ((tiflags & TH_FIN) || + (ti->ti_len != 0 && + in_localaddr(inp->inp_faddr)))) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + /* + * Limit the `virtual advertised window' to TCP_MAXWIN + * here. Even if we requested window scaling, it will + * become effective only later when our SYN is acked. + */ + tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); + tcpstat.tcps_connects++; + soisconnected(so); + callout_reset(tp->tt_keep, tcp_keepinit, + tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + /* else do standard 3-way handshake */ + } else { + /* + * No CC option, but maybe CC.NEW: + * invalidate cached value. + */ + taop->tao_cc = 0; + } + /* + * TAO test failed or there was no CC option, + * do a standard 3-way handshake. + */ + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->snd_una) || + SEQ_GT(ti->ti_ack, tp->snd_max))) + goto dropwithreset; + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->iss) || + SEQ_GT(ti->ti_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else + goto dropwithreset; + } + if (tiflags & TH_RST) { + if (tiflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((tiflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = ti->ti_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = ti->ti_seq; + tcp_rcvseqinit(tp); + if (tiflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flag & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else + goto dropwithreset; + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (tcp_delack_enabled && ti->ti_len != 0) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + /* + * Received <SYN,ACK> in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + tiflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + callout_stop(tp->tt_rexmt); + if (to.to_flag & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance ti->ti_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + ti->ti_seq++; + if (ti->ti_len > tp->rcv_wnd) { + todrop = ti->ti_len - tp->rcv_wnd; + m_adj(m, -todrop); + ti->ti_len = tp->rcv_wnd; + tiflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = ti->ti_seq - 1; + tp->rcv_up = ti->ti_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (tiflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((tiflags & TH_SYN) && + (to.to_flag & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + (ticks - tp->t_starttime) > tcp_msl) + goto dropwithreset; + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * If we have multiple segments in flight, the intial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (tiflags & TH_RST) { + if (SEQ_GEQ(ti->ti_seq, tp->last_ack_sent) && + SEQ_LT(ti->ti_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; + + case TCPS_TIME_WAIT: + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if its CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) + goto dropafterack; + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(ti->ti_seq, tp->irs)) + goto dropwithreset; + + todrop = tp->rcv_nxt - ti->ti_seq; + if (todrop > 0) { + if (tiflags & TH_SYN) { + tiflags &= ~TH_SYN; + ti->ti_seq++; + if (ti->ti_urp > 1) + ti->ti_urp--; + else + tiflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > ti->ti_len + || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + tiflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = ti->ti_len; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + m_adj(m, todrop); + ti->ti_seq += todrop; + ti->ti_len -= todrop; + if (ti->ti_urp > todrop) + ti->ti_urp -= todrop; + else { + tiflags &= ~TH_URG; + ti->ti_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (tiflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + iss = tp->snd_nxt + TCP_ISSINCR; + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + ti->ti_len -= todrop; + tiflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (tiflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((tiflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(inp)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcpiphdr *)0, + (struct mbuf *)0); + tp->snd_wl1 = ti->ti_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < ti->ti_ack <= tp->snd_max + * then advance tp->snd_una to ti->ti_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { + if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (!callout_active(tp->tt_rexmt) || + ti->ti_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flag & TOF_TS) + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (ti->ti_ack == tp->snd_max) { + callout_stop(tp->tt_rexmt); + needoutput = 1; + } else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + sowwakeup(so); + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((tiflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, ti->ti_seq) || + (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || + (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (ti->ti_len == 0 && + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = ti->ti_seq; + tp->snd_wl2 = ti->ti_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((tiflags & TH_URG) && ti->ti_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { + ti->ti_urp = 0; /* XXX */ + tiflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { + tp->rcv_up = ti->ti_seq + ti->ti_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (ti->ti_urp <= (u_long)ti->ti_len +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, ti, m); + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((ti->ti_len || (tiflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + TCP_REASS(tp, ti, m, so, tiflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + tiflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (tiflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /*FALLTHROUGH*/ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (tiflags & TH_ACK) && + (SEQ_GT(tp->snd_una, ti->ti_ack) || + SEQ_GT(ti->ti_ack, tp->snd_max)) ) + goto dropwithreset; +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: +#ifdef TCP_RESTRICT_RST + if (restrict_rst) + goto drop; +#endif + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + goto drop; +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + if (tiflags & TH_ACK) + tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + else { + if (tiflags & TH_SYN) + ti->ti_len++; + tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, + TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +} + +static void +tcp_dooptions(tp, cp, cnt, ti, to) + struct tcpcb *tp; + u_char *cp; + int cnt; + struct tcpiphdr *ti; + struct tcpopt *to; +{ + u_short mss = 0; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flag |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + NTOHL(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + NTOHL(to->to_tsecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (ti->ti_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to->to_tsval; + tp->ts_recent_age = ticks; + } + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flag |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + if (ti->ti_flags & TH_SYN) + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + NTOHL(to->to_ccecho); + break; + } + } + if (ti->ti_flags & TH_SYN) + tcp_mss(tp, mss); /* sets t_maxseg */ +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(so, ti, m) + struct socket *so; + struct tcpiphdr *ti; + register struct mbuf *m; +{ + int cnt = ti->ti_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + int rtt; +{ + register int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(tp, offer) + struct tcpcb *tp; + int offer; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; + + inp = tp->t_inpcb; + if ((rt = tcp_rtlookup(inp)) == NULL) { + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); + tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + else + { + mss = ifp->if_mtu - sizeof(struct tcpiphdr); + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize, so, NULL); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize, so, NULL); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if (in_localaddr(inp->inp_faddr)) + tp->snd_cwnd = mss * ss_fltsz_local; + else + tp->snd_cwnd = mss * ss_fltsz; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(tp) + struct tcpcb *tp; +{ + struct rtentry *rt; + + rt = tcp_rtlookup(tp->t_inpcb); + if (rt == NULL) + return tcp_mssdflt; + + return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); +} diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h new file mode 100644 index 0000000..de50dd5 --- /dev/null +++ b/sys/netinet/tcp_seq.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_SEQ_H_ +#define _NETINET_TCP_SEQ_H_ +/* + * TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* + * TCP connection counts are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define CC_LT(a,b) ((int)((a)-(b)) < 0) +#define CC_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define CC_GT(a,b) ((int)((a)-(b)) > 0) +#define CC_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* Macro to increment a CC: skip 0 which has a special meaning */ +#define CC_INC(c) (++(c) == 0 ? ++(c) : (c)) + +/* + * Macros to initialize tcp sequence numbers for + * send and receive from initial send and receive + * sequence numbers. + */ +#define tcp_rcvseqinit(tp) \ + (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1 + +#define tcp_sendseqinit(tp) \ + (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ + (tp)->iss + +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * hz) + /* timestamp wrap-around time */ + +#ifdef KERNEL +extern tcp_cc tcp_ccgen; /* global connection count */ + +/* + * Increment for tcp_iss each second. + * This is designed to increment at the standard 250 KB/s, + * but with a random component averaging 128 KB. + * We also increment tcp_iss by a quarter of this amount + * each time we use the value for a new connection. + * If defined, the tcp_random18() macro should produce a + * number in the range [0-0x3ffff] that is hard to predict. + */ +#ifndef tcp_random18 +#define tcp_random18() ((random() >> 14) & 0x3ffff) +#endif +#define TCP_ISSINCR (122*1024 + tcp_random18()) + +extern tcp_seq tcp_iss; /* tcp initial send seq # */ +#else +#define TCP_ISSINCR (250*1024) /* increment for tcp_iss each second */ +#endif /* KERNEL */ +#endif /* _NETINET_TCP_SEQ_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c new file mode 100644 index 0000000..fd3e435 --- /dev/null +++ b/sys/netinet/tcp_subr.c @@ -0,0 +1,839 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> + +#include <vm/vm_zone.h> + +#include <net/route.h> +#include <net/if.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +#ifdef INET6 +int tcp_v6mssdflt = TCP6_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, &tcp_v6mssdflt , 0, ""); +#endif + +#if 0 +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, + &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); +#endif + +static int tcp_do_rfc1323 = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); + +static int tcp_do_rfc1644 = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, + &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static void tcp_cleartaocache __P((void)); +static void tcp_notify __P((struct inpcb *, int)); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * This is the actual shape of what we allocate using the zone + * allocator. Doing it this way allows us to protect both structures + * using the same generation count, and also eliminates the overhead + * of allocating tcpcbs separately. By hiding the structure here, + * we avoid changing most of the rest of the code (although it needs + * to be changed, eventually, for greater efficiency). + */ +#define ALIGNMENT 32 +#define ALIGNM1 (ALIGNMENT - 1) +struct inp_tp { + union { + struct inpcb inp; + char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; + } inp_tp_u; + struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; + struct callout inp_tp_delack; +}; +#undef ALIGNMENT +#undef ALIGNM1 + +/* + * Tcp initialization + */ +void +tcp_init() +{ + int hashsize; + + tcp_iss = random(); /* wrong, but better than a constant */ + tcp_ccgen = 1; + tcp_cleartaocache(); + + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + tcp_tcbhashsize = hashsize; + tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); + tcbinfo.porthashbase = hashinit(hashsize, M_PCB, + &tcbinfo.porthashmask); + tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, + ZONE_INTERRUPT, 0); + + if (max_protohdr < sizeof(struct tcpiphdr)) + max_protohdr = sizeof(struct tcpiphdr); + if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) + panic("tcp_init"); +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcpiphdr * +tcp_template(tp) + struct tcpcb *tp; +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcpiphdr *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcpiphdr); + n = mtod(m, struct tcpiphdr *); + } + bzero(n->ti_x1, sizeof(n->ti_x1)); + n->ti_pr = IPPROTO_TCP; + n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + n->ti_src = inp->inp_laddr; + n->ti_dst = inp->inp_faddr; + n->ti_sport = inp->inp_lport; + n->ti_dport = inp->inp_fport; + n->ti_seq = 0; + n->ti_ack = 0; + n->ti_x2 = 0; + n->ti_off = 5; + n->ti_flags = 0; + n->ti_win = 0; + n->ti_sum = 0; + n->ti_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(tp, ti, m, ack, seq, flags) + struct tcpcb *tp; + register struct tcpiphdr *ti; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + + if (tp) { + if (!(flags & TH_RST)) + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + ro = &tp->t_inpcb->inp_route; + } else { + ro = &sro; + bzero(ro, sizeof *ro); + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; +#ifdef TCP_COMPAT_42 + tlen = 1; +#else + tlen = 0; +#endif + m->m_data += max_linkhdr; + *mtod(m, struct tcpiphdr *) = *ti; + ti = mtod(m, struct tcpiphdr *); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ti; + m->m_len = sizeof (struct tcpiphdr); + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } + xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long); + xchg(ti->ti_dport, ti->ti_sport, n_short); +#undef xchg + } + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); + tlen += sizeof (struct tcpiphdr); + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + bzero(ti->ti_x1, sizeof(ti->ti_x1)); + ti->ti_seq = htonl(seq); + ti->ti_ack = htonl(ack); + ti->ti_x2 = 0; + ti->ti_off = sizeof (struct tcphdr) >> 2; + ti->ti_flags = flags; + if (tp) + ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + else + ti->ti_win = htons((u_short)win); + ti->ti_urp = 0; + ti->ti_sum = 0; + ti->ti_sum = in_cksum(m, tlen); + ((struct ip *)ti)->ip_len = tlen; + ((struct ip *)ti)->ip_ttl = ip_defttl; +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, ti, 0); +#endif + (void) ip_output(m, NULL, ro, 0, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + struct inp_tp *it; + register struct tcpcb *tp; + + it = (struct inp_tp *)inp; + tp = &it->tcb; + bzero((char *) tp, sizeof(struct tcpcb)); + tp->t_segq = NULL; + tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); + callout_init(tp->tt_persist = &it->inp_tp_persist); + callout_init(tp->tt_keep = &it->inp_tp_keep); + callout_init(tp->tt_2msl = &it->inp_tp_2msl); + callout_init(tp->tt_delack = &it->inp_tp_delack); + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (tcp_do_rfc1644) + tp->t_flags |= TF_REQ_CC; + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct mbuf *q; + register struct mbuf *nq; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + register struct rtentry *rt; + int dosavessthresh; + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16 && + (rt = inp->inp_route.ro_rt) && + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + register u_long i = 0; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * The old comment here said: + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + * + * But we want to save the ssthresh even if no pipesize is + * specified explicitly in the route, because such + * connections still have an implicit pipesize specified + * by the global tcp_sendspace. In the absence of a reliable + * way to calculate the pipesize, it will have to do. + */ + i = tp->snd_ssthresh; + if (rt->rt_rmx.rmx_sendpipe != 0) + dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); + else + dosavessthresh = (i < so->so_snd.sb_hiwat / 2); + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) + || dosavessthresh) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + /* free the reassembly queue, if any */ + for (q = tp->t_segq; q; q = nq) { + nq = q->m_nextpkt; + tp->t_segq = nq; + m_freem(q); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + inp->inp_ppcb = NULL; + soisdisconnected(so); + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + */ +static void +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + register struct socket *so = inp->inp_socket; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + so->so_error = error; + else + tp->t_softerror = error; + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +} + +static int +tcp_pcblist SYSCTL_HANDLER_ARGS +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = tcbinfo.listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred SYSCTL_HANDLER_ARGS +{ + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection"); + +void +tcp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + register struct ip *ip = vip; + register struct tcphdr *th; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); + in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, + cmd, notify); + } else + in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +void +tcp_mtudisc(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; + + if (tp) { + rt = tcp_rtlookup(inp); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + return; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated the return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(inp) + struct inpcb *inp; +{ + struct route *ro; + struct rtentry *rt; + + ro = &inp->inp_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(inp) + struct inpcb *inp; +{ + struct rtentry *rt = tcp_rtlookup(inp); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} + +/* + * Clear all the TAO cache entries, called from tcp_init. + * + * XXX + * This routine is just an empty one, because we assume that the routing + * routing tables are initialized at the same time when TCP, so there is + * nothing in the cache left over. + */ +static void +tcp_cleartaocache() +{ +} diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c new file mode 100644 index 0000000..591fb18 --- /dev/null +++ b/sys/netinet/tcp_timer.c @@ -0,0 +1,456 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +static int +sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS +{ + int error, s, tt; + + tt = *(int *)oidp->oid_arg1; + s = tt * 1000 / hz; + + error = sysctl_handle_int(oidp, &s, 0, req); + if (error || !req->newptr) + return (error); + + tt = s * hz / 1000; + if (tt < 1) + return (EINVAL); + + *(int *)oidp->oid_arg1 = tt; + return (0); +} + +int tcp_keepinit; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_keepidle; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_keepintvl; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_delacktime; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, + CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", + "Time before a delayed ACK is sent"); + +int tcp_msl; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); + +static int always_keepalive = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, + &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + +static int tcp_keepcnt = TCPTV_KEEPCNT; + /* max idle probes */ +int tcp_maxpersistidle; + /* max idle time in persist */ +int tcp_maxidle; + +/* + * Tcp protocol timeout routine called every 500 ms. + * Updates timestamps used for TCP + * causes finite state machine actions if timers expire. + */ +void +tcp_slowtimo() +{ + int s; + + s = splnet(); + + tcp_maxidle = tcp_keepcnt * tcp_keepintvl; + + tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */ +#ifdef TCP_COMPAT_42 + if ((int)tcp_iss < 0) + tcp_iss = TCP_ISSINCR; /* XXX */ +#endif + splx(s); +} + +/* + * Cancel all timers for TCP tp. + */ +void +tcp_canceltimers(tp) + struct tcpcb *tp; +{ + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_rexmt); +} + +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ + +/* + * TCP timer processing. + */ +void +tcp_timer_delack(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + + s = splnet(); + if (callout_pending(tp->tt_delack)) { + splx(s); + return; + } + callout_deactivate(tp->tt_delack); + + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_delack++; + (void) tcp_output(tp); + splx(s); +} + +void +tcp_timer_2msl(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_2msl)) { + splx(s); + return; + } + callout_deactivate(tp->tt_2msl); + /* + * 2 MSL timeout in shutdown went off. If we're closed but + * still waiting for peer to close and connection has been idle + * too long, or if 2MSL time is up from TIME_WAIT, delete connection + * control block. Otherwise, check again in a bit. + */ + if (tp->t_state != TCPS_TIME_WAIT && + (ticks - tp->t_rcvtime) <= tcp_maxidle) + callout_reset(tp->tt_2msl, tcp_keepintvl, + tcp_timer_2msl, tp); + else + tp = tcp_close(tp); + +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} + +void +tcp_timer_keep(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_keep)) { + splx(s); + return; + } + callout_deactivate(tp->tt_keep); + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + tcpstat.tcps_keeptimeo++; + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((always_keepalive || + tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) + goto dropit; + /* + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. + */ + tcpstat.tcps_keepprobe++; +#ifdef TCP_COMPAT_42 + /* + * The keepalive packet must have nonzero length + * to get a 4.2 host to respond. + */ + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt - 1, tp->snd_una - 1, 0); +#else + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); +#endif + callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); + } else + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); + return; + +dropit: + tcpstat.tcps_keepdrops++; + tp = tcp_drop(tp, ETIMEDOUT); + +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} + +void +tcp_timer_persist(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_persist)) { + splx(s); + return; + } + callout_deactivate(tp->tt_persist); + /* + * Persistance timer into zero window. + * Force a byte to be output, if possible. + */ + tcpstat.tcps_persisttimeo++; + /* + * Hack: if the peer is dead/unreachable, we do not + * time out if the window is closed. After a full + * backoff, drop the connection if the idle time + * (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || + (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + tcpstat.tcps_persistdrop++; + tp = tcp_drop(tp, ETIMEDOUT); + goto out; + } + tcp_setpersist(tp); + tp->t_force = 1; + (void) tcp_output(tp); + tp->t_force = 0; + +out: +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} + +void +tcp_timer_rexmt(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + int rexmt; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_rexmt)) { + splx(s); + return; + } + callout_deactivate(tp->tt_rexmt); + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + tcpstat.tcps_timeoutdrop++; + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + goto out; + } + if (tp->t_rxtshift == 1) { + /* + * first retransmit; record ssthresh and cwnd so they can + * be recovered if this turns out to be a "bad" retransmit. + * A retransmit is considered "bad" if an ACK for this + * segment is received within RTT/2 interval; the assumption + * here is that the ACK was already in flight. See + * "On Estimating End-to-End Network Path Properties" by + * Allman and Paxson for more details. + */ + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + } + tcpstat.tcps_rexmttimeo++; + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + /* + * If losing, let the lower level know and try for + * a better route. Also, if we backed off this far, + * our srtt estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtttime = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; + } + (void) tcp_output(tp); + +out: +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h new file mode 100644 index 0000000..1f8d1b7 --- /dev/null +++ b/sys/netinet/tcp_timer.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_TIMER_H_ +#define _NETINET_TCP_TIMER_H_ + +/* + * The TCPT_REXMT timer is used to force retransmissions. + * The TCP has the TCPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The TCPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the TCPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as TCPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The TCPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The TCPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, + * but not yet established, then we drop the connection. Once the connection + * is established, if the connection is idle for TCPTV_KEEP_IDLE time + * (and keepalives have been enabled on the socket), we begin to probe + * the connection. We force the peer to send us a segment by sending: + * <SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK> + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the TCPT_KEEP + * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE + * amount of time probing, then we drop the connection. + */ + +/* + * Time constants. + */ +#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ +#define TCPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ +#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */ + +#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */ +#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ + +#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ +#define TCPTV_KEEPCNT 8 /* max probes before drop */ + +#define TCPTV_MIN ( 1*hz) /* minimum allowable value */ +#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ + +#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ + +#define TCP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */ + +#ifdef TCPTIMERS +static char *tcptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ + (tv) = (value); \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + else if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} while(0) + +#ifdef KERNEL +extern int tcp_keepinit; /* time to establish connection */ +extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_keepintvl; /* time between keepalive probes */ +extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_delacktime; /* time before sending a delayed ACK */ +extern int tcp_maxpersistidle; +extern int tcp_msl; +extern int tcp_ttl; /* time to live for TCP segs */ +extern int tcp_backoff[]; + +void tcp_timer_2msl __P((void *xtp)); +void tcp_timer_keep __P((void *xtp)); +void tcp_timer_persist __P((void *xtp)); +void tcp_timer_rexmt __P((void *xtp)); +void tcp_timer_delack __P((void *xtp)); + +#endif /* KERNEL */ + +#endif /* !_NETINET_TCP_TIMER_H_ */ diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c new file mode 100644 index 0000000..fd3e435 --- /dev/null +++ b/sys/netinet/tcp_timewait.c @@ -0,0 +1,839 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> + +#include <vm/vm_zone.h> + +#include <net/route.h> +#include <net/if.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +#ifdef INET6 +int tcp_v6mssdflt = TCP6_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, &tcp_v6mssdflt , 0, ""); +#endif + +#if 0 +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, + &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); +#endif + +static int tcp_do_rfc1323 = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); + +static int tcp_do_rfc1644 = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, + &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static void tcp_cleartaocache __P((void)); +static void tcp_notify __P((struct inpcb *, int)); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * This is the actual shape of what we allocate using the zone + * allocator. Doing it this way allows us to protect both structures + * using the same generation count, and also eliminates the overhead + * of allocating tcpcbs separately. By hiding the structure here, + * we avoid changing most of the rest of the code (although it needs + * to be changed, eventually, for greater efficiency). + */ +#define ALIGNMENT 32 +#define ALIGNM1 (ALIGNMENT - 1) +struct inp_tp { + union { + struct inpcb inp; + char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; + } inp_tp_u; + struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; + struct callout inp_tp_delack; +}; +#undef ALIGNMENT +#undef ALIGNM1 + +/* + * Tcp initialization + */ +void +tcp_init() +{ + int hashsize; + + tcp_iss = random(); /* wrong, but better than a constant */ + tcp_ccgen = 1; + tcp_cleartaocache(); + + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + tcp_tcbhashsize = hashsize; + tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); + tcbinfo.porthashbase = hashinit(hashsize, M_PCB, + &tcbinfo.porthashmask); + tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, + ZONE_INTERRUPT, 0); + + if (max_protohdr < sizeof(struct tcpiphdr)) + max_protohdr = sizeof(struct tcpiphdr); + if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) + panic("tcp_init"); +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcpiphdr * +tcp_template(tp) + struct tcpcb *tp; +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcpiphdr *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcpiphdr); + n = mtod(m, struct tcpiphdr *); + } + bzero(n->ti_x1, sizeof(n->ti_x1)); + n->ti_pr = IPPROTO_TCP; + n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + n->ti_src = inp->inp_laddr; + n->ti_dst = inp->inp_faddr; + n->ti_sport = inp->inp_lport; + n->ti_dport = inp->inp_fport; + n->ti_seq = 0; + n->ti_ack = 0; + n->ti_x2 = 0; + n->ti_off = 5; + n->ti_flags = 0; + n->ti_win = 0; + n->ti_sum = 0; + n->ti_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(tp, ti, m, ack, seq, flags) + struct tcpcb *tp; + register struct tcpiphdr *ti; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + + if (tp) { + if (!(flags & TH_RST)) + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + ro = &tp->t_inpcb->inp_route; + } else { + ro = &sro; + bzero(ro, sizeof *ro); + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; +#ifdef TCP_COMPAT_42 + tlen = 1; +#else + tlen = 0; +#endif + m->m_data += max_linkhdr; + *mtod(m, struct tcpiphdr *) = *ti; + ti = mtod(m, struct tcpiphdr *); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ti; + m->m_len = sizeof (struct tcpiphdr); + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } + xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, n_long); + xchg(ti->ti_dport, ti->ti_sport, n_short); +#undef xchg + } + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); + tlen += sizeof (struct tcpiphdr); + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + bzero(ti->ti_x1, sizeof(ti->ti_x1)); + ti->ti_seq = htonl(seq); + ti->ti_ack = htonl(ack); + ti->ti_x2 = 0; + ti->ti_off = sizeof (struct tcphdr) >> 2; + ti->ti_flags = flags; + if (tp) + ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + else + ti->ti_win = htons((u_short)win); + ti->ti_urp = 0; + ti->ti_sum = 0; + ti->ti_sum = in_cksum(m, tlen); + ((struct ip *)ti)->ip_len = tlen; + ((struct ip *)ti)->ip_ttl = ip_defttl; +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, ti, 0); +#endif + (void) ip_output(m, NULL, ro, 0, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + struct inp_tp *it; + register struct tcpcb *tp; + + it = (struct inp_tp *)inp; + tp = &it->tcb; + bzero((char *) tp, sizeof(struct tcpcb)); + tp->t_segq = NULL; + tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); + callout_init(tp->tt_persist = &it->inp_tp_persist); + callout_init(tp->tt_keep = &it->inp_tp_keep); + callout_init(tp->tt_2msl = &it->inp_tp_2msl); + callout_init(tp->tt_delack = &it->inp_tp_delack); + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (tcp_do_rfc1644) + tp->t_flags |= TF_REQ_CC; + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct mbuf *q; + register struct mbuf *nq; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + register struct rtentry *rt; + int dosavessthresh; + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16 && + (rt = inp->inp_route.ro_rt) && + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + register u_long i = 0; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * The old comment here said: + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + * + * But we want to save the ssthresh even if no pipesize is + * specified explicitly in the route, because such + * connections still have an implicit pipesize specified + * by the global tcp_sendspace. In the absence of a reliable + * way to calculate the pipesize, it will have to do. + */ + i = tp->snd_ssthresh; + if (rt->rt_rmx.rmx_sendpipe != 0) + dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); + else + dosavessthresh = (i < so->so_snd.sb_hiwat / 2); + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) + || dosavessthresh) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + /* free the reassembly queue, if any */ + for (q = tp->t_segq; q; q = nq) { + nq = q->m_nextpkt; + tp->t_segq = nq; + m_freem(q); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + inp->inp_ppcb = NULL; + soisdisconnected(so); + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + */ +static void +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + register struct socket *so = inp->inp_socket; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + so->so_error = error; + else + tp->t_softerror = error; + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +} + +static int +tcp_pcblist SYSCTL_HANDLER_ARGS +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = tcbinfo.listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred SYSCTL_HANDLER_ARGS +{ + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection"); + +void +tcp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + register struct ip *ip = vip; + register struct tcphdr *th; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); + in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, + cmd, notify); + } else + in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +void +tcp_mtudisc(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; + + if (tp) { + rt = tcp_rtlookup(inp); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + return; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated the return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(inp) + struct inpcb *inp; +{ + struct route *ro; + struct rtentry *rt; + + ro = &inp->inp_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(inp) + struct inpcb *inp; +{ + struct rtentry *rt = tcp_rtlookup(inp); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} + +/* + * Clear all the TAO cache entries, called from tcp_init. + * + * XXX + * This routine is just an empty one, because we assume that the routing + * routing tables are initialized at the same time when TCP, so there is + * nothing in the cache left over. + */ +static void +tcp_cleartaocache() +{ +} diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c new file mode 100644 index 0000000..7d7f71f --- /dev/null +++ b/sys/netinet/tcp_usrreq.c @@ -0,0 +1,821 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + * $FreeBSD$ + */ + +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +/* + * TCP protocol interface to socket abstraction. + */ +extern char *tcpstates[]; /* XXX ??? */ + +static int tcp_attach __P((struct socket *, struct proc *)); +static int tcp_connect __P((struct tcpcb *, struct sockaddr *, + struct proc *)); +static struct tcpcb * + tcp_disconnect __P((struct tcpcb *)); +static struct tcpcb * + tcp_usrclosed __P((struct tcpcb *)); + +#ifdef TCPDEBUG +#define TCPDEBUG0 int ostate +#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 +#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ + tcp_trace(TA_USER, ostate, tp, 0, req) +#else +#define TCPDEBUG0 +#define TCPDEBUG1() +#define TCPDEBUG2(req) +#endif + +/* + * TCP attaches to socket via pru_attach(), reserving space, + * and an internet control block. + */ +static int +tcp_usr_attach(struct socket *so, int proto, struct proc *p) +{ + int s = splnet(); + int error; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = 0; + TCPDEBUG0; + + TCPDEBUG1(); + if (inp) { + error = EISCONN; + goto out; + } + + error = tcp_attach(so, p); + if (error) + goto out; + + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; + tp = sototcpcb(so); +out: + TCPDEBUG2(PRU_ATTACH); + splx(s); + return error; +} + +/* + * pru_detach() detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a pru_disconnect(), + * which may finish later; embryonic TCB's can just + * be discarded here. + */ +static int +tcp_usr_detach(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + TCPDEBUG0; + + if (inp == 0) { + splx(s); + return EINVAL; /* XXX */ + } + tp = intotcpcb(inp); + TCPDEBUG1(); + tp = tcp_disconnect(tp); + + TCPDEBUG2(PRU_DETACH); + splx(s); + return error; +} + +#define COMMON_START() TCPDEBUG0; \ + do { \ + if (inp == 0) { \ + splx(s); \ + return EINVAL; \ + } \ + tp = intotcpcb(inp); \ + TCPDEBUG1(); \ + } while(0) + +#define COMMON_END(req) out: TCPDEBUG2(req); splx(s); return error; goto out + + +/* + * Give the socket an address. + */ +static int +tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in *sinp; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_family == AF_INET && + IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + error = in_pcbbind(inp, nam, p); + if (error) + goto out; + COMMON_END(PRU_BIND); + +} + +/* + * Prepare to accept connections. + */ +static int +tcp_usr_listen(struct socket *so, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if (inp->inp_lport == 0) + error = in_pcbbind(inp, (struct sockaddr *)0, p); + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} + +/* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ +static int +tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in *sinp; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_family == AF_INET + && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + + prison_remote_ip(p, 0, &sinp->sin_addr.s_addr); + + if ((error = tcp_connect(tp, nam, p)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} + +/* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ +static int +tcp_usr_disconnect(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tp = tcp_disconnect(tp); + COMMON_END(PRU_DISCONNECT); +} + +/* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ +static int +tcp_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + in_setpeeraddr(so, nam); + COMMON_END(PRU_ACCEPT); +} + +/* + * Mark the connection as being incapable of further output. + */ +static int +tcp_usr_shutdown(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + socantsendmore(so); + tp = tcp_usrclosed(tp); + if (tp) + error = tcp_output(tp); + COMMON_END(PRU_SHUTDOWN); +} + +/* + * After a receive, possibly send window update to peer. + */ +static int +tcp_usr_rcvd(struct socket *so, int flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tcp_output(tp); + COMMON_END(PRU_RCVD); +} + +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. Unlike the other + * pru_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pru_* routines + * generally are caller-frees. + */ +static int +tcp_usr_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + TCPDEBUG0; + + if (inp == NULL) { + /* + * OOPS! we lost a race, the TCP session got reset after + * we checked SS_CANTSENDMORE, eg: while doing uiomove or a + * network interrupt in the non-splnet() section of sosend(). + */ + if (m) + m_freem(m); + if (control) + m_freem(control); + error = ECONNRESET; /* XXX EPIPE? */ + tp = NULL; + TCPDEBUG1(); + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if (control) { + /* TCP doesn't do control messages (rights, creds, etc) */ + if (control->m_len) { + m_freem(control); + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + m_freem(control); /* empty control, just free it */ + } + if(!(flags & PRUS_OOB)) { + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + error = tcp_connect(tp, nam, p); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + socantsendmore(so); + tp = tcp_usrclosed(tp); + } + if (tp != NULL) { + if (flags & PRUS_MORETOCOME) + tp->t_flags |= TF_MORETOCOME; + error = tcp_output(tp); + if (flags & PRUS_MORETOCOME) + tp->t_flags &= ~TF_MORETOCOME; + } + } else { + if (sbspace(&so->so_snd) < -512) { + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + error = tcp_connect(tp, nam, p); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_force = 1; + error = tcp_output(tp); + tp->t_force = 0; + } + COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); +} + +/* + * Abort the TCP. + */ +static int +tcp_usr_abort(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tp = tcp_drop(tp, ECONNABORTED); + COMMON_END(PRU_ABORT); +} + +/* + * Receive out-of-band data. + */ +static int +tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if ((so->so_oobmark == 0 && + (so->so_state & SS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + goto out; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + goto out; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if ((flags & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + COMMON_END(PRU_RCVOOB); +} + +/* xxx - should be const */ +struct pr_usrreqs tcp_usrreqs = { + tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind, + tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach, + tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; + +/* + * Common subroutine to open a TCP connection to remote host specified + * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local + * port number if needed. Call in_pcbladdr to do the routing and to choose + * a local host address (interface). If there is an existing incarnation + * of the same connection in TIME-WAIT state and if the remote host was + * sending CC options and if the connection duration was < MSL, then + * truncate the previous TIME-WAIT state and proceed. + * Initialize connection parameters and enter SYN-SENT state. + */ +static int +tcp_connect(tp, nam, p) + register struct tcpcb *tp; + struct sockaddr *nam; + struct proc *p; +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in *ifaddr; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + int error; + + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, p); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in_pcbladdr(inp, nam, &ifaddr); + if (error) + return error; + oinp = in_pcblookup_hash(inp->inp_pcbinfo, + sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr + : ifaddr->sin_addr, + inp->inp_lport, 0, NULL); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + (ticks - otp->t_starttime) < tcp_msl && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ifaddr->sin_addr; + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + in_pcbdisconnect(inp); + return ENOBUFS; + } + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} + +/* + * The new sockopt interface makes it possible for us to block in the + * copyin/out step (if we take a page fault). Taking a page fault at + * splnet() is probably a Bad Thing. (Since sockets and pcbs both now + * use TSM, there probably isn't any need for this function to run at + * splnet() any more. This needs more examination.) + */ +int +tcp_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, opt, optval, s; + struct inpcb *inp; + struct tcpcb *tp; + + error = 0; + s = splnet(); /* XXX */ + inp = sotoinpcb(so); + if (inp == NULL) { + splx(s); + return (ECONNRESET); + } + if (sopt->sopt_level != IPPROTO_TCP) { + error = ip_ctloutput(so, sopt); + splx(s); + return (error); + } + tp = intotcpcb(inp); + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + case TCP_NOOPT: + case TCP_NOPUSH: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case TCP_NODELAY: + opt = TF_NODELAY; + break; + case TCP_NOOPT: + opt = TF_NOOPT; + break; + case TCP_NOPUSH: + opt = TF_NOPUSH; + break; + default: + opt = 0; /* dead code to fool gcc */ + break; + } + + if (optval) + tp->t_flags |= opt; + else + tp->t_flags &= ~opt; + break; + + case TCP_MAXSEG: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + if (optval > 0 && optval <= tp->t_maxseg) + tp->t_maxseg = optval; + else + error = EINVAL; + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + optval = tp->t_flags & TF_NODELAY; + break; + case TCP_MAXSEG: + optval = tp->t_maxseg; + break; + case TCP_NOOPT: + optval = tp->t_flags & TF_NOOPT; + break; + case TCP_NOPUSH: + optval = tp->t_flags & TF_NOPUSH; + break; + default: + error = ENOPROTOOPT; + break; + } + if (error == 0) + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + } + splx(s); + return (error); +} + +/* + * tcp_sendspace and tcp_recvspace are the default send and receive window + * sizes, respectively. These are obsolescent (this information should + * be set by the route). + */ +u_long tcp_sendspace = 1024*16; +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, + &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); +u_long tcp_recvspace = 1024*16; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +static int +tcp_attach(so, p) + struct socket *so; + struct proc *p; +{ + register struct tcpcb *tp; + struct inpcb *inp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + error = in_pcballoc(so, &tcbinfo, p); + if (error) + return (error); + inp = sotoinpcb(so); + inp->inp_vflag |= INP_IPV4; + tp = tcp_newtcpcb(inp); + if (tp == 0) { + int nofd = so->so_state & SS_NOFDREF; /* XXX */ + + so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ + in_pcbdetach(inp); + so->so_state |= nofd; + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static struct tcpcb * +tcp_disconnect(tp) + register struct tcpcb *tp; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (tp->t_state < TCPS_ESTABLISHED) + tp = tcp_close(tp); + else if ((so->so_options & SO_LINGER) && so->so_linger == 0) + tp = tcp_drop(tp, 0); + else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tp = tcp_usrclosed(tp); + if (tp) + (void) tcp_output(tp); + } + return (tp); +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static struct tcpcb * +tcp_usrclosed(tp) + register struct tcpcb *tp; +{ + + switch (tp->t_state) { + + case TCPS_CLOSED: + case TCPS_LISTEN: + tp->t_state = TCPS_CLOSED; + tp = tcp_close(tp); + break; + + case TCPS_SYN_SENT: + case TCPS_SYN_RECEIVED: + tp->t_flags |= TF_NEEDFIN; + break; + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { + soisdisconnected(tp->t_inpcb->inp_socket); + /* To prevent the connection hanging in FIN_WAIT_2 forever. */ + if (tp->t_state == TCPS_FIN_WAIT_2) + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + return (tp); +} + diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h new file mode 100644 index 0000000..0258b77 --- /dev/null +++ b/sys/netinet/tcp_var.h @@ -0,0 +1,390 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_VAR_H_ +#define _NETINET_TCP_VAR_H_ +/* + * Kernel variables for tcp. + */ + +/* + * Tcp control block, one per tcp; fields: + * Organized for 16 byte cacheline efficiency. + */ +struct tcpcb { + struct mbuf *t_segq; + int t_dupacks; /* consecutive dup acks recd */ + struct tcpiphdr *t_template; /* skeletal packet for transmit */ + + struct callout *tt_rexmt; /* retransmit timer */ + struct callout *tt_persist; /* retransmit persistence */ + struct callout *tt_keep; /* keepalive */ + struct callout *tt_2msl; /* 2*msl TIME_WAIT timer */ + struct callout *tt_delack; /* delayed ACK timer */ + + struct inpcb *t_inpcb; /* back pointer to internet pcb */ + int t_state; /* state of this connection */ + u_int t_flags; +#define TF_ACKNOW 0x00001 /* ack peer immediately */ +#define TF_DELACK 0x00002 /* ack, but try to delay it */ +#define TF_NODELAY 0x00004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x00008 /* don't use tcp options */ +#define TF_SENTFIN 0x00010 /* have sent FIN */ +#define TF_REQ_SCALE 0x00020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x00040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x00080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x00100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x00200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x00400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x00800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x01000 /* don't push */ +#define TF_REQ_CC 0x02000 /* have/will request CC */ +#define TF_RCVD_CC 0x04000 /* a CC was received in SYN */ +#define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */ +#define TF_MORETOCOME 0x10000 /* More data to be appended to sock */ + int t_force; /* 1 if forcing out a byte */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_long snd_wnd; /* send window */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + u_int t_maxopd; /* mss plus options */ + + u_long t_rcvtime; /* inactivity time */ + u_long t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_long t_rttupdated; /* number of times rtt sampled */ + u_long max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_long ts_recent; /* timestamp echo data */ + + u_long ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; +/* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ +/* experimental */ + u_long snd_cwnd_prev; /* cwnd prior to retransmit */ + u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ + u_long t_badrxtwin; /* window for retransmit recovery */ +}; + +/* + * Structure to hold TCP options that are only used during segment + * processing (in tcp_input), but not held in the tcpcb. + * It's basically used to reduce the number of parameters + * to tcp_dooptions. + */ +struct tcpopt { + u_long to_flag; /* which options are present */ +#define TOF_TS 0x0001 /* timestamp */ +#define TOF_CC 0x0002 /* CC and CCnew are exclusive */ +#define TOF_CCNEW 0x0004 +#define TOF_CCECHO 0x0008 + u_long to_tsval; + u_long to_tsecr; + tcp_cc to_cc; /* holds CC or CCnew */ + tcp_cc to_ccecho; +}; + +/* + * The TAO cache entry which is stored in the protocol family specific + * portion of the route metrics. + */ +struct rmxp_tao { + tcp_cc tao_cc; /* latest CC in valid SYN */ + tcp_cc tao_ccsent; /* latest CC sent to peer */ + u_short tao_mssopt; /* peer's cached MSS */ +#ifdef notyet + u_short tao_flags; /* cache status flags */ +#define TAOF_DONT 0x0001 /* peer doesn't understand rfc1644 */ +#define TAOF_OK 0x0002 /* peer does understand rfc1644 */ +#define TAOF_UNDEF 0 /* we don't know yet */ +#endif /* notyet */ +}; +#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) + +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +#define TCP_REXMTVAL(tp) \ + max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) + +/* + * TCP statistics. + * Many of these should be kept per connection, + * but that's inconvenient at the moment. + */ +struct tcpstat { + u_long tcps_connattempt; /* connections initiated */ + u_long tcps_accepts; /* connections accepted */ + u_long tcps_connects; /* connections established */ + u_long tcps_drops; /* connections dropped */ + u_long tcps_conndrops; /* embryonic connections dropped */ + u_long tcps_closed; /* conn. closed (includes drops) */ + u_long tcps_segstimed; /* segs where we tried to get rtt */ + u_long tcps_rttupdated; /* times we succeeded */ + u_long tcps_delack; /* delayed acks sent */ + u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + u_long tcps_rexmttimeo; /* retransmit timeouts */ + u_long tcps_persisttimeo; /* persist timeouts */ + u_long tcps_keeptimeo; /* keepalive timeouts */ + u_long tcps_keepprobe; /* keepalive probes sent */ + u_long tcps_keepdrops; /* connections dropped in keepalive */ + + u_long tcps_sndtotal; /* total packets sent */ + u_long tcps_sndpack; /* data packets sent */ + u_long tcps_sndbyte; /* data bytes sent */ + u_long tcps_sndrexmitpack; /* data packets retransmitted */ + u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_long tcps_sndacks; /* ack-only packets sent */ + u_long tcps_sndprobe; /* window probes sent */ + u_long tcps_sndurg; /* packets sent with URG only */ + u_long tcps_sndwinup; /* window update-only packets sent */ + u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + u_long tcps_rcvtotal; /* total packets received */ + u_long tcps_rcvpack; /* packets received in sequence */ + u_long tcps_rcvbyte; /* bytes received in sequence */ + u_long tcps_rcvbadsum; /* packets received with ccksum errs */ + u_long tcps_rcvbadoff; /* packets received with bad offset */ + u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ + u_long tcps_rcvshort; /* packets received too short */ + u_long tcps_rcvduppack; /* duplicate-only packets received */ + u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ + u_long tcps_rcvpartduppack; /* packets with some duplicate data */ + u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + u_long tcps_rcvoopack; /* out-of-order packets received */ + u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvpackafterwin; /* packets with data after window */ + u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ + u_long tcps_rcvafterclose; /* packets rcvd after "close" */ + u_long tcps_rcvwinprobe; /* rcvd window probe packets */ + u_long tcps_rcvdupack; /* rcvd duplicate acks */ + u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + u_long tcps_rcvackpack; /* rcvd ack packets */ + u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ + u_long tcps_rcvwinupd; /* rcvd window update packets */ + u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_predack; /* times hdr predict ok for acks */ + u_long tcps_preddat; /* times hdr predict ok for data pkts */ + u_long tcps_pcbcachemiss; + u_long tcps_cachedrtt; /* times cached RTT in route updated */ + u_long tcps_cachedrttvar; /* times cached rttvar updated */ + u_long tcps_cachedssthresh; /* times cached ssthresh updated */ + u_long tcps_usedrtt; /* times RTT initialized from route */ + u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ + u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ + u_long tcps_persistdrop; /* timeout in persist state */ + u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ + u_long tcps_mturesent; /* resends due to MTU discovery */ + u_long tcps_listendrop; /* listen queue overflows */ +}; + +/* + * TCB structure exported to user-land via sysctl(3). + * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been + * included. Not all of our clients do. + */ +#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) +struct xtcpcb { + size_t xt_len; + struct inpcb xt_inp; + struct tcpcb xt_tp; + struct xsocket xt_socket; + u_quad_t xt_alignment_hack; +}; +#endif + +/* + * Names for TCP sysctl objects + */ +#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ +#define TCPCTL_DO_RFC1644 2 /* use RFC-1644 extensions */ +#define TCPCTL_MSSDFLT 3 /* MSS default */ +#define TCPCTL_STATS 4 /* statistics (read-only) */ +#define TCPCTL_RTTDFLT 5 /* default RTT estimate */ +#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ +#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ +#define TCPCTL_SENDSPACE 8 /* send buffer space */ +#define TCPCTL_RECVSPACE 9 /* receive buffer space */ +#define TCPCTL_KEEPINIT 10 /* receive buffer space */ +#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ +#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ +#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ +#define TCPCTL_MAXID 14 + +#define TCPCTL_NAMES { \ + { 0, 0 }, \ + { "rfc1323", CTLTYPE_INT }, \ + { "rfc1644", CTLTYPE_INT }, \ + { "mssdflt", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "rttdflt", CTLTYPE_INT }, \ + { "keepidle", CTLTYPE_INT }, \ + { "keepintvl", CTLTYPE_INT }, \ + { "sendspace", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "keepinit", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ + { "delacktime", CTLTYPE_INT }, \ + { "v6mssdflt", CTLTYPE_INT }, \ +} + + +#ifdef KERNEL +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_tcp); +#endif + +extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ +extern struct inpcbinfo tcbinfo; +extern struct tcpstat tcpstat; /* tcp statistics */ +extern int tcp_mssdflt; /* XXX */ +extern int tcp_delack_enabled; +extern int ss_fltsz; +extern int ss_fltsz_local; + +void tcp_canceltimers __P((struct tcpcb *)); +struct tcpcb * + tcp_close __P((struct tcpcb *)); +void tcp_ctlinput __P((int, struct sockaddr *, void *)); +int tcp_ctloutput __P((struct socket *, struct sockopt *)); +struct tcpcb * + tcp_drop __P((struct tcpcb *, int)); +void tcp_drain __P((void)); +void tcp_fasttimo __P((void)); +struct rmxp_tao * + tcp_gettaocache __P((struct inpcb *)); +void tcp_init __P((void)); +void tcp_input __P((struct mbuf *, int)); +void tcp_mss __P((struct tcpcb *, int)); +int tcp_mssopt __P((struct tcpcb *)); +void tcp_mtudisc __P((struct inpcb *, int)); +struct tcpcb * + tcp_newtcpcb __P((struct inpcb *)); +int tcp_output __P((struct tcpcb *)); +void tcp_quench __P((struct inpcb *, int)); +void tcp_respond __P((struct tcpcb *, + struct tcpiphdr *, struct mbuf *, tcp_seq, tcp_seq, int)); +struct rtentry * + tcp_rtlookup __P((struct inpcb *)); +void tcp_setpersist __P((struct tcpcb *)); +void tcp_slowtimo __P((void)); +struct tcpiphdr * + tcp_template __P((struct tcpcb *)); +struct tcpcb * + tcp_timers __P((struct tcpcb *, int)); +void tcp_trace __P((int, int, struct tcpcb *, struct tcpiphdr *, int)); + +extern struct pr_usrreqs tcp_usrreqs; +extern u_long tcp_sendspace; +extern u_long tcp_recvspace; + +#endif /* KERNEL */ + +#endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/netinet/tcpip.h b/sys/netinet/tcpip.h new file mode 100644 index 0000000..2cf3df1 --- /dev/null +++ b/sys/netinet/tcpip.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcpip.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCPIP_H_ +#define _NETINET_TCPIP_H_ + +/* + * Tcp+ip header, after ip options removed. + */ +struct tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ +}; +#ifdef notyet +/* + * Tcp+ip header, after ip options removed but including TCP options. + */ +struct full_tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ + char ti_o[TCP_MAXOLEN]; /* space for tcp options */ +}; +#endif /* notyet */ +#define ti_x1 ti_i.ih_x1 +#define ti_pr ti_i.ih_pr +#define ti_len ti_i.ih_len +#define ti_src ti_i.ih_src +#define ti_dst ti_i.ih_dst +#define ti_sport ti_t.th_sport +#define ti_dport ti_t.th_dport +#define ti_seq ti_t.th_seq +#define ti_ack ti_t.th_ack +#define ti_x2 ti_t.th_x2 +#define ti_off ti_t.th_off +#define ti_flags ti_t.th_flags +#define ti_win ti_t.th_win +#define ti_sum ti_t.th_sum +#define ti_urp ti_t.th_urp + +#endif diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h new file mode 100644 index 0000000..635267f --- /dev/null +++ b/sys/netinet/udp.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * Udp protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +#endif diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c new file mode 100644 index 0000000..03e0ab7 --- /dev/null +++ b/sys/netinet/udp_usrreq.c @@ -0,0 +1,891 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 + * $FreeBSD$ + */ + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/ip_icmp.h> +#include <netinet/icmp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +/* + * UDP protocol implementation. + * Per RFC 768, August, 1980. + */ +#ifndef COMPAT_42 +static int udpcksum = 1; +#else +static int udpcksum = 0; /* XXX */ +#endif +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, + &udpcksum, 0, ""); + +int log_in_vain = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming UDP packets"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send port unreachables for refused connects"); + +struct inpcbhead udb; /* from udp_var.h */ +#define udb6 udb /* for KAME src sync over BSD*'s */ +struct inpcbinfo udbinfo; + +#ifndef UDBHASHSIZE +#define UDBHASHSIZE 16 +#endif + +struct udpstat udpstat; /* from udp_var.h */ +SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, + &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); + +static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; +#ifdef INET6 +struct udp_in6 { + struct sockaddr_in6 uin6_sin; + u_char uin6_init_done : 1; +} udp_in6 = { + { sizeof(udp_in6.uin6_sin), AF_INET6 }, + 0 +}; +struct udp_ip6 { + struct ip6_hdr uip6_ip6; + u_char uip6_init_done : 1; +} udp_ip6; +#endif /* INET6 */ + +static void udp_append __P((struct inpcb *last, struct ip *ip, + struct mbuf *n, int off)); +#ifdef INET6 +static void ip_2_ip6_hdr __P((struct ip6_hdr *ip6, struct ip *ip)); +#endif + +static int udp_detach __P((struct socket *so)); +static int udp_output __P((struct inpcb *, struct mbuf *, struct sockaddr *, + struct mbuf *, struct proc *)); + +void +udp_init() +{ + LIST_INIT(&udb); + udbinfo.listhead = &udb; + udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); + udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, + &udbinfo.porthashmask); + udbinfo.ipi_zone = zinit("udpcb", sizeof(struct inpcb), maxsockets, + ZONE_INTERRUPT, 0); +} + +void +udp_input(m, off, proto) + register struct mbuf *m; + int off, proto; +{ + int iphlen = off; + register struct ip *ip; + register struct udphdr *uh; + register struct inpcb *inp; + struct mbuf *opts = 0; + int len; + struct ip save_ip; + struct sockaddr *append_sa; + + udpstat.udps_ipackets++; + + /* + * Strip IP options, if any; should skip this, + * make available to user, and use on returned packets, + * but we don't yet have a way to check the checksum + * with options still present. + */ + if (iphlen > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + + /* + * Get IP and UDP header together in first mbuf. + */ + ip = mtod(m, struct ip *); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + udpstat.udps_hdrops++; + return; + } + ip = mtod(m, struct ip *); + } + uh = (struct udphdr *)((caddr_t)ip + iphlen); + + /* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ + len = ntohs((u_short)uh->uh_ulen); + if (ip->ip_len != len) { + if (len > ip->ip_len || len < sizeof(struct udphdr)) { + udpstat.udps_badlen++; + goto bad; + } + m_adj(m, len - ip->ip_len); + /* ip->ip_len = len; */ + } + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + save_ip = *ip; + + /* + * Checksum extended UDP header and data. + */ + if (uh->uh_sum) { + bzero(((struct ipovly *)ip)->ih_x1, 9); + ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); + if (uh->uh_sum) { + udpstat.udps_badsum++; + m_freem(m); + return; + } + } + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + struct inpcb *last; + /* + * Deliver a multicast or broadcast datagram to *all* sockets + * for which the local and remote addresses and ports match + * those of the incoming datagram. This allows more than + * one process to receive multi/broadcasts on the same port. + * (This really ought to be done for unicast datagrams as + * well, but that would cause problems with existing + * applications that open both address-specific sockets and + * a wildcard socket listening to the same port -- they would + * end up receiving duplicates of every unicast datagram. + * Those applications open the multiple sockets to overcome an + * inadequacy of the UDP socket interface, but for backwards + * compatibility we avoid the problem here rather than + * fixing the interface. Maybe 4.5BSD will remedy this?) + */ + + /* + * Construct sockaddr format source address. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + /* + * Locate pcb(s) for datagram. + * (Algorithm copied from raw_intr().) + */ + last = NULL; +#ifdef INET6 + udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0; +#endif + LIST_FOREACH(inp, &udb, inp_list) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == NULL) + continue; +#endif + if (inp->inp_lport != uh->uh_dport) + continue; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (inp->inp_laddr.s_addr != + ip->ip_dst.s_addr) + continue; + } + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (inp->inp_faddr.s_addr != + ip->ip_src.s_addr || + inp->inp_fport != uh->uh_sport) + continue; + } + + if (last != NULL) { + struct mbuf *n; + +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject_so(m, last->inp_socket)) + ipsecstat.in_polvio++; + /* do not inject data to pcb */ + else +#endif /*IPSEC*/ + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) + udp_append(last, ip, n, + iphlen + + sizeof(struct udphdr)); + } + last = inp; + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids searching + * through all pcbs in the common case of a non-shared + * port. It * assumes that an application will never + * clear these options after setting them. + */ + if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) + break; + } + + if (last == NULL) { + /* + * No matching pcb found; discard datagram. + * (No need to send an ICMP Port Unreachable + * for a broadcast or multicast datgram.) + */ + udpstat.udps_noportbcast++; + goto bad; + } +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject_so(m, last->inp_socket)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif /*IPSEC*/ + udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); + return; + } + /* + * Locate pcb for datagram. + */ + inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); + if (inp == NULL) { + if (log_in_vain) { + char buf[4*sizeof "123"]; + + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), + ntohs(uh->uh_sport)); + } + udpstat.udps_noport++; + if (m->m_flags & (M_BCAST | M_MCAST)) { + udpstat.udps_noportbcast++; + goto bad; + } + *ip = save_ip; +#ifdef ICMP_BANDLIM + if (badport_bandlim(0) < 0) + goto bad; +#endif + if (!blackhole) + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); + else + goto bad; + return; + } +#ifdef IPSEC + if (ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif /*IPSEC*/ + + /* + * Construct sockaddr format source address. + * Stuff source address and datagram in user buffer. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + if (inp->inp_flags & INP_CONTROLOPTS + || inp->inp_socket->so_options & SO_TIMESTAMP) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + int savedflags; + + ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); + savedflags = inp->inp_flags; + inp->inp_flags &= ~INP_UNMAPPABLEOPTS; + ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m); + inp->inp_flags = savedflags; + } else +#endif + ip_savecontrol(inp, &opts, ip, m); + } + iphlen += sizeof(struct udphdr); + m_adj(m, iphlen); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); + append_sa = (struct sockaddr *)&udp_in6; + } else +#endif + append_sa = (struct sockaddr *)&udp_in; + if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(inp->inp_socket); + return; +bad: + m_freem(m); + if (opts) + m_freem(opts); + return; +} + +#if defined(INET6) +static void +ip_2_ip6_hdr(ip6, ip) + struct ip6_hdr *ip6; + struct ip *ip; +{ + bzero(ip6, sizeof(*ip6)); + + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_plen = ip->ip_len; + ip6->ip6_nxt = ip->ip_p; + ip6->ip6_hlim = ip->ip_ttl; + ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] = + IPV6_ADDR_INT32_SMP; + ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; + ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; +} +#endif + +/* + * subroutine of udp_input(), mainly for source code readability. + * caller must properly init udp_ip6 and udp_in6 beforehand. + */ +static void +udp_append(last, ip, n, off) + struct inpcb *last; + struct ip *ip; + struct mbuf *n; + int off; +{ + struct sockaddr *append_sa; + struct mbuf *opts = 0; + + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) { +#ifdef INET6 + if (last->inp_vflag & INP_IPV6) { + int savedflags; + + if (udp_ip6.uip6_init_done == 0) { + ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); + udp_ip6.uip6_init_done = 1; + } + savedflags = last->inp_flags; + last->inp_flags &= ~INP_UNMAPPABLEOPTS; + ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n); + last->inp_flags = savedflags; + } else +#endif + ip_savecontrol(last, &opts, ip, n); + } +#ifdef INET6 + if (last->inp_vflag & INP_IPV6) { + if (udp_in6.uin6_init_done == 0) { + in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); + udp_in6.uin6_init_done = 1; + } + append_sa = (struct sockaddr *)&udp_in6.uin6_sin; + } else +#endif + append_sa = (struct sockaddr *)&udp_in; + m_adj(n, off); + if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) { + m_freem(n); + if (opts) + m_freem(opts); + udpstat.udps_fullsock++; + } else + sorwakeup(last->inp_socket); +} + +/* + * Notify a udp user of an asynchronous error; + * just wake up so that he can collect error status. + */ +void +udp_notify(inp, errno) + register struct inpcb *inp; + int errno; +{ + inp->inp_socket->so_error = errno; + sorwakeup(inp->inp_socket); + sowwakeup(inp->inp_socket); +} + +void +udp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + register struct ip *ip = vip; + register struct udphdr *uh; + + if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + in_pcbnotify(&udb, sa, uh->uh_dport, ip->ip_src, uh->uh_sport, + cmd, udp_notify); + } else + in_pcbnotify(&udb, sa, 0, zeroin_addr, 0, cmd, udp_notify); +} + +static int +udp_pcblist SYSCTL_HANDLER_ARGS +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = udbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = udbinfo.ipi_gencnt; + n = udbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = udbinfo.listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = udbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = udbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + udp_pcblist, "S,xinpcb", "List of active UDP sockets"); + +static int +udp_getcred SYSCTL_HANDLER_ARGS +{ + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, udp_getcred, "S,ucred", "Get the ucred of a UDP connection"); + +static int +udp_output(inp, m, addr, control, p) + register struct inpcb *inp; + struct mbuf *m; + struct sockaddr *addr; + struct mbuf *control; + struct proc *p; +{ + register struct udpiphdr *ui; + register int len = m->m_pkthdr.len; + struct in_addr laddr; + struct sockaddr_in *sin; + int s = 0, error = 0; + + if (control) + m_freem(control); /* XXX */ + + if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { + error = EMSGSIZE; + goto release; + } + + if (addr) { + sin = (struct sockaddr_in *)addr; + prison_remote_ip(p, 0, &sin->sin_addr.s_addr); + laddr = inp->inp_laddr; + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + goto release; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = in_pcbconnect(inp, addr, p); + if (error) { + splx(s); + goto release; + } + } else { + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + goto release; + } + } + /* + * Calculate data length and get a mbuf + * for UDP and IP headers. + */ + M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); + if (m == 0) { + error = ENOBUFS; + if (addr) + splx(s); + goto release; + } + + /* + * Fill in mbuf with extended UDP header + * and addresses and length put into network format. + */ + ui = mtod(m, struct udpiphdr *); + bzero(ui->ui_x1, sizeof(ui->ui_x1)); + ui->ui_pr = IPPROTO_UDP; + ui->ui_len = htons((u_short)len + sizeof (struct udphdr)); + ui->ui_src = inp->inp_laddr; + ui->ui_dst = inp->inp_faddr; + ui->ui_sport = inp->inp_lport; + ui->ui_dport = inp->inp_fport; + ui->ui_ulen = ui->ui_len; + + /* + * Stuff checksum and output datagram. + */ + ui->ui_sum = 0; + if (udpcksum) { + if ((ui->ui_sum = in_cksum(m, sizeof (struct udpiphdr) + len)) == 0) + ui->ui_sum = 0xffff; + } + ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ + ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ + udpstat.udps_opackets++; + +#ifdef IPSEC + m->m_pkthdr.rcvif = (struct ifnet *)inp->inp_socket; +#endif /*IPSEC*/ + + error = ip_output(m, inp->inp_options, &inp->inp_route, + inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST), + inp->inp_moptions); + + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; /* XXX rehash? */ + splx(s); + } + return (error); + +release: + m_freem(m); + return (error); +} + +u_long udp_sendspace = 9216; /* really max datagram size */ + /* 40 1K datagrams */ +SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, + &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); + +u_long udp_recvspace = 40 * (1024 + +#ifdef INET6 + sizeof(struct sockaddr_in6) +#else + sizeof(struct sockaddr_in) +#endif + ); +SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &udp_recvspace, 0, "Maximum incoming UDP datagram size"); + +static int +udp_abort(struct socket *so) +{ + struct inpcb *inp; + int s; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; /* ??? possible? panic instead? */ + soisdisconnected(so); + s = splnet(); + in_pcbdetach(inp); + splx(s); + return 0; +} + +static int +udp_attach(struct socket *so, int proto, struct proc *p) +{ + struct inpcb *inp; + int s, error; + + inp = sotoinpcb(so); + if (inp != 0) + return EINVAL; + + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) + return error; + s = splnet(); + error = in_pcballoc(so, &udbinfo, p); + splx(s); + if (error) + return error; + + inp = (struct inpcb *)so->so_pcb; + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_ttl = ip_defttl; +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error != 0) { + in_pcbdetach(inp); + return error; + } +#endif /*IPSEC*/ + return 0; +} + +static int +udp_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp; + int s, error; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + s = splnet(); + error = in_pcbbind(inp, nam, p); + splx(s); + return error; +} + +static int +udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp; + int s, error; + struct sockaddr_in *sin; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + if (inp->inp_faddr.s_addr != INADDR_ANY) + return EISCONN; + s = splnet(); + sin = (struct sockaddr_in *)nam; + prison_remote_ip(p, 0, &sin->sin_addr.s_addr); + error = in_pcbconnect(inp, nam, p); + splx(s); + if (error == 0) + soisconnected(so); + return error; +} + +static int +udp_detach(struct socket *so) +{ + struct inpcb *inp; + int s; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + s = splnet(); + in_pcbdetach(inp); + splx(s); + return 0; +} + +static int +udp_disconnect(struct socket *so) +{ + struct inpcb *inp; + int s; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + if (inp->inp_faddr.s_addr == INADDR_ANY) + return ENOTCONN; + + s = splnet(); + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + splx(s); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + return 0; +} + +static int +udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct proc *p) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) { + m_freem(m); + return EINVAL; + } + return udp_output(inp, m, addr, control, p); +} + +int +udp_shutdown(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + socantsendmore(so); + return 0; +} + +struct pr_usrreqs udp_usrreqs = { + udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, + pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, + pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; + diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h new file mode 100644 index 0000000..491632e --- /dev/null +++ b/sys/netinet/udp_var.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_VAR_H_ +#define _NETINET_UDP_VAR_H_ + +/* + * UDP kernel structures and variables. + */ +struct udpiphdr { + struct ipovly ui_i; /* overlaid ip structure */ + struct udphdr ui_u; /* udp header */ +}; +#define ui_x1 ui_i.ih_x1 +#define ui_pr ui_i.ih_pr +#define ui_len ui_i.ih_len +#define ui_src ui_i.ih_src +#define ui_dst ui_i.ih_dst +#define ui_sport ui_u.uh_sport +#define ui_dport ui_u.uh_dport +#define ui_ulen ui_u.uh_ulen +#define ui_sum ui_u.uh_sum + +struct udpcb { + /* XXX - these should be by reference so we can do options quickly */ + struct ip udb_ip; + struct udphdr udb_uh; + struct sockaddr_in udb_conn; + struct in_hostcache *udb_hc; + struct mbuf *udb_queue; +}; +#define inptoudpcb(inp) ((struct udpdb *)(inp)->inp_ppcb) + +struct udpstat { + /* input statistics: */ + u_long udps_ipackets; /* total input packets */ + u_long udps_hdrops; /* packet shorter than header */ + u_long udps_badsum; /* checksum error */ + u_long udps_nosum; /* no checksum */ + u_long udps_badlen; /* data length larger than packet */ + u_long udps_noport; /* no socket on port */ + u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_fullsock; /* not delivered, input socket full */ + u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ + u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + /* output statistics: */ + u_long udps_opackets; /* total output packets */ + u_long udps_fastout; /* output packets on fast path */ + /* of no socket on port, arrived as multicast */ + u_long udps_noportmcast; +}; + +/* + * Names for UDP sysctl objects + */ +#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ +#define UDPCTL_STATS 2 /* statistics (read-only) */ +#define UDPCTL_MAXDGRAM 3 /* max datagram size */ +#define UDPCTL_RECVSPACE 4 /* default receive buffer space */ +#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ +#define UDPCTL_MAXID 6 + +#define UDPCTL_NAMES { \ + { 0, 0 }, \ + { "checksum", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "maxdgram", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ +} + +#ifdef KERNEL +SYSCTL_DECL(_net_inet_udp); + +extern struct pr_usrreqs udp_usrreqs; +extern struct inpcbhead udb; +extern struct inpcbinfo udbinfo; +extern u_long udp_sendspace; +extern u_long udp_recvspace; +extern struct udpstat udpstat; +extern int log_in_vain; + +void udp_ctlinput __P((int, struct sockaddr *, void *)); +void udp_init __P((void)); +void udp_input __P((struct mbuf *, int, int)); + +void udp_notify __P((struct inpcb *inp, int errno)); +int udp_shutdown __P((struct socket *so)); +#endif + +#endif |