diff options
Diffstat (limited to 'sys/netinet')
79 files changed, 48450 insertions, 0 deletions
diff --git a/sys/netinet/accf_data.c b/sys/netinet/accf_data.c new file mode 100644 index 0000000..2058b06 --- /dev/null +++ b/sys/netinet/accf_data.c @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2000 Alfred Perlstein <alfred@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define ACCEPT_FILTER_MOD + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/signalvar.h> +#include <sys/socketvar.h> + +/* accept filter that holds a socket until data arrives */ + +static void sohasdata(struct socket *so, void *arg, int waitflag); + +static struct accept_filter accf_data_filter = { + "dataready", + sohasdata, + NULL, + NULL +}; + +static moduledata_t accf_data_mod = { + "accf_data", + accept_filt_generic_mod_event, + &accf_data_filter +}; + +DECLARE_MODULE(accf_data, accf_data_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static void +sohasdata(struct socket *so, void *arg, int waitflag) +{ + + if (!soreadable(so)) + return; + + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} diff --git a/sys/netinet/accf_http.c b/sys/netinet/accf_http.c new file mode 100644 index 0000000..686e563 --- /dev/null +++ b/sys/netinet/accf_http.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2000 Paycounter, Inc. + * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define ACCEPT_FILTER_MOD + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> +#include <sys/socketvar.h> + +/* check for GET/HEAD */ +static void sohashttpget(struct socket *so, void *arg, int waitflag); +/* check for HTTP/1.0 or HTTP/1.1 */ +static void soparsehttpvers(struct socket *so, void *arg, int waitflag); +/* check for end of HTTP/1.x request */ +static void soishttpconnected(struct socket *so, void *arg, int waitflag); +/* strcmp on an mbuf chain */ +static int mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp); +/* strncmp on an mbuf chain */ +static int mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, + int max, char *cmp); +/* socketbuffer is full */ +static int sbfull(struct sockbuf *sb); + +static struct accept_filter accf_http_filter = { + "httpready", + sohashttpget, + NULL, + NULL +}; + +static moduledata_t accf_http_mod = { + "accf_http", + accept_filt_generic_mod_event, + &accf_http_filter +}; + +DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static int parse_http_version = 1; + +SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, +"HTTP accept filter"); +SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, +&parse_http_version, 1, +"Parse http version so that non 1.x requests work"); + +#ifdef ACCF_HTTP_DEBUG +#define DPRINT(fmt, args...) \ + do { \ + printf("%s:%d: " fmt "\n", __func__, __LINE__, ##args); \ + } while (0) +#else +#define DPRINT(fmt, args...) +#endif + +static int +sbfull(struct sockbuf *sb) +{ + + DPRINT("sbfull, cc(%ld) >= hiwat(%ld): %d, " + "mbcnt(%ld) >= mbmax(%ld): %d", + sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat, + sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax); + return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax); +} + +/* + * start at mbuf m, (must provide npkt if exists) + * starting at offset in m compare characters in mbuf chain for 'cmp' + */ +static int +mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + n = npkt; + if (npkt) + npkt = npkt->m_nextpkt; + for (; m; m = m->m_next) { + for (; offset < m->m_len; offset++, cmp++) { + if (*cmp == '\0') + return (1); + else if (*cmp != *(mtod(m, char *) + offset)) + return (0); + } + if (*cmp == '\0') + return (1); + offset = 0; + } + } + return (0); +} + +/* + * start at mbuf m, (must provide npkt if exists) + * starting at offset in m compare characters in mbuf chain for 'cmp' + * stop at 'max' characters + */ +static int +mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, int max, char *cmp) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + n = npkt; + if (npkt) + npkt = npkt->m_nextpkt; + for (; m; m = m->m_next) { + for (; offset < m->m_len; offset++, cmp++, max--) { + if (max == 0 || *cmp == '\0') + return (1); + else if (*cmp != *(mtod(m, char *) + offset)) + return (0); + } + if (max == 0 || *cmp == '\0') + return (1); + offset = 0; + } + } + return (0); +} + +#define STRSETUP(sptr, slen, str) \ + do { \ + sptr = str; \ + slen = sizeof(str) - 1; \ + } while(0) + +static void +sohashttpget(struct socket *so, void *arg, int waitflag) +{ + + if ((so->so_state & SS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { + struct mbuf *m; + char *cmp; + int cmplen, cc; + + m = so->so_rcv.sb_mb; + cc = so->so_rcv.sb_cc - 1; + if (cc < 1) + return; + switch (*mtod(m, char *)) { + case 'G': + STRSETUP(cmp, cmplen, "ET "); + break; + case 'H': + STRSETUP(cmp, cmplen, "EAD "); + break; + default: + goto fallout; + } + if (cc < cmplen) { + if (mbufstrncmp(m, m->m_nextpkt, 1, cc, cmp) == 1) { + DPRINT("short cc (%d) but mbufstrncmp ok", cc); + return; + } else { + DPRINT("short cc (%d) mbufstrncmp failed", cc); + goto fallout; + } + } + if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) { + DPRINT("mbufstrcmp ok"); + if (parse_http_version == 0) + soishttpconnected(so, arg, waitflag); + else + soparsehttpvers(so, arg, waitflag); + return; + } + DPRINT("mbufstrcmp bad"); + } + +fallout: + DPRINT("fallout"); + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} + +static void +soparsehttpvers(struct socket *so, void *arg, int waitflag) +{ + struct mbuf *m, *n; + int i, cc, spaces, inspaces; + + if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + goto fallout; + + m = so->so_rcv.sb_mb; + cc = so->so_rcv.sb_cc; + inspaces = spaces = 0; + for (m = so->so_rcv.sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + for (i = 0; i < m->m_len; i++, cc--) { + switch (*(mtod(m, char *) + i)) { + case ' ': + /* tabs? '\t' */ + if (!inspaces) { + spaces++; + inspaces = 1; + } + break; + case '\r': + case '\n': + DPRINT("newline"); + goto fallout; + default: + if (spaces != 2) { + inspaces = 0; + break; + } + + /* + * if we don't have enough characters + * left (cc < sizeof("HTTP/1.0") - 1) + * then see if the remaining ones + * are a request we can parse. + */ + if (cc < sizeof("HTTP/1.0") - 1) { + if (mbufstrncmp(m, n, i, cc, + "HTTP/1.") == 1) { + DPRINT("ok"); + goto readmore; + } else { + DPRINT("bad"); + goto fallout; + } + } else if ( + mbufstrcmp(m, n, i, "HTTP/1.0") || + mbufstrcmp(m, n, i, "HTTP/1.1")) { + DPRINT("ok"); + soishttpconnected(so, + arg, waitflag); + return; + } else { + DPRINT("bad"); + goto fallout; + } + } + } + } + } +readmore: + DPRINT("readmore"); + /* + * if we hit here we haven't hit something + * we don't understand or a newline, so try again + */ + so->so_upcall = soparsehttpvers; + so->so_rcv.sb_flags |= SB_UPCALL; + return; + +fallout: + DPRINT("fallout"); + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} + + +#define NCHRS 3 + +static void +soishttpconnected(struct socket *so, void *arg, int waitflag) +{ + char a, b, c; + struct mbuf *m, *n; + int ccleft, copied; + + DPRINT("start"); + if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + goto gotit; + + /* + * Walk the socketbuffer and copy the last NCHRS (3) into a, b, and c + * copied - how much we've copied so far + * ccleft - how many bytes remaining in the socketbuffer + * just loop over the mbufs subtracting from 'ccleft' until we only + * have NCHRS left + */ + copied = 0; + ccleft = so->so_rcv.sb_cc; + if (ccleft < NCHRS) + goto readmore; + a = b = c = '\0'; + for (m = so->so_rcv.sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + ccleft -= m->m_len; + if (ccleft <= NCHRS) { + char *src; + int tocopy; + + tocopy = (NCHRS - ccleft) - copied; + src = mtod(m, char *) + (m->m_len - tocopy); + + while (tocopy--) { + switch (copied++) { + case 0: + a = *src++; + break; + case 1: + b = *src++; + break; + case 2: + c = *src++; + break; + } + } + } + } + } + if (c == '\n' && (b == '\n' || (b == '\r' && a == '\n'))) { + /* we have all request headers */ + goto gotit; + } + +readmore: + so->so_upcall = soishttpconnected; + so->so_rcv.sb_flags |= SB_UPCALL; + return; + +gotit: + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h new file mode 100644 index 0000000..2b1f529 --- /dev/null +++ b/sys/netinet/icmp6.h @@ -0,0 +1,740 @@ +/* $FreeBSD$ */ +/* $KAME: icmp6.h,v 1.46 2001/04/27 15:09:48 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_ICMP6_H_ +#define _NETINET_ICMP6_H_ + +#define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) + - sizeof(struct icmp6_hdr) */ + +struct icmp6_hdr { + u_int8_t icmp6_type; /* type field */ + u_int8_t icmp6_code; /* code field */ + u_int16_t icmp6_cksum; /* checksum field */ + union { + u_int32_t icmp6_un_data32[1]; /* type-specific field */ + u_int16_t icmp6_un_data16[2]; /* type-specific field */ + u_int8_t icmp6_un_data8[4]; /* type-specific field */ + } icmp6_dataun; +} __attribute__((__packed__)); + +#define icmp6_data32 icmp6_dataun.icmp6_un_data32 +#define icmp6_data16 icmp6_dataun.icmp6_un_data16 +#define icmp6_data8 icmp6_dataun.icmp6_un_data8 +#define icmp6_pptr icmp6_data32[0] /* parameter prob */ +#define icmp6_mtu icmp6_data32[0] /* packet too big */ +#define icmp6_id icmp6_data16[0] /* echo request/reply */ +#define icmp6_seq icmp6_data16[1] /* echo request/reply */ +#define icmp6_maxdelay icmp6_data16[0] /* mcast group membership */ + +#define ICMP6_DST_UNREACH 1 /* dest unreachable, codes: */ +#define ICMP6_PACKET_TOO_BIG 2 /* packet too big */ +#define ICMP6_TIME_EXCEEDED 3 /* time exceeded, code: */ +#define ICMP6_PARAM_PROB 4 /* ip6 header bad */ + +#define ICMP6_ECHO_REQUEST 128 /* echo service */ +#define ICMP6_ECHO_REPLY 129 /* echo reply */ +#define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ +#define MLD_LISTENER_QUERY 130 /* multicast listener query */ +#define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ +#define MLD_LISTENER_REPORT 131 /* multicast listener report */ +#define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ +#define MLD_LISTENER_DONE 132 /* multicast listener done */ + +#ifndef _KERNEL +/* the followings are for backward compatibility to old KAME apps. */ +#define MLD6_LISTENER_QUERY MLD_LISTENER_QUERY +#define MLD6_LISTENER_REPORT MLD_LISTENER_REPORT +#define MLD6_LISTENER_DONE MLD_LISTENER_DONE +#endif + +#define ND_ROUTER_SOLICIT 133 /* router solicitation */ +#define ND_ROUTER_ADVERT 134 /* router advertisment */ +#define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ +#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisment */ +#define ND_REDIRECT 137 /* redirect */ + +#define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ + +#define ICMP6_WRUREQUEST 139 /* who are you request */ +#define ICMP6_WRUREPLY 140 /* who are you reply */ +#define ICMP6_FQDN_QUERY 139 /* FQDN query */ +#define ICMP6_FQDN_REPLY 140 /* FQDN reply */ +#define ICMP6_NI_QUERY 139 /* node information request */ +#define ICMP6_NI_REPLY 140 /* node information reply */ + +/* The definitions below are experimental. TBA */ +#define MLD_MTRACE_RESP 200 /* mtrace resp (to sender) */ +#define MLD_MTRACE 201 /* mtrace messages */ + +#define ICMP6_HADISCOV_REQUEST 202 /* XXX To be defined */ +#define ICMP6_HADISCOV_REPLY 203 /* XXX To be defined */ + +#ifndef _KERNEL +#define MLD6_MTRACE_RESP MLD_MTRACE_RESP +#define MLD6_MTRACE MLD_MTRACE +#endif + +#define ICMP6_MAXTYPE 203 + +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ +#define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 /* not a neighbor(obsolete) */ +#define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */ +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ + +#define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */ +#define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */ + +#define ICMP6_PARAMPROB_HEADER 0 /* erroneous header field */ +#define ICMP6_PARAMPROB_NEXTHEADER 1 /* unrecognized next header */ +#define ICMP6_PARAMPROB_OPTION 2 /* unrecognized option */ + +#define ICMP6_INFOMSG_MASK 0x80 /* all informational messages */ + +#define ICMP6_NI_SUBJ_IPV6 0 /* Query Subject is an IPv6 address */ +#define ICMP6_NI_SUBJ_FQDN 1 /* Query Subject is a Domain name */ +#define ICMP6_NI_SUBJ_IPV4 2 /* Query Subject is an IPv4 address */ + +#define ICMP6_NI_SUCCESS 0 /* node information successful reply */ +#define ICMP6_NI_REFUSED 1 /* node information request is refused */ +#define ICMP6_NI_UNKNOWN 2 /* unknown Qtype */ + +#define ICMP6_ROUTER_RENUMBERING_COMMAND 0 /* rr command */ +#define ICMP6_ROUTER_RENUMBERING_RESULT 1 /* rr result */ +#define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 /* rr seq num reset */ + +/* Used in kernel only */ +#define ND_REDIRECT_ONLINK 0 /* redirect to an on-link node */ +#define ND_REDIRECT_ROUTER 1 /* redirect to a better router */ + +/* + * Multicast Listener Discovery + */ +struct mld_hdr { + struct icmp6_hdr mld_icmp6_hdr; + struct in6_addr mld_addr; /* multicast address */ +} __attribute__((__packed__)); + +/* definitions to provide backward compatibility to old KAME applications */ +#ifndef _KERNEL +#define mld6_hdr mld_hdr +#define mld6_type mld_type +#define mld6_code mld_code +#define mld6_cksum mld_cksum +#define mld6_maxdelay mld_maxdelay +#define mld6_reserved mld_reserved +#define mld6_addr mld_addr +#endif + +/* shortcut macro definitions */ +#define mld_type mld_icmp6_hdr.icmp6_type +#define mld_code mld_icmp6_hdr.icmp6_code +#define mld_cksum mld_icmp6_hdr.icmp6_cksum +#define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0] +#define mld_reserved mld_icmp6_hdr.icmp6_data16[1] + +/* + * Neighbor Discovery + */ + +struct nd_router_solicit { /* router solicitation */ + struct icmp6_hdr nd_rs_hdr; + /* could be followed by options */ +} __attribute__((__packed__)); + +#define nd_rs_type nd_rs_hdr.icmp6_type +#define nd_rs_code nd_rs_hdr.icmp6_code +#define nd_rs_cksum nd_rs_hdr.icmp6_cksum +#define nd_rs_reserved nd_rs_hdr.icmp6_data32[0] + +struct nd_router_advert { /* router advertisement */ + struct icmp6_hdr nd_ra_hdr; + u_int32_t nd_ra_reachable; /* reachable time */ + u_int32_t nd_ra_retransmit; /* retransmit timer */ + /* could be followed by options */ +} __attribute__((__packed__)); + +#define nd_ra_type nd_ra_hdr.icmp6_type +#define nd_ra_code nd_ra_hdr.icmp6_code +#define nd_ra_cksum nd_ra_hdr.icmp6_cksum +#define nd_ra_curhoplimit nd_ra_hdr.icmp6_data8[0] +#define nd_ra_flags_reserved nd_ra_hdr.icmp6_data8[1] +#define ND_RA_FLAG_MANAGED 0x80 +#define ND_RA_FLAG_OTHER 0x40 +#define ND_RA_FLAG_HA 0x20 + +/* + * Router preference values based on draft-draves-ipngwg-router-selection-01. + * These are non-standard definitions. + */ +#define ND_RA_FLAG_RTPREF_MASK 0x18 /* 00011000 */ + +#define ND_RA_FLAG_RTPREF_HIGH 0x08 /* 00001000 */ +#define ND_RA_FLAG_RTPREF_MEDIUM 0x00 /* 00000000 */ +#define ND_RA_FLAG_RTPREF_LOW 0x18 /* 00011000 */ +#define ND_RA_FLAG_RTPREF_RSV 0x10 /* 00010000 */ + +#define nd_ra_router_lifetime nd_ra_hdr.icmp6_data16[1] + +struct nd_neighbor_solicit { /* neighbor solicitation */ + struct icmp6_hdr nd_ns_hdr; + struct in6_addr nd_ns_target; /*target address */ + /* could be followed by options */ +} __attribute__((__packed__)); + +#define nd_ns_type nd_ns_hdr.icmp6_type +#define nd_ns_code nd_ns_hdr.icmp6_code +#define nd_ns_cksum nd_ns_hdr.icmp6_cksum +#define nd_ns_reserved nd_ns_hdr.icmp6_data32[0] + +struct nd_neighbor_advert { /* neighbor advertisement */ + struct icmp6_hdr nd_na_hdr; + struct in6_addr nd_na_target; /* target address */ + /* could be followed by options */ +} __attribute__((__packed__)); + +#define nd_na_type nd_na_hdr.icmp6_type +#define nd_na_code nd_na_hdr.icmp6_code +#define nd_na_cksum nd_na_hdr.icmp6_cksum +#define nd_na_flags_reserved nd_na_hdr.icmp6_data32[0] +#if BYTE_ORDER == BIG_ENDIAN +#define ND_NA_FLAG_ROUTER 0x80000000 +#define ND_NA_FLAG_SOLICITED 0x40000000 +#define ND_NA_FLAG_OVERRIDE 0x20000000 +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define ND_NA_FLAG_ROUTER 0x80 +#define ND_NA_FLAG_SOLICITED 0x40 +#define ND_NA_FLAG_OVERRIDE 0x20 +#endif +#endif + +struct nd_redirect { /* redirect */ + struct icmp6_hdr nd_rd_hdr; + struct in6_addr nd_rd_target; /* target address */ + struct in6_addr nd_rd_dst; /* destination address */ + /* could be followed by options */ +} __attribute__((__packed__)); + +#define nd_rd_type nd_rd_hdr.icmp6_type +#define nd_rd_code nd_rd_hdr.icmp6_code +#define nd_rd_cksum nd_rd_hdr.icmp6_cksum +#define nd_rd_reserved nd_rd_hdr.icmp6_data32[0] + +struct nd_opt_hdr { /* Neighbor discovery option header */ + u_int8_t nd_opt_type; + u_int8_t nd_opt_len; + /* followed by option specific data*/ +} __attribute__((__packed__)); + +#define ND_OPT_SOURCE_LINKADDR 1 +#define ND_OPT_TARGET_LINKADDR 2 +#define ND_OPT_PREFIX_INFORMATION 3 +#define ND_OPT_REDIRECTED_HEADER 4 +#define ND_OPT_MTU 5 + +#define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */ + +struct nd_opt_prefix_info { /* prefix information */ + u_int8_t nd_opt_pi_type; + u_int8_t nd_opt_pi_len; + u_int8_t nd_opt_pi_prefix_len; + u_int8_t nd_opt_pi_flags_reserved; + u_int32_t nd_opt_pi_valid_time; + u_int32_t nd_opt_pi_preferred_time; + u_int32_t nd_opt_pi_reserved2; + struct in6_addr nd_opt_pi_prefix; +} __attribute__((__packed__)); + +#define ND_OPT_PI_FLAG_ONLINK 0x80 +#define ND_OPT_PI_FLAG_AUTO 0x40 + +struct nd_opt_rd_hdr { /* redirected header */ + u_int8_t nd_opt_rh_type; + u_int8_t nd_opt_rh_len; + u_int16_t nd_opt_rh_reserved1; + u_int32_t nd_opt_rh_reserved2; + /* followed by IP header and data */ +} __attribute__((__packed__)); + +struct nd_opt_mtu { /* MTU option */ + u_int8_t nd_opt_mtu_type; + u_int8_t nd_opt_mtu_len; + u_int16_t nd_opt_mtu_reserved; + u_int32_t nd_opt_mtu_mtu; +} __attribute__((__packed__)); + +struct nd_opt_route_info { /* route info */ + u_int8_t nd_opt_rti_type; + u_int8_t nd_opt_rti_len; + u_int8_t nd_opt_rti_prefixlen; + u_int8_t nd_opt_rti_flags; + u_int32_t nd_opt_rti_lifetime; + /* prefix follows */ +} __attribute__((__packed__)); + +/* + * icmp6 namelookup + */ + +struct icmp6_namelookup { + struct icmp6_hdr icmp6_nl_hdr; + u_int8_t icmp6_nl_nonce[8]; + int32_t icmp6_nl_ttl; +#if 0 + u_int8_t icmp6_nl_len; + u_int8_t icmp6_nl_name[3]; +#endif + /* could be followed by options */ +} __attribute__((__packed__)); + +/* + * icmp6 node information + */ +struct icmp6_nodeinfo { + struct icmp6_hdr icmp6_ni_hdr; + u_int8_t icmp6_ni_nonce[8]; + /* could be followed by reply data */ +} __attribute__((__packed__)); + +#define ni_type icmp6_ni_hdr.icmp6_type +#define ni_code icmp6_ni_hdr.icmp6_code +#define ni_cksum icmp6_ni_hdr.icmp6_cksum +#define ni_qtype icmp6_ni_hdr.icmp6_data16[0] +#define ni_flags icmp6_ni_hdr.icmp6_data16[1] + +#define NI_QTYPE_NOOP 0 /* NOOP */ +#define NI_QTYPE_SUPTYPES 1 /* Supported Qtypes */ +#define NI_QTYPE_FQDN 2 /* FQDN (draft 04) */ +#define NI_QTYPE_DNSNAME 2 /* DNS Name */ +#define NI_QTYPE_NODEADDR 3 /* Node Addresses */ +#define NI_QTYPE_IPV4ADDR 4 /* IPv4 Addresses */ + +#if BYTE_ORDER == BIG_ENDIAN +#define NI_SUPTYPE_FLAG_COMPRESS 0x1 +#define NI_FQDN_FLAG_VALIDTTL 0x1 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define NI_SUPTYPE_FLAG_COMPRESS 0x0100 +#define NI_FQDN_FLAG_VALIDTTL 0x0100 +#endif + +#ifdef NAME_LOOKUPS_04 +#if BYTE_ORDER == BIG_ENDIAN +#define NI_NODEADDR_FLAG_LINKLOCAL 0x1 +#define NI_NODEADDR_FLAG_SITELOCAL 0x2 +#define NI_NODEADDR_FLAG_GLOBAL 0x4 +#define NI_NODEADDR_FLAG_ALL 0x8 +#define NI_NODEADDR_FLAG_TRUNCATE 0x10 +#define NI_NODEADDR_FLAG_ANYCAST 0x20 /* just experimental. not in spec */ +#elif BYTE_ORDER == LITTLE_ENDIAN +#define NI_NODEADDR_FLAG_LINKLOCAL 0x0100 +#define NI_NODEADDR_FLAG_SITELOCAL 0x0200 +#define NI_NODEADDR_FLAG_GLOBAL 0x0400 +#define NI_NODEADDR_FLAG_ALL 0x0800 +#define NI_NODEADDR_FLAG_TRUNCATE 0x1000 +#define NI_NODEADDR_FLAG_ANYCAST 0x2000 /* just experimental. not in spec */ +#endif +#else /* draft-ietf-ipngwg-icmp-name-lookups-05 (and later?) */ +#if BYTE_ORDER == BIG_ENDIAN +#define NI_NODEADDR_FLAG_TRUNCATE 0x1 +#define NI_NODEADDR_FLAG_ALL 0x2 +#define NI_NODEADDR_FLAG_COMPAT 0x4 +#define NI_NODEADDR_FLAG_LINKLOCAL 0x8 +#define NI_NODEADDR_FLAG_SITELOCAL 0x10 +#define NI_NODEADDR_FLAG_GLOBAL 0x20 +#define NI_NODEADDR_FLAG_ANYCAST 0x40 /* just experimental. not in spec */ +#elif BYTE_ORDER == LITTLE_ENDIAN +#define NI_NODEADDR_FLAG_TRUNCATE 0x0100 +#define NI_NODEADDR_FLAG_ALL 0x0200 +#define NI_NODEADDR_FLAG_COMPAT 0x0400 +#define NI_NODEADDR_FLAG_LINKLOCAL 0x0800 +#define NI_NODEADDR_FLAG_SITELOCAL 0x1000 +#define NI_NODEADDR_FLAG_GLOBAL 0x2000 +#define NI_NODEADDR_FLAG_ANYCAST 0x4000 /* just experimental. not in spec */ +#endif +#endif + +struct ni_reply_fqdn { + u_int32_t ni_fqdn_ttl; /* TTL */ + u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ + u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ +} __attribute__((__packed__)); + +/* + * Router Renumbering. as router-renum-08.txt + */ +struct icmp6_router_renum { /* router renumbering header */ + struct icmp6_hdr rr_hdr; + u_int8_t rr_segnum; + u_int8_t rr_flags; + u_int16_t rr_maxdelay; + u_int32_t rr_reserved; +} __attribute__((__packed__)); + +#define ICMP6_RR_FLAGS_TEST 0x80 +#define ICMP6_RR_FLAGS_REQRESULT 0x40 +#define ICMP6_RR_FLAGS_FORCEAPPLY 0x20 +#define ICMP6_RR_FLAGS_SPECSITE 0x10 +#define ICMP6_RR_FLAGS_PREVDONE 0x08 + +#define rr_type rr_hdr.icmp6_type +#define rr_code rr_hdr.icmp6_code +#define rr_cksum rr_hdr.icmp6_cksum +#define rr_seqnum rr_hdr.icmp6_data32[0] + +struct rr_pco_match { /* match prefix part */ + u_int8_t rpm_code; + u_int8_t rpm_len; + u_int8_t rpm_ordinal; + u_int8_t rpm_matchlen; + u_int8_t rpm_minlen; + u_int8_t rpm_maxlen; + u_int16_t rpm_reserved; + struct in6_addr rpm_prefix; +} __attribute__((__packed__)); + +#define RPM_PCO_ADD 1 +#define RPM_PCO_CHANGE 2 +#define RPM_PCO_SETGLOBAL 3 +#define RPM_PCO_MAX 4 + +struct rr_pco_use { /* use prefix part */ + u_int8_t rpu_uselen; + u_int8_t rpu_keeplen; + u_int8_t rpu_ramask; + u_int8_t rpu_raflags; + u_int32_t rpu_vltime; + u_int32_t rpu_pltime; + u_int32_t rpu_flags; + struct in6_addr rpu_prefix; +} __attribute__((__packed__)); +#define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 +#define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 + +#if BYTE_ORDER == BIG_ENDIAN +#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80000000 +#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40000000 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80 +#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40 +#endif + +struct rr_result { /* router renumbering result message */ + u_int16_t rrr_flags; + u_int8_t rrr_ordinal; + u_int8_t rrr_matchedlen; + u_int32_t rrr_ifid; + struct in6_addr rrr_prefix; +} __attribute__((__packed__)); +#if BYTE_ORDER == BIG_ENDIAN +#define ICMP6_RR_RESULT_FLAGS_OOB 0x0002 +#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define ICMP6_RR_RESULT_FLAGS_OOB 0x0200 +#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0100 +#endif + +/* + * icmp6 filter structures. + */ + +struct icmp6_filter { + u_int32_t icmp6_filt[8]; +}; + +#ifdef _KERNEL +#define ICMP6_FILTER_SETPASSALL(filterp) \ +do { \ + int i; u_char *p; \ + p = (u_char *)filterp; \ + for (i = 0; i < sizeof(struct icmp6_filter); i++) \ + p[i] = 0xff; \ +} while (0) +#define ICMP6_FILTER_SETBLOCKALL(filterp) \ + bzero(filterp, sizeof(struct icmp6_filter)) +#else /* _KERNEL */ +#define ICMP6_FILTER_SETPASSALL(filterp) \ + memset(filterp, 0xff, sizeof(struct icmp6_filter)) +#define ICMP6_FILTER_SETBLOCKALL(filterp) \ + memset(filterp, 0x00, sizeof(struct icmp6_filter)) +#endif /* _KERNEL */ + +#define ICMP6_FILTER_SETPASS(type, filterp) \ + (((filterp)->icmp6_filt[(type) >> 5]) |= (1 << ((type) & 31))) +#define ICMP6_FILTER_SETBLOCK(type, filterp) \ + (((filterp)->icmp6_filt[(type) >> 5]) &= ~(1 << ((type) & 31))) +#define ICMP6_FILTER_WILLPASS(type, filterp) \ + ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) != 0) +#define ICMP6_FILTER_WILLBLOCK(type, filterp) \ + ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) + +/* + * Variables related to this implementation + * of the internet control message protocol version 6. + */ +struct icmp6errstat { + u_quad_t icp6errs_dst_unreach_noroute; + u_quad_t icp6errs_dst_unreach_admin; + u_quad_t icp6errs_dst_unreach_beyondscope; + u_quad_t icp6errs_dst_unreach_addr; + u_quad_t icp6errs_dst_unreach_noport; + u_quad_t icp6errs_packet_too_big; + u_quad_t icp6errs_time_exceed_transit; + u_quad_t icp6errs_time_exceed_reassembly; + u_quad_t icp6errs_paramprob_header; + u_quad_t icp6errs_paramprob_nextheader; + u_quad_t icp6errs_paramprob_option; + u_quad_t icp6errs_redirect; /* we regard redirect as an error here */ + u_quad_t icp6errs_unknown; +}; + +struct icmp6stat { +/* statistics related to icmp6 packets generated */ + u_quad_t icp6s_error; /* # of calls to icmp6_error */ + u_quad_t icp6s_canterror; /* no error 'cuz old was icmp */ + u_quad_t icp6s_toofreq; /* no error 'cuz rate limitation */ + u_quad_t icp6s_outhist[256]; +/* statistics related to input message processed */ + u_quad_t icp6s_badcode; /* icmp6_code out of range */ + u_quad_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ + u_quad_t icp6s_checksum; /* bad checksum */ + u_quad_t icp6s_badlen; /* calculated bound mismatch */ + /* + * number of responses: this member is inherited from netinet code, but + * for netinet6 code, it is already available in icp6s_outhist[]. + */ + u_quad_t icp6s_reflect; + u_quad_t icp6s_inhist[256]; + u_quad_t icp6s_nd_toomanyopt; /* too many ND options */ + struct icmp6errstat icp6s_outerrhist; +#define icp6s_odst_unreach_noroute \ + icp6s_outerrhist.icp6errs_dst_unreach_noroute +#define icp6s_odst_unreach_admin icp6s_outerrhist.icp6errs_dst_unreach_admin +#define icp6s_odst_unreach_beyondscope \ + icp6s_outerrhist.icp6errs_dst_unreach_beyondscope +#define icp6s_odst_unreach_addr icp6s_outerrhist.icp6errs_dst_unreach_addr +#define icp6s_odst_unreach_noport icp6s_outerrhist.icp6errs_dst_unreach_noport +#define icp6s_opacket_too_big icp6s_outerrhist.icp6errs_packet_too_big +#define icp6s_otime_exceed_transit \ + icp6s_outerrhist.icp6errs_time_exceed_transit +#define icp6s_otime_exceed_reassembly \ + icp6s_outerrhist.icp6errs_time_exceed_reassembly +#define icp6s_oparamprob_header icp6s_outerrhist.icp6errs_paramprob_header +#define icp6s_oparamprob_nextheader \ + icp6s_outerrhist.icp6errs_paramprob_nextheader +#define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option +#define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect +#define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown + u_quad_t icp6s_pmtuchg; /* path MTU changes */ + u_quad_t icp6s_nd_badopt; /* bad ND options */ + u_quad_t icp6s_badns; /* bad neighbor solicitation */ + u_quad_t icp6s_badna; /* bad neighbor advertisement */ + u_quad_t icp6s_badrs; /* bad router advertisement */ + u_quad_t icp6s_badra; /* bad router advertisement */ + u_quad_t icp6s_badredirect; /* bad redirect message */ +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPV6CTL_STATS 1 +#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ +#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ +#if 0 /*obsoleted*/ +#define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ +#endif +#define ICMPV6CTL_ND6_PRUNE 6 +#define ICMPV6CTL_ND6_DELAY 8 +#define ICMPV6CTL_ND6_UMAXTRIES 9 +#define ICMPV6CTL_ND6_MMAXTRIES 10 +#define ICMPV6CTL_ND6_USELOOPBACK 11 +/*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ +#define ICMPV6CTL_NODEINFO 13 +#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ +#define ICMPV6CTL_ND6_MAXNUDHINT 15 +#define ICMPV6CTL_MTUDISC_HIWAT 16 +#define ICMPV6CTL_MTUDISC_LOWAT 17 +#define ICMPV6CTL_ND6_DEBUG 18 +#define ICMPV6CTL_ND6_DRLIST 19 +#define ICMPV6CTL_ND6_PRLIST 20 +#define ICMPV6CTL_MAXID 21 + +#define ICMPV6CTL_NAMES { \ + { 0, 0 }, \ + { 0, 0 }, \ + { "rediraccept", CTLTYPE_INT }, \ + { "redirtimeout", CTLTYPE_INT }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "nd6_prune", CTLTYPE_INT }, \ + { 0, 0 }, \ + { "nd6_delay", CTLTYPE_INT }, \ + { "nd6_umaxtries", CTLTYPE_INT }, \ + { "nd6_mmaxtries", CTLTYPE_INT }, \ + { "nd6_useloopback", CTLTYPE_INT }, \ + { 0, 0 }, \ + { "nodeinfo", CTLTYPE_INT }, \ + { "errppslimit", CTLTYPE_INT }, \ + { "nd6_maxnudhint", CTLTYPE_INT }, \ + { "mtudisc_hiwat", CTLTYPE_INT }, \ + { "mtudisc_lowat", CTLTYPE_INT }, \ + { "nd6_debug", CTLTYPE_INT }, \ + { 0, 0 }, \ + { 0, 0 }, \ +} + +#define RTF_PROBEMTU RTF_PROTO1 + +#ifdef _KERNEL +# ifdef __STDC__ +struct rtentry; +struct rttimer; +struct in6_multi; +# endif +void icmp6_init(void); +void icmp6_paramerror(struct mbuf *, int); +void icmp6_error(struct mbuf *, int, int, int); +int icmp6_input(struct mbuf **, int *, int); +void icmp6_fasttimo(void); +void icmp6_reflect(struct mbuf *, size_t); +void icmp6_prepare(struct mbuf *); +void icmp6_redirect_input(struct mbuf *, int); +void icmp6_redirect_output(struct mbuf *, struct rtentry *); + +struct ip6ctlparam; +void icmp6_mtudisc_update(struct ip6ctlparam *, int); + +/* XXX: is this the right place for these macros? */ +#define icmp6_ifstat_inc(ifp, tag) \ +do { \ + if ((ifp) && (ifp)->if_index <= if_index \ + && (ifp)->if_index < icmp6_ifstatmax \ + && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ + icmp6_ifstat[(ifp)->if_index]->tag++; \ + } \ +} while (0) + +#define icmp6_ifoutstat_inc(ifp, type, code) \ +do { \ + icmp6_ifstat_inc(ifp, ifs6_out_msg); \ + if (type < ICMP6_INFOMSG_MASK) \ + icmp6_ifstat_inc(ifp, ifs6_out_error); \ + switch(type) { \ + case ICMP6_DST_UNREACH: \ + icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \ + if (code == ICMP6_DST_UNREACH_ADMIN) \ + icmp6_ifstat_inc(ifp, ifs6_out_adminprohib); \ + break; \ + case ICMP6_PACKET_TOO_BIG: \ + icmp6_ifstat_inc(ifp, ifs6_out_pkttoobig); \ + break; \ + case ICMP6_TIME_EXCEEDED: \ + icmp6_ifstat_inc(ifp, ifs6_out_timeexceed); \ + break; \ + case ICMP6_PARAM_PROB: \ + icmp6_ifstat_inc(ifp, ifs6_out_paramprob); \ + break; \ + case ICMP6_ECHO_REQUEST: \ + icmp6_ifstat_inc(ifp, ifs6_out_echo); \ + break; \ + case ICMP6_ECHO_REPLY: \ + icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \ + break; \ + case MLD_LISTENER_QUERY: \ + icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \ + break; \ + case MLD_LISTENER_REPORT: \ + icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \ + break; \ + case MLD_LISTENER_DONE: \ + icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \ + break; \ + case ND_ROUTER_SOLICIT: \ + icmp6_ifstat_inc(ifp, ifs6_out_routersolicit); \ + break; \ + case ND_ROUTER_ADVERT: \ + icmp6_ifstat_inc(ifp, ifs6_out_routeradvert); \ + break; \ + case ND_NEIGHBOR_SOLICIT: \ + icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); \ + break; \ + case ND_NEIGHBOR_ADVERT: \ + icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); \ + break; \ + case ND_REDIRECT: \ + icmp6_ifstat_inc(ifp, ifs6_out_redirect); \ + break; \ + } \ +} while (0) + +extern int icmp6_rediraccept; /* accept/process redirects */ +extern int icmp6_redirtimeout; /* cache time for redirect routes */ +#endif /* _KERNEL */ + +#endif /* not _NETINET_ICMP6_H_ */ diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h new file mode 100644 index 0000000..92e23c4 --- /dev/null +++ b/sys/netinet/icmp_var.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_ICMP_VAR_H_ +#define _NETINET_ICMP_VAR_H_ + + +/* + * Variables related to this implementation + * of the internet control message protocol. + */ +struct icmpstat { +/* statistics related to icmp packets generated */ + u_long icps_error; /* # of calls to icmp_error */ + u_long icps_oldshort; /* no error 'cuz old ip too short */ + u_long icps_oldicmp; /* no error 'cuz old was icmp */ + u_long icps_outhist[ICMP_MAXTYPE + 1]; +/* statistics related to input messages processed */ + u_long icps_badcode; /* icmp_code out of range */ + u_long icps_tooshort; /* packet < ICMP_MINLEN */ + u_long icps_checksum; /* bad checksum */ + u_long icps_badlen; /* calculated bound mismatch */ + u_long icps_reflect; /* number of responses */ + u_long icps_inhist[ICMP_MAXTYPE + 1]; + u_long icps_bmcastecho; /* b/mcast echo requests dropped */ + u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */ + u_long icps_badaddr; /* bad return address */ + u_long icps_noroute; /* no route back */ +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ +#define ICMPCTL_STATS 2 /* statistics (read-only) */ +#define ICMPCTL_ICMPLIM 3 +#define ICMPCTL_MAXID 4 + +#define ICMPCTL_NAMES { \ + { 0, 0 }, \ + { "maskrepl", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "icmplim", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_icmp); +extern int badport_bandlim(int); +#define BANDLIM_UNLIMITED -1 +#define BANDLIM_ICMP_UNREACH 0 +#define BANDLIM_ICMP_ECHO 1 +#define BANDLIM_ICMP_TSTAMP 2 +#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ +#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ +#define BANDLIM_MAX 4 +#endif + +#endif diff --git a/sys/netinet/if_atm.c b/sys/netinet/if_atm.c new file mode 100644 index 0000000..934309b --- /dev/null +++ b/sys/netinet/if_atm.c @@ -0,0 +1,278 @@ +/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */ + +/* + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * IP <=> ATM address resolution. + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_natm.h" + +#if defined(INET) || defined(INET6) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/if_atm.h> + +#include <netinet/in.h> +#include <netinet/if_atm.h> + +#ifdef NATM +#include <netnatm/natm.h> +#endif + + +#define SDL(s) ((struct sockaddr_dl *)s) + +/* + * atm_rtrequest: handle ATM rt request (in support of generic code) + * inputs: "req" = request code + * "rt" = route entry + * "info" = rt_addrinfo + */ + +void +atm_rtrequest(req, rt, info) + int req; + register struct rtentry *rt; + struct rt_addrinfo *info; +{ + register struct sockaddr *gate = rt->rt_gateway; + struct atm_pseudoioctl api; +#ifdef NATM + struct sockaddr_in *sin; + struct natmpcb *npcb = NULL; + struct atm_pseudohdr *aph; +#endif + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */ + return; + + switch (req) { + + case RTM_RESOLVE: /* resolve: only happens when cloning */ + printf("atm_rtrequest: RTM_RESOLVE request detected?\n"); + break; + + case RTM_ADD: + + /* + * route added by a command (e.g. ifconfig, route, arp...). + * + * first check to see if this is not a host route, in which + * case we are being called via "ifconfig" to set the address. + */ + + if ((rt->rt_flags & RTF_HOST) == 0) { + rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + break; + } + + if ((rt->rt_flags & RTF_CLONING) != 0) { + printf("atm_rtrequest: cloning route detected?\n"); + break; + } + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "atm_rtrequest: bad gateway value"); + break; + } + + KASSERT(rt->rt_ifp->if_ioctl != NULL, + ("atm_rtrequest: null ioctl")); +#ifdef NATM + /* + * let native ATM know we are using this VCI/VPI + * (i.e. reserve it) + */ + sin = (struct sockaddr_in *) rt_key(rt); + if (sin->sin_family != AF_INET) + goto failed; + aph = (struct atm_pseudohdr *) LLADDR(SDL(gate)); + npcb = npcb_add(NULL, rt->rt_ifp, ATM_PH_VCI(aph), + ATM_PH_VPI(aph)); + if (npcb == NULL) + goto failed; + npcb->npcb_flags |= NPCB_IP; + npcb->ipaddr.s_addr = sin->sin_addr.s_addr; + /* XXX: move npcb to llinfo when ATM ARP is ready */ + rt->rt_llinfo = (caddr_t) npcb; + rt->rt_flags |= RTF_LLINFO; +#endif + /* + * let the lower level know this circuit is active + */ + bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); + api.rxhand = NULL; + if (rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMENA, + (caddr_t)&api) != 0) { + printf("atm: couldn't add VC\n"); + goto failed; + } + + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + + break; + +failed: +#ifdef NATM + if (npcb) { + npcb_free(npcb, NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } +#endif + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, + rt_mask(rt), 0, (struct rtentry **) 0); + break; + + case RTM_DELETE: + +#ifdef NATM + /* + * tell native ATM we are done with this VC + */ + + if (rt->rt_flags & RTF_LLINFO) { + npcb_free((struct natmpcb *)rt->rt_llinfo, + NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } +#endif + /* + * tell the lower layer to disable this circuit + */ + + bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); + api.rxhand = NULL; + (void)rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMDIS, + (caddr_t)&api); + + break; + } +} + +/* + * atmresolve: + * inputs: + * [1] "rt" = the link level route to use (or null if need to look one up) + * [2] "m" = mbuf containing the data to be sent + * [3] "dst" = sockaddr_in (IP) address of dest. + * output: + * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info + * return: + * 0 == resolve FAILED; note that "m" gets m_freem'd in this case + * 1 == resolve OK; desten contains result + * + * XXX: will need more work if we wish to support ATMARP in the kernel, + * but this is enough for PVCs entered via the "route" command. + */ + +int +atmresolve(rt, m, dst, desten) + +register struct rtentry *rt; +struct mbuf *m; +register struct sockaddr *dst; +register struct atm_pseudohdr *desten; /* OUT */ + +{ + struct sockaddr_dl *sdl; + + if (m->m_flags & (M_BCAST|M_MCAST)) { + log(LOG_INFO, "atmresolve: BCAST/MCAST packet detected/dumped"); + goto bad; + } + + if (rt == NULL) { + rt = RTALLOC1(dst, 0); + if (rt == NULL) goto bad; /* failed */ + rt->rt_refcnt--; /* don't keep LL references */ + if ((rt->rt_flags & RTF_GATEWAY) != 0 || + (rt->rt_flags & RTF_LLINFO) == 0 || + /* XXX: are we using LLINFO? */ + rt->rt_gateway->sa_family != AF_LINK) { + goto bad; + } + } + + /* + * note that rt_gateway is a sockaddr_dl which contains the + * atm_pseudohdr data structure for this route. we currently + * don't need any rt_llinfo info (but will if we want to support + * ATM ARP [c.f. if_ether.c]). + */ + + sdl = SDL(rt->rt_gateway); + + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + + + if (sdl->sdl_family == AF_LINK && sdl->sdl_alen == sizeof(*desten)) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return(1); /* ok, go for it! */ + } + + /* + * we got an entry, but it doesn't have valid link address + * info in it (it is prob. the interface route, which has + * sdl_alen == 0). dump packet. (fall through to "bad"). + */ + +bad: + m_freem(m); + return(0); +} +#endif /* INET */ diff --git a/sys/netinet/if_atm.h b/sys/netinet/if_atm.h new file mode 100644 index 0000000..b8cddf6 --- /dev/null +++ b/sys/netinet/if_atm.h @@ -0,0 +1,47 @@ +/* $FreeBSD$ */ +/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */ + +/* + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * if_atm.h + */ + +struct atm_pseudohdr; +struct mbuf; +struct rtentry; +struct sockaddr; + +void atm_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *, + struct atm_pseudohdr *); diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c new file mode 100644 index 0000000..77eee3c --- /dev/null +++ b/sys/netinet/if_ether.c @@ -0,0 +1,951 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Ethernet address resolution protocol. + * TODO: + * add "inuse/lock" bit (or ref. count) along with valid bit + */ + +#include "opt_inet.h" +#include "opt_bdg.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/route.h> +#include <net/netisr.h> +#include <net/if_llc.h> +#ifdef BRIDGE +#include <net/ethernet.h> +#include <net/bridge.h> +#endif + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/if_ether.h> + +#include <net/if_arc.h> +#include <net/iso88025.h> + +#define SIN(s) ((struct sockaddr_in *)s) +#define SDL(s) ((struct sockaddr_dl *)s) + +SYSCTL_DECL(_net_link_ether); +SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); + +/* timer values */ +static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ +static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +static int arpt_down = 20; /* once declared down, don't send for 20 sec */ + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, + &arpt_prune, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, + &arpt_keep, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, + &arpt_down, 0, ""); + +#define rt_expire rt_rmx.rmx_expire + +struct llinfo_arp { + LIST_ENTRY(llinfo_arp) la_le; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + long la_asked; /* last time we QUERIED for this addr */ +#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ +}; + +static LIST_HEAD(, llinfo_arp) llinfo_arp; + +struct ifqueue arpintrq; +static int arp_inuse, arp_allocated, arpinit_done; + +static int arp_maxtries = 5; +static int useloopback = 1; /* use loopback interface for local traffic */ +static int arp_proxyall = 0; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, + &arp_maxtries, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, + &useloopback, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, + &arp_proxyall, 0, ""); + +static void arp_init(void); +static void arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +static void arprequest(struct ifnet *, + struct in_addr *, struct in_addr *, u_char *); +static void arpintr(void); +static void arptfree(struct llinfo_arp *); +static void arptimer(void *); +static struct llinfo_arp + *arplookup(u_long, int, int); +#ifdef INET +static void in_arpinput(struct mbuf *); +#endif + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +/* ARGSUSED */ +static void +arptimer(ignored_arg) + void *ignored_arg; +{ + int s = splnet(); + register struct llinfo_arp *la = LIST_FIRST(&llinfo_arp); + struct llinfo_arp *ola; + + timeout(arptimer, (caddr_t)0, arpt_prune * hz); + while ((ola = la) != 0) { + register struct rtentry *rt = la->la_rt; + la = LIST_NEXT(la, la_le); + if (rt->rt_expire && rt->rt_expire <= time_second) + arptfree(ola); /* timer has expired, clear */ + } + splx(s); +} + +/* + * Parallel to llc_rtrequest. + */ +static void +arp_rtrequest(req, rt, info) + int req; + register struct rtentry *rt; + struct rt_addrinfo *info; +{ + register struct sockaddr *gate = rt->rt_gateway; + register struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (!arpinit_done) { + arpinit_done = 1; + timeout(arptimer, (caddr_t)0, hz); + } + if (rt->rt_flags & RTF_GATEWAY) + return; + switch (req) { + + case RTM_ADD: + /* + * XXX: If this is a manually added route to interface + * such as older version of routed or gated might provide, + * restore cloning bit. + */ + if ((rt->rt_flags & RTF_HOST) == 0 && + SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) + rt->rt_flags |= RTF_CLONING; + if (rt->rt_flags & RTF_CLONING) { + /* + * Case 1: This route should come from a route to iface. + */ + rt_setgate(rt, rt_key(rt), + (struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + rt->rt_expire = time_second; + break; + } + /* Announce a new entry if requested. */ + if (rt->rt_flags & RTF_ANNOUNCE) + arprequest(rt->rt_ifp, + &SIN(rt_key(rt))->sin_addr, + &SIN(rt_key(rt))->sin_addr, + (u_char *)LLADDR(SDL(gate))); + /*FALLTHROUGH*/ + case RTM_RESOLVE: + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n"); + break; + } + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + if (la != 0) + break; /* This happens on a route change */ + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + R_Malloc(la, struct llinfo_arp *, sizeof(*la)); + rt->rt_llinfo = (caddr_t)la; + if (la == 0) { + log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); + break; + } + arp_inuse++, arp_allocated++; + Bzero(la, sizeof(*la)); + la->la_rt = rt; + rt->rt_flags |= RTF_LLINFO; + LIST_INSERT_HEAD(&llinfo_arp, la, la_le); + +#ifdef INET + /* + * This keeps the multicast addresses from showing up + * in `arp -a' listings as unresolved. It's not actually + * functional. Then the same for broadcast. + */ + if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr)) && + rt->rt_ifp->if_type != IFT_ARCNET) { + ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr, + LLADDR(SDL(gate))); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } + if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { + memcpy(LLADDR(SDL(gate)), rt->rt_ifp->if_broadcastaddr, + rt->rt_ifp->if_addrlen); + SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen; + rt->rt_expire = 0; + } +#endif + + if (SIN(rt_key(rt))->sin_addr.s_addr == + (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced + * through the hardware by configuring the loopback down. + * However, it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and remove + * the route to force traffic out to the hardware. + */ + rt->rt_expire = 0; + Bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)), + SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen); + if (useloopback) + rt->rt_ifp = loif; + + } + break; + + case RTM_DELETE: + if (la == 0) + break; + arp_inuse--; + LIST_REMOVE(la, la_le); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + if (la->la_hold) + m_freem(la->la_hold); + Free((caddr_t)la); + } +} + +/* + * Broadcast an ARP request. Caller specifies: + * - arp header source ip address + * - arp header target ip address + * - arp header source ethernet address + */ +static void +arprequest(ifp, sip, tip, enaddr) + register struct ifnet *ifp; + register struct in_addr *sip, *tip; + register u_char *enaddr; +{ + register struct mbuf *m; + register struct ether_header *eh; + register struct arc_header *arh; + register struct arphdr *ah; + struct sockaddr sa; + static u_char llcx[] = { 0x82, 0x40, LLC_SNAP_LSAP, LLC_SNAP_LSAP, + LLC_UI, 0x00, 0x00, 0x00, 0x08, 0x06 }; + u_short ar_hrd; + + if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + return; + m->m_pkthdr.rcvif = (struct ifnet *)0; + switch (ifp->if_type) { + case IFT_ARCNET: + ar_hrd = htons(ARPHRD_ARCNET); + + m->m_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); + m->m_pkthdr.len = m->m_len; + MH_ALIGN(m, m->m_len); + + arh = (struct arc_header *)sa.sa_data; + arh->arc_dhost = *ifp->if_broadcastaddr; + arh->arc_type = ARCTYPE_ARP; + + ah = mtod(m, struct arphdr *); + break; + + case IFT_ISO88025: + ar_hrd = htons(ARPHRD_IEEE802); + + m->m_len = sizeof(llcx) + + arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); + m->m_pkthdr.len = m->m_len; + MH_ALIGN(m, m->m_len); + + (void)memcpy(mtod(m, caddr_t), llcx, sizeof(llcx)); + (void)memcpy(sa.sa_data, ifp->if_broadcastaddr, 6); + (void)memcpy(sa.sa_data + 6, enaddr, 6); + sa.sa_data[6] |= TR_RII; + sa.sa_data[12] = TR_AC; + sa.sa_data[13] = TR_LLC_FRAME; + + ah = (struct arphdr *)(mtod(m, char *) + sizeof(llcx)); + break; + case IFT_FDDI: + case IFT_ETHER: + /* + * This may not be correct for types not explicitly + * listed, but this is our best guess + */ + default: + ar_hrd = htons(ARPHRD_ETHER); + + m->m_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); + m->m_pkthdr.len = m->m_len; + MH_ALIGN(m, m->m_len); + + eh = (struct ether_header *)sa.sa_data; + /* if_output will not swap */ + eh->ether_type = htons(ETHERTYPE_ARP); + (void)memcpy(eh->ether_dhost, ifp->if_broadcastaddr, + sizeof(eh->ether_dhost)); + + ah = mtod(m, struct arphdr *); + break; + } + + ah->ar_hrd = ar_hrd; + ah->ar_pro = htons(ETHERTYPE_IP); + ah->ar_hln = ifp->if_addrlen; /* hardware address length */ + ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ + ah->ar_op = htons(ARPOP_REQUEST); + (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); + (void)memcpy(ar_spa(ah), sip, ah->ar_pln); + (void)memcpy(ar_tpa(ah), tip, ah->ar_pln); + + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0); +} + +/* + * Resolve an IP address into an ethernet address. If success, + * desten is filled in. If there is no entry in arptab, + * set one up and broadcast a request for the IP address. + * Hold onto this mbuf and resend it once the address + * is finally resolved. A return value of 1 indicates + * that desten has been filled in and the packet should be sent + * normally; a 0 return indicates that the packet has been + * taken over here, either now or for later transmission. + */ +int +arpresolve(ifp, rt, m, dst, desten, rt0) + register struct ifnet *ifp; + register struct rtentry *rt; + struct mbuf *m; + register struct sockaddr *dst; + register u_char *desten; + struct rtentry *rt0; +{ + struct llinfo_arp *la = 0; + struct sockaddr_dl *sdl; + + if (m->m_flags & M_BCAST) { /* broadcast */ + (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); + return (1); + } + if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {/* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return(1); + } + if (rt) + la = (struct llinfo_arp *)rt->rt_llinfo; + if (la == 0) { + la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0); + if (la) + rt = la->la_rt; + } + if (la == 0 || rt == 0) { + log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s%s%s\n", + inet_ntoa(SIN(dst)->sin_addr), la ? "la" : "", + rt ? "rt" : ""); + m_freem(m); + return (0); + } + sdl = SDL(rt->rt_gateway); + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + if ((rt->rt_expire == 0 || rt->rt_expire > time_second) && + sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { + /* + * If entry has an expiry time and it is approaching, + * see if we need to send an ARP request within this + * arpt_down interval. + */ + if ((rt->rt_expire != 0) && + (time_second + (arp_maxtries - la->la_asked) * arpt_down > + rt->rt_expire)) { + arprequest(ifp, + &SIN(rt->rt_ifa->ifa_addr)->sin_addr, + &SIN(dst)->sin_addr, + IF_LLADDR(ifp)); + la->la_asked++; + } + + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return 1; + } + /* + * If ARP is disabled on this interface, stop. + * XXX + * Probably should not allocate empty llinfo struct if we are + * not going to be sending out an arp request. + */ + if (ifp->if_flags & IFF_NOARP) { + m_freem(m); + return (0); + } + /* + * There is an arptab entry, but no ethernet address + * response yet. Replace the held mbuf with this + * latest one. + */ + if (la->la_hold) + m_freem(la->la_hold); + la->la_hold = m; + if (rt->rt_expire) { + rt->rt_flags &= ~RTF_REJECT; + if (la->la_asked == 0 || rt->rt_expire != time_second) { + rt->rt_expire = time_second; + if (la->la_asked++ < arp_maxtries) + arprequest(ifp, + &SIN(rt->rt_ifa->ifa_addr)->sin_addr, + &SIN(dst)->sin_addr, + IF_LLADDR(ifp)); + else { + rt->rt_flags |= RTF_REJECT; + rt->rt_expire += arpt_down; + la->la_asked = 0; + } + + } + } + return (0); +} + +/* + * Common length and type checks are done here, + * then the protocol-specific routine is called. + */ +static void +arpintr() +{ + register struct mbuf *m; + register struct arphdr *ar; + int s; + + if (!arpinit_done) { + arpinit_done = 1; + timeout(arptimer, (caddr_t)0, hz); + } + while (arpintrq.ifq_head) { + s = splimp(); + IF_DEQUEUE(&arpintrq, m); + splx(s); + if (m == 0 || (m->m_flags & M_PKTHDR) == 0) + panic("arpintr"); + + if (m->m_len < sizeof(struct arphdr) && + ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { + log(LOG_ERR, "arp: runt packet -- m_pullup failed\n"); + continue; + } + ar = mtod(m, struct arphdr *); + + if (ntohs(ar->ar_hrd) != ARPHRD_ETHER + && ntohs(ar->ar_hrd) != ARPHRD_IEEE802 + && ntohs(ar->ar_hrd) != ARPHRD_ARCNET) { + log(LOG_ERR, + "arp: unknown hardware address format (0x%2D)\n", + (unsigned char *)&ar->ar_hrd, ""); + m_freem(m); + continue; + } + + if (m->m_pkthdr.len < arphdr_len(ar) && + (m = m_pullup(m, arphdr_len(ar))) == NULL) { + log(LOG_ERR, "arp: runt packet\n"); + m_freem(m); + continue; + } + + switch (ntohs(ar->ar_pro)) { +#ifdef INET + case ETHERTYPE_IP: + in_arpinput(m); + continue; +#endif + } + m_freem(m); + } +} + +#ifdef INET +/* + * ARP for Internet protocols on 10 Mb/s Ethernet. + * Algorithm is that given in RFC 826. + * In addition, a sanity check is performed on the sender + * protocol address, to catch impersonators. + * We no longer handle negotiations for use of trailer protocol: + * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent + * along with IP replies if we wanted trailers sent to us, + * and also sent them in response to IP replies. + * This allowed either end to announce the desire to receive + * trailer packets. + * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, + * but formerly didn't normally send requests. + */ +static int log_arp_wrong_iface = 1; +static int log_arp_movements = 1; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, + &log_arp_wrong_iface, 0, + "log arp packets arriving on the wrong interface"); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, + &log_arp_movements, 0, + "log arp replies from MACs different than the one in the cache"); + + +static void +in_arpinput(m) + struct mbuf *m; +{ + register struct arphdr *ah; + register struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ether_header *eh; + struct arc_header *arh; + struct iso88025_header *th = (struct iso88025_header *)0; + struct iso88025_sockaddr_dl_data *trld; + register struct llinfo_arp *la = 0; + register struct rtentry *rt; + struct ifaddr *ifa; + struct in_ifaddr *ia; + struct sockaddr_dl *sdl; + struct sockaddr sa; + struct in_addr isaddr, itaddr, myaddr; + int op, rif_len; + int req_len; + + req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); + if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { + log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n"); + return; + } + + ah = mtod(m, struct arphdr *); + op = ntohs(ah->ar_op); + (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); + (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); +#ifdef BRIDGE +#define BRIDGE_TEST (do_bridge) +#else +#define BRIDGE_TEST (0) /* cc will optimise the test away */ +#endif + /* + * For a bridge, we want to check the address irrespective + * of the receive interface. (This will change slightly + * when we have clusters of interfaces). + */ + LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) + if ((BRIDGE_TEST || (ia->ia_ifp == ifp)) && + itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) + goto match; + LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) + if ((BRIDGE_TEST || (ia->ia_ifp == ifp)) && + isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) + goto match; + /* + * No match, use the first inet address on the receive interface + * as a dummy address for the rest of the function. + */ + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) { + ia = ifatoia(ifa); + goto match; + } + /* + * If bridging, fall back to using any inet address. + */ + if (!BRIDGE_TEST || + (ia = TAILQ_FIRST(&in_ifaddrhead)) == NULL) { + m_freem(m); + return; + } +match: + myaddr = ia->ia_addr.sin_addr; + if (!bcmp(ar_sha(ah), IF_LLADDR(ifp), ifp->if_addrlen)) { + m_freem(m); /* it's from me, ignore it. */ + return; + } + if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { + log(LOG_ERR, + "arp: link address is broadcast for IP address %s!\n", + inet_ntoa(isaddr)); + m_freem(m); + return; + } + if (isaddr.s_addr == myaddr.s_addr) { + log(LOG_ERR, + "arp: %*D is using my IP address %s!\n", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + inet_ntoa(isaddr)); + itaddr = myaddr; + goto reply; + } + la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0); + if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) { + /* the following is not an error when doing bridging */ + if (!BRIDGE_TEST && rt->rt_ifp != ifp) { + if (log_arp_wrong_iface) + log(LOG_ERR, "arp: %s is on %s%d but got reply from %*D on %s%d\n", + inet_ntoa(isaddr), + rt->rt_ifp->if_name, rt->rt_ifp->if_unit, + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_name, ifp->if_unit); + goto reply; + } + if (sdl->sdl_alen && + bcmp(ar_sha(ah), LLADDR(sdl), sdl->sdl_alen)) { + if (rt->rt_expire) { + if (log_arp_movements) + log(LOG_INFO, "arp: %s moved from %*D to %*D on %s%d\n", + inet_ntoa(isaddr), + ifp->if_addrlen, (u_char *)LLADDR(sdl), ":", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_name, ifp->if_unit); + } else { + log(LOG_ERR, + "arp: %*D attempts to modify permanent entry for %s on %s%d\n", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + inet_ntoa(isaddr), ifp->if_name, ifp->if_unit); + goto reply; + } + } + /* + * sanity check for the address length. + * XXX this does not work for protocols with variable address + * length. -is + */ + if (sdl->sdl_alen && + sdl->sdl_alen != ah->ar_hln) { + log(LOG_WARNING, + "arp from %*D: new addr len %d, was %d", + ifp->if_addrlen, (u_char *) ar_sha(ah), ":", + ah->ar_hln, sdl->sdl_alen); + } + if (ifp->if_addrlen != ah->ar_hln) { + log(LOG_WARNING, + "arp from %*D: addr len: new %d, i/f %d (ignored)", + ifp->if_addrlen, (u_char *) ar_sha(ah), ":", + ah->ar_hln, ifp->if_addrlen); + goto reply; + } + (void)memcpy(LLADDR(sdl), ar_sha(ah), + sdl->sdl_alen = ah->ar_hln); + /* + * If we receive an arp from a token-ring station over + * a token-ring nic then try to save the source + * routing info. + */ + if (ifp->if_type == IFT_ISO88025) { + th = (struct iso88025_header *)m->m_pkthdr.header; + trld = SDL_ISO88025(sdl); + rif_len = TR_RCF_RIFLEN(th->rcf); + if ((th->iso88025_shost[0] & TR_RII) && + (rif_len > 2)) { + trld->trld_rcf = th->rcf; + trld->trld_rcf ^= htons(TR_RCF_DIR); + memcpy(trld->trld_route, th->rd, rif_len - 2); + trld->trld_rcf &= ~htons(TR_RCF_BCST_MASK); + /* + * Set up source routing information for + * reply packet (XXX) + */ + m->m_data -= rif_len; + m->m_len += rif_len; + m->m_pkthdr.len += rif_len; + } else { + th->iso88025_shost[0] &= ~TR_RII; + trld->trld_rcf = 0; + } + m->m_data -= 8; + m->m_len += 8; + m->m_pkthdr.len += 8; + th->rcf = trld->trld_rcf; + } + if (rt->rt_expire) + rt->rt_expire = time_second + arpt_keep; + rt->rt_flags &= ~RTF_REJECT; + la->la_asked = 0; + if (la->la_hold) { + (*ifp->if_output)(ifp, la->la_hold, + rt_key(rt), rt); + la->la_hold = 0; + } + } +reply: + if (op != ARPOP_REQUEST) { + m_freem(m); + return; + } + if (itaddr.s_addr == myaddr.s_addr) { + /* I am the target */ + (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); + (void)memcpy(ar_sha(ah), IF_LLADDR(ifp), ah->ar_hln); + } else { + la = arplookup(itaddr.s_addr, 0, SIN_PROXY); + if (la == NULL) { + struct sockaddr_in sin; + + if (!arp_proxyall) { + m_freem(m); + return; + } + + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = itaddr; + + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) { + m_freem(m); + return; + } + /* + * Don't send proxies for nodes on the same interface + * as this one came out of, or we'll get into a fight + * over who claims what Ether address. + */ + if (rt->rt_ifp == ifp) { + rtfree(rt); + m_freem(m); + return; + } + (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); + (void)memcpy(ar_sha(ah), IF_LLADDR(ifp), ah->ar_hln); + rtfree(rt); + + /* + * Also check that the node which sent the ARP packet + * is on the the interface we expect it to be on. This + * avoids ARP chaos if an interface is connected to the + * wrong network. + */ + sin.sin_addr = isaddr; + + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) { + m_freem(m); + return; + } + if (rt->rt_ifp != ifp) { + log(LOG_INFO, "arp_proxy: ignoring request" + " from %s via %s%d, expecting %s%d\n", + inet_ntoa(isaddr), ifp->if_name, + ifp->if_unit, rt->rt_ifp->if_name, + rt->rt_ifp->if_unit); + rtfree(rt); + m_freem(m); + return; + } + rtfree(rt); + +#ifdef DEBUG_PROXY + printf("arp: proxying for %s\n", + inet_ntoa(itaddr)); +#endif + } else { + rt = la->la_rt; + (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); + sdl = SDL(rt->rt_gateway); + (void)memcpy(ar_sha(ah), LLADDR(sdl), ah->ar_hln); + } + } + + (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); + (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); + ah->ar_op = htons(ARPOP_REPLY); + ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ + switch (ifp->if_type) { + case IFT_ARCNET: + arh = (struct arc_header *)sa.sa_data; + arh->arc_dhost = *ar_tha(ah); + arh->arc_type = ARCTYPE_ARP; + break; + + case IFT_ISO88025: + /* Re-arrange the source/dest address */ + memcpy(th->iso88025_dhost, th->iso88025_shost, + sizeof(th->iso88025_dhost)); + memcpy(th->iso88025_shost, IF_LLADDR(ifp), + sizeof(th->iso88025_shost)); + /* Set the source routing bit if neccesary */ + if (th->iso88025_dhost[0] & TR_RII) { + th->iso88025_dhost[0] &= ~TR_RII; + if (TR_RCF_RIFLEN(th->rcf) > 2) + th->iso88025_shost[0] |= TR_RII; + } + /* Copy the addresses, ac and fc into sa_data */ + memcpy(sa.sa_data, th->iso88025_dhost, + sizeof(th->iso88025_dhost) * 2); + sa.sa_data[(sizeof(th->iso88025_dhost) * 2)] = TR_AC; + sa.sa_data[(sizeof(th->iso88025_dhost) * 2) + 1] = TR_LLC_FRAME; + break; + case IFT_ETHER: + case IFT_FDDI: + /* + * May not be correct for types not explictly + * listed, but it is our best guess. + */ + default: + eh = (struct ether_header *)sa.sa_data; + (void)memcpy(eh->ether_dhost, ar_tha(ah), + sizeof(eh->ether_dhost)); + eh->ether_type = htons(ETHERTYPE_ARP); + break; + } + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0); + return; +} +#endif + +/* + * Free an arp entry. + */ +static void +arptfree(la) + register struct llinfo_arp *la; +{ + register struct rtentry *rt = la->la_rt; + register struct sockaddr_dl *sdl; + if (rt == 0) + panic("arptfree"); + if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_alen = 0; + la->la_asked = 0; + rt->rt_flags &= ~RTF_REJECT; + return; + } + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), + 0, (struct rtentry **)0); +} +/* + * Lookup or enter a new address in arptab. + */ +static struct llinfo_arp * +arplookup(addr, create, proxy) + u_long addr; + int create, proxy; +{ + register struct rtentry *rt; + static struct sockaddr_inarp sin = {sizeof(sin), AF_INET }; + const char *why = 0; + + sin.sin_addr.s_addr = addr; + sin.sin_other = proxy ? SIN_PROXY : 0; + rt = rtalloc1((struct sockaddr *)&sin, create, 0UL); + if (rt == 0) + return (0); + rt->rt_refcnt--; + + if (rt->rt_flags & RTF_GATEWAY) + why = "host is not on local network"; + else if ((rt->rt_flags & RTF_LLINFO) == 0) + why = "could not allocate llinfo"; + else if (rt->rt_gateway->sa_family != AF_LINK) + why = "gateway route is not ours"; + + if (why && create) { + log(LOG_DEBUG, "arplookup %s failed: %s\n", + inet_ntoa(sin.sin_addr), why); + return 0; + } else if (why) { + return 0; + } + return ((struct llinfo_arp *)rt->rt_llinfo); +} + +void +arp_ifinit(ifp, ifa) + struct ifnet *ifp; + struct ifaddr *ifa; +{ + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) + arprequest(ifp, &IA_SIN(ifa)->sin_addr, + &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); + ifa->ifa_rtrequest = arp_rtrequest; + ifa->ifa_flags |= RTF_CLONING; +} + +static void +arp_init(void) +{ + + arpintrq.ifq_maxlen = 50; + mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF); + LIST_INIT(&llinfo_arp); + register_netisr(NETISR_ARP, arpintr); +} + +SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h new file mode 100644 index 0000000..6b31758 --- /dev/null +++ b/sys/netinet/if_ether.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.h 8.3 (Berkeley) 5/2/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IF_ETHER_H_ +#define _NETINET_IF_ETHER_H_ + +#include <net/ethernet.h> +#include <net/if_arp.h> + +/* + * Macro to map an IP multicast address to an Ethernet multicast address. + * The high-order 25 bits of the Ethernet address are statically assigned, + * and the low-order 23 bits are taken from the low end of the IP address. + */ +#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \ + /* struct in_addr *ipaddr; */ \ + /* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x01; \ + (enaddr)[1] = 0x00; \ + (enaddr)[2] = 0x5e; \ + (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((u_char *)ipaddr)[3]; \ +} +/* + * Macro to map an IP6 multicast address to an Ethernet multicast address. + * The high-order 16 bits of the Ethernet address are statically assigned, + * and the low-order 32 bits are taken from the low end of the IP6 address. + */ +#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \ +/* struct in6_addr *ip6addr; */ \ +/* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x33; \ + (enaddr)[1] = 0x33; \ + (enaddr)[2] = ((u_char *)ip6addr)[12]; \ + (enaddr)[3] = ((u_char *)ip6addr)[13]; \ + (enaddr)[4] = ((u_char *)ip6addr)[14]; \ + (enaddr)[5] = ((u_char *)ip6addr)[15]; \ +} + +/* + * Ethernet Address Resolution Protocol. + * + * See RFC 826 for protocol description. Structure below is adapted + * to resolving internet addresses. Field names used correspond to + * RFC 826. + */ +struct ether_arp { + struct arphdr ea_hdr; /* fixed-size header */ + u_char arp_sha[ETHER_ADDR_LEN]; /* sender hardware address */ + u_char arp_spa[4]; /* sender protocol address */ + u_char arp_tha[ETHER_ADDR_LEN]; /* target hardware address */ + u_char arp_tpa[4]; /* target protocol address */ +}; +#define arp_hrd ea_hdr.ar_hrd +#define arp_pro ea_hdr.ar_pro +#define arp_hln ea_hdr.ar_hln +#define arp_pln ea_hdr.ar_pln +#define arp_op ea_hdr.ar_op + +struct sockaddr_inarp { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + struct in_addr sin_srcaddr; + u_short sin_tos; + u_short sin_other; +#define SIN_PROXY 1 +}; +/* + * IP and ethernet specific routing flags + */ +#define RTF_USETRAILERS RTF_PROTO1 /* use trailers */ +#define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */ + +#ifdef _KERNEL +extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN]; +extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; +extern struct ifqueue arpintrq; + +int arpresolve(struct ifnet *, struct rtentry *, struct mbuf *, + struct sockaddr *, u_char *, struct rtentry *); +void arp_ifinit(struct ifnet *, struct ifaddr *); +#endif + +#endif diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c new file mode 100644 index 0000000..12a1552 --- /dev/null +++ b/sys/netinet/igmp.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.c 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +/* + * Internet Group Management Protocol (IGMP) routines. + * + * Written by Steve Deering, Stanford, May 1988. + * Modified by Rosen Sharma, Stanford, Aug 1994. + * Modified by Bill Fenner, Xerox PARC, Feb 1995. + * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * + * MULTICAST Revision: 3.5.1.4 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/igmp.h> +#include <netinet/igmp_var.h> + +#include <machine/in_cksum.h> + +static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); + +static struct router_info * + find_rti(struct ifnet *ifp); + +static struct igmpstat igmpstat; + +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, + &igmpstat, igmpstat, ""); + +static int igmp_timers_are_running; +static u_long igmp_all_hosts_group; +static u_long igmp_all_rtrs_group; +static struct mbuf *router_alert; +static struct router_info *Head; + +static void igmp_sendpkt(struct in_multi *, int, unsigned long); + +void +igmp_init() +{ + struct ipoption *ra; + + /* + * To avoid byte-swapping the same value over and over again. + */ + igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); + igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); + + igmp_timers_are_running = 0; + + /* + * Construct a Router Alert option to use in outgoing packets + */ + MGET(router_alert, M_DONTWAIT, MT_DATA); + ra = mtod(router_alert, struct ipoption *); + ra->ipopt_dst.s_addr = 0; + ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + ra->ipopt_list[1] = 0x04; /* 4 bytes long */ + ra->ipopt_list[2] = 0x00; + ra->ipopt_list[3] = 0x00; + router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; + + Head = (struct router_info *) 0; +} + +static struct router_info * +find_rti(ifp) + struct ifnet *ifp; +{ + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> entering \n"); +#endif + while (rti) { + if (rti->rti_ifp == ifp) { +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> found old entry \n"); +#endif + return rti; + } + rti = rti->rti_next; + } + MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT); + rti->rti_ifp = ifp; + rti->rti_type = IGMP_V2_ROUTER; + rti->rti_time = 0; + rti->rti_next = Head; + Head = rti; +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> created an entry \n"); +#endif + return rti; +} + +void +igmp_input(m, off) + register struct mbuf *m; + int off; +{ + register int iphlen = off; + register struct igmp *igmp; + register struct ip *ip; + register int igmplen; + register struct ifnet *ifp = m->m_pkthdr.rcvif; + register int minlen; + register struct in_multi *inm; + register struct in_ifaddr *ia; + struct in_multistep step; + struct router_info *rti; + + int timer; /** timer value in the igmp query header **/ + + ++igmpstat.igps_rcv_total; + + ip = mtod(m, struct ip *); + igmplen = ip->ip_len; + + /* + * Validate lengths + */ + if (igmplen < IGMP_MINLEN) { + ++igmpstat.igps_rcv_tooshort; + m_freem(m); + return; + } + minlen = iphlen + IGMP_MINLEN; + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + ++igmpstat.igps_rcv_tooshort; + return; + } + + /* + * Validate checksum + */ + m->m_data += iphlen; + m->m_len -= iphlen; + igmp = mtod(m, struct igmp *); + if (in_cksum(m, igmplen)) { + ++igmpstat.igps_rcv_badsum; + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + ip = mtod(m, struct ip *); + timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + rti = find_rti(ifp); + + /* + * In the IGMPv2 specification, there are 3 states and a flag. + * + * In Non-Member state, we simply don't have a membership record. + * In Delaying Member state, our timer is running (inm->inm_timer) + * In Idle Member state, our timer is not running (inm->inm_timer==0) + * + * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if + * we have heard a report from another member, or IGMP_IREPORTEDLAST + * if I sent the last report. + */ + switch (igmp->igmp_type) { + + case IGMP_MEMBERSHIP_QUERY: + ++igmpstat.igps_rcv_queries; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (igmp->igmp_code == 0) { + /* + * Old router. Remember that the querier on this + * interface is old, and set the timer to the + * value in RFC 1112. + */ + + rti->rti_type = IGMP_V1_ROUTER; + rti->rti_time = 0; + + timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + + if (ip->ip_dst.s_addr != igmp_all_hosts_group || + igmp->igmp_group.s_addr != 0) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } else { + /* + * New router. Simply do the new validity check. + */ + + if (igmp->igmp_group.s_addr != 0 && + !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to the "all-hosts" group (224.0.0.1). + * - Restart any timer that is already running but has + * a value longer than the requested timeout. + * - Use the value specified in the query message as + * the maximum timeout. + */ + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_ifp == ifp && + inm->inm_addr.s_addr != igmp_all_hosts_group && + (igmp->igmp_group.s_addr == 0 || + igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { + if (inm->inm_timer == 0 || + inm->inm_timer > timer) { + inm->inm_timer = + IGMP_RANDOM_DELAY(timer); + igmp_timers_are_running = 1; + } + } + IN_NEXT_MULTI(step, inm); + } + + break; + + case IGMP_V1_MEMBERSHIP_REPORT: + case IGMP_V2_MEMBERSHIP_REPORT: + /* + * For fast leave to work, we have to know that we are the + * last person to send a report for this group. Reports + * can potentially get looped back if we are a multicast + * router, so discard reports sourced by me. + */ + IFP_TO_IA(ifp, ia); + if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + + ++igmpstat.igps_rcv_reports; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badreports; + m_freem(m); + return; + } + + /* + * KLUDGE: if the IP source address of the report has an + * unspecified (i.e., zero) subnet number, as is allowed for + * a booting host, replace it with the correct subnet number + * so that a process-level multicast routing daemon can + * determine which subnet it arrived from. This is necessary + * to compensate for the lack of any way for a process to + * determine the arrival interface of an incoming packet. + */ + if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) + if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); + + /* + * If we belong to the group being reported, stop + * our timer for that group. + */ + IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); + + if (inm != NULL) { + inm->inm_timer = 0; + ++igmpstat.igps_rcv_ourreports; + + inm->inm_state = IGMP_OTHERMEMBER; + } + + break; + } + + /* + * Pass all valid IGMP packets up to any process(es) listening + * on a raw IGMP socket. + */ + rip_input(m, off); +} + +void +igmp_joingroup(inm) + struct in_multi *inm; +{ + int s = splnet(); + + if (inm->inm_addr.s_addr == igmp_all_hosts_group + || inm->inm_ifp->if_flags & IFF_LOOPBACK) { + inm->inm_timer = 0; + inm->inm_state = IGMP_OTHERMEMBER; + } else { + inm->inm_rti = find_rti(inm->inm_ifp); + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); + inm->inm_state = IGMP_IREPORTEDLAST; + igmp_timers_are_running = 1; + } + splx(s); +} + +void +igmp_leavegroup(inm) + struct in_multi *inm; +{ + if (inm->inm_state == IGMP_IREPORTEDLAST && + inm->inm_addr.s_addr != igmp_all_hosts_group && + !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && + inm->inm_rti->rti_type != IGMP_V1_ROUTER) + igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); +} + +void +igmp_fasttimo() +{ + register struct in_multi *inm; + struct in_multistep step; + int s; + + /* + * Quick check to see if any work needs to be done, in order + * to minimize the overhead of fasttimo processing. + */ + + if (!igmp_timers_are_running) + return; + + s = splnet(); + igmp_timers_are_running = 0; + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_timer == 0) { + /* do nothing */ + } else if (--inm->inm_timer == 0) { + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_state = IGMP_IREPORTEDLAST; + } else { + igmp_timers_are_running = 1; + } + IN_NEXT_MULTI(step, inm); + } + splx(s); +} + +void +igmp_slowtimo() +{ + int s = splnet(); + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > entering \n"); +#endif + while (rti) { + if (rti->rti_type == IGMP_V1_ROUTER) { + rti->rti_time++; + if (rti->rti_time >= IGMP_AGE_THRESHOLD) { + rti->rti_type = IGMP_V2_ROUTER; + } + } + rti = rti->rti_next; + } +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > exiting \n"); +#endif + splx(s); +} + +static struct route igmprt; + +static void +igmp_sendpkt(inm, type, addr) + struct in_multi *inm; + int type; + unsigned long addr; +{ + struct mbuf *m; + struct igmp *igmp; + struct ip *ip; + struct ip_moptions imo; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = loif; + m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; + MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); + m->m_data += sizeof(struct ip); + m->m_len = IGMP_MINLEN; + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; + + imo.imo_multicast_ifp = inm->inm_ifp; + imo.imo_multicast_ttl = 1; + imo.imo_multicast_vif = -1; + /* + * Request loopback of the report if we are acting as a multicast + * router, so that the process-level routing daemon can hear it. + */ + imo.imo_multicast_loop = (ip_mrouter != NULL); + + /* + * XXX + * Do we have to worry about reentrancy here? Don't think so. + */ + ip_output(m, router_alert, &igmprt, 0, &imo); + + ++igmpstat.igps_snd_reports; +} diff --git a/sys/netinet/igmp.h b/sys/netinet/igmp.h new file mode 100644 index 0000000..7d943d6 --- /dev/null +++ b/sys/netinet/igmp.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_H_ +#define _NETINET_IGMP_H_ + +/* + * Internet Group Management Protocol (IGMP) definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.2 + */ + +/* + * IGMP packet format. + */ +struct igmp { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* subtype for routing msgs */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ +}; /* (zero for queries) */ + +#define IGMP_MINLEN 8 + +/* + * Message types, including version number. + */ +#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */ +#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ +#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ +#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */ + +#define IGMP_DVMRP 0x13 /* DVMRP routing message */ +#define IGMP_PIM 0x14 /* PIM routing message */ + +#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/ +#define IGMP_MTRACE 0x1f /* mcast traceroute messages */ + +#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ + /* query (in seconds) according */ + /* to RFC1112 */ + + +#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */ + /* specifies time in 10th of seconds*/ + +/* + * The following four defininitions are for backwards compatibility. + * They should be removed as soon as all applications are updated to + * use the new constant names. + */ +#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY +#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT +#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP + +#endif /* _NETINET_IGMP_H_ */ diff --git a/sys/netinet/igmp_var.h b/sys/netinet/igmp_var.h new file mode 100644 index 0000000..d6451a7 --- /dev/null +++ b/sys/netinet/igmp_var.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_VAR_H_ +#define _NETINET_IGMP_VAR_H_ + +/* + * Internet Group Management Protocol (IGMP), + * implementation-specific definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.3 + */ + +struct igmpstat { + u_int igps_rcv_total; /* total IGMP messages received */ + u_int igps_rcv_tooshort; /* received with too few bytes */ + u_int igps_rcv_badsum; /* received with bad checksum */ + u_int igps_rcv_queries; /* received membership queries */ + u_int igps_rcv_badqueries; /* received invalid queries */ + u_int igps_rcv_reports; /* received membership reports */ + u_int igps_rcv_badreports; /* received invalid reports */ + u_int igps_rcv_ourreports; /* received reports for our groups */ + u_int igps_snd_reports; /* sent membership reports */ +}; + +#ifdef _KERNEL +#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) + +/* + * States for IGMPv2's leave processing + */ +#define IGMP_OTHERMEMBER 0 +#define IGMP_IREPORTEDLAST 1 + +/* + * We must remember what version the subnet's querier is. + * We conveniently use the IGMP message type for the proper + * membership report to keep this state. + */ +#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT + +/* + * Revert to new router if we haven't heard from an old router in + * this amount of time. + */ +#define IGMP_AGE_THRESHOLD 540 + +void igmp_init(void); +void igmp_input(struct mbuf *, int); +void igmp_joingroup(struct in_multi *); +void igmp_leavegroup(struct in_multi *); +void igmp_fasttimo(void); +void igmp_slowtimo(void); + +SYSCTL_DECL(_net_inet_igmp); + +#endif + +/* + * Names for IGMP sysctl objects + */ +#define IGMPCTL_STATS 1 /* statistics (read-only) */ +#define IGMPCTL_MAXID 2 + +#define IGMPCTL_NAMES { \ + { 0, 0 }, \ + { "stats", CTLTYPE_STRUCT }, \ +} +#endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c new file mode 100644 index 0000000..5f4179d --- /dev/null +++ b/sys/netinet/in.c @@ -0,0 +1,890 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.c 8.4 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> + +#include <netinet/igmp_var.h> + +static MALLOC_DEFINE(M_IPMADDR, "in_multi", "internet multicast address"); + +static int in_mask2len(struct in_addr *); +static void in_len2mask(struct in_addr *, int); +static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, + struct ifnet *, struct thread *); + +static void in_socktrim(struct sockaddr_in *); +static int in_ifinit(struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int); + +static int subnetsarelocal = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, + &subnetsarelocal, 0, ""); + +struct in_multihead in_multihead; /* XXX BSS initialization */ + +extern struct inpcbinfo ripcbinfo; +extern struct inpcbinfo udbinfo; + +/* + * Return 1 if an internet address is for a ``local'' host + * (one to which we have a connection). If subnetsarelocal + * is true, this includes other subnets of the local net. + * Otherwise, it includes only the directly-connected (sub)nets. + */ +int +in_localaddr(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register struct in_ifaddr *ia; + + if (subnetsarelocal) { + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if ((i & ia->ia_netmask) == ia->ia_net) + return (1); + } else { + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if ((i & ia->ia_subnetmask) == ia->ia_subnet) + return (1); + } + return (0); +} + +/* + * Determine whether an IP address is in a reserved set of addresses + * that may not be forwarded, or whether datagrams to that destination + * may be forwarded. + */ +int +in_canforward(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i)) + return (0); + if (IN_CLASSA(i)) { + net = i & IN_CLASSA_NET; + if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) + return (0); + } + return (1); +} + +/* + * Trim a mask in a sockaddr + */ +static void +in_socktrim(ap) +struct sockaddr_in *ap; +{ + register char *cplim = (char *) &ap->sin_addr; + register char *cp = (char *) (&ap->sin_addr + 1); + + ap->sin_len = 0; + while (--cp >= cplim) + if (*cp) { + (ap)->sin_len = cp - (char *) (ap) + 1; + break; + } +} + +static int +in_mask2len(mask) + struct in_addr *mask; +{ + int x, y; + u_char *p; + + p = (u_char *)mask; + for (x = 0; x < sizeof(*mask); x++) { + if (p[x] != 0xff) + break; + } + y = 0; + if (x < sizeof(*mask)) { + for (y = 0; y < 8; y++) { + if ((p[x] & (0x80 >> y)) == 0) + break; + } + } + return x * 8 + y; +} + +static void +in_len2mask(mask, len) + struct in_addr *mask; + int len; +{ + int i; + u_char *p; + + p = (u_char *)mask; + bzero(mask, sizeof(*mask)); + for (i = 0; i < len / 8; i++) + p[i] = 0xff; + if (len % 8) + p[i] = (0xff00 >> (len % 8)) & 0xff; +} + +static int in_interfaces; /* number of external internet interfaces */ + +/* + * Generic internet control operations (ioctl's). + * Ifp is 0 if not an interface-specific ioctl. + */ +/* ARGSUSED */ +int +in_control(so, cmd, data, ifp, td) + struct socket *so; + u_long cmd; + caddr_t data; + register struct ifnet *ifp; + struct thread *td; +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct in_ifaddr *ia = 0, *iap; + register struct ifaddr *ifa; + struct in_addr dst; + struct in_ifaddr *oia; + struct in_aliasreq *ifra = (struct in_aliasreq *)data; + struct sockaddr_in oldaddr; + int error, hostIsNew, iaIsNew, maskIsNew, s; + + iaIsNew = 0; + + switch (cmd) { + case SIOCALIFADDR: + case SIOCDLIFADDR: + if (td && (error = suser(td)) != 0) + return error; + /*fall through*/ + case SIOCGLIFADDR: + if (!ifp) + return EINVAL; + return in_lifaddr_ioctl(so, cmd, data, ifp, td); + } + + /* + * Find address for this interface, if it exists. + * + * If an alias address was specified, find that one instead of + * the first one on the interface, if possible. + */ + if (ifp) { + dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr; + LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) + if (iap->ia_ifp == ifp && + iap->ia_addr.sin_addr.s_addr == dst.s_addr) { + ia = iap; + break; + } + if (ia == NULL) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + iap = ifatoia(ifa); + if (iap->ia_addr.sin_family == AF_INET) { + ia = iap; + break; + } + } + } + + switch (cmd) { + + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifp == 0) + return (EADDRNOTAVAIL); + if (ifra->ifra_addr.sin_family == AF_INET) { + for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) { + if (ia->ia_ifp == ifp && + ia->ia_addr.sin_addr.s_addr == + ifra->ifra_addr.sin_addr.s_addr) + break; + } + if ((ifp->if_flags & IFF_POINTOPOINT) + && (cmd == SIOCAIFADDR) + && (ifra->ifra_dstaddr.sin_addr.s_addr + == INADDR_ANY)) { + return EDESTADDRREQ; + } + } + if (cmd == SIOCDIFADDR && ia == 0) + return (EADDRNOTAVAIL); + /* FALLTHROUGH */ + case SIOCSIFADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if (td && (error = suser(td)) != 0) + return error; + + if (ifp == 0) + return (EADDRNOTAVAIL); + if (ia == (struct in_ifaddr *)0) { + ia = (struct in_ifaddr *) + malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO); + if (ia == (struct in_ifaddr *)NULL) + return (ENOBUFS); + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); + ifa = &ia->ia_ifa; + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + + ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; + ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; + ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; + ia->ia_sockmask.sin_len = 8; + ia->ia_sockmask.sin_family = AF_INET; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_family = AF_INET; + } + ia->ia_ifp = ifp; + if (!(ifp->if_flags & IFF_LOOPBACK)) + in_interfaces++; + splx(s); + iaIsNew = 1; + } + break; + + case SIOCSIFBRDADDR: + if (td && (error = suser(td)) != 0) + return error; + /* FALLTHROUGH */ + + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + if (ia == (struct in_ifaddr *)0) + return (EADDRNOTAVAIL); + break; + } + switch (cmd) { + + case SIOCGIFADDR: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + return (0); + + case SIOCGIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + return (0); + + case SIOCGIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + return (0); + + case SIOCGIFNETMASK: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + return (0); + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + oldaddr = ia->ia_dstaddr; + ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + if (ifp->if_ioctl && (error = (*ifp->if_ioctl) + (ifp, SIOCSIFDSTADDR, (caddr_t)ia))) { + ia->ia_dstaddr = oldaddr; + return (error); + } + if (ia->ia_flags & IFA_ROUTE) { + ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } + return (0); + + case SIOCSIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + return (0); + + case SIOCSIFADDR: + error = in_ifinit(ifp, ia, + (struct sockaddr_in *) &ifr->ifr_addr, 1); + if (error != 0 && iaIsNew) + break; + return (0); + + case SIOCSIFNETMASK: + ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); + return (0); + + case SIOCAIFADDR: + maskIsNew = 0; + hostIsNew = 1; + error = 0; + if (ia->ia_addr.sin_family == AF_INET) { + if (ifra->ifra_addr.sin_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; + } + if (ifra->ifra_mask.sin_len) { + in_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + ia->ia_sockmask.sin_family = AF_INET; + ia->ia_subnetmask = + ntohl(ia->ia_sockmask.sin_addr.s_addr); + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sin_family == AF_INET)) { + in_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.sin_family == AF_INET && + (hostIsNew || maskIsNew)) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if (error != 0 && iaIsNew) + break; + + if ((ifp->if_flags & IFF_BROADCAST) && + (ifra->ifra_broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = ifra->ifra_broadaddr; + return (error); + + case SIOCDIFADDR: + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ifp, ia); + /* + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. + */ + in_ifadown(&ia->ia_ifa, 1); + /* + * XXX horrible hack to detect that we are being called + * from if_detach() + */ + if (ifaddr_byindex(ifp->if_index) != NULL) { + in_pcbpurgeif0(&ripcbinfo, ifp); + in_pcbpurgeif0(&udbinfo, ifp); + } + error = 0; + break; + + default: + if (ifp == 0 || ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + + /* + * Protect from ipintr() traversing address list while we're modifying + * it. + */ + s = splnet(); + TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); + TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); + LIST_REMOVE(ia, ia_hash); + IFAFREE(&ia->ia_ifa); + splx(s); + + return (error); +} + +/* + * SIOC[GAD]LIFADDR. + * SIOCGLIFADDR: get first address. (?!?) + * SIOCGLIFADDR with IFLR_PREFIX: + * get first address that matches the specified prefix. + * SIOCALIFADDR: add the specified address. + * SIOCALIFADDR with IFLR_PREFIX: + * EINVAL since we can't deduce hostid part of the address. + * SIOCDLIFADDR: delete the specified address. + * SIOCDLIFADDR with IFLR_PREFIX: + * delete the first address that matches the specified prefix. + * return values: + * EINVAL on invalid parameters + * EADDRNOTAVAIL on prefix match failed/specified address not found + * other values may be returned from in_ioctl() + */ +static int +in_lifaddr_ioctl(so, cmd, data, ifp, td) + struct socket *so; + u_long cmd; + caddr_t data; + struct ifnet *ifp; + struct thread *td; +{ + struct if_laddrreq *iflr = (struct if_laddrreq *)data; + struct ifaddr *ifa; + + /* sanity checks */ + if (!data || !ifp) { + panic("invalid argument to in_lifaddr_ioctl"); + /*NOTRECHED*/ + } + + switch (cmd) { + case SIOCGLIFADDR: + /* address must be specified on GET with IFLR_PREFIX */ + if ((iflr->flags & IFLR_PREFIX) == 0) + break; + /*FALLTHROUGH*/ + case SIOCALIFADDR: + case SIOCDLIFADDR: + /* address must be specified on ADD and DELETE */ + if (iflr->addr.ss_family != AF_INET) + return EINVAL; + if (iflr->addr.ss_len != sizeof(struct sockaddr_in)) + return EINVAL; + /* XXX need improvement */ + if (iflr->dstaddr.ss_family + && iflr->dstaddr.ss_family != AF_INET) + return EINVAL; + if (iflr->dstaddr.ss_family + && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in)) + return EINVAL; + break; + default: /*shouldn't happen*/ + return EOPNOTSUPP; + } + if (sizeof(struct in_addr) * 8 < iflr->prefixlen) + return EINVAL; + + switch (cmd) { + case SIOCALIFADDR: + { + struct in_aliasreq ifra; + + if (iflr->flags & IFLR_PREFIX) + return EINVAL; + + /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */ + bzero(&ifra, sizeof(ifra)); + bcopy(iflr->iflr_name, ifra.ifra_name, + sizeof(ifra.ifra_name)); + + bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len); + + if (iflr->dstaddr.ss_family) { /*XXX*/ + bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr, + iflr->dstaddr.ss_len); + } + + ifra.ifra_mask.sin_family = AF_INET; + ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in); + in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen); + + return in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td); + } + case SIOCGLIFADDR: + case SIOCDLIFADDR: + { + struct in_ifaddr *ia; + struct in_addr mask, candidate, match; + struct sockaddr_in *sin; + int cmp; + + bzero(&mask, sizeof(mask)); + if (iflr->flags & IFLR_PREFIX) { + /* lookup a prefix rather than address. */ + in_len2mask(&mask, iflr->prefixlen); + + sin = (struct sockaddr_in *)&iflr->addr; + match.s_addr = sin->sin_addr.s_addr; + match.s_addr &= mask.s_addr; + + /* if you set extra bits, that's wrong */ + if (match.s_addr != sin->sin_addr.s_addr) + return EINVAL; + + cmp = 1; + } else { + if (cmd == SIOCGLIFADDR) { + /* on getting an address, take the 1st match */ + cmp = 0; /*XXX*/ + } else { + /* on deleting an address, do exact match */ + in_len2mask(&mask, 32); + sin = (struct sockaddr_in *)&iflr->addr; + match.s_addr = sin->sin_addr.s_addr; + + cmp = 1; + } + } + + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + if (!cmp) + break; + candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; + candidate.s_addr &= mask.s_addr; + if (candidate.s_addr == match.s_addr) + break; + } + if (!ifa) + return EADDRNOTAVAIL; + ia = (struct in_ifaddr *)ifa; + + if (cmd == SIOCGLIFADDR) { + /* fill in the if_laddrreq structure */ + bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); + + if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { + bcopy(&ia->ia_dstaddr, &iflr->dstaddr, + ia->ia_dstaddr.sin_len); + } else + bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); + + iflr->prefixlen = + in_mask2len(&ia->ia_sockmask.sin_addr); + + iflr->flags = 0; /*XXX*/ + + return 0; + } else { + struct in_aliasreq ifra; + + /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */ + bzero(&ifra, sizeof(ifra)); + bcopy(iflr->iflr_name, ifra.ifra_name, + sizeof(ifra.ifra_name)); + + bcopy(&ia->ia_addr, &ifra.ifra_addr, + ia->ia_addr.sin_len); + if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { + bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr, + ia->ia_dstaddr.sin_len); + } + bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, + ia->ia_sockmask.sin_len); + + return in_control(so, SIOCDIFADDR, (caddr_t)&ifra, + ifp, td); + } + } + } + + return EOPNOTSUPP; /*just for safety*/ +} + +/* + * Delete any existing route for an interface. + */ +void +in_ifscrub(ifp, ia) + register struct ifnet *ifp; + register struct in_ifaddr *ia; +{ + + if ((ia->ia_flags & IFA_ROUTE) == 0) + return; + if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + ia->ia_flags &= ~IFA_ROUTE; +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +static int +in_ifinit(ifp, ia, sin, scrub) + register struct ifnet *ifp; + register struct in_ifaddr *ia; + struct sockaddr_in *sin; + int scrub; +{ + register u_long i = ntohl(sin->sin_addr.s_addr); + struct sockaddr_in oldaddr; + int s = splimp(), flags = RTF_UP, error = 0; + + oldaddr = ia->ia_addr; + ia->ia_addr = *sin; + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { + splx(s); + ia->ia_addr = oldaddr; + return (error); + } + if (oldaddr.sin_family == AF_INET) + LIST_REMOVE(ia, ia_hash); + if (ia->ia_addr.sin_family == AF_INET) + LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + splx(s); + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + in_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (IN_CLASSA(i)) + ia->ia_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_netmask = IN_CLASSB_NET; + else + ia->ia_netmask = IN_CLASSC_NET; + /* + * The subnet mask usually includes at least the standard network part, + * but may may be smaller in the case of supernetting. + * If it is set, we believe it. + */ + if (ia->ia_subnetmask == 0) { + ia->ia_subnetmask = ia->ia_netmask; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + } else + ia->ia_netmask &= ia->ia_subnetmask; + ia->ia_net = i & ia->ia_netmask; + ia->ia_subnet = i & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); + /* + * Add route for the network. + */ + ia->ia_ifa.ifa_metric = ifp->if_metric; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_netbroadcast.s_addr = + htonl(ia->ia_net | ~ ia->ia_netmask); + } else if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr; + flags |= RTF_HOST; + } else if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_family != AF_INET) + return (0); + flags |= RTF_HOST; + } + + /*- + * Don't add host routes for interface addresses of + * 0.0.0.0 --> 0.255.255.255 netmask 255.0.0.0. This makes it + * possible to assign several such address pairs with consistent + * results (no host route) and is required by BOOTP. + * + * XXX: This is ugly ! There should be a way for the caller to + * say that they don't want a host route. + */ + if (ia->ia_addr.sin_addr.s_addr != INADDR_ANY || + ia->ia_netmask != IN_CLASSA_NET || + ia->ia_dstaddr.sin_addr.s_addr != htonl(IN_CLASSA_HOST)) { + if ((error = rtinit(&ia->ia_ifa, (int)RTM_ADD, flags)) != 0) { + ia->ia_addr = oldaddr; + return (error); + } + ia->ia_flags |= IFA_ROUTE; + } + + /* + * If the interface supports multicast, join the "all hosts" + * multicast group on that interface. + */ + if (ifp->if_flags & IFF_MULTICAST) { + struct in_addr addr; + + addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + in_addmulti(&addr, ifp); + } + return (error); +} + + +/* + * Return 1 if the address might be a local broadcast address. + */ +int +in_broadcast(in, ifp) + struct in_addr in; + struct ifnet *ifp; +{ + register struct ifaddr *ifa; + u_long t; + + if (in.s_addr == INADDR_BROADCAST || + in.s_addr == INADDR_ANY) + return 1; + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return 0; + t = ntohl(in.s_addr); + /* + * Look through the list of addresses for a match + * with a broadcast address. + */ +#define ia ((struct in_ifaddr *)ifa) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET && + (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + in.s_addr == ia->ia_netbroadcast.s_addr || + /* + * Check for old-style (host 0) broadcast. + */ + t == ia->ia_subnet || t == ia->ia_net) && + /* + * Check for an all one subnetmask. These + * only exist when an interface gets a secondary + * address. + */ + ia->ia_subnetmask != (u_long)0xffffffff) + return 1; + return (0); +#undef ia +} +/* + * Add an address to the list of IP multicast addresses for a given interface. + */ +struct in_multi * +in_addmulti(ap, ifp) + register struct in_addr *ap; + register struct ifnet *ifp; +{ + register struct in_multi *inm; + int error; + struct sockaddr_in sin; + struct ifmultiaddr *ifma; + int s = splnet(); + + /* + * Call generic routine to add membership or increment + * refcount. It wants addresses in the form of a sockaddr, + * so we build one here (being careful to zero the unused bytes). + */ + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = *ap; + error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma); + if (error) { + splx(s); + return 0; + } + + /* + * If ifma->ifma_protospec is null, then if_addmulti() created + * a new record. Otherwise, we are done. + */ + if (ifma->ifma_protospec != 0) { + splx(s); + return ifma->ifma_protospec; + } + + /* XXX - if_addmulti uses M_WAITOK. Can this really be called + at interrupt time? If so, need to fix if_addmulti. XXX */ + inm = (struct in_multi *)malloc(sizeof(*inm), M_IPMADDR, + M_NOWAIT | M_ZERO); + if (inm == NULL) { + splx(s); + return (NULL); + } + + inm->inm_addr = *ap; + inm->inm_ifp = ifp; + inm->inm_ifma = ifma; + ifma->ifma_protospec = inm; + LIST_INSERT_HEAD(&in_multihead, inm, inm_link); + + /* + * Let IGMP know that we have joined a new IP multicast group. + */ + igmp_joingroup(inm); + splx(s); + return (inm); +} + +/* + * Delete a multicast address record. + */ +void +in_delmulti(inm) + register struct in_multi *inm; +{ + struct ifmultiaddr *ifma = inm->inm_ifma; + struct in_multi my_inm; + int s = splnet(); + + my_inm.inm_ifp = NULL ; /* don't send the leave msg */ + if (ifma->ifma_refcount == 1) { + /* + * No remaining claims to this record; let IGMP know that + * we are leaving the multicast group. + * But do it after the if_delmulti() which might reset + * the interface and nuke the packet. + */ + my_inm = *inm ; + ifma->ifma_protospec = 0; + LIST_REMOVE(inm, inm_link); + free(inm, M_IPMADDR); + } + /* XXX - should be separate API for when we have an ifma? */ + if_delmulti(ifma->ifma_ifp, ifma->ifma_addr); + if (my_inm.inm_ifp != NULL) + igmp_leavegroup(&my_inm); + splx(s); +} diff --git a/sys/netinet/in.h b/sys/netinet/in.h new file mode 100644 index 0000000..282415d --- /dev/null +++ b/sys/netinet/in.h @@ -0,0 +1,550 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.h 8.3 (Berkeley) 1/3/94 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_H_ +#define _NETINET_IN_H_ + +#include <sys/cdefs.h> +#include <sys/_types.h> +#include <machine/endian.h> + +/* Protocols common to RFC 1700, POSIX, and X/Open. */ +#define IPPROTO_IP 0 /* dummy for IP */ +#define IPPROTO_ICMP 1 /* control message protocol */ +#define IPPROTO_TCP 6 /* tcp */ +#define IPPROTO_UDP 17 /* user datagram protocol */ + +#define INADDR_ANY (u_int32_t)0x00000000 +#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ + +#ifndef _UINT8_T_DECLARED +typedef __uint8_t uint8_t; +#define _UINT8_T_DECLARED +#endif + +#ifndef _UINT16_T_DECLARED +typedef __uint16_t uint16_t; +#define _UINT16_T_DECLARED +#endif + +#ifndef _UINT32_T_DECLARED +typedef __uint32_t uint32_t; +#define _UINT32_T_DECLARED +#endif + +#ifndef _IN_ADDR_T_DECLARED +typedef uint32_t in_addr_t; +#define _IN_ADDR_T_DECLARED +#endif + +#ifndef _IN_PORT_T_DECLARED +typedef uint16_t in_port_t; +#define _IN_PORT_T_DECLARED +#endif + +#ifdef _BSD_SA_FAMILY_T_ +typedef _BSD_SA_FAMILY_T_ sa_family_t; +#undef _BSD_SA_FAMILY_T_ +#endif + +/* Internet address (a structure for historical reasons). */ +#ifndef _STRUCT_IN_ADDR_DECLARED +struct in_addr { + in_addr_t s_addr; +}; +#define _STRUCT_IN_ADDR_DECLARED +#endif + +/* Socket address, internet style. */ +struct sockaddr_in { + uint8_t sin_len; + sa_family_t sin_family; + in_port_t sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; + +#ifndef _KERNEL + +#ifndef _BYTEORDER_PROTOTYPED +#define _BYTEORDER_PROTOTYPED +__BEGIN_DECLS +uint32_t htonl(uint32_t); +uint16_t htons(uint16_t); +uint32_t ntohl(uint32_t); +uint16_t ntohs(uint16_t); +__END_DECLS +#endif + +#ifndef _BYTEORDER_FUNC_DEFINED +#define _BYTEORDER_FUNC_DEFINED +#define htonl(x) __htonl(x) +#define htons(x) __htons(x) +#define ntohl(x) __ntohl(x) +#define ntohs(x) __ntohs(x) +#endif + +#endif /* !_KERNEL */ + +#if __POSIX_VISIBLE >= 200112 +#define IPPROTO_RAW 255 /* raw IP packet */ +#define INET_ADDRSTRLEN 16 +#endif + +#if __BSD_VISIBLE +/* + * Constants and structures defined by the internet system, + * Per RFC 790, September 1981, and numerous additions. + */ + +/* + * Protocols (RFC 1700) + */ +#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ +#define IPPROTO_IGMP 2 /* group mgmt protocol */ +#define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ +#define IPPROTO_IPV4 4 /* IPv4 encapsulation */ +#define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */ +#define IPPROTO_ST 7 /* Stream protocol II */ +#define IPPROTO_EGP 8 /* exterior gateway protocol */ +#define IPPROTO_PIGP 9 /* private interior gateway */ +#define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */ +#define IPPROTO_NVPII 11 /* network voice protocol*/ +#define IPPROTO_PUP 12 /* pup */ +#define IPPROTO_ARGUS 13 /* Argus */ +#define IPPROTO_EMCON 14 /* EMCON */ +#define IPPROTO_XNET 15 /* Cross Net Debugger */ +#define IPPROTO_CHAOS 16 /* Chaos*/ +#define IPPROTO_MUX 18 /* Multiplexing */ +#define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */ +#define IPPROTO_HMP 20 /* Host Monitoring */ +#define IPPROTO_PRM 21 /* Packet Radio Measurement */ +#define IPPROTO_IDP 22 /* xns idp */ +#define IPPROTO_TRUNK1 23 /* Trunk-1 */ +#define IPPROTO_TRUNK2 24 /* Trunk-2 */ +#define IPPROTO_LEAF1 25 /* Leaf-1 */ +#define IPPROTO_LEAF2 26 /* Leaf-2 */ +#define IPPROTO_RDP 27 /* Reliable Data */ +#define IPPROTO_IRTP 28 /* Reliable Transaction */ +#define IPPROTO_TP 29 /* tp-4 w/ class negotiation */ +#define IPPROTO_BLT 30 /* Bulk Data Transfer */ +#define IPPROTO_NSP 31 /* Network Services */ +#define IPPROTO_INP 32 /* Merit Internodal */ +#define IPPROTO_SEP 33 /* Sequential Exchange */ +#define IPPROTO_3PC 34 /* Third Party Connect */ +#define IPPROTO_IDPR 35 /* InterDomain Policy Routing */ +#define IPPROTO_XTP 36 /* XTP */ +#define IPPROTO_DDP 37 /* Datagram Delivery */ +#define IPPROTO_CMTP 38 /* Control Message Transport */ +#define IPPROTO_TPXX 39 /* TP++ Transport */ +#define IPPROTO_IL 40 /* IL transport protocol */ +#define IPPROTO_IPV6 41 /* IP6 header */ +#define IPPROTO_SDRP 42 /* Source Demand Routing */ +#define IPPROTO_ROUTING 43 /* IP6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ +#define IPPROTO_IDRP 45 /* InterDomain Routing*/ +#define IPPROTO_RSVP 46 /* resource reservation */ +#define IPPROTO_GRE 47 /* General Routing Encap. */ +#define IPPROTO_MHRP 48 /* Mobile Host Routing */ +#define IPPROTO_BHA 49 /* BHA */ +#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ +#define IPPROTO_AH 51 /* IP6 Auth Header */ +#define IPPROTO_INLSP 52 /* Integ. Net Layer Security */ +#define IPPROTO_SWIPE 53 /* IP with encryption */ +#define IPPROTO_NHRP 54 /* Next Hop Resolution */ +#define IPPROTO_MOBILE 55 /* IP Mobility */ +#define IPPROTO_TLSP 56 /* Transport Layer Security */ +#define IPPROTO_SKIP 57 /* SKIP */ +#define IPPROTO_ICMPV6 58 /* ICMP6 */ +#define IPPROTO_NONE 59 /* IP6 no next header */ +#define IPPROTO_DSTOPTS 60 /* IP6 destination option */ +#define IPPROTO_AHIP 61 /* any host internal protocol */ +#define IPPROTO_CFTP 62 /* CFTP */ +#define IPPROTO_HELLO 63 /* "hello" routing protocol */ +#define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */ +#define IPPROTO_KRYPTOLAN 65 /* Kryptolan */ +#define IPPROTO_RVD 66 /* Remote Virtual Disk */ +#define IPPROTO_IPPC 67 /* Pluribus Packet Core */ +#define IPPROTO_ADFS 68 /* Any distributed FS */ +#define IPPROTO_SATMON 69 /* Satnet Monitoring */ +#define IPPROTO_VISA 70 /* VISA Protocol */ +#define IPPROTO_IPCV 71 /* Packet Core Utility */ +#define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */ +#define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */ +#define IPPROTO_WSN 74 /* Wang Span Network */ +#define IPPROTO_PVP 75 /* Packet Video Protocol */ +#define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */ +#define IPPROTO_ND 77 /* Sun net disk proto (temp.) */ +#define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */ +#define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */ +#define IPPROTO_EON 80 /* ISO cnlp */ +#define IPPROTO_VMTP 81 /* VMTP */ +#define IPPROTO_SVMTP 82 /* Secure VMTP */ +#define IPPROTO_VINES 83 /* Banyon VINES */ +#define IPPROTO_TTP 84 /* TTP */ +#define IPPROTO_IGP 85 /* NSFNET-IGP */ +#define IPPROTO_DGP 86 /* dissimilar gateway prot. */ +#define IPPROTO_TCF 87 /* TCF */ +#define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */ +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_SRPC 90 /* Strite RPC protocol */ +#define IPPROTO_LARP 91 /* Locus Address Resoloution */ +#define IPPROTO_MTP 92 /* Multicast Transport */ +#define IPPROTO_AX25 93 /* AX.25 Frames */ +#define IPPROTO_IPEIP 94 /* IP encapsulated in IP */ +#define IPPROTO_MICP 95 /* Mobile Int.ing control */ +#define IPPROTO_SCCSP 96 /* Semaphore Comm. security */ +#define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */ +#define IPPROTO_ENCAP 98 /* encapsulation header */ +#define IPPROTO_APES 99 /* any private encr. scheme */ +#define IPPROTO_GMTP 100 /* GMTP*/ +#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ +/* 101-254: Partly Unassigned */ +#define IPPROTO_PIM 103 /* Protocol Independent Mcast */ +#define IPPROTO_PGM 113 /* PGM */ +/* 255: Reserved */ +/* BSD Private, local use, namespace incursion */ +#define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ +#define IPPROTO_MAX 256 + +/* last return value of *_input(), meaning "all job for this pkt is done". */ +#define IPPROTO_DONE 257 + +/* + * Local port number conventions: + * + * When a user does a bind(2) or connect(2) with a port number of zero, + * a non-conflicting local port address is chosen. + * The default range is IPPORT_HIFIRSTAUTO through + * IPPORT_HILASTAUTO, although that is settable by sysctl. + * + * A user may set the IPPROTO_IP option IP_PORTRANGE to change this + * default assignment range. + * + * The value IP_PORTRANGE_DEFAULT causes the default behavior. + * + * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers + * into the "high" range. These are reserved for client outbound connections + * which do not want to be filtered by any firewalls. Note that by default + * this is the same as IP_PORTRANGE_DEFAULT. + * + * The value IP_PORTRANGE_LOW changes the range to the "low" are + * that is (by convention) restricted to privileged processes. This + * convention is based on "vouchsafe" principles only. It is only secure + * if you trust the remote host to restrict these ports. + * + * The default range of ports and the high range can be changed by + * sysctl(3). (net.inet.ip.port{hi,low}{first,last}_auto) + * + * Changing those values has bad security implications if you are + * using a a stateless firewall that is allowing packets outside of that + * range in order to allow transparent outgoing connections. + * + * Such a firewall configuration will generally depend on the use of these + * default values. If you change them, you may find your Security + * Administrator looking for you with a heavy object. + * + * For a slightly more orthodox text view on this: + * + * ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers + * + * port numbers are divided into three ranges: + * + * 0 - 1023 Well Known Ports + * 1024 - 49151 Registered Ports + * 49152 - 65535 Dynamic and/or Private Ports + * + */ + +/* + * Ports < IPPORT_RESERVED are reserved for + * privileged processes (e.g. root). (IP_PORTRANGE_LOW) + */ +#define IPPORT_RESERVED 1024 + +/* + * Default local port range, used by both IP_PORTRANGE_DEFAULT + * and IP_PORTRANGE_HIGH. + */ +#define IPPORT_HIFIRSTAUTO 49152 +#define IPPORT_HILASTAUTO 65535 + +/* + * Scanning for a free reserved port return a value below IPPORT_RESERVED, + * but higher than IPPORT_RESERVEDSTART. Traditionally the start value was + * 512, but that conflicts with some well-known-services that firewalls may + * have a fit if we use. + */ +#define IPPORT_RESERVEDSTART 600 + +#define IPPORT_MAX 65535 + +/* + * Definitions of bits in internet address integers. + * On subnets, the decomposition of addresses to host and net parts + * is done according to subnet mask, not the masks here. + */ +#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0) +#define IN_CLASSA_NET 0xff000000 +#define IN_CLASSA_NSHIFT 24 +#define IN_CLASSA_HOST 0x00ffffff +#define IN_CLASSA_MAX 128 + +#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000) +#define IN_CLASSB_NET 0xffff0000 +#define IN_CLASSB_NSHIFT 16 +#define IN_CLASSB_HOST 0x0000ffff +#define IN_CLASSB_MAX 65536 + +#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000) +#define IN_CLASSC_NET 0xffffff00 +#define IN_CLASSC_NSHIFT 8 +#define IN_CLASSC_HOST 0x000000ff + +#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000) +#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ +#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ +#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ +#define IN_MULTICAST(i) IN_CLASSD(i) + +#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) +#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) + +#define INADDR_LOOPBACK (u_int32_t)0x7f000001 +#ifndef _KERNEL +#define INADDR_NONE 0xffffffff /* -1 return */ +#endif + +#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ +#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ + +#define IN_LOOPBACKNET 127 /* official! */ + +/* + * Options for use with [gs]etsockopt at the IP level. + * First word of comment is data type; bool is stored in int. + */ +#define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */ +#define IP_HDRINCL 2 /* int; header is included with data */ +#define IP_TOS 3 /* int; IP type of service and preced. */ +#define IP_TTL 4 /* int; IP time to live */ +#define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */ +#define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */ +#define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */ +#define IP_RETOPTS 8 /* ip_opts; set/get IP options */ +#define IP_MULTICAST_IF 9 /* u_char; set/get IP multicast i/f */ +#define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */ +#define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */ +#define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */ +#define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */ +#define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */ +#define IP_RSVP_ON 15 /* enable RSVP in kernel */ +#define IP_RSVP_OFF 16 /* disable RSVP in kernel */ +#define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */ +#define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */ +#define IP_PORTRANGE 19 /* int; range to choose for unspec port */ +#define IP_RECVIF 20 /* bool; receive reception if w/dgram */ +/* for IPSEC */ +#define IP_IPSEC_POLICY 21 /* int; set/get security policy */ +#define IP_FAITH 22 /* bool; accept FAITH'ed connections */ + +#define IP_FW_ADD 50 /* add a firewall rule to chain */ +#define IP_FW_DEL 51 /* delete a firewall rule from chain */ +#define IP_FW_FLUSH 52 /* flush firewall rule chain */ +#define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */ +#define IP_FW_GET 54 /* get entire firewall rule chain */ +#define IP_FW_RESETLOG 55 /* reset logging counters */ + +#define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */ +#define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */ +#define IP_DUMMYNET_FLUSH 62 /* flush dummynet */ +#define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ + +/* + * Defaults and limits for options + */ +#define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ +#define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ +#define IP_MAX_MEMBERSHIPS 20 /* per socket */ + +/* + * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. + */ +struct ip_mreq { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +/* + * Argument for IP_PORTRANGE: + * - which range to search when port is unspecified at bind() or connect() + */ +#define IP_PORTRANGE_DEFAULT 0 /* default range */ +#define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ +#define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ + +/* + * Definitions for inet sysctl operations. + * + * Third level is protocol number. + * Fourth level is desired variable within that protocol. + */ +#define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */ + +#define CTL_IPPROTO_NAMES { \ + { "ip", CTLTYPE_NODE }, \ + { "icmp", CTLTYPE_NODE }, \ + { "igmp", CTLTYPE_NODE }, \ + { "ggp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "tcp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { "egp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "pup", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "udp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "idp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "ipsec", CTLTYPE_NODE }, \ +} + +/* + * Names for IP sysctl objects + */ +#define IPCTL_FORWARDING 1 /* act as router */ +#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ +#define IPCTL_DEFTTL 3 /* default TTL */ +#ifdef notyet +#define IPCTL_DEFMTU 4 /* default MTU */ +#endif +#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */ +#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */ +#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */ +#define IPCTL_SOURCEROUTE 8 /* may perform source routes */ +#define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */ +#define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */ +#define IPCTL_INTRQDROPS 11 /* number of netisr q drops */ +#define IPCTL_STATS 12 /* ipstat structure */ +#define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ +#define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ +#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */ +#define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ +#define IPCTL_MAXID 17 + +#define IPCTL_NAMES { \ + { 0, 0 }, \ + { "forwarding", CTLTYPE_INT }, \ + { "redirect", CTLTYPE_INT }, \ + { "ttl", CTLTYPE_INT }, \ + { "mtu", CTLTYPE_INT }, \ + { "rtexpire", CTLTYPE_INT }, \ + { "rtminexpire", CTLTYPE_INT }, \ + { "rtmaxcache", CTLTYPE_INT }, \ + { "sourceroute", CTLTYPE_INT }, \ + { "directed-broadcast", CTLTYPE_INT }, \ + { "intr-queue-maxlen", CTLTYPE_INT }, \ + { "intr-queue-drops", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "accept_sourceroute", CTLTYPE_INT }, \ + { "fastforwarding", CTLTYPE_INT }, \ +} + +#endif /* __BSD_VISIBLE */ + +#ifdef _KERNEL + +struct ifnet; struct mbuf; /* forward declarations for Standard C */ + +int in_broadcast(struct in_addr, struct ifnet *); +int in_canforward(struct in_addr); +int in_localaddr(struct in_addr); +char *inet_ntoa(struct in_addr); /* in libkern */ +char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */ + +#define satosin(sa) ((struct sockaddr_in *)(sa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + +#endif /* _KERNEL */ + +/* INET6 stuff */ +#if __POSIX_VISIBLE >= 200112 +#define __KAME_NETINET_IN_H_INCLUDED_ +#include <netinet6/in6.h> +#undef __KAME_NETINET_IN_H_INCLUDED_ +#endif + +#endif /* !_NETINET_IN_H_*/ diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c new file mode 100644 index 0000000..eaf1493 --- /dev/null +++ b/sys/netinet/in_cksum.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/mbuf.h> + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(m, len) + register struct mbuf *m; + register int len; +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (int) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c new file mode 100644 index 0000000..b7a1cec --- /dev/null +++ b/sys/netinet/in_gif.c @@ -0,0 +1,354 @@ +/* $FreeBSD$ */ +/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_mrouting.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <sys/malloc.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/in_gif.h> +#include <netinet/in_var.h> +#include <netinet/ip_encap.h> +#include <netinet/ip_ecn.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#ifdef MROUTING +#include <netinet/ip_mroute.h> +#endif /* MROUTING */ + +#include <net/if_gif.h> + +#include <net/net_osdep.h> + +static int ip_gif_ttl = GIF_TTL; +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, + &ip_gif_ttl, 0, ""); + +int +in_gif_output(ifp, family, m, rt) + struct ifnet *ifp; + int family; + struct mbuf *m; + struct rtentry *rt; +{ + struct gif_softc *sc = (struct gif_softc*)ifp; + struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; + struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; + struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; + struct ip iphdr; /* capsule IP header, host byte ordered */ + int proto, error; + u_int8_t tos; + + if (sin_src == NULL || sin_dst == NULL || + sin_src->sin_family != AF_INET || + sin_dst->sin_family != AF_INET) { + m_freem(m); + return EAFNOSUPPORT; + } + + switch (family) { +#ifdef INET + case AF_INET: + { + struct ip *ip; + + proto = IPPROTO_IPV4; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return ENOBUFS; + } + ip = mtod(m, struct ip *); + tos = ip->ip_tos; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct ip6_hdr *ip6; + proto = IPPROTO_IPV6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return ENOBUFS; + } + ip6 = mtod(m, struct ip6_hdr *); + tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif /* INET6 */ + default: +#ifdef DEBUG + printf("in_gif_output: warning: unknown family %d passed\n", + family); +#endif + m_freem(m); + return EAFNOSUPPORT; + } + + bzero(&iphdr, sizeof(iphdr)); + iphdr.ip_src = sin_src->sin_addr; + /* bidirectional configured tunnel mode */ + if (sin_dst->sin_addr.s_addr != INADDR_ANY) + iphdr.ip_dst = sin_dst->sin_addr; + else { + m_freem(m); + return ENETUNREACH; + } + iphdr.ip_p = proto; + /* version will be set in ip_output() */ + iphdr.ip_ttl = ip_gif_ttl; + iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); + if (ifp->if_flags & IFF_LINK1) + ip_ecn_ingress(ECN_ALLOWED, &iphdr.ip_tos, &tos); + else + ip_ecn_ingress(ECN_NOCARE, &iphdr.ip_tos, &tos); + + /* prepend new IP header */ + M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + if (m && m->m_len < sizeof(struct ip)) + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) { + printf("ENOBUFS in in_gif_output %d\n", __LINE__); + return ENOBUFS; + } + bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); + + if (dst->sin_family != sin_dst->sin_family || + dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { + /* cache route doesn't match */ + dst->sin_family = sin_dst->sin_family; + dst->sin_len = sizeof(struct sockaddr_in); + dst->sin_addr = sin_dst->sin_addr; + if (sc->gif_ro.ro_rt) { + RTFREE(sc->gif_ro.ro_rt); + sc->gif_ro.ro_rt = NULL; + } +#if 0 + sc->gif_if.if_mtu = GIF_MTU; +#endif + } + + if (sc->gif_ro.ro_rt == NULL) { + rtalloc(&sc->gif_ro); + if (sc->gif_ro.ro_rt == NULL) { + m_freem(m); + return ENETUNREACH; + } + + /* if it constitutes infinite encapsulation, punt. */ + if (sc->gif_ro.ro_rt->rt_ifp == ifp) { + m_freem(m); + return ENETUNREACH; /* XXX */ + } +#if 0 + ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu + - sizeof(struct ip); +#endif + } + + error = ip_output(m, NULL, &sc->gif_ro, 0, NULL); + return(error); +} + +void +in_gif_input(m, off) + struct mbuf *m; + int off; +{ + struct ifnet *gifp = NULL; + struct ip *ip; + int af; + u_int8_t otos; + int proto; + + ip = mtod(m, struct ip *); + proto = ip->ip_p; + + gifp = (struct ifnet *)encap_getarg(m); + + if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + m_freem(m); + ipstat.ips_nogif++; + return; + } + + otos = ip->ip_tos; + m_adj(m, off); + + switch (proto) { +#ifdef INET + case IPPROTO_IPV4: + { + struct ip *ip; + af = AF_INET; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return; + } + ip = mtod(m, struct ip *); + if (gifp->if_flags & IFF_LINK1) + ip_ecn_egress(ECN_ALLOWED, &otos, &ip->ip_tos); + else + ip_ecn_egress(ECN_NOCARE, &otos, &ip->ip_tos); + break; + } +#endif +#ifdef INET6 + case IPPROTO_IPV6: + { + struct ip6_hdr *ip6; + u_int8_t itos; + af = AF_INET6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return; + } + ip6 = mtod(m, struct ip6_hdr *); + itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + if (gifp->if_flags & IFF_LINK1) + ip_ecn_egress(ECN_ALLOWED, &otos, &itos); + else + ip_ecn_egress(ECN_NOCARE, &otos, &itos); + ip6->ip6_flow &= ~htonl(0xff << 20); + ip6->ip6_flow |= htonl((u_int32_t)itos << 20); + break; + } +#endif /* INET6 */ + default: + ipstat.ips_nogif++; + m_freem(m); + return; + } + gif_input(m, af, gifp); + return; +} + +/* + * we know that we are in IFF_UP, outer address available, and outer family + * matched the physical addr family. see gif_encapcheck(). + */ +int +gif_encapcheck4(m, off, proto, arg) + const struct mbuf *m; + int off; + int proto; + void *arg; +{ + struct ip ip; + struct gif_softc *sc; + struct sockaddr_in *src, *dst; + int addrmatch; + struct in_ifaddr *ia4; + + /* sanity check done in caller */ + sc = (struct gif_softc *)arg; + src = (struct sockaddr_in *)sc->gif_psrc; + dst = (struct sockaddr_in *)sc->gif_pdst; + + /* LINTED const cast */ + m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); + + /* check for address match */ + addrmatch = 0; + if (src->sin_addr.s_addr == ip.ip_dst.s_addr) + addrmatch |= 1; + if (dst->sin_addr.s_addr == ip.ip_src.s_addr) + addrmatch |= 2; + if (addrmatch != 3) + return 0; + + /* martian filters on outer source - NOT done in ip_input! */ + if (IN_MULTICAST(ntohl(ip.ip_src.s_addr))) + return 0; + switch ((ntohl(ip.ip_src.s_addr) & 0xff000000) >> 24) { + case 0: case 127: case 255: + return 0; + } + /* reject packets with broadcast on source */ + TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) + { + if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) + continue; + if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) + return 0; + } + + /* ingress filters on outer source */ + if ((sc->gif_if.if_flags & IFF_LINK2) == 0 && + (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) { + struct sockaddr_in sin; + struct rtentry *rt; + + bzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = ip.ip_src; + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) { +#if 0 + log(LOG_WARNING, "%s: packet from 0x%x dropped " + "due to ingress filter\n", if_name(&sc->gif_if), + (u_int32_t)ntohl(sin.sin_addr.s_addr)); +#endif + if (rt) + rtfree(rt); + return 0; + } + rtfree(rt); + } + + return 32 * 2; +} diff --git a/sys/netinet/in_gif.h b/sys/netinet/in_gif.h new file mode 100644 index 0000000..262d9ba --- /dev/null +++ b/sys/netinet/in_gif.h @@ -0,0 +1,42 @@ +/* $FreeBSD$ */ +/* $KAME: in_gif.h,v 1.5 2000/04/14 08:36:02 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IN_GIF_H_ +#define _NETINET_IN_GIF_H_ + +#define GIF_TTL 30 + +void in_gif_input(struct mbuf *, int off); +int in_gif_output(struct ifnet *, int, struct mbuf *, struct rtentry *); +int gif_encapcheck4(const struct mbuf *, int, int, void *); + +#endif /*_NETINET_IN_GIF_H_*/ diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c new file mode 100644 index 0000000..744cfc2 --- /dev/null +++ b/sys/netinet/in_pcb.c @@ -0,0 +1,1072 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/proc.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <machine/limits.h> + +#include <vm/uma.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif /* INET6 */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#endif /* IPSEC */ + +struct in_addr zeroin_addr; + +/* + * These configure the range of local port addresses assigned to + * "unspecified" outgoing connections/packets/whatever. + */ +int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ +int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ +int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ +int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */ +int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ +int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + +#define RANGECHK(var, min, max) \ + if ((var) < (min)) { (var) = (min); } \ + else if ((var) > (max)) { (var) = (max); } + +static int +sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) +{ + int error = sysctl_handle_int(oidp, + oidp->oid_arg1, oidp->oid_arg2, req); + if (!error) { + RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); + } + return error; +} + +#undef RANGECHK + +SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); + +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, + &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); + +/* + * in_pcb.c: manage the Protocol Control Blocks. + * + * NOTE: It is assumed that most of these functions will be called at + * splnet(). XXX - There are, unfortunately, a few exceptions to this + * rule that should be fixed. + */ + +/* + * Allocate a PCB and associate it with the socket. + */ +int +in_pcballoc(so, pcbinfo, td) + struct socket *so; + struct inpcbinfo *pcbinfo; + struct thread *td; +{ + register struct inpcb *inp; +#ifdef IPSEC + int error; +#endif + + inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); + if (inp == NULL) + return (ENOBUFS); + bzero((caddr_t)inp, sizeof(*inp)); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + inp->inp_pcbinfo = pcbinfo; + inp->inp_socket = so; +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error != 0) { + uma_zfree(pcbinfo->ipi_zone, inp); + return error; + } +#endif /*IPSEC*/ +#if defined(INET6) + if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on) + inp->inp_flags |= IN6P_IPV6_V6ONLY; +#endif + LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); + pcbinfo->ipi_count++; + so->so_pcb = (caddr_t)inp; + INP_LOCK_INIT(inp, "inp"); +#ifdef INET6 + if (ip6_auto_flowlabel) + inp->inp_flags |= IN6P_AUTOFLOWLABEL; +#endif + return (0); +} + +int +in_pcbbind(inp, nam, td) + register struct inpcb *inp; + struct sockaddr *nam; + struct thread *td; +{ + register struct socket *so = inp->inp_socket; + unsigned short *lastport; + struct sockaddr_in *sin; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + u_short lport = 0; + int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error, prison = 0; + + if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ + return (EADDRNOTAVAIL); + if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY) + return (EINVAL); + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + wild = 1; + if (nam) { + sin = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); +#ifdef notdef + /* + * We should check the family, but old programs + * incorrectly fail to initialize it. + */ + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); +#endif + if (sin->sin_addr.s_addr != INADDR_ANY) + if (prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr)) + return(EINVAL); + lport = sin->sin_port; + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if (so->so_options & SO_REUSEADDR) + reuseport = SO_REUSEADDR|SO_REUSEPORT; + } else if (sin->sin_addr.s_addr != INADDR_ANY) { + sin->sin_port = 0; /* yech... */ + bzero(&sin->sin_zero, sizeof(sin->sin_zero)); + if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) + return (EADDRNOTAVAIL); + } + if (lport) { + struct inpcb *t; + /* GROSS */ + if (ntohs(lport) < IPPORT_RESERVED && td && + suser_cred(td->td_ucred, PRISON_ROOT)) + return (EACCES); + if (td && jailed(td->td_ucred)) + prison = 1; + if (so->so_cred->cr_uid != 0 && + !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + t = in_pcblookup_local(inp->inp_pcbinfo, + sin->sin_addr, lport, + prison ? 0 : INPLOOKUP_WILDCARD); + if (t && + (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != INADDR_ANY || + (t->inp_socket->so_options & + SO_REUSEPORT) == 0) && + (so->so_cred->cr_uid != + t->inp_socket->so_cred->cr_uid)) { +#if defined(INET6) + if (ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif /* defined(INET6) */ + return (EADDRINUSE); + } + } + if (prison && + prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr)) + return (EADDRNOTAVAIL); + t = in_pcblookup_local(pcbinfo, sin->sin_addr, + lport, prison ? 0 : wild); + if (t && + (reuseport & t->inp_socket->so_options) == 0) { +#if defined(INET6) + if (ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif /* defined(INET6) */ + return (EADDRINUSE); + } + } + inp->inp_laddr = sin->sin_addr; + } + if (lport == 0) { + ushort first, last; + int count; + + if (inp->inp_laddr.s_addr != INADDR_ANY) + if (prison_ip(td->td_ucred, 0, &inp->inp_laddr.s_addr )) { + inp->inp_laddr.s_addr = INADDR_ANY; + return (EINVAL); + } + inp->inp_flags |= INP_ANONPORT; + + if (inp->inp_flags & INP_HIGHPORT) { + first = ipport_hifirstauto; /* sysctl */ + last = ipport_hilastauto; + lastport = &pcbinfo->lasthi; + } else if (inp->inp_flags & INP_LOWPORT) { + if (td && (error = suser_cred(td->td_ucred, PRISON_ROOT))) { + inp->inp_laddr.s_addr = INADDR_ANY; + return error; + } + first = ipport_lowfirstauto; /* 1023 */ + last = ipport_lowlastauto; /* 600 */ + lastport = &pcbinfo->lastlow; + } else { + first = ipport_firstauto; /* sysctl */ + last = ipport_lastauto; + lastport = &pcbinfo->lastport; + } + /* + * Simple check to ensure all ports are not used up causing + * a deadlock here. + * + * We split the two cases (up and down) so that the direction + * is not being tested on each round of the loop. + */ + if (first > last) { + /* + * counting down + */ + count = first - last; + + do { + if (count-- < 0) { /* completely used? */ + inp->inp_laddr.s_addr = INADDR_ANY; + return (EADDRNOTAVAIL); + } + --*lastport; + if (*lastport > first || *lastport < last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, + inp->inp_laddr, lport, wild)); + } else { + /* + * counting up + */ + count = last - first; + + do { + if (count-- < 0) { /* completely used? */ + /* + * Undo any address bind that may have + * occurred above. + */ + inp->inp_laddr.s_addr = INADDR_ANY; + return (EADDRNOTAVAIL); + } + ++*lastport; + if (*lastport < first || *lastport > last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, + inp->inp_laddr, lport, wild)); + } + } + inp->inp_lport = lport; + if (prison_ip(td->td_ucred, 0, &inp->inp_laddr.s_addr)) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return (EINVAL); + } + if (in_pcbinshash(inp) != 0) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return (EAGAIN); + } + return (0); +} + +/* + * Transform old in_pcbconnect() into an inner subroutine for new + * in_pcbconnect(): Do some validity-checking on the remote + * address (in mbuf 'nam') and then determine local host address + * (i.e., which interface) to use to access that remote host. + * + * This preserves definition of in_pcbconnect(), while supporting a + * slightly different version for T/TCP. (This is more than + * a bit of a kludge, but cleaning up the internal interfaces would + * have forced minor changes in every protocol). + */ + +int +in_pcbladdr(inp, nam, plocal_sin) + register struct inpcb *inp; + struct sockaddr *nam; + struct sockaddr_in **plocal_sin; +{ + struct in_ifaddr *ia; + register struct sockaddr_in *sin = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); + if (!TAILQ_EMPTY(&in_ifaddrhead)) { + /* + * If the destination address is INADDR_ANY, + * use the primary local address. + * If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, + * choose the broadcast address for that interface. + */ + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; + else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && + (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&TAILQ_FIRST(&in_ifaddrhead)->ia_broadaddr)->sin_addr; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + register struct route *ro; + + ia = (struct in_ifaddr *)0; + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + * Note that we should check the address family of the cached + * destination, in case of sharing the cache with IPv6. + */ + ro = &inp->inp_route; + if (ro->ro_rt && + (ro->ro_dst.sa_family != AF_INET || + satosin(&ro->ro_dst)->sin_addr.s_addr != + sin->sin_addr.s_addr || + inp->inp_socket->so_options & SO_DONTROUTE)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ + (ro->ro_rt == (struct rtentry *)0 || + ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + /* No route yet, so try to acquire one */ + bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + sin->sin_addr; + rtalloc(ro); + } + /* + * If we found a route, use the address + * corresponding to the outgoing interface + * unless it is the loopback (in case a route + * to our address on another net goes to loopback). + */ + if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia == 0) { + u_short fport = sin->sin_port; + + sin->sin_port = 0; + ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); + if (ia == 0) + ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + sin->sin_port = fport; + if (ia == 0) + ia = TAILQ_FIRST(&in_ifaddrhead); + if (ia == 0) + return (EADDRNOTAVAIL); + } + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, use the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && + inp->inp_moptions != NULL) { + struct ip_moptions *imo; + struct ifnet *ifp; + + imo = inp->inp_moptions; + if (imo->imo_multicast_ifp != NULL) { + ifp = imo->imo_multicast_ifp; + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) + return (EADDRNOTAVAIL); + } + } + /* + * Don't do pcblookup call here; return interface in plocal_sin + * and exit to caller, that will do the lookup. + */ + *plocal_sin = &ia->ia_addr; + + } + return(0); +} + +/* + * Outer subroutine: + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sin. + * If don't have a local address for this socket yet, + * then pick one. + */ +int +in_pcbconnect(inp, nam, td) + register struct inpcb *inp; + struct sockaddr *nam; + struct thread *td; +{ + struct sockaddr_in *ifaddr; + struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in sa; + struct ucred *cred; + int error; + + cred = inp->inp_socket->so_cred; + if (inp->inp_laddr.s_addr == INADDR_ANY && jailed(cred)) { + bzero(&sa, sizeof (sa)); + sa.sin_addr.s_addr = htonl(prison_getip(cred)); + sa.sin_len=sizeof (sa); + sa.sin_family = AF_INET; + error = in_pcbbind(inp, (struct sockaddr *)&sa, td); + if (error) + return (error); + } + /* + * Call inner routine, to assign local interface address. + */ + if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0) + return(error); + + if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_lport, 0, NULL) != NULL) { + return (EADDRINUSE); + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, td); + if (error) + return (error); + } + inp->inp_laddr = ifaddr->sin_addr; + } + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + return (0); +} + +void +in_pcbdisconnect(inp) + struct inpcb *inp; +{ + + inp->inp_faddr.s_addr = INADDR_ANY; + inp->inp_fport = 0; + in_pcbrehash(inp); + if (inp->inp_socket->so_state & SS_NOFDREF) + in_pcbdetach(inp); +} + +void +in_pcbdetach(inp) + struct inpcb *inp; +{ + struct socket *so = inp->inp_socket; + struct inpcbinfo *ipi = inp->inp_pcbinfo; + +#ifdef IPSEC + ipsec4_delete_pcbpolicy(inp); +#endif /*IPSEC*/ + inp->inp_gencnt = ++ipi->ipi_gencnt; + in_pcbremlists(inp); + so->so_pcb = 0; + sotryfree(so); + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_route.ro_rt) + rtfree(inp->inp_route.ro_rt); + ip_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; + INP_LOCK_DESTROY(inp); + uma_zfree(ipi->ipi_zone, inp); +} + +/* + * The wrapper function will pass down the pcbinfo for this function to lock. + * The socket must have a valid + * (i.e., non-nil) PCB, but it should be impossible to get an invalid one + * except through a kernel programming error, so it is acceptable to panic + * (or in this case trap) if the PCB is invalid. (Actually, we don't trap + * because there actually /is/ a programming error somewhere... XXX) + */ +int +in_setsockaddr(so, nam, pcbinfo) + struct socket *so; + struct sockaddr **nam; + struct inpcbinfo *pcbinfo; +{ + int s; + register struct inpcb *inp; + register struct sockaddr_in *sin; + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + INP_INFO_RLOCK(pcbinfo); + inp = sotoinpcb(so); + if (!inp) { + INP_INFO_RUNLOCK(pcbinfo); + splx(s); + free(sin, M_SONAME); + return ECONNRESET; + } + INP_LOCK(inp); + sin->sin_port = inp->inp_lport; + sin->sin_addr = inp->inp_laddr; + INP_UNLOCK(inp); + INP_INFO_RUNLOCK(pcbinfo); + splx(s); + + *nam = (struct sockaddr *)sin; + return 0; +} + +/* + * The wrapper function will pass down the pcbinfo for this function to lock. + */ +int +in_setpeeraddr(so, nam, pcbinfo) + struct socket *so; + struct sockaddr **nam; + struct inpcbinfo *pcbinfo; +{ + int s; + register struct inpcb *inp; + register struct sockaddr_in *sin; + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + INP_INFO_RLOCK(pcbinfo); + inp = sotoinpcb(so); + if (!inp) { + INP_INFO_RUNLOCK(pcbinfo); + splx(s); + free(sin, M_SONAME); + return ECONNRESET; + } + INP_LOCK(inp); + sin->sin_port = inp->inp_fport; + sin->sin_addr = inp->inp_faddr; + INP_UNLOCK(inp); + INP_INFO_RUNLOCK(pcbinfo); + splx(s); + + *nam = (struct sockaddr *)sin; + return 0; +} + +void +in_pcbnotifyall(pcbinfo, faddr, errno, notify) + struct inpcbinfo *pcbinfo; + struct in_addr faddr; + int errno; + struct inpcb *(*notify)(struct inpcb *, int); +{ + struct inpcb *inp, *ninp; + struct inpcbhead *head; + int s; + + s = splnet(); + INP_INFO_RLOCK(pcbinfo); + head = pcbinfo->listhead; + for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) { + INP_LOCK(inp); + ninp = LIST_NEXT(inp, inp_list); +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) { + INP_UNLOCK(inp); + continue; + } +#endif + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_socket == NULL) { + INP_UNLOCK(inp); + continue; + } + (*notify)(inp, errno); + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(pcbinfo); + splx(s); +} + +void +in_pcbpurgeif0(pcbinfo, ifp) + struct inpcbinfo *pcbinfo; + struct ifnet *ifp; +{ + struct inpcb *inp; + struct ip_moptions *imo; + int i, gap; + + /* why no splnet here? XXX */ + INP_INFO_RLOCK(pcbinfo); + LIST_FOREACH(inp, pcbinfo->listhead, inp_list) { + INP_LOCK(inp); + imo = inp->inp_moptions; + if ((inp->inp_vflag & INP_IPV4) && + imo != NULL) { + /* + * Unselect the outgoing interface if it is being + * detached. + */ + if (imo->imo_multicast_ifp == ifp) + imo->imo_multicast_ifp = NULL; + + /* + * Drop multicast group membership if we joined + * through the interface being detached. + */ + for (i = 0, gap = 0; i < imo->imo_num_memberships; + i++) { + if (imo->imo_membership[i]->inm_ifp == ifp) { + in_delmulti(imo->imo_membership[i]); + gap++; + } else if (gap != 0) + imo->imo_membership[i - gap] = + imo->imo_membership[i]; + } + imo->imo_num_memberships -= gap; + } + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(pcbinfo); +} + +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +void +in_losing(inp) + struct inpcb *inp; +{ + register struct rtentry *rt; + struct rt_addrinfo info; + + if ((rt = inp->inp_route.ro_rt)) { + bzero((caddr_t)&info, sizeof(info)); + info.rti_flags = rt->rt_flags; + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); + if (rt->rt_flags & RTF_DYNAMIC) + (void) rtrequest1(RTM_DELETE, &info, NULL); + inp->inp_route.ro_rt = NULL; + rtfree(rt); + /* + * A new route can be allocated + * the next time output is attempted. + */ + } +} + +/* + * After a routing change, flush old routing + * and allocate a (hopefully) better one. + */ +struct inpcb * +in_rtchange(inp, errno) + register struct inpcb *inp; + int errno; +{ + if (inp->inp_route.ro_rt) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = 0; + /* + * A new route can be allocated the next time + * output is attempted. + */ + } + return inp; +} + +/* + * Lookup a PCB based on the local address and port. + */ +struct inpcb * +in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) + struct inpcbinfo *pcbinfo; + struct in_addr laddr; + u_int lport_arg; + int wild_okay; +{ + register struct inpcb *inp; + int matchwild = 3, wildcard; + u_short lport = lport_arg; + + if (!wild_okay) { + struct inpcbhead *head; + /* + * Look for an unconnected (wildcard foreign addr) PCB that + * matches the local address and port we're looking for. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_lport == lport) { + /* + * Found. + */ + return (inp); + } + } + /* + * Not found. + */ + return (NULL); + } else { + struct inpcbporthead *porthash; + struct inpcbport *phd; + struct inpcb *match = NULL; + /* + * Best fit PCB lookup. + * + * First see if this local port is in use by looking on the + * port hash list. + */ + porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport, + pcbinfo->porthashmask)]; + LIST_FOREACH(phd, porthash, phd_hash) { + if (phd->phd_port == lport) + break; + } + if (phd != NULL) { + /* + * Port is in use by one or more PCBs. Look for best + * fit. + */ + LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { + wildcard = 0; +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY) + wildcard++; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) + wildcard++; + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) { + break; + } + } + } + } + return (match); + } +} + +/* + * Lookup PCB in hash list. + */ +struct inpcb * +in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, + ifp) + struct inpcbinfo *pcbinfo; + struct in_addr faddr, laddr; + u_int fport_arg, lport_arg; + int wildcard; + struct ifnet *ifp; +{ + struct inpcbhead *head; + register struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * Found. + */ + return (inp); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; +#if defined(INET6) + struct inpcb *local_wild_mapped = NULL; +#endif /* defined(INET6) */ + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_lport == lport) { + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + if (inp->inp_laddr.s_addr == laddr.s_addr) + return (inp); + else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#if defined(INET6) + if (INP_CHECK_SOCKAF(inp->inp_socket, + AF_INET6)) + local_wild_mapped = inp; + else +#endif /* defined(INET6) */ + local_wild = inp; + } + } + } +#if defined(INET6) + if (local_wild == NULL) + return (local_wild_mapped); +#endif /* defined(INET6) */ + return (local_wild); + } + + /* + * Not found. + */ + return (NULL); +} + +/* + * Insert PCB onto various hash lists. + */ +int +in_pcbinshash(inp) + struct inpcb *inp; +{ + struct inpcbhead *pcbhash; + struct inpcbporthead *pcbporthash; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbport *phd; + u_int32_t hashkey_faddr; + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)]; + + pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport, + pcbinfo->porthashmask)]; + + /* + * Go through port list and look for a head for this lport. + */ + LIST_FOREACH(phd, pcbporthash, phd_hash) { + if (phd->phd_port == inp->inp_lport) + break; + } + /* + * If none exists, malloc one and tack it on. + */ + if (phd == NULL) { + MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT); + if (phd == NULL) { + return (ENOBUFS); /* XXX */ + } + phd->phd_port = inp->inp_lport; + LIST_INIT(&phd->phd_pcblist); + LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); + } + inp->inp_phd = phd; + LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); + LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + return (0); +} + +/* + * Move PCB to the proper hash bucket when { faddr, fport } have been + * changed. NOTE: This does not handle the case of the lport changing (the + * hashed port list would have to be updated as well), so the lport must + * not change after in_pcbinshash() has been called. + */ +void +in_pcbrehash(inp) + struct inpcb *inp; +{ + struct inpcbhead *head; + u_int32_t hashkey_faddr; + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)]; + + LIST_REMOVE(inp, inp_hash); + LIST_INSERT_HEAD(head, inp, inp_hash); +} + +/* + * Remove PCB from various lists. + */ +void +in_pcbremlists(inp) + struct inpcb *inp; +{ + inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt; + if (inp->inp_lport) { + struct inpcbport *phd = inp->inp_phd; + + LIST_REMOVE(inp, inp_hash); + LIST_REMOVE(inp, inp_portlist); + if (LIST_FIRST(&phd->phd_pcblist) == NULL) { + LIST_REMOVE(phd, phd_hash); + free(phd, M_PCB); + } + } + LIST_REMOVE(inp, inp_list); + inp->inp_pcbinfo->ipi_count--; +} + +int +prison_xinpcb(struct thread *td, struct inpcb *inp) +{ + if (!jailed(td->td_ucred)) + return (0); + if (ntohl(inp->inp_laddr.s_addr) == prison_getip(td->td_ucred)) + return (0); + return (1); +} diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h new file mode 100644 index 0000000..1d9a21d --- /dev/null +++ b/sys/netinet/in_pcb.h @@ -0,0 +1,352 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_PCB_H_ +#define _NETINET_IN_PCB_H_ + +#include <sys/queue.h> + +#include <net/route.h> +#include <netinet6/ipsec.h> /* for IPSEC */ +#include <vm/uma.h> + +#define in6pcb inpcb /* for KAME src sync over BSD*'s */ +#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ + +/* + * Common structure pcb for internet protocol implementation. + * Here are stored pointers to local and foreign host table + * entries, local and foreign socket numbers, and pointers + * up (to a socket structure) and down (to a protocol-specific) + * control block. + */ +LIST_HEAD(inpcbhead, inpcb); +LIST_HEAD(inpcbporthead, inpcbport); +typedef u_quad_t inp_gen_t; + +/* + * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. + * So, AF_INET6 null laddr is also used as AF_INET null laddr, + * by utilize following structure. (At last, same as INRIA) + */ +struct in_addr_4in6 { + u_int32_t ia46_pad32[3]; + struct in_addr ia46_addr4; +}; + +/* + * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. + * in_conninfo has some extra padding to accomplish this. + */ +struct in_endpoints { + u_int16_t ie_fport; /* foreign port */ + u_int16_t ie_lport; /* local port */ + /* protocol dependent part, local and foreign addr */ + union { + /* foreign host table entry */ + struct in_addr_4in6 ie46_foreign; + struct in6_addr ie6_foreign; + } ie_dependfaddr; + union { + /* local host table entry */ + struct in_addr_4in6 ie46_local; + struct in6_addr ie6_local; + } ie_dependladdr; +#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 +#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 +#define ie6_faddr ie_dependfaddr.ie6_foreign +#define ie6_laddr ie_dependladdr.ie6_local +}; + +/* + * XXX + * At some point struct route should possibly change to: + * struct rtentry *rt + * struct in_endpoints *ie; + */ +struct in_conninfo { + u_int8_t inc_flags; + u_int8_t inc_len; + u_int16_t inc_pad; /* XXX alignment for in_endpoints */ + /* protocol dependent part; cached route */ + struct in_endpoints inc_ie; + union { + /* placeholder for routing entry */ + struct route inc4_route; + struct route_in6 inc6_route; + } inc_dependroute; +}; +#define inc_isipv6 inc_flags /* temp compatability */ +#define inc_fport inc_ie.ie_fport +#define inc_lport inc_ie.ie_lport +#define inc_faddr inc_ie.ie_faddr +#define inc_laddr inc_ie.ie_laddr +#define inc_route inc_dependroute.inc4_route +#define inc6_faddr inc_ie.ie6_faddr +#define inc6_laddr inc_ie.ie6_laddr +#define inc6_route inc_dependroute.inc6_route + +struct icmp6_filter; + +struct inpcb { + LIST_ENTRY(inpcb) inp_hash; /* hash list */ + LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ + u_int32_t inp_flow; + + /* local and foreign ports, local and foreign addr */ + struct in_conninfo inp_inc; + + caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + struct inpcbinfo *inp_pcbinfo; /* PCB list info */ + struct socket *inp_socket; /* back pointer to socket */ + /* list for this PCB's local port */ + int inp_flags; /* generic IP/datagram flags */ + + struct inpcbpolicy *inp_sp; /* for IPSEC */ + u_char inp_vflag; /* IP version flag (v4/v6) */ +#define INP_IPV4 0x1 +#define INP_IPV6 0x2 + u_char inp_ip_ttl; /* time to live proto */ + u_char inp_ip_p; /* protocol proto */ + + /* protocol dependent part; options */ + struct { + u_char inp4_ip_tos; /* type of service proto */ + struct mbuf *inp4_options; /* IP options */ + struct ip_moptions *inp4_moptions; /* IP multicast options */ + } inp_depend4; +#define inp_fport inp_inc.inc_fport +#define inp_lport inp_inc.inc_lport +#define inp_faddr inp_inc.inc_faddr +#define inp_laddr inp_inc.inc_laddr +#define inp_route inp_inc.inc_route +#define inp_ip_tos inp_depend4.inp4_ip_tos +#define inp_options inp_depend4.inp4_options +#define inp_moptions inp_depend4.inp4_moptions + struct { + /* IP options */ + struct mbuf *inp6_options; + /* IP6 options for outgoing packets */ + struct ip6_pktopts *inp6_outputopts; + /* IP multicast options */ + struct ip6_moptions *inp6_moptions; + /* ICMPv6 code type filter */ + struct icmp6_filter *inp6_icmp6filt; + /* IPV6_CHECKSUM setsockopt */ + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + u_int8_t inp6_hlim; + } inp_depend6; + LIST_ENTRY(inpcb) inp_portlist; + struct inpcbport *inp_phd; /* head of this list */ + inp_gen_t inp_gencnt; /* generation count of this instance */ + struct mtx inp_mtx; + +#define in6p_faddr inp_inc.inc6_faddr +#define in6p_laddr inp_inc.inc6_laddr +#define in6p_route inp_inc.inc6_route +#define in6p_ip6_hlim inp_depend6.inp6_hlim +#define in6p_hops inp_depend6.inp6_hops /* default hop limit */ +#define in6p_ip6_nxt inp_ip_p +#define in6p_flowinfo inp_flow +#define in6p_vflag inp_vflag +#define in6p_options inp_depend6.inp6_options +#define in6p_outputopts inp_depend6.inp6_outputopts +#define in6p_moptions inp_depend6.inp6_moptions +#define in6p_icmp6filt inp_depend6.inp6_icmp6filt +#define in6p_cksum inp_depend6.inp6_cksum +#define inp6_ifindex inp_depend6.inp6_ifindex +#define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ +#define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ +#define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ +#define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ +#define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ +}; +/* + * The range of the generation count, as used in this implementation, + * is 9e19. We would have to create 300 billion connections per + * second for this number to roll over in a year. This seems sufficiently + * unlikely that we simply don't concern ourselves with that possibility. + */ + +/* + * Interface exported to userland by various protocols which use + * inpcbs. Hack alert -- only define if struct xsocket is in scope. + */ +#ifdef _SYS_SOCKETVAR_H_ +struct xinpcb { + size_t xi_len; /* length of this structure */ + struct inpcb xi_inp; + struct xsocket xi_socket; + u_quad_t xi_alignment_hack; +}; + +struct xinpgen { + size_t xig_len; /* length of this structure */ + u_int xig_count; /* number of PCBs at this time */ + inp_gen_t xig_gen; /* generation count at this time */ + so_gen_t xig_sogen; /* socket generation count at this time */ +}; +#endif /* _SYS_SOCKETVAR_H_ */ + +struct inpcbport { + LIST_ENTRY(inpcbport) phd_hash; + struct inpcbhead phd_pcblist; + u_short phd_port; +}; + +struct inpcbinfo { /* XXX documentation, prefixes */ + struct inpcbhead *hashbase; + u_long hashmask; + struct inpcbporthead *porthashbase; + u_long porthashmask; + struct inpcbhead *listhead; + u_short lastport; + u_short lastlow; + u_short lasthi; + uma_zone_t ipi_zone; /* zone to allocate pcbs from */ + u_int ipi_count; /* number of pcbs in this list */ + u_quad_t ipi_gencnt; /* current generation count */ + struct mtx ipi_mtx; +}; + +#define INP_LOCK_INIT(inp, d) \ + mtx_init(&(inp)->inp_mtx, (d), NULL, MTX_DEF | MTX_RECURSE) +#define INP_LOCK_DESTROY(inp) mtx_destroy(&(inp)->inp_mtx) +#define INP_LOCK(inp) mtx_lock(&(inp)->inp_mtx) +#define INP_UNLOCK(inp) mtx_unlock(&(inp)->inp_mtx) + +#define INP_INFO_LOCK_INIT(ipi, d) \ + mtx_init(&(ipi)->ipi_mtx, (d), NULL, MTX_DEF | MTX_RECURSE) +#define INP_INFO_RLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx) +#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx) +#define INP_INFO_RUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) +#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx) + +#define INP_PCBHASH(faddr, lport, fport, mask) \ + (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) +#define INP_PCBPORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) + +/* flags in inp_flags: */ +#define INP_RECVOPTS 0x01 /* receive incoming IP options */ +#define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ +#define INP_RECVDSTADDR 0x04 /* receive IP dst address */ +#define INP_HDRINCL 0x08 /* user supplies entire IP header */ +#define INP_HIGHPORT 0x10 /* user wants "high" port binding */ +#define INP_LOWPORT 0x20 /* user wants "low" port binding */ +#define INP_ANONPORT 0x40 /* port chosen for user */ +#define INP_RECVIF 0x80 /* receive incoming interface */ +#define INP_MTUDISC 0x100 /* user can do MTU discovery */ +#define INP_FAITH 0x200 /* accept FAITH'ed connections */ + +#define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ + +#define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */ +#define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */ +#define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */ +#define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */ +#define IN6P_RTHDR 0x100000 /* receive routing header */ +#define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ +#define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ +#define IN6P_BINDV6ONLY 0x10000000 /* do not grab IPv4 traffic */ + +#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ + INP_RECVIF|\ + IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ + IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ + IN6P_AUTOFLOWLABEL) +#define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ + IN6P_AUTOFLOWLABEL) + + /* for KAME src sync over BSD*'s */ +#define IN6P_HIGHPORT INP_HIGHPORT +#define IN6P_LOWPORT INP_LOWPORT +#define IN6P_ANONPORT INP_ANONPORT +#define IN6P_RECVIF INP_RECVIF +#define IN6P_MTUDISC INP_MTUDISC +#define IN6P_FAITH INP_FAITH +#define IN6P_CONTROLOPTS INP_CONTROLOPTS + /* + * socket AF version is {newer than,or include} + * actual datagram AF version + */ + +#define INPLOOKUP_WILDCARD 1 +#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) +#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ + +#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family + +#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) + +#ifdef _KERNEL +extern int ipport_lowfirstauto; +extern int ipport_lowlastauto; +extern int ipport_firstauto; +extern int ipport_lastauto; +extern int ipport_hifirstauto; +extern int ipport_hilastauto; + +void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); +void in_losing(struct inpcb *); +struct inpcb * + in_rtchange(struct inpcb *, int); +int in_pcballoc(struct socket *, struct inpcbinfo *, struct thread *); +int in_pcbbind(struct inpcb *, struct sockaddr *, struct thread *); +int in_pcbconnect(struct inpcb *, struct sockaddr *, struct thread *); +void in_pcbdetach(struct inpcb *); +void in_pcbdisconnect(struct inpcb *); +int in_pcbinshash(struct inpcb *); +int in_pcbladdr(struct inpcb *, struct sockaddr *, + struct sockaddr_in **); +struct inpcb * + in_pcblookup_local(struct inpcbinfo *, + struct in_addr, u_int, int); +struct inpcb * + in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, + struct in_addr, u_int, int, struct ifnet *); +void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, + int, struct inpcb *(*)(struct inpcb *, int)); +void in_pcbrehash(struct inpcb *); +int in_setpeeraddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo); +int in_setsockaddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo);; +void in_pcbremlists(struct inpcb *inp); +int prison_xinpcb(struct thread *td, struct inpcb *inp); +#endif /* _KERNEL */ + +#endif /* !_NETINET_IN_PCB_H_ */ diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c new file mode 100644 index 0000000..b522652 --- /dev/null +++ b/sys/netinet/in_proto.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 + * $FreeBSD$ + */ + +#include "opt_ipdivert.h" +#include "opt_ipx.h" +#include "opt_ipsec.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/queue.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/igmp_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> +#include <netinet/ip_encap.h> + +/* + * TCP/IP protocol family: IP, ICMP, UDP, TCP. + */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netinet6/ah.h> +#ifdef IPSEC_ESP +#include <netinet6/esp.h> +#endif +#include <netinet6/ipcomp.h> +#endif /* IPSEC */ + +#ifdef IPXIP +#include <netipx/ipx_ip.h> +#endif + +#ifdef NSIP +#include <netns/ns.h> +#include <netns/ns_if.h> +#endif + +extern struct domain inetdomain; +static struct pr_usrreqs nousrreqs; + +struct protosw inetsw[] = { +{ 0, &inetdomain, 0, 0, + 0, 0, 0, 0, + 0, + ip_init, 0, ip_slowtimo, ip_drain, + &nousrreqs +}, +{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, + udp_input, 0, udp_ctlinput, ip_ctloutput, + 0, + udp_init, 0, 0, 0, + &udp_usrreqs +}, +{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, + PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + tcp_input, 0, tcp_ctlinput, tcp_ctloutput, + 0, + tcp_init, 0, tcp_slowtimo, tcp_drain, + &tcp_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, + rip_input, 0, rip_ctlinput, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + icmp_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + igmp_input, 0, 0, rip_ctloutput, + 0, + igmp_init, igmp_fasttimo, igmp_slowtimo, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RSVP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + rsvp_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#ifdef IPSEC +{ SOCK_RAW, &inetdomain, IPPROTO_AH, PR_ATOMIC|PR_ADDR, + ah4_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#ifdef IPSEC_ESP +{ SOCK_RAW, &inetdomain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR, + esp4_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#endif +{ SOCK_RAW, &inetdomain, IPPROTO_IPCOMP, PR_ATOMIC|PR_ADDR, + ipcomp4_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#endif /* IPSEC */ +{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + encap4_input, 0, 0, rip_ctloutput, + 0, + encap_init, 0, 0, 0, + &rip_usrreqs +}, +# ifdef INET6 +{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + encap4_input, 0, 0, rip_ctloutput, + 0, + encap_init, 0, 0, 0, + &rip_usrreqs +}, +#endif +#ifdef IPDIVERT +{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR, + div_input, 0, 0, ip_ctloutput, + 0, + div_init, 0, 0, 0, + &div_usrreqs, +}, +#endif +#ifdef IPXIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + ipxip_input, 0, ipxip_ctlinput, 0, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif +#ifdef NSIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, + idpip_input, 0, nsip_ctlinput, 0, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif + /* raw wildcard */ +{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR, + rip_input, 0, 0, rip_ctloutput, + 0, + rip_init, 0, 0, 0, + &rip_usrreqs +}, +}; + +extern int in_inithead(void **, int); + +struct domain inetdomain = + { AF_INET, "internet", 0, 0, 0, + inetsw, + &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0, + in_inithead, 32, sizeof(struct sockaddr_in) + }; + +DOMAIN_SET(inet); + +SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, + "Internet Family"); + +SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP"); +SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP"); +SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); +SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); +SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); +#ifdef IPSEC +SYSCTL_NODE(_net_inet, IPPROTO_AH, ipsec, CTLFLAG_RW, 0, "IPSEC"); +#endif /* IPSEC */ +SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); +#ifdef IPDIVERT +SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "DIVERT"); +#endif + diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c new file mode 100644 index 0000000..08052fb --- /dev/null +++ b/sys/netinet/in_rmx.c @@ -0,0 +1,426 @@ +/* + * Copyright 1994, 1995 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This code does two things necessary for the enhanced TCP metrics to + * function in a useful manner: + * 1) It marks all non-host routes as `cloning', thus ensuring that + * every actual reference to such a route actually gets turned + * into a reference to a host route to the specific destination + * requested. + * 2) When such routes lose all their references, it arranges for them + * to be deleted in some random collection of circumstances, so that + * a large quantity of stale routing data is not kept in kernel memory + * indefinitely. See in_rtqtimo() below for the exact mechanism. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/mbuf.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +extern int in_inithead(void **head, int off); + +#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ + +/* + * Do what we need to do when inserting a route. + */ +static struct radix_node * +in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, + struct radix_node *treenodes) +{ + struct rtentry *rt = (struct rtentry *)treenodes; + struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + struct radix_node *ret; + + /* + * For IP, all unicast non-host routes are automatically cloning. + */ + if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + rt->rt_flags |= RTF_MULTICAST; + + if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) { + rt->rt_flags |= RTF_PRCLONING; + } + + /* + * A little bit of help for both IP output and input: + * For host routes, we make sure that RTF_BROADCAST + * is set for anything that looks like a broadcast address. + * This way, we can avoid an expensive call to in_broadcast() + * in ip_output() most of the time (because the route passed + * to ip_output() is almost always a host route). + * + * We also do the same for local addresses, with the thought + * that this might one day be used to speed up ip_input(). + * + * We also mark routes to multicast addresses as such, because + * it's easy to do and might be useful (but this is much more + * dubious since it's so easy to inspect the address). (This + * is done above.) + */ + if (rt->rt_flags & RTF_HOST) { + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { + rt->rt_flags |= RTF_BROADCAST; + } else { + if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr + == sin->sin_addr.s_addr) + rt->rt_flags |= RTF_LOCAL; + } + } + + if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) + && rt->rt_ifp) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + + ret = rn_addroute(v_arg, n_arg, head, treenodes); + if (ret == NULL && rt->rt_flags & RTF_HOST) { + struct rtentry *rt2; + /* + * We are trying to add a host route, but can't. + * Find out if it is because of an + * ARP entry and delete it if so. + */ + rt2 = rtalloc1((struct sockaddr *)sin, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt2) { + if (rt2->rt_flags & RTF_LLINFO && + rt2->rt_flags & RTF_HOST && + rt2->rt_gateway && + rt2->rt_gateway->sa_family == AF_LINK) { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt2), + rt2->rt_gateway, + rt_mask(rt2), rt2->rt_flags, 0); + ret = rn_addroute(v_arg, n_arg, head, + treenodes); + } + RTFREE(rt2); + } + } + + /* + * If the new route created successfully, and we are forwarding, + * and there is a cached route, free it. Otherwise, we may end + * up using the wrong route. + */ + if (ret != NULL && ipforwarding && ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + + return ret; +} + +/* + * This code is the inverse of in_clsroute: on first reference, if we + * were managing the route, stop doing so and set the expiration timer + * back off again. + */ +static struct radix_node * +in_matroute(void *v_arg, struct radix_node_head *head) +{ + struct radix_node *rn = rn_match(v_arg, head); + struct rtentry *rt = (struct rtentry *)rn; + + if(rt && rt->rt_refcnt == 0) { /* this is first reference */ + if(rt->rt_flags & RTPRF_OURS) { + rt->rt_flags &= ~RTPRF_OURS; + rt->rt_rmx.rmx_expire = 0; + } + } + return rn; +} + +static int rtq_reallyold = 60*60; + /* one hour is ``really old'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, + &rtq_reallyold , 0, + "Default expiration time on dynamically learned routes"); + +static int rtq_minreallyold = 10; + /* never automatically crank down to less */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, + &rtq_minreallyold , 0, + "Minimum time to attempt to hold onto dynamically learned routes"); + +static int rtq_toomany = 128; + /* 128 cached routes is ``too many'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, + &rtq_toomany , 0, "Upper limit on dynamically learned routes"); + +/* + * On last reference drop, mark the route as belong to us so that it can be + * timed out. + */ +static void +in_clsroute(struct radix_node *rn, struct radix_node_head *head) +{ + struct rtentry *rt = (struct rtentry *)rn; + + if(!(rt->rt_flags & RTF_UP)) + return; /* prophylactic measures */ + + if((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) + return; + + if((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) + != RTF_WASCLONED) + return; + + /* + * As requested by David Greenman: + * If rtq_reallyold is 0, just delete the route without + * waiting for a timeout cycle to kill it. + */ + if(rtq_reallyold != 0) { + rt->rt_flags |= RTPRF_OURS; + rt->rt_rmx.rmx_expire = time_second + rtq_reallyold; + } else { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + } +} + +struct rtqk_arg { + struct radix_node_head *rnh; + int draining; + int killed; + int found; + int updating; + time_t nextstop; +}; + +/* + * Get rid of old routes. When draining, this deletes everything, even when + * the timeout is not expired yet. When updating, this makes sure that + * nothing has a timeout longer than the current value of rtq_reallyold. + */ +static int +in_rtqkill(struct radix_node *rn, void *rock) +{ + struct rtqk_arg *ap = rock; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if(rt->rt_flags & RTPRF_OURS) { + ap->found++; + + if(ap->draining || rt->rt_rmx.rmx_expire <= time_second) { + if(rt->rt_refcnt > 0) + panic("rtqkill route really not free"); + + err = rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + if(err) { + log(LOG_WARNING, "in_rtqkill: error %d\n", err); + } else { + ap->killed++; + } + } else { + if(ap->updating + && (rt->rt_rmx.rmx_expire - time_second + > rtq_reallyold)) { + rt->rt_rmx.rmx_expire = time_second + + rtq_reallyold; + } + ap->nextstop = lmin(ap->nextstop, + rt->rt_rmx.rmx_expire); + } + } + + return 0; +} + +#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ +static int rtq_timeout = RTQ_TIMEOUT; + +static void +in_rtqtimo(void *rock) +{ + struct radix_node_head *rnh = rock; + struct rtqk_arg arg; + struct timeval atv; + static time_t last_adjusted_timeout = 0; + int s; + + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = time_second + rtq_timeout; + arg.draining = arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + + /* + * Attempt to be somewhat dynamic about this: + * If there are ``too many'' routes sitting around taking up space, + * then crank down the timeout, and see if we can't make some more + * go away. However, we make sure that we will never adjust more + * than once in rtq_timeout seconds, to keep from cranking down too + * hard. + */ + if((arg.found - arg.killed > rtq_toomany) + && (time_second - last_adjusted_timeout >= rtq_timeout) + && rtq_reallyold > rtq_minreallyold) { + rtq_reallyold = 2*rtq_reallyold / 3; + if(rtq_reallyold < rtq_minreallyold) { + rtq_reallyold = rtq_minreallyold; + } + + last_adjusted_timeout = time_second; +#ifdef DIAGNOSTIC + log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", + rtq_reallyold); +#endif + arg.found = arg.killed = 0; + arg.updating = 1; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + } + + atv.tv_usec = 0; + atv.tv_sec = arg.nextstop - time_second; + timeout(in_rtqtimo, rock, tvtohz(&atv)); +} + +void +in_rtqdrain(void) +{ + struct radix_node_head *rnh = rt_tables[AF_INET]; + struct rtqk_arg arg; + int s; + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = 0; + arg.draining = 1; + arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); +} + +/* + * Initialize our routing tree. + */ +int +in_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + if(!rn_inithead(head, off)) + return 0; + + if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ + return 1; /* only do this for the real routing table */ + + rnh = *head; + rnh->rnh_addaddr = in_addroute; + rnh->rnh_matchaddr = in_matroute; + rnh->rnh_close = in_clsroute; + in_rtqtimo(rnh); /* kick off timeout first time */ + return 1; +} + + +/* + * This zaps old routes when the interface goes down or interface + * address is deleted. In the latter case, it deletes static routes + * that point to this address. If we don't do this, we may end up + * using the old address in the future. The ones we always want to + * get rid of are things like ARP entries, since the user might down + * the interface, walk over to a completely different network, and + * plug back in. + */ +struct in_ifadown_arg { + struct radix_node_head *rnh; + struct ifaddr *ifa; + int del; +}; + +static int +in_ifadownkill(struct radix_node *rn, void *xap) +{ + struct in_ifadown_arg *ap = xap; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if (rt->rt_ifa == ap->ifa && + (ap->del || !(rt->rt_flags & RTF_STATIC))) { + /* + * We need to disable the automatic prune that happens + * in this case in rtrequest() because it will blow + * away the pointers that rn_walktree() needs in order + * continue our descent. We will end up deleting all + * the routes that rtrequest() would have in any case, + * so that behavior is not needed there. + */ + rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING); + err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); + if (err) { + log(LOG_WARNING, "in_ifadownkill: error %d\n", err); + } + } + return 0; +} + +int +in_ifadown(struct ifaddr *ifa, int delete) +{ + struct in_ifadown_arg arg; + struct radix_node_head *rnh; + + if (ifa->ifa_addr->sa_family != AF_INET) + return 1; + + arg.rnh = rnh = rt_tables[AF_INET]; + arg.ifa = ifa; + arg.del = delete; + rnh->rnh_walktree(rnh, in_ifadownkill, &arg); + ifa->ifa_flags &= ~IFA_ROUTE; + return 0; +} diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h new file mode 100644 index 0000000..4282752 --- /dev/null +++ b/sys/netinet/in_systm.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_systm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_SYSTM_H_ +#define _NETINET_IN_SYSTM_H_ + +/* + * Miscellaneous internetwork + * definitions for kernel. + */ + +/* + * Network types. + * + * Internally the system keeps counters in the headers with the bytes + * swapped so that VAX instructions will work on them. It reverses + * the bytes before transmission at each protocol level. The n_ types + * represent the types with the bytes in ``high-ender'' order. + */ +typedef u_int16_t n_short; /* short as received from the net */ +typedef u_int32_t n_long; /* long as received from the net */ + +typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */ + +#ifdef _KERNEL +n_time iptime(void); +#endif + +#endif diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h new file mode 100644 index 0000000..4ccc4cd --- /dev/null +++ b/sys/netinet/in_var.h @@ -0,0 +1,243 @@ +/* + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_VAR_H_ +#define _NETINET_IN_VAR_H_ + +#include <sys/queue.h> +#include <sys/fnv_hash.h> + +/* + * Interface address, Internet version. One of these structures + * is allocated for each Internet address on an interface. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ +struct in_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + /* ia_{,sub}net{,mask} in host order */ + u_long ia_net; /* network number of interface */ + u_long ia_netmask; /* mask of net part */ + u_long ia_subnet; /* subnet number, including net */ + u_long ia_subnetmask; /* mask of subnet part */ + struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ + TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ + struct sockaddr_in ia_addr; /* reserve space for interface name */ + struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ +}; + +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; +}; +/* + * Given a pointer to an in_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_in. + */ +#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) +#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) + +#define IN_LNAOF(in, ifa) \ + ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) + + +#ifdef _KERNEL +extern struct ifqueue ipintrq; /* ip packet input queue */ +extern struct in_addr zeroin_addr; +extern u_char inetctlerrmap[]; + +/* + * Hash table for IP addresses. + */ +extern LIST_HEAD(in_ifaddrhashhead, in_ifaddr) *in_ifaddrhashtbl; +extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; +extern u_long in_ifaddrhmask; /* mask for hash table */ + +#define INADDR_NHASH_LOG2 9 +#define INADDR_NHASH (1 << INADDR_NHASH_LOG2) +#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) +#define INADDR_HASH(x) \ + (&in_ifaddrhashtbl[INADDR_HASHVAL(x) & in_ifaddrhmask]) + + +/* + * Macro for finding the interface (ifnet structure) corresponding to one + * of our IP addresses. + */ +#define INADDR_TO_IFP(addr, ifp) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ +{ \ + struct in_ifaddr *ia; \ +\ + LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ + if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + break; \ + (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ +} + +/* + * Macro for finding the internet address structure (in_ifaddr) corresponding + * to a given interface (ifnet structure). + */ +#define IFP_TO_IA(ifp, ia) \ + /* struct ifnet *ifp; */ \ + /* struct in_ifaddr *ia; */ \ +{ \ + for ((ia) = TAILQ_FIRST(&in_ifaddrhead); \ + (ia) != NULL && (ia)->ia_ifp != (ifp); \ + (ia) = TAILQ_NEXT((ia), ia_link)) \ + continue; \ +} +#endif + +/* + * This information should be part of the ifnet structure but we don't wish + * to change that - as it might break a number of things + */ + +struct router_info { + struct ifnet *rti_ifp; + int rti_type; /* type of router which is querier on this interface */ + int rti_time; /* # of slow timeouts since last old query */ + struct router_info *rti_next; +}; + +/* + * Internet multicast address structure. There is one of these for each IP + * multicast group to which this host belongs on a given network interface. + * For every entry on the interface's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. They are also + * kept on a system-wide list to make it easier to keep our legacy IGMP code + * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). + */ +struct in_multi { + LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ + struct in_addr inm_addr; /* IP multicast address, convenience */ + struct ifnet *inm_ifp; /* back pointer to ifnet */ + struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ + u_int inm_timer; /* IGMP membership report timer */ + u_int inm_state; /* state of the membership */ + struct router_info *inm_rti; /* router info*/ +}; + +#ifdef _KERNEL + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_raw); +#endif + +extern LIST_HEAD(in_multihead, in_multi) in_multihead; + +/* + * Structure used by macros below to remember position when stepping through + * all of the in_multi records. + */ +struct in_multistep { + struct in_multi *i_inm; +}; + +/* + * Macro for looking up the in_multi record for a given IP multicast address + * on a given interface. If no matching record is found, "inm" is set null. + */ +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +do { \ + struct ifmultiaddr *ifma; \ +\ + TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \ + if (ifma->ifma_addr->sa_family == AF_INET \ + && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ + (addr).s_addr) \ + break; \ + } \ + (inm) = ifma ? ifma->ifma_protospec : 0; \ +} while(0) + +/* + * Macro to step through all of the in_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "inm" when there + * are no remaining records. + */ +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ +} while(0) + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + (step).i_inm = LIST_FIRST(&in_multihead); \ + IN_NEXT_MULTI((step), (inm)); \ +} while(0) + +struct route; +struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); +void in_delmulti(struct in_multi *); +int in_control(struct socket *, u_long, caddr_t, struct ifnet *, + struct thread *); +void in_rtqdrain(void); +void ip_input(struct mbuf *); +int in_ifadown(struct ifaddr *ifa, int); +void in_ifscrub(struct ifnet *, struct in_ifaddr *); +int ipflow_fastforward(struct mbuf *); +void ipflow_create(const struct route *, struct mbuf *); +void ipflow_slowtimo(void); + +#endif /* _KERNEL */ + +/* INET6 stuff */ +#include <netinet6/in6_var.h> + +#endif /* _NETINET_IN_VAR_H_ */ diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h new file mode 100644 index 0000000..7bb0988 --- /dev/null +++ b/sys/netinet/ip.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.2 (Berkeley) 6/1/94 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_H_ +#define _NETINET_IP_H_ + +/* + * Definitions for internet protocol version 4. + * Per RFC 791, September 1981. + */ +#define IPVERSION 4 + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#ifdef _IP_VHL + u_char ip_vhl; /* version << 4 | header length >> 2 */ +#else +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif +#endif /* not _IP_VHL */ + u_char ip_tos; /* type of service */ + u_short ip_len; /* total length */ + u_short ip_id; /* identification */ + u_short ip_off; /* fragment offset field */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +}; + +#ifdef _IP_VHL +#define IP_MAKE_VHL(v, hl) ((v) << 4 | (hl)) +#define IP_VHL_HL(vhl) ((vhl) & 0x0f) +#define IP_VHL_V(vhl) ((vhl) >> 4) +#define IP_VHL_BORING 0x45 +#endif + +#define IP_MAXPACKET 65535 /* maximum packet size */ + +/* + * Definitions for IP type of service (ip_tos) + */ +#define IPTOS_LOWDELAY 0x10 +#define IPTOS_THROUGHPUT 0x08 +#define IPTOS_RELIABILITY 0x04 +#define IPTOS_MINCOST 0x02 +/* ECN bits proposed by Sally Floyd */ +#define IPTOS_CE 0x01 /* congestion experienced */ +#define IPTOS_ECT 0x02 /* ECN-capable transport */ + + +/* + * Definitions for IP precedence (also in ip_tos) (hopefully unused) + */ +#define IPTOS_PREC_NETCONTROL 0xe0 +#define IPTOS_PREC_INTERNETCONTROL 0xc0 +#define IPTOS_PREC_CRITIC_ECP 0xa0 +#define IPTOS_PREC_FLASHOVERRIDE 0x80 +#define IPTOS_PREC_FLASH 0x60 +#define IPTOS_PREC_IMMEDIATE 0x40 +#define IPTOS_PREC_PRIORITY 0x20 +#define IPTOS_PREC_ROUTINE 0x00 + +/* + * Definitions for options. + */ +#define IPOPT_COPIED(o) ((o)&0x80) +#define IPOPT_CLASS(o) ((o)&0x60) +#define IPOPT_NUMBER(o) ((o)&0x1f) + +#define IPOPT_CONTROL 0x00 +#define IPOPT_RESERVED1 0x20 +#define IPOPT_DEBMEAS 0x40 +#define IPOPT_RESERVED2 0x60 + +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ + +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ +#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_ESO 133 /* extended security */ +#define IPOPT_CIPSO 134 /* commerical security */ +#define IPOPT_SATID 136 /* satnet id */ +#define IPOPT_SSRR 137 /* strict source route */ +#define IPOPT_RA 148 /* router alert */ + +/* + * Offsets to fields in options other than EOL and NOP. + */ +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_OFFSET 2 /* offset within option */ +#define IPOPT_MINOFF 4 /* min value of above */ + +/* + * Time stamp option structure. + */ +struct ip_timestamp { + u_char ipt_code; /* IPOPT_TS */ + u_char ipt_len; /* size of structure (variable) */ + u_char ipt_ptr; /* index of current entry */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ipt_flg:4, /* flags, see below */ + ipt_oflw:4; /* overflow counter */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ipt_oflw:4, /* overflow counter */ + ipt_flg:4; /* flags, see below */ +#endif + union ipt_timestamp { + n_long ipt_time[1]; + struct ipt_ta { + struct in_addr ipt_addr; + n_long ipt_time; + } ipt_ta[1]; + } ipt_timestamp; +}; + +/* flag bits for ipt_flg */ +#define IPOPT_TS_TSONLY 0 /* timestamps only */ +#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ +#define IPOPT_TS_PRESPEC 3 /* specified modules only */ + +/* bits for security (not byte swapped) */ +#define IPOPT_SECUR_UNCLASS 0x0000 +#define IPOPT_SECUR_CONFID 0xf135 +#define IPOPT_SECUR_EFTO 0x789a +#define IPOPT_SECUR_MMMM 0xbc4d +#define IPOPT_SECUR_RESTR 0xaf13 +#define IPOPT_SECUR_SECRET 0xd788 +#define IPOPT_SECUR_TOPSECRET 0x6bc5 + +/* + * Internet implementation parameters. + */ +#define MAXTTL 255 /* maximum time to live (seconds) */ +#define IPDEFTTL 64 /* default ttl, from RFC 1340 */ +#define IPFRAGTTL 60 /* time to live for frags, slowhz */ +#define IPTTLDEC 1 /* subtracted when forwarding */ + +#define IP_MSS 576 /* default maximum segment size */ + +#endif diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h new file mode 100644 index 0000000..ec2c216 --- /dev/null +++ b/sys/netinet/ip6.h @@ -0,0 +1,308 @@ +/* $FreeBSD$ */ +/* $KAME: ip6.h,v 1.18 2001/03/29 05:34:30 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_IP6_H_ +#define _NETINET_IP6_H_ + +/* + * Definition for internet protocol version 6. + * RFC 2460 + */ + +struct ip6_hdr { + union { + struct ip6_hdrctl { + u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */ + u_int16_t ip6_un1_plen; /* payload length */ + u_int8_t ip6_un1_nxt; /* next header */ + u_int8_t ip6_un1_hlim; /* hop limit */ + } ip6_un1; + u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */ + } ip6_ctlun; + struct in6_addr ip6_src; /* source address */ + struct in6_addr ip6_dst; /* destination address */ +} __attribute__((__packed__)); + +#define ip6_vfc ip6_ctlun.ip6_un2_vfc +#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow +#define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen +#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt +#define ip6_hlim ip6_ctlun.ip6_un1.ip6_un1_hlim +#define ip6_hops ip6_ctlun.ip6_un1.ip6_un1_hlim + +#define IPV6_VERSION 0x60 +#define IPV6_VERSION_MASK 0xf0 + +#if BYTE_ORDER == BIG_ENDIAN +#define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */ +#define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */ +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */ +#define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ +#endif /* LITTLE_ENDIAN */ +#endif +#if 1 +/* ECN bits proposed by Sally Floyd */ +#define IP6TOS_CE 0x01 /* congestion experienced */ +#define IP6TOS_ECT 0x02 /* ECN-capable transport */ +#endif + +/* + * Extension Headers + */ + +struct ip6_ext { + u_int8_t ip6e_nxt; + u_int8_t ip6e_len; +} __attribute__((__packed__)); + +/* Hop-by-Hop options header */ +/* XXX should we pad it to force alignment on an 8-byte boundary? */ +struct ip6_hbh { + u_int8_t ip6h_nxt; /* next header */ + u_int8_t ip6h_len; /* length in units of 8 octets */ + /* followed by options */ +} __attribute__((__packed__)); + +/* Destination options header */ +/* XXX should we pad it to force alignment on an 8-byte boundary? */ +struct ip6_dest { + u_int8_t ip6d_nxt; /* next header */ + u_int8_t ip6d_len; /* length in units of 8 octets */ + /* followed by options */ +} __attribute__((__packed__)); + +/* Option types and related macros */ +#define IP6OPT_PAD1 0x00 /* 00 0 00000 */ +#define IP6OPT_PADN 0x01 /* 00 0 00001 */ +#define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */ +#define IP6OPT_NSAP_ADDR 0xC3 /* 11 0 00011 */ +#define IP6OPT_TUNNEL_LIMIT 0x04 /* 00 0 00100 */ +#define IP6OPT_RTALERT 0x05 /* 00 0 00101 (KAME definition) */ + +#define IP6OPT_RTALERT_LEN 4 +#define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */ +#define IP6OPT_RTALERT_RSVP 1 /* Datagram contains an RSVP message */ +#define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */ +#define IP6OPT_MINLEN 2 + +#define IP6OPT_BINDING_UPDATE 0xc6 /* 11 0 00110 */ +#define IP6OPT_BINDING_ACK 0x07 /* 00 0 00111 */ +#define IP6OPT_BINDING_REQ 0x08 /* 00 0 01000 */ +#define IP6OPT_HOME_ADDRESS 0xc9 /* 11 0 01001 */ +#define IP6OPT_EID 0x8a /* 10 0 01010 */ + +#define IP6OPT_TYPE(o) ((o) & 0xC0) +#define IP6OPT_TYPE_SKIP 0x00 +#define IP6OPT_TYPE_DISCARD 0x40 +#define IP6OPT_TYPE_FORCEICMP 0x80 +#define IP6OPT_TYPE_ICMP 0xC0 + +#define IP6OPT_MUTABLE 0x20 + +#define IP6OPT_JUMBO_LEN 6 + +/* Routing header */ +struct ip6_rthdr { + u_int8_t ip6r_nxt; /* next header */ + u_int8_t ip6r_len; /* length in units of 8 octets */ + u_int8_t ip6r_type; /* routing type */ + u_int8_t ip6r_segleft; /* segments left */ + /* followed by routing type specific data */ +} __attribute__((__packed__)); + +/* Type 0 Routing header */ +struct ip6_rthdr0 { + u_int8_t ip6r0_nxt; /* next header */ + u_int8_t ip6r0_len; /* length in units of 8 octets */ + u_int8_t ip6r0_type; /* always zero */ + u_int8_t ip6r0_segleft; /* segments left */ + u_int8_t ip6r0_reserved; /* reserved field */ + u_int8_t ip6r0_slmap[3]; /* strict/loose bit map */ + struct in6_addr ip6r0_addr[1]; /* up to 23 addresses */ +} __attribute__((__packed__)); + +/* Fragment header */ +struct ip6_frag { + u_int8_t ip6f_nxt; /* next header */ + u_int8_t ip6f_reserved; /* reserved field */ + u_int16_t ip6f_offlg; /* offset, reserved, and flag */ + u_int32_t ip6f_ident; /* identification */ +} __attribute__((__packed__)); + +#if BYTE_ORDER == BIG_ENDIAN +#define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */ +#define IP6F_RESERVED_MASK 0x0006 /* reserved bits in ip6f_offlg */ +#define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */ +#else /* BYTE_ORDER == LITTLE_ENDIAN */ +#define IP6F_OFF_MASK 0xf8ff /* mask out offset from _offlg */ +#define IP6F_RESERVED_MASK 0x0600 /* reserved bits in ip6f_offlg */ +#define IP6F_MORE_FRAG 0x0100 /* more-fragments flag */ +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + +/* + * Internet implementation parameters. + */ +#define IPV6_MAXHLIM 255 /* maximun hoplimit */ +#define IPV6_DEFHLIM 64 /* default hlim */ +#define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */ +#define IPV6_HLIMDEC 1 /* subtracted when forwaeding */ + +#define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */ +#define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/ + +#ifdef _KERNEL +/* + * IP6_EXTHDR_CHECK ensures that region between the IP6 header and the + * target header (including IPv6 itself, extension headers and + * TCP/UDP/ICMP6 headers) are continuous. KAME requires drivers + * to store incoming data into one internal mbuf or one or more external + * mbufs(never into two or more internal mbufs). Thus, the third case is + * supposed to never be matched but is prepared just in case. + */ + +#define IP6_EXTHDR_CHECK(m, off, hlen, ret) \ +do { \ + if ((m)->m_next != NULL) { \ + if (((m)->m_flags & M_LOOP) && \ + ((m)->m_len < (off) + (hlen)) && \ + (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ + ip6stat.ip6s_exthdrtoolong++; \ + return ret; \ + } else if ((m)->m_flags & M_EXT) { \ + if ((m)->m_len < (off) + (hlen)) { \ + ip6stat.ip6s_exthdrtoolong++; \ + m_freem(m); \ + return ret; \ + } \ + } else { \ + if ((m)->m_len < (off) + (hlen)) { \ + ip6stat.ip6s_exthdrtoolong++; \ + m_freem(m); \ + return ret; \ + } \ + } \ + } else { \ + if ((m)->m_len < (off) + (hlen)) { \ + ip6stat.ip6s_tooshort++; \ + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \ + m_freem(m); \ + return ret; \ + } \ + } \ +} while (0) + +/* + * IP6_EXTHDR_GET ensures that intermediate protocol header (from "off" to + * "len") is located in single mbuf, on contiguous memory region. + * The pointer to the region will be returned to pointer variable "val", + * with type "typ". + * IP6_EXTHDR_GET0 does the same, except that it aligns the structure at the + * very top of mbuf. GET0 is likely to make memory copy than GET. + * + * XXX we're now testing this, needs m_pulldown() + */ +#define IP6_EXTHDR_GET(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + int tmp; \ + if ((m)->m_len >= (off) + (len)) \ + (val) = (typ)(mtod((m), caddr_t) + (off)); \ + else { \ + t = m_pulldown((m), (off), (len), &tmp); \ + if (t) { \ + if (t->m_len < tmp + (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)(mtod(t, caddr_t) + tmp); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) + +#define IP6_EXTHDR_GET0(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + if ((off) == 0) \ + (val) = (typ)mtod(m, caddr_t); \ + else { \ + t = m_pulldown((m), (off), (len), NULL); \ + if (t) { \ + if (t->m_len < (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)mtod(t, caddr_t); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) +#endif /*_KERNEL*/ + +#endif /* not _NETINET_IP6_H_ */ diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c new file mode 100644 index 0000000..50e939b --- /dev/null +++ b/sys/netinet/ip_divert.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipsec.h" + +#ifndef INET +#error "IPDIVERT requires INET." +#endif + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <vm/uma.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> + +/* + * Divert sockets + */ + +/* + * Allocate enough space to hold a full IP packet + */ +#define DIVSNDQ (65536 + 100) +#define DIVRCVQ (65536 + 100) + +/* + * Divert sockets work in conjunction with ipfw, see the divert(4) + * manpage for features. + * Internally, packets selected by ipfw in ip_input() or ip_output(), + * and never diverted before, are passed to the input queue of the + * divert socket with a given 'divert_port' number (as specified in + * the matching ipfw rule), and they are tagged with a 16 bit cookie + * (representing the rule number of the matching ipfw rule), which + * is passed to process reading from the socket. + * + * Packets written to the divert socket are again tagged with a cookie + * (usually the same as above) and a destination address. + * If the destination address is INADDR_ANY then the packet is + * treated as outgoing and sent to ip_output(), otherwise it is + * treated as incoming and sent to ip_input(). + * In both cases, the packet is tagged with the cookie. + * + * On reinjection, processing in ip_input() and ip_output() + * will be exactly the same as for the original packet, except that + * ipfw processing will start at the rule number after the one + * written in the cookie (so, tagging a packet with a cookie of 0 + * will cause it to be effectively considered as a standard packet). + */ + +/* Internal variables */ +static struct inpcbhead divcb; +static struct inpcbinfo divcbinfo; + +static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ +static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ + +/* Optimization: have this preinitialized */ +static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET }; + +/* + * Initialize divert connection block queue. + */ +void +div_init(void) +{ + INP_INFO_LOCK_INIT(&divcbinfo, "div"); + LIST_INIT(&divcb); + divcbinfo.listhead = &divcb; + /* + * XXX We don't use the hash list for divert IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); + divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); + divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); +} + +/* + * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets + * with that protocol number to enter the system from the outside. + */ +void +div_input(struct mbuf *m, int off) +{ + ipstat.ips_noproto++; + m_freem(m); +} + +/* + * Divert a packet by passing it up to the divert socket at port 'port'. + * + * Setup generic address and protocol structures for div_input routine, + * then pass them along with mbuf chain. + */ +void +divert_packet(struct mbuf *m, int incoming, int port, int rule) +{ + struct ip *ip; + struct inpcb *inp; + struct socket *sa; + u_int16_t nport; + + /* Sanity check */ + KASSERT(port != 0, ("%s: port=0", __func__)); + + divsrc.sin_port = rule; /* record matching rule */ + + /* Assure header */ + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == 0) + return; + ip = mtod(m, struct ip *); + + /* + * Record receive interface address, if any. + * But only for incoming packets. + */ + divsrc.sin_addr.s_addr = 0; + if (incoming) { + struct ifaddr *ifa; + + /* Sanity check */ + KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __func__)); + + /* Find IP address for receive interface */ + TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + divsrc.sin_addr = + ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + break; + } + } + /* + * Record the incoming interface name whenever we have one. + */ + bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero)); + if (m->m_pkthdr.rcvif) { + /* + * Hide the actual interface name in there in the + * sin_zero array. XXX This needs to be moved to a + * different sockaddr type for divert, e.g. + * sockaddr_div with multiple fields like + * sockaddr_dl. Presently we have only 7 bytes + * but that will do for now as most interfaces + * are 4 or less + 2 or less bytes for unit. + * There is probably a faster way of doing this, + * possibly taking it from the sockaddr_dl on the iface. + * This solves the problem of a P2P link and a LAN interface + * having the same address, which can result in the wrong + * interface being assigned to the packet when fed back + * into the divert socket. Theoretically if the daemon saves + * and re-uses the sockaddr_in as suggested in the man pages, + * this iface name will come along for the ride. + * (see div_output for the other half of this.) + */ + snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero), + "%s%d", m->m_pkthdr.rcvif->if_name, + m->m_pkthdr.rcvif->if_unit); + } + + /* Put packet on socket queue, if any */ + sa = NULL; + nport = htons((u_int16_t)port); + LIST_FOREACH(inp, &divcb, inp_list) { + if (inp->inp_lport == nport) + sa = inp->inp_socket; + } + if (sa) { + if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, + m, (struct mbuf *)0) == 0) + m_freem(m); + else + sorwakeup(sa); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Deliver packet back into the IP processing machinery. + * + * If no address specified, or address is 0.0.0.0, send to ip_output(); + * otherwise, send to ip_input() and mark as having been received on + * the interface with that address. + */ +static int +div_output(struct socket *so, struct mbuf *m, + struct sockaddr_in *sin, struct mbuf *control) +{ + int error = 0; + struct m_hdr divert_tag; + + /* + * Prepare the tag for divert info. Note that a packet + * with a 0 tag in mh_data is effectively untagged, + * so we could optimize that case. + */ + divert_tag.mh_type = MT_TAG; + divert_tag.mh_flags = PACKET_TAG_DIVERT; + divert_tag.mh_next = m; + divert_tag.mh_data = 0; /* the matching rule # */ + m->m_pkthdr.rcvif = NULL; /* XXX is it necessary ? */ + + if (control) + m_freem(control); /* XXX */ + + /* Loopback avoidance and state recovery */ + if (sin) { + int i; + + divert_tag.mh_data = (caddr_t)(int)sin->sin_port; + /* + * Find receive interface with the given name, stuffed + * (if it exists) in the sin_zero[] field. + * The name is user supplied data so don't trust its size + * or that it is zero terminated. + */ + for (i = 0; sin->sin_zero[i] && i < sizeof(sin->sin_zero); i++) + ; + if ( i > 0 && i < sizeof(sin->sin_zero)) + m->m_pkthdr.rcvif = ifunit(sin->sin_zero); + } + + /* Reinject packet into the system as incoming or outgoing */ + if (!sin || sin->sin_addr.s_addr == 0) { + struct inpcb *const inp = sotoinpcb(so); + struct ip *const ip = mtod(m, struct ip *); + + /* + * Don't allow both user specified and setsockopt options, + * and don't allow packet length sizes that will crash + */ + if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || + ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { + error = EINVAL; + goto cantsend; + } + + /* Convert fields to host order for ip_output() */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + /* Send packet to output processing */ + ipstat.ips_rawout++; /* XXX */ + error = ip_output((struct mbuf *)&divert_tag, + inp->inp_options, &inp->inp_route, + (so->so_options & SO_DONTROUTE) | + IP_ALLOWBROADCAST | IP_RAWOUTPUT, + inp->inp_moptions); + } else { + if (m->m_pkthdr.rcvif == NULL) { + /* + * No luck with the name, check by IP address. + * Clear the port and the ifname to make sure + * there are no distractions for ifa_ifwithaddr. + */ + struct ifaddr *ifa; + + bzero(sin->sin_zero, sizeof(sin->sin_zero)); + sin->sin_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *) sin); + if (ifa == NULL) { + error = EADDRNOTAVAIL; + goto cantsend; + } + m->m_pkthdr.rcvif = ifa->ifa_ifp; + } + /* Send packet to input processing */ + ip_input((struct mbuf *)&divert_tag); + } + + return error; + +cantsend: + m_freem(m); + return error; +} + +static int +div_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + int error, s; + + inp = sotoinpcb(so); + if (inp) + panic("div_attach"); + if (td && (error = suser(td)) != 0) + return error; + + error = soreserve(so, div_sendspace, div_recvspace); + if (error) + return error; + s = splnet(); + error = in_pcballoc(so, &divcbinfo, td); + splx(s); + if (error) + return error; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip_p = proto; + inp->inp_vflag |= INP_IPV4; + inp->inp_flags |= INP_HDRINCL; + /* The socket is always "connected" because + we always know "where" to send the packet */ + so->so_state |= SS_ISCONNECTED; + return 0; +} + +static int +div_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + panic("div_detach"); + in_pcbdetach(inp); + return 0; +} + +static int +div_abort(struct socket *so) +{ + soisdisconnected(so); + return div_detach(so); +} + +static int +div_disconnect(struct socket *so) +{ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + return div_abort(so); +} + +static int +div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp; + int s; + int error; + + s = splnet(); + inp = sotoinpcb(so); + /* in_pcbbind assumes that nam is a sockaddr_in + * and in_pcbbind requires a valid address. Since divert + * sockets don't we need to make sure the address is + * filled in properly. + * XXX -- divert should not be abusing in_pcbind + * and should probably have its own family. + */ + if (nam->sa_family != AF_INET) + error = EAFNOSUPPORT; + else { + ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; + error = in_pcbbind(inp, nam, td); + } + splx(s); + return error; +} + +static int +div_shutdown(struct socket *so) +{ + socantsendmore(so); + return 0; +} + +static int +div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + /* Packet must have a header (but that's about it) */ + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + m_freem(m); + return EINVAL; + } + + /* Send packet */ + return div_output(so, m, (struct sockaddr_in *)nam, control); +} + +static int +div_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = divcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = divcbinfo.ipi_gencnt; + n = divcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->td, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = divcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = divcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +/* + * This is the wrapper function for in_setsockaddr. We just pass down + * the pcbinfo for in_setpeeraddr to lock. + */ +static int +div_sockaddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setsockaddr(so, nam, &divcbinfo)); +} + +/* + * This is the wrapper function for in_setpeeraddr. We just pass down + * the pcbinfo for in_setpeeraddr to lock. + */ +static int +div_peeraddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setpeeraddr(so, nam, &divcbinfo)); +} + + +SYSCTL_DECL(_net_inet_divert); +SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, + div_pcblist, "S,xinpcb", "List of active divert sockets"); + +struct pr_usrreqs div_usrreqs = { + div_abort, pru_accept_notsupp, div_attach, div_bind, + pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach, + div_disconnect, pru_listen_notsupp, div_peeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown, + div_sockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c new file mode 100644 index 0000000..6006b65 --- /dev/null +++ b/sys/netinet/ip_dummynet.c @@ -0,0 +1,1952 @@ +/* + * Copyright (c) 1998-2001 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define DEB(x) +#define DDB(x) x + +/* + * This module implements IP dummynet, a bandwidth limiter/delay emulator + * used in conjunction with the ipfw package. + * Description of the data structures used is in ip_dummynet.h + * Here you mainly find the following blocks of code: + * + variable declarations; + * + heap management functions; + * + scheduler and dummynet functions; + * + configuration and initialization. + * + * NOTA BENE: critical sections are protected by splimp()/splx() + * pairs. One would think that splnet() is enough as for most of + * the netinet code, but it is not so because when used with + * bridging, dummynet is invoked at splimp(). + * + * Most important Changes: + * + * 011004: KLDable + * 010124: Fixed WF2Q behaviour + * 010122: Fixed spl protection. + * 000601: WF2Q support + * 000106: large rewrite, use heaps to handle very many pipes. + * 980513: initial release + * + * include files marked with XXX are probably not needed + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/queue.h> /* XXX */ +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/time.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/ip_var.h> + +#include <netinet/if_ether.h> /* for struct arpcom */ +#include <net/bridge.h> + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timer.c) + */ +static dn_key curr_time = 0 ; /* current simulation time */ + +static int dn_hash_size = 64 ; /* default hash size */ + +/* statistics on number of queue searches and search steps */ +static int searches, search_steps ; +static int pipe_expire = 1 ; /* expire queue if empty */ +static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ + +static int red_lookup_depth = 256; /* RED - default lookup table depth */ +static int red_avg_pkt_size = 512; /* RED - default medium packet size */ +static int red_max_pkt_size = 1500; /* RED - default max packet size */ + +/* + * Three heaps contain queues and pipes that the scheduler handles: + * + * ready_heap contains all dn_flow_queue related to fixed-rate pipes. + * + * wfq_ready_heap contains the pipes associated with WF2Q flows + * + * extract_heap contains pipes associated with delay lines. + * + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; + +static int heap_init(struct dn_heap *h, int size) ; +static int heap_insert (struct dn_heap *h, dn_key key1, void *p); +static void heap_extract(struct dn_heap *h, void *obj); + +static void transmit_event(struct dn_pipe *pipe); +static void ready_event(struct dn_flow_queue *q); + +static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ +static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ + +static struct callout_handle dn_timeout; + +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, + CTLFLAG_RW, 0, "Dummynet"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, + CTLFLAG_RD, &curr_time, 0, "Current tick"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, + CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, + CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, + CTLFLAG_RD, &searches, 0, "Number of queue searches"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, + CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, + CTLFLAG_RW, &dn_max_ratio, 0, + "Max ratio between dynamic queues and buckets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); +#endif + +static int config_pipe(struct dn_pipe *p); +static int ip_dn_ctl(struct sockopt *sopt); + +static void rt_unref(struct rtentry *); +static void dummynet(void *); +static void dummynet_flush(void); +void dummynet_drain(void); +static ip_dn_io_t dummynet_io; +static void dn_rule_delete(void *); + +int if_tx_rdy(struct ifnet *ifp); + +/* + * ip_fw_chain_head is used when deleting a pipe, because ipfw rules can + * hold references to the pipe. + */ +extern LIST_HEAD (ip_fw_head, ip_fw) ip_fw_chain_head; + +static void +rt_unref(struct rtentry *rt) +{ + if (rt == NULL) + return ; + if (rt->rt_refcnt <= 0) + printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt); + RTFREE(rt); +} + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * XXX failure to allocate a new element is a pretty bad failure + * as we basically stall a whole queue forever!! + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( 2*(x) + 1 ) +#define HEAP_IS_LEFT(x) ( (x) & 1 ) +#define HEAP_RIGHT(x) ( 2*(x) + 2 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_init(struct dn_heap *h, int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) { + printf("heap_init, Bogus call, have %d want %d\n", + h->size, new_size); + return 0 ; + } + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; + p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT ); + if (p == NULL) { + printf(" heap_init, resize %d failed\n", new_size ); + return 1 ; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DUMMYNET); + } + h->p = p ; + h->size = new_size ; + return 0 ; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If offset > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; +/* + * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. + */ +#define RESET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; +static int +heap_insert(struct dn_heap *h, dn_key key1, void *p) +{ + int son = h->elements ; + + if (p == NULL) /* data already there, set starting point */ + son = key1 ; + else { /* insert new element at the end, possibly resize */ + son = h->elements ; + if (son == h->size) /* need resize... */ + if (heap_init(h, h->elements+1) ) + return 1 ; /* failure... */ + h->p[son].object = p ; + h->p[son].key = key1 ; + h->elements++ ; + } + while (son > 0) { /* bubble up */ + int father = HEAP_FATHER(son) ; + struct dn_heap_entry tmp ; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break ; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp) ; + SET_OFFSET(h, son); + son = father ; + } + SET_OFFSET(h, son); + return 0 ; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +static void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1 ; + + if (max < 0) { + printf("warning, extract from empty heap 0x%p\n", h); + return ; + } + father = 0 ; /* default: move up smallest child */ + if (obj != NULL) { /* extract specific element, index is at offset */ + if (h->offset <= 0) + panic("*** heap_extract from middle not supported on this heap!!!\n"); + father = *((int *)((char *)obj + h->offset)) ; + if (father < 0 || father >= h->elements) { + printf("dummynet: heap_extract, father %d out of bound 0..%d\n", + father, h->elements); + panic("heap_extract"); + } + } + RESET_OFFSET(h, father); + child = HEAP_LEFT(father) ; /* left child */ + while (child <= max) { /* valid entry */ + if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child = child+1 ; /* take right child, otherwise left */ + h->p[father] = h->p[child] ; + SET_OFFSET(h, father); + father = child ; + child = HEAP_LEFT(child) ; /* left child for next loop */ + } + h->elements-- ; + if (father != max) { + /* + * Fill hole with last entry and bubble up, reusing the insert code + */ + h->p[father] = h->p[max] ; + heap_insert(h, father, NULL); /* this one cannot fail */ + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, dn_key new_key, void *object) +{ + int temp; + int i ; + int max = h->elements-1 ; + struct dn_heap_entry buf ; + + if (h->offset <= 0) + panic("cannot move items on this heap"); + + i = *((int *)((char *)object + h->offset)); + if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ + h->p[i].key = new_key ; + for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; + i = temp ) { /* bubble up */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } + } else { /* must move down */ + h->p[i].key = new_key ; + while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ + if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) + temp++ ; /* select child with min key */ + if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } else + break ; + i = temp ; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i ; + + for (i = 0 ; i < h->elements ; i++ ) + heap_insert(h, i , NULL) ; +} + +/* + * cleanup the heap and free data structure + */ +static void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DUMMYNET); + bzero(h, sizeof(*h) ); +} + +/* + * --- end of heap management functions --- + */ + +/* + * Scheduler functions: + * + * transmit_event() is called when the delay-line needs to enter + * the scheduler, either because of existing pkts getting ready, + * or new packets entering the queue. The event handled is the delivery + * time of the packet. + * + * ready_event() does something similar with fixed-rate queues, and the + * event handled is the finish time of the head pkt. + * + * wfq_ready_event() does something similar with WF2Q queues, and the + * event handled is the start time of the head pkt. + * + * In all cases, we make sure that the data structures are consistent + * before passing pkts out, because this might trigger recursive + * invocations of the procedures. + */ +static void +transmit_event(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt ; + + while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { + /* + * first unlink, then call procedures, since ip_input() can invoke + * ip_output() and viceversa, thus causing nested calls + */ + pipe->head = DN_NEXT(pkt) ; + + /* + * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf + * (NOT A REAL one, just a small block of malloc'ed memory) with + * m_type = MT_TAG, m_flags = PACKET_TAG_DUMMYNET + * dn_m (m_next) = actual mbuf to be processed by ip_input/output + * and some other fields. + * The block IS FREED HERE because it contains parameters passed + * to the called routine. + */ + switch (pkt->dn_dir) { + case DN_TO_IP_OUT: + (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL); + rt_unref (pkt->ro.ro_rt) ; + break ; + + case DN_TO_IP_IN : + ip_input((struct mbuf *)pkt) ; + break ; + + case DN_TO_BDG_FWD : + if (!BDG_LOADED) { + /* somebody unloaded the bridge module. Drop pkt */ + printf("-- dropping bridged packet trapped in pipe--\n"); + m_freem(pkt->dn_m); + break; + } /* fallthrough */ + case DN_TO_ETH_DEMUX: + { + struct mbuf *m = (struct mbuf *)pkt ; + struct ether_header *eh; + + if (pkt->dn_m->m_len < ETHER_HDR_LEN && + (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/bridge: pullup fail, dropping pkt\n"); + break; + } + /* + * same as ether_input, make eh be a pointer into the mbuf + */ + eh = mtod(pkt->dn_m, struct ether_header *); + m_adj(pkt->dn_m, ETHER_HDR_LEN); + /* + * bdg_forward() wants a pointer to the pseudo-mbuf-header, but + * on return it will supply the pointer to the actual packet + * (originally pkt->dn_m, but could be something else now) if + * it has not consumed it. + */ + if (pkt->dn_dir == DN_TO_BDG_FWD) { + m = bdg_forward_ptr(m, eh, pkt->ifp); + if (m) + m_freem(m); + } else + ether_demux(NULL, eh, m); /* which consumes the mbuf */ + } + break ; + case DN_TO_ETH_OUT: + ether_output_frame(pkt->ifp, (struct mbuf *)pkt); + break; + + default: + printf("dummynet: bad switch %d!\n", pkt->dn_dir); + m_freem(pkt->dn_m); + break ; + } + free(pkt, M_DUMMYNET); + } + /* if there are leftover packets, put into the heap for next event */ + if ( (pkt = pipe->head) ) + heap_insert(&extract_heap, pkt->output_time, pipe ) ; + /* XXX should check errors on heap_insert, by draining the + * whole pipe p and hoping in the future we are more successful + */ +} + +/* + * the following macro computes how many ticks we have to wait + * before being able to transmit a packet. The credit is taken from + * either a pipe (WF2Q) or a flow_queue (per-flow queueing) + */ +#define SET_TICKS(pkt, q, p) \ + (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ + p->bandwidth ; + +/* + * extract pkt from queue, compute output time (could be now) + * and put into delay line (p_queue) + */ +static void +move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, + struct dn_pipe *p, int len) +{ + q->head = DN_NEXT(pkt) ; + q->len-- ; + q->len_bytes -= len ; + + pkt->output_time = curr_time + p->delay ; + + if (p->head == NULL) + p->head = pkt; + else + DN_NEXT(p->tail) = pkt; + p->tail = pkt; + DN_NEXT(p->tail) = NULL; +} + +/* + * ready_event() is invoked every time the queue must enter the + * scheduler, either because the first packet arrives, or because + * a previously scheduled event fired. + * On invokation, drain as many pkts as possible (could be 0) and then + * if there are leftover packets reinsert the pkt in the scheduler. + */ +static void +ready_event(struct dn_flow_queue *q) +{ + struct dn_pkt *pkt; + struct dn_pipe *p = q->fs->pipe ; + int p_was_empty ; + + if (p == NULL) { + printf("ready_event- pipe is gone\n"); + return ; + } + p_was_empty = (p->head == NULL) ; + + /* + * schedule fixed-rate queues linked to this pipe: + * Account for the bw accumulated since last scheduling, then + * drain as many pkts as allowed by q->numbytes and move to + * the delay line (in p) computing output time. + * bandwidth==0 (no limit) means we can drain the whole queue, + * setting len_scaled = 0 does the job. + */ + q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; + while ( (pkt = q->head) != NULL ) { + int len = pkt->dn_m->m_pkthdr.len; + int len_scaled = p->bandwidth ? len*8*hz : 0 ; + if (len_scaled > q->numbytes ) + break ; + q->numbytes -= len_scaled ; + move_pkt(pkt, q, p, len); + } + /* + * If we have more packets queued, schedule next ready event + * (can only occur when bandwidth != 0, otherwise we would have + * flushed the whole queue in the previous loop). + * To this purpose we record the current time and compute how many + * ticks to go for the finish time of the packet. + */ + if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ + dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ + q->sched_time = curr_time ; + heap_insert(&ready_heap, curr_time + t, (void *)q ); + /* XXX should check errors on heap_insert, and drain the whole + * queue on error hoping next time we are luckier. + */ + } else /* RED needs to know when the queue becomes empty */ + q->q_time = curr_time; + /* + * If the delay line was empty call transmit_event(p) now. + * Otherwise, the scheduler will take care of it. + */ + if (p_was_empty) + transmit_event(p); +} + +/* + * Called when we can transmit packets on WF2Q queues. Take pkts out of + * the queues at their start time, and enqueue into the delay line. + * Packets are drained until p->numbytes < 0. As long as + * len_scaled >= p->numbytes, the packet goes into the delay line + * with a deadline p->delay. For the last packet, if p->numbytes<0, + * there is an additional delay. + */ +static void +ready_event_wfq(struct dn_pipe *p) +{ + int p_was_empty = (p->head == NULL) ; + struct dn_heap *sch = &(p->scheduler_heap); + struct dn_heap *neh = &(p->not_eligible_heap) ; + + if (p->if_name[0] == 0) /* tx clock is simulated */ + p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; + else { /* tx clock is for real, the ifq must be empty or this is a NOP */ + if (p->ifp && p->ifp->if_snd.ifq_head != NULL) + return ; + else { + DEB(printf("pipe %d ready from %s --\n", + p->pipe_nr, p->if_name);) + } + } + + /* + * While we have backlogged traffic AND credit, we need to do + * something on the queue. + */ + while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { + if (sch->elements > 0) { /* have some eligible pkts to send out */ + struct dn_flow_queue *q = sch->p[0].object ; + struct dn_pkt *pkt = q->head; + struct dn_flow_set *fs = q->fs; + u_int64_t len = pkt->dn_m->m_pkthdr.len; + int len_scaled = p->bandwidth ? len*8*hz : 0 ; + + heap_extract(sch, NULL); /* remove queue from heap */ + p->numbytes -= len_scaled ; + move_pkt(pkt, q, p, len); + + p->V += (len<<MY_M) / p->sum ; /* update V */ + q->S = q->F ; /* update start time */ + if (q->len == 0) { /* Flow not backlogged any more */ + fs->backlogged-- ; + heap_insert(&(p->idle_heap), q->F, q); + } else { /* still backlogged */ + /* + * update F and position in backlogged queue, then + * put flow in not_eligible_heap (we will fix this later). + */ + len = (q->head)->dn_m->m_pkthdr.len; + q->F += (len<<MY_M)/(u_int64_t) fs->weight ; + if (DN_KEY_LEQ(q->S, p->V)) + heap_insert(neh, q->S, q); + else + heap_insert(sch, q->F, q); + } + } + /* + * now compute V = max(V, min(S_i)). Remember that all elements in sch + * have by definition S_i <= V so if sch is not empty, V is surely + * the max and we must not update it. Conversely, if sch is empty + * we only need to look at neh. + */ + if (sch->elements == 0 && neh->elements > 0) + p->V = MAX64 ( p->V, neh->p[0].key ); + /* move from neh to sch any packets that have become eligible */ + while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { + struct dn_flow_queue *q = neh->p[0].object ; + heap_extract(neh, NULL); + heap_insert(sch, q->F, q); + } + + if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ + p->numbytes = -1 ; /* mark not ready for I/O */ + break ; + } + } + if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 + && p->idle_heap.elements > 0) { + /* + * no traffic and no events scheduled. We can get rid of idle-heap. + */ + int i ; + + for (i = 0 ; i < p->idle_heap.elements ; i++) { + struct dn_flow_queue *q = p->idle_heap.p[i].object ; + + q->F = 0 ; + q->S = q->F + 1 ; + } + p->sum = 0 ; + p->V = 0 ; + p->idle_heap.elements = 0 ; + } + /* + * If we are getting clocks from dummynet (not a real interface) and + * If we are under credit, schedule the next ready event. + * Also fix the delivery time of the last packet. + */ + if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ + dn_key t=0 ; /* number of ticks i have to wait */ + + if (p->bandwidth > 0) + t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; + p->tail->output_time += t ; + p->sched_time = curr_time ; + heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); + /* XXX should check errors on heap_insert, and drain the whole + * queue on error hoping next time we are luckier. + */ + } + /* + * If the delay line was empty call transmit_event(p) now. + * Otherwise, the scheduler will take care of it. + */ + if (p_was_empty) + transmit_event(p); +} + +/* + * This is called once per tick, or HZ times per second. It is used to + * increment the current tick counter and schedule expired events. + */ +static void +dummynet(void * __unused unused) +{ + void *p ; /* generic parameter to handler */ + struct dn_heap *h ; + int s ; + struct dn_heap *heaps[3]; + int i; + struct dn_pipe *pe ; + + heaps[0] = &ready_heap ; /* fixed-rate queues */ + heaps[1] = &wfq_ready_heap ; /* wfq queues */ + heaps[2] = &extract_heap ; /* delay line */ + s = splimp(); /* see note on top, splnet() is not enough */ + curr_time++ ; + for (i=0; i < 3 ; i++) { + h = heaps[i]; + while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { + DDB(if (h->p[0].key > curr_time) + printf("-- dummynet: warning, heap %d is %d ticks late\n", + i, (int)(curr_time - h->p[0].key));) + p = h->p[0].object ; /* store a copy before heap_extract */ + heap_extract(h, NULL); /* need to extract before processing */ + if (i == 0) + ready_event(p) ; + else if (i == 1) { + struct dn_pipe *pipe = p; + if (pipe->if_name[0] != '\0') + printf("*** bad ready_event_wfq for pipe %s\n", + pipe->if_name); + else + ready_event_wfq(p) ; + } else + transmit_event(p); + } + } + /* sweep pipes trying to expire idle flow_queues */ + for (pe = all_pipes; pe ; pe = pe->next ) + if (pe->idle_heap.elements > 0 && + DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { + struct dn_flow_queue *q = pe->idle_heap.p[0].object ; + + heap_extract(&(pe->idle_heap), NULL); + q->S = q->F + 1 ; /* mark timestamp as invalid */ + pe->sum -= q->fs->weight ; + } + splx(s); + dn_timeout = timeout(dummynet, NULL, 1); +} + +/* + * called by an interface when tx_rdy occurs. + */ +int +if_tx_rdy(struct ifnet *ifp) +{ + struct dn_pipe *p; + + for (p = all_pipes; p ; p = p->next ) + if (p->ifp == ifp) + break ; + if (p == NULL) { + char buf[32]; + sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit); + for (p = all_pipes; p ; p = p->next ) + if (!strcmp(p->if_name, buf) ) { + p->ifp = ifp ; + DEB(printf("++ tx rdy from %s (now found)\n", buf);) + break ; + } + } + if (p != NULL) { + DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name, + ifp->if_unit, ifp->if_snd.ifq_len);) + p->numbytes = 0 ; /* mark ready for I/O */ + ready_event_wfq(p); + } + return 0; +} + +/* + * Unconditionally expire empty queues in case of shortage. + * Returns the number of queues freed. + */ +static int +expire_queues(struct dn_flow_set *fs) +{ + struct dn_flow_queue *q, *prev ; + int i, initial_elements = fs->rq_elements ; + + if (fs->last_expired == time_second) + return 0 ; + fs->last_expired = time_second ; + for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ + for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) + if (q->head != NULL || q->S != q->F+1) { + prev = q ; + q = q->next ; + } else { /* entry is idle, expire it */ + struct dn_flow_queue *old_q = q ; + + if (prev != NULL) + prev->next = q = q->next ; + else + fs->rq[i] = q = q->next ; + fs->rq_elements-- ; + free(old_q, M_DUMMYNET); + } + return initial_elements - fs->rq_elements ; +} + +/* + * If room, create a new queue and put at head of slot i; + * otherwise, create or use the default queue. + */ +static struct dn_flow_queue * +create_queue(struct dn_flow_set *fs, int i) +{ + struct dn_flow_queue *q ; + + if (fs->rq_elements > fs->rq_size * dn_max_ratio && + expire_queues(fs) == 0) { + /* + * No way to get room, use or create overflow queue. + */ + i = fs->rq_size ; + if ( fs->rq[i] != NULL ) + return fs->rq[i] ; + } + q = malloc(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO); + if (q == NULL) { + printf("sorry, cannot allocate queue for new flow\n"); + return NULL ; + } + q->fs = fs ; + q->hash_slot = i ; + q->next = fs->rq[i] ; + q->S = q->F + 1; /* hack - mark timestamp as invalid */ + fs->rq[i] = q ; + fs->rq_elements++ ; + return q ; +} + +/* + * Given a flow_set and a pkt in last_pkt, find a matching queue + * after appropriate masking. The queue is moved to front + * so that further searches take less time. + */ +static struct dn_flow_queue * +find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) +{ + int i = 0 ; /* we need i and q for new allocations */ + struct dn_flow_queue *q, *prev; + + if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) + q = fs->rq[0] ; + else { + /* first, do the masking */ + id->dst_ip &= fs->flow_mask.dst_ip ; + id->src_ip &= fs->flow_mask.src_ip ; + id->dst_port &= fs->flow_mask.dst_port ; + id->src_port &= fs->flow_mask.src_port ; + id->proto &= fs->flow_mask.proto ; + id->flags = 0 ; /* we don't care about this one */ + /* then, hash function */ + i = ( (id->dst_ip) & 0xffff ) ^ + ( (id->dst_ip >> 15) & 0xffff ) ^ + ( (id->src_ip << 1) & 0xffff ) ^ + ( (id->src_ip >> 16 ) & 0xffff ) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->proto ); + i = i % fs->rq_size ; + /* finally, scan the current list for a match */ + searches++ ; + for (prev=NULL, q = fs->rq[i] ; q ; ) { + search_steps++; + if (bcmp(id, &(q->id), sizeof(q->id) ) == 0) + break ; /* found */ + else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { + /* entry is idle and not in any heap, expire it */ + struct dn_flow_queue *old_q = q ; + + if (prev != NULL) + prev->next = q = q->next ; + else + fs->rq[i] = q = q->next ; + fs->rq_elements-- ; + free(old_q, M_DUMMYNET); + continue ; + } + prev = q ; + q = q->next ; + } + if (q && prev != NULL) { /* found and not in front */ + prev->next = q->next ; + q->next = fs->rq[i] ; + fs->rq[i] = q ; + } + } + if (q == NULL) { /* no match, need to allocate a new entry */ + q = create_queue(fs, i); + if (q != NULL) + q->id = *id ; + } + return q ; +} + +static int +red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + * + */ + + int64_t p_b = 0; + /* queue in bytes or packets ? */ + u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; + + DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);) + + /* average queue size estimation */ + if (q_size != 0) { + /* + * queue is not empty, avg <- avg + (q_size - avg) * w_q + */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); + + q->avg += (int) v; + } else { + /* + * queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = (curr_time - q->q_time) / fs->lookup_step; + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + DEB(printf("avg: %u ", SCALE_VAL(q->avg));) + + /* should i drop ? */ + + if (q->avg < fs->min_th) { + q->count = -1; + return 0; /* accept packet ; */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->flags_fs & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than max_th the + * packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; + } else { + q->count = -1; + printf("- drop"); + return 1 ; + } + } else if (q->avg > fs->min_th) { + /* + * we compute p_b using the linear dropping function p_b = c_1 * + * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = + * max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; + } + if (fs->flags_fs & DN_QSIZE_IS_BYTES) + p_b = (p_b * len) / fs->max_pkt_size; + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { + q->count = 0; + DEB(printf("- red drop");) + /* after a drop we calculate a new random value */ + q->random = random() & 0xffff; + return 1; /* drop */ + } + } + /* end of RED algorithm */ + return 0 ; /* accept */ +} + +static __inline +struct dn_flow_set * +locate_flowset(int pipe_nr, struct ip_fw *rule) +{ + struct dn_flow_set *fs = NULL ; + + if ( (rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE ) + for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) + ; + else { + struct dn_pipe *p1; + for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) + ; + if (p1 != NULL) + fs = &(p1->fs) ; + } + if (fs != NULL) + rule->pipe_ptr = fs ; /* record for the future */ + return fs ; +} + +/* + * dummynet hook for packets. Below 'pipe' is a pipe or a queue + * depending on whether WF2Q or fixed bw is used. + * + * pipe_nr pipe or queue the packet is destined for. + * dir where shall we send the packet after dummynet. + * m the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + * real_dst in bdg_forward + * ro route parameter (only used in ip_output, NULL otherwise) + * dst destination address, only used by ip_output + * rule matching rule, in case of multiple passes + * flags flags from the caller, only used in ip_output + * + */ +static int +dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt *pkt; + struct dn_flow_set *fs; + struct dn_pipe *pipe ; + u_int64_t len = m->m_pkthdr.len ; + struct dn_flow_queue *q = NULL ; + int s ; + + s = splimp(); + + pipe_nr &= 0xffff ; + + if ( (fs = fwa->rule->pipe_ptr) == NULL ) { + fs = locate_flowset(pipe_nr, fwa->rule); + if (fs == NULL) + goto dropit ; /* this queue/pipe does not exist! */ + } + pipe = fs->pipe ; + if (pipe == NULL) { /* must be a queue, try find a matching pipe */ + for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; + pipe = pipe->next) + ; + if (pipe != NULL) + fs->pipe = pipe ; + else { + printf("No pipe %d for queue %d, drop pkt\n", + fs->parent_nr, fs->fs_nr); + goto dropit ; + } + } + q = find_queue(fs, &(fwa->f_id)); + if ( q == NULL ) + goto dropit ; /* cannot allocate queue */ + /* + * update statistics, then check reasons to drop pkt + */ + q->tot_bytes += len ; + q->tot_pkts++ ; + if ( fs->plr && random() < fs->plr ) + goto dropit ; /* random pkt drop */ + if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { + if (q->len_bytes > fs->qsize) + goto dropit ; /* queue size overflow */ + } else { + if (q->len >= fs->qsize) + goto dropit ; /* queue count overflow */ + } + if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) + goto dropit ; + + /* XXX expensive to zero, see if we can remove it*/ + pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_DUMMYNET, M_NOWAIT|M_ZERO); + if ( pkt == NULL ) + goto dropit ; /* cannot allocate packet header */ + /* ok, i can handle the pkt now... */ + /* build and enqueue packet + parameters */ + pkt->hdr.mh_type = MT_TAG; + pkt->hdr.mh_flags = PACKET_TAG_DUMMYNET; + pkt->rule = fwa->rule ; + DN_NEXT(pkt) = NULL; + pkt->dn_m = m; + pkt->dn_dir = dir ; + + pkt->ifp = fwa->oif; + if (dir == DN_TO_IP_OUT) { + /* + * We need to copy *ro because for ICMP pkts (and maybe others) + * the caller passed a pointer into the stack; dst might also be + * a pointer into *ro so it needs to be updated. + */ + pkt->ro = *(fwa->ro); + if (fwa->ro->ro_rt) + fwa->ro->ro_rt->rt_refcnt++ ; + if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ + fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; + + pkt->dn_dst = fwa->dst; + pkt->flags = fwa->flags; + } + if (q->head == NULL) + q->head = pkt; + else + DN_NEXT(q->tail) = pkt; + q->tail = pkt; + q->len++; + q->len_bytes += len ; + + if ( q->head != pkt ) /* flow was not idle, we are done */ + goto done; + /* + * If we reach this point the flow was previously idle, so we need + * to schedule it. This involves different actions for fixed-rate or + * WF2Q queues. + */ + if ( (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) { + /* + * Fixed-rate queue: just insert into the ready_heap. + */ + dn_key t = 0 ; + if (pipe->bandwidth) + t = SET_TICKS(pkt, q, pipe); + q->sched_time = curr_time ; + if (t == 0) /* must process it now */ + ready_event( q ); + else + heap_insert(&ready_heap, curr_time + t , q ); + } else { + /* + * WF2Q. First, compute start time S: if the flow was idle (S=F+1) + * set S to the virtual time V for the controlling pipe, and update + * the sum of weights for the pipe; otherwise, remove flow from + * idle_heap and set S to max(F,V). + * Second, compute finish time F = S + len/weight. + * Third, if pipe was idle, update V=max(S, V). + * Fourth, count one more backlogged flow. + */ + if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ + q->S = pipe->V ; + pipe->sum += fs->weight ; /* add weight of new queue */ + } else { + heap_extract(&(pipe->idle_heap), q); + q->S = MAX64(q->F, pipe->V ) ; + } + q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight; + + if (pipe->not_eligible_heap.elements == 0 && + pipe->scheduler_heap.elements == 0) + pipe->V = MAX64 ( q->S, pipe->V ); + fs->backlogged++ ; + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the scheduler_heap cannot be + * empty). If the flow is not eligible we just store it in the + * not_eligible_heap. Otherwise, we store in the scheduler_heap + * and possibly invoke ready_event_wfq() right now if there is + * leftover credit. + * Note that for all flows in scheduler_heap (SCH), S_i <= V, + * and for all flows in not_eligible_heap (NEH), S_i > V . + * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, + * we only need to look into NEH. + */ + if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ + if (pipe->scheduler_heap.elements == 0) + printf("++ ouch! not eligible but empty scheduler!\n"); + heap_insert(&(pipe->not_eligible_heap), q->S, q); + } else { + heap_insert(&(pipe->scheduler_heap), q->F, q); + if (pipe->numbytes >= 0) { /* pipe is idle */ + if (pipe->scheduler_heap.elements != 1) + printf("*** OUCH! pipe should have been idle!\n"); + DEB(printf("Waking up pipe %d at %d\n", + pipe->pipe_nr, (int)(q->F >> MY_M)); ) + pipe->sched_time = curr_time ; + ready_event_wfq(pipe); + } + } + } +done: + splx(s); + return 0; + +dropit: + splx(s); + if (q) + q->drops++ ; + m_freem(m); + return ENOBUFS ; +} + +/* + * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) + * Doing this would probably save us the initial bzero of dn_pkt + */ +#define DN_FREE_PKT(pkt) { \ + struct dn_pkt *n = pkt ; \ + rt_unref ( n->ro.ro_rt ) ; \ + m_freem(n->dn_m); \ + pkt = DN_NEXT(n) ; \ + free(n, M_DUMMYNET) ; } + +/* + * Dispose all packets and flow_queues on a flow_set. + * If all=1, also remove red lookup table and other storage, + * including the descriptor itself. + * For the one in dn_pipe MUST also cleanup ready_heap... + */ +static void +purge_flow_set(struct dn_flow_set *fs, int all) +{ + struct dn_pkt *pkt ; + struct dn_flow_queue *q, *qn ; + int i ; + + for (i = 0 ; i <= fs->rq_size ; i++ ) { + for (q = fs->rq[i] ; q ; q = qn ) { + for (pkt = q->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + qn = q->next ; + free(q, M_DUMMYNET); + } + fs->rq[i] = NULL ; + } + fs->rq_elements = 0 ; + if (all) { + /* RED - free lookup table */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_DUMMYNET); + if (fs->rq) + free(fs->rq, M_DUMMYNET); + /* if this fs is not part of a pipe, free it */ + if (fs->pipe && fs != &(fs->pipe->fs) ) + free(fs, M_DUMMYNET); + } +} + +/* + * Dispose all packets queued on a pipe (not a flow_set). + * Also free all resources associated to a pipe, which is about + * to be deleted. + */ +static void +purge_pipe(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt ; + + purge_flow_set( &(pipe->fs), 1 ); + + for (pkt = pipe->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + + heap_free( &(pipe->scheduler_heap) ); + heap_free( &(pipe->not_eligible_heap) ); + heap_free( &(pipe->idle_heap) ); +} + +/* + * Delete all pipes and heaps returning memory. Must also + * remove references from all ipfw rules to all pipes. + */ +static void +dummynet_flush() +{ + struct dn_pipe *curr_p, *p ; + struct ip_fw *rule ; + struct dn_flow_set *fs, *curr_fs; + int s ; + + s = splimp() ; + + /* remove all references to pipes ...*/ + LIST_FOREACH(rule, &ip_fw_chain_head, next) + rule->pipe_ptr = NULL ; + /* prevent future matches... */ + p = all_pipes ; + all_pipes = NULL ; + fs = all_flow_sets ; + all_flow_sets = NULL ; + /* and free heaps so we don't have unwanted events */ + heap_free(&ready_heap); + heap_free(&wfq_ready_heap); + heap_free(&extract_heap); + splx(s) ; + /* + * Now purge all queued pkts and delete all pipes + */ + /* scan and purge all flow_sets. */ + for ( ; fs ; ) { + curr_fs = fs ; + fs = fs->next ; + purge_flow_set(curr_fs, 1); + } + for ( ; p ; ) { + purge_pipe(p); + curr_p = p ; + p = p->next ; + free(curr_p, M_DUMMYNET); + } +} + + +extern struct ip_fw *ip_fw_default_rule ; +static void +dn_rule_delete_fs(struct dn_flow_set *fs, void *r) +{ + int i ; + struct dn_flow_queue *q ; + struct dn_pkt *pkt ; + + for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ + for (q = fs->rq[i] ; q ; q = q->next ) + for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->rule == r) + pkt->rule = ip_fw_default_rule ; +} +/* + * when a firewall rule is deleted, scan all queues and remove the flow-id + * from packets matching this rule. + */ +void +dn_rule_delete(void *r) +{ + struct dn_pipe *p ; + struct dn_pkt *pkt ; + struct dn_flow_set *fs ; + + /* + * If the rule references a queue (dn_flow_set), then scan + * the flow set, otherwise scan pipes. Should do either, but doing + * both does not harm. + */ + for ( fs = all_flow_sets ; fs ; fs = fs->next ) + dn_rule_delete_fs(fs, r); + for ( p = all_pipes ; p ; p = p->next ) { + fs = &(p->fs) ; + dn_rule_delete_fs(fs, r); + for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->rule == r) + pkt->rule = ip_fw_default_rule ; + } +} + +/* + * setup RED parameters + */ +static int +config_red(struct dn_flow_set *p, struct dn_flow_set * x) +{ + int i; + + x->w_q = p->w_q; + x->min_th = SCALE(p->min_th); + x->max_th = SCALE(p->max_th); + x->max_p = p->max_p; + + x->c_1 = p->max_p / (p->max_th - p->min_th); + x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); + if (x->flags_fs & DN_IS_GENTLE_RED) { + x->c_3 = (SCALE(1) - p->max_p) / p->max_th; + x->c_4 = (SCALE(1) - 2 * p->max_p); + } + + /* if the lookup table already exist, free and create it again */ + if (x->w_q_lookup) { + free(x->w_q_lookup, M_DUMMYNET); + x->w_q_lookup = NULL ; + } + if (red_lookup_depth == 0) { + printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0"); + free(x, M_DUMMYNET); + return EINVAL; + } + x->lookup_depth = red_lookup_depth; + x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int), + M_DUMMYNET, M_DONTWAIT); + if (x->w_q_lookup == NULL) { + printf("sorry, cannot allocate red lookup table\n"); + free(x, M_DUMMYNET); + return ENOSPC; + } + + /* fill the lookup table with (1 - w_q)^x */ + x->lookup_step = p->lookup_step ; + x->lookup_weight = p->lookup_weight ; + x->w_q_lookup[0] = SCALE(1) - x->w_q; + for (i = 1; i < x->lookup_depth; i++) + x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); + if (red_avg_pkt_size < 1) + red_avg_pkt_size = 512 ; + x->avg_pkt_size = red_avg_pkt_size ; + if (red_max_pkt_size < 1) + red_max_pkt_size = 1500 ; + x->max_pkt_size = red_max_pkt_size ; + return 0 ; +} + +static int +alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) +{ + if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ + int l = pfs->rq_size; + + if (l == 0) + l = dn_hash_size; + if (l < 4) + l = 4; + else if (l > 1024) + l = 1024; + x->rq_size = l; + } else /* one is enough for null mask */ + x->rq_size = 1; + x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), + M_DUMMYNET, M_DONTWAIT | M_ZERO); + if (x->rq == NULL) { + printf("sorry, cannot allocate queue\n"); + return ENOSPC; + } + x->rq_elements = 0; + return 0 ; +} + +static void +set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) +{ + x->flags_fs = src->flags_fs; + x->qsize = src->qsize; + x->plr = src->plr; + x->flow_mask = src->flow_mask; + if (x->flags_fs & DN_QSIZE_IS_BYTES) { + if (x->qsize > 1024*1024) + x->qsize = 1024*1024 ; + } else { + if (x->qsize == 0) + x->qsize = 50 ; + if (x->qsize > 100) + x->qsize = 50 ; + } + /* configuring RED */ + if ( x->flags_fs & DN_IS_RED ) + config_red(src, x) ; /* XXX should check errors */ +} + +/* + * setup pipe or queue parameters. + */ + +static int +config_pipe(struct dn_pipe *p) +{ + int s ; + struct dn_flow_set *pfs = &(p->fs); + + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + */ + p->delay = ( p->delay * hz ) / 1000 ; + /* We need either a pipe number or a flow_set number */ + if (p->pipe_nr == 0 && pfs->fs_nr == 0) + return EINVAL ; + if (p->pipe_nr != 0 && pfs->fs_nr != 0) + return EINVAL ; + if (p->pipe_nr != 0) { /* this is a pipe */ + struct dn_pipe *x, *a, *b; + /* locate pipe */ + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + + if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ + x = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO); + if (x == NULL) { + printf("ip_dummynet.c: no memory for new pipe\n"); + return ENOSPC; + } + x->pipe_nr = p->pipe_nr; + x->fs.pipe = x ; + /* idle_heap is the only one from which we extract from the middle. + */ + x->idle_heap.size = x->idle_heap.elements = 0 ; + x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); + } else + x = b; + + x->bandwidth = p->bandwidth ; + x->numbytes = 0; /* just in case... */ + bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); + x->ifp = NULL ; /* reset interface ptr */ + x->delay = p->delay ; + set_fs_parms(&(x->fs), pfs); + + + if ( x->fs.rq == NULL ) { /* a new pipe */ + s = alloc_hash(&(x->fs), pfs) ; + if (s) { + free(x, M_DUMMYNET); + return s ; + } + s = splimp() ; + x->next = b ; + if (a == NULL) + all_pipes = x ; + else + a->next = x ; + splx(s); + } + } else { /* config queue */ + struct dn_flow_set *x, *a, *b ; + + /* locate flow_set */ + for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; + a = b , b = b->next) ; + + if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ + if (pfs->parent_nr == 0) /* need link to a pipe */ + return EINVAL ; + x = malloc(sizeof(struct dn_flow_set),M_DUMMYNET,M_DONTWAIT|M_ZERO); + if (x == NULL) { + printf("ip_dummynet.c: no memory for new flow_set\n"); + return ENOSPC; + } + x->fs_nr = pfs->fs_nr; + x->parent_nr = pfs->parent_nr; + x->weight = pfs->weight ; + if (x->weight == 0) + x->weight = 1 ; + else if (x->weight > 100) + x->weight = 100 ; + } else { + /* Change parent pipe not allowed; must delete and recreate */ + if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) + return EINVAL ; + x = b; + } + set_fs_parms(x, pfs); + + if ( x->rq == NULL ) { /* a new flow_set */ + s = alloc_hash(x, pfs) ; + if (s) { + free(x, M_DUMMYNET); + return s ; + } + s = splimp() ; + x->next = b; + if (a == NULL) + all_flow_sets = x; + else + a->next = x; + splx(s); + } + } + return 0 ; +} + +/* + * Helper function to remove from a heap queues which are linked to + * a flow_set about to be deleted. + */ +static void +fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) +{ + int i = 0, found = 0 ; + for (; i < h->elements ;) + if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (found) + heapify(h); +} + +/* + * helper function to remove a pipe from a heap (can be there at most once) + */ +static void +pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) +{ + if (h->elements > 0) { + int i = 0 ; + for (i=0; i < h->elements ; i++ ) { + if (h->p[i].object == p) { /* found it */ + h->elements-- ; + h->p[i] = h->p[h->elements] ; + heapify(h); + break ; + } + } + } +} + +/* + * drain all queues. Called in case of severe mbuf shortage. + */ +void +dummynet_drain() +{ + struct dn_flow_set *fs; + struct dn_pipe *p; + struct dn_pkt *pkt; + + heap_free(&ready_heap); + heap_free(&wfq_ready_heap); + heap_free(&extract_heap); + /* remove all references to this pipe from flow_sets */ + for (fs = all_flow_sets; fs; fs= fs->next ) + purge_flow_set(fs, 0); + + for (p = all_pipes; p; p= p->next ) { + purge_flow_set(&(p->fs), 0); + for (pkt = p->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + p->head = p->tail = NULL ; + } +} + +/* + * Fully delete a pipe or a queue, cleaning up associated info. + */ +static int +delete_pipe(struct dn_pipe *p) +{ + int s ; + struct ip_fw *rule ; + + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL ; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL ; + if (p->pipe_nr != 0) { /* this is an old-style pipe */ + struct dn_pipe *a, *b; + struct dn_flow_set *fs; + + /* locate pipe */ + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + if (b == NULL || (b->pipe_nr != p->pipe_nr) ) + return EINVAL ; /* not found */ + + s = splimp() ; + + /* unlink from list of pipes */ + if (a == NULL) + all_pipes = b->next ; + else + a->next = b->next ; + /* remove references to this pipe from the ip_fw rules. */ + LIST_FOREACH(rule, &ip_fw_chain_head, next) + if (rule->pipe_ptr == &(b->fs)) + rule->pipe_ptr = NULL ; + + /* remove all references to this pipe from flow_sets */ + for (fs = all_flow_sets; fs; fs= fs->next ) + if (fs->pipe == b) { + printf("++ ref to pipe %d from fs %d\n", + p->pipe_nr, fs->fs_nr); + fs->pipe = NULL ; + purge_flow_set(fs, 0); + } + fs_remove_from_heap(&ready_heap, &(b->fs)); + purge_pipe(b); /* remove all data associated to this pipe */ + /* remove reference to here from extract_heap and wfq_ready_heap */ + pipe_remove_from_heap(&extract_heap, b); + pipe_remove_from_heap(&wfq_ready_heap, b); + splx(s); + free(b, M_DUMMYNET); + } else { /* this is a WF2Q queue (dn_flow_set) */ + struct dn_flow_set *a, *b; + + /* locate set */ + for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; + a = b , b = b->next) ; + if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) + return EINVAL ; /* not found */ + + s = splimp() ; + if (a == NULL) + all_flow_sets = b->next ; + else + a->next = b->next ; + /* remove references to this flow_set from the ip_fw rules. */ + LIST_FOREACH(rule, &ip_fw_chain_head, next) + if (rule->pipe_ptr == b) + rule->pipe_ptr = NULL ; + + if (b->pipe != NULL) { + /* Update total weight on parent pipe and cleanup parent heaps */ + b->pipe->sum -= b->weight * b->backlogged ; + fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); + fs_remove_from_heap(&(b->pipe->scheduler_heap), b); +#if 1 /* XXX should i remove from idle_heap as well ? */ + fs_remove_from_heap(&(b->pipe->idle_heap), b); +#endif + } + purge_flow_set(b, 1); + splx(s); + } + return 0 ; +} + +/* + * helper function used to copy data from kernel in DUMMYNET_GET + */ +static char * +dn_copy_set(struct dn_flow_set *set, char *bp) +{ + int i, copied = 0 ; + struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; + + for (i = 0 ; i <= set->rq_size ; i++) + for (q = set->rq[i] ; q ; q = q->next, qp++ ) { + if (q->hash_slot != i) + printf("++ at %d: wrong slot (have %d, " + "should be %d)\n", copied, q->hash_slot, i); + if (q->fs != set) + printf("++ at %d: wrong fs ptr (have %p, should be %p)\n", + i, q->fs, set); + copied++ ; + bcopy(q, qp, sizeof( *q ) ); + /* cleanup pointers */ + qp->next = NULL ; + qp->head = qp->tail = NULL ; + qp->fs = NULL ; + } + if (copied != set->rq_elements) + printf("++ wrong count, have %d should be %d\n", + copied, set->rq_elements); + return (char *)qp ; +} + +static int +dummynet_get(struct sockopt *sopt) +{ + char *buf, *bp ; /* bp is the "copy-pointer" */ + size_t size ; + struct dn_flow_set *set ; + struct dn_pipe *p ; + int s, error=0 ; + + s = splimp(); + /* + * compute size of data structures: list of pipes and flow_sets. + */ + for (p = all_pipes, size = 0 ; p ; p = p->next ) + size += sizeof( *p ) + + p->fs.rq_elements * sizeof(struct dn_flow_queue); + for (set = all_flow_sets ; set ; set = set->next ) + size += sizeof ( *set ) + + set->rq_elements * sizeof(struct dn_flow_queue); + buf = malloc(size, M_TEMP, M_DONTWAIT); + if (buf == 0) { + splx(s); + return ENOBUFS ; + } + for (p = all_pipes, bp = buf ; p ; p = p->next ) { + struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; + + /* + * copy pipe descriptor into *bp, convert delay back to ms, + * then copy the flow_set descriptor(s) one at a time. + * After each flow_set, copy the queue descriptor it owns. + */ + bcopy(p, bp, sizeof( *p ) ); + pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; + /* + * XXX the following is a hack based on ->next being the + * first field in dn_pipe and dn_flow_set. The correct + * solution would be to move the dn_flow_set to the beginning + * of struct dn_pipe. + */ + pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ; + /* clean pointers */ + pipe_bp->head = pipe_bp->tail = NULL ; + pipe_bp->fs.next = NULL ; + pipe_bp->fs.pipe = NULL ; + pipe_bp->fs.rq = NULL ; + + bp += sizeof( *p ) ; + bp = dn_copy_set( &(p->fs), bp ); + } + for (set = all_flow_sets ; set ; set = set->next ) { + struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; + bcopy(set, bp, sizeof( *set ) ); + /* XXX same hack as above */ + fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ; + fs_bp->pipe = NULL ; + fs_bp->rq = NULL ; + bp += sizeof( *set ) ; + bp = dn_copy_set( set, bp ); + } + splx(s); + error = sooptcopyout(sopt, buf, size); + free(buf, M_TEMP); + return error ; +} + +/* + * Handler for the various dummynet socket options (get, flush, config, del) + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + int error = 0 ; + struct dn_pipe *p, tmp_pipe; + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + switch (sopt->sopt_name) { + default : + printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name); + return EINVAL ; + + case IP_DUMMYNET_GET : + error = dummynet_get(sopt); + break ; + + case IP_DUMMYNET_FLUSH : + dummynet_flush() ; + break ; + + case IP_DUMMYNET_CONFIGURE : + p = &tmp_pipe ; + error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); + if (error) + break ; + error = config_pipe(p); + break ; + + case IP_DUMMYNET_DEL : /* remove a pipe or queue */ + p = &tmp_pipe ; + error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); + if (error) + break ; + + error = delete_pipe(p); + break ; + } + return error ; +} + +static void +ip_dn_init(void) +{ + printf("DUMMYNET initialized (011031)\n"); + all_pipes = NULL ; + all_flow_sets = NULL ; + ready_heap.size = ready_heap.elements = 0 ; + ready_heap.offset = 0 ; + + wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; + wfq_ready_heap.offset = 0 ; + + extract_heap.size = extract_heap.elements = 0 ; + extract_heap.offset = 0 ; + ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; + ip_dn_ruledel_ptr = dn_rule_delete; + bzero(&dn_timeout, sizeof(struct callout_handle)); + dn_timeout = timeout(dummynet, NULL, 1); +} + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + int s; + switch (type) { + case MOD_LOAD: + s = splimp(); + if (DUMMYNET_LOADED) { + splx(s); + printf("DUMMYNET already loaded\n"); + return EEXIST ; + } + ip_dn_init(); + splx(s); + break; + + case MOD_UNLOAD: +#if !defined(KLD_MODULE) + printf("dummynet statically compiled, cannot unload\n"); + return EINVAL ; +#else + s = splimp(); + untimeout(dummynet, NULL, dn_timeout); + dummynet_flush(); + ip_dn_ctl_ptr = NULL; + ip_dn_io_ptr = NULL; + ip_dn_ruledel_ptr = NULL; + splx(s); +#endif + break ; + default: + break ; + } + return 0 ; +} + +static moduledata_t dummynet_mod = { + "dummynet", + dummynet_modevent, + NULL +}; +DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_DEPEND(dummynet, ipfw, 1, 1, 1); +MODULE_VERSION(dummynet, 1); diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h new file mode 100644 index 0000000..30c5f6f --- /dev/null +++ b/sys/netinet/ip_dummynet.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of dummynet data structures. In the structures, I decided + * not to use the macros in <sys/queue.h> in the hope of making the code + * easier to port to other architectures. The type of lists and queue we + * use here is pretty simple anyways. + */ + +/* + * We start with a heap, which is used in the scheduler to decide when + * to transmit packets etc. + * + * The key for the heap is used for two different values: + * + * 1. timer ticks- max 10K/second, so 32 bits are enough; + * + * 2. virtual times. These increase in steps of len/x, where len is the + * packet length, and x is either the weight of the flow, or the + * sum of all weights. + * If we limit to max 1000 flows and a max weight of 100, then + * x needs 17 bits. The packet size is 16 bits, so we can easily + * overflow if we do not allow errors. + * So we use a key "dn_key" which is 64 bits. Some macros are used to + * compare key values and handle wraparounds. + * MAX64 returns the largest of two key values. + * MY_M is used as a shift count when doing fixed point arithmetic + * (a better name would be useful...). + */ +typedef u_int64_t dn_key ; /* sorting key */ +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) +#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) +#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#define MY_M 16 /* number of left shift to obtain a larger precision */ + +/* + * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the + * virtual time wraps every 15 days. + */ + +/* + * The OFFSET_OF macro is used to return the offset of a field within + * a structure. It is used by the heap management routines. + */ +#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) + +/* + * A heap entry is made of a key and a pointer to the actual + * object stored in the heap. + * The heap is an array of dn_heap_entry entries, dynamically allocated. + * Current size is "size", with "elements" actually in use. + * The heap normally supports only ordered insert and extract from the top. + * If we want to extract an object from the middle of the heap, we + * have to know where the object itself is located in the heap (or we + * need to scan the whole array). To this purpose, an object has a + * field (int) which contains the index of the object itself into the + * heap. When the object is moved, the field must also be updated. + * The offset of the index in the object is stored in the 'offset' + * field in the heap descriptor. The assumption is that this offset + * is non-zero if we want to support extract from the middle. + */ +struct dn_heap_entry { + dn_key key ; /* sorting key. Topmost element is smallest one */ + void *object ; /* object pointer */ +} ; + +struct dn_heap { + int size ; + int elements ; + int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry *p ; /* really an array of "size" entries */ +} ; + +/* + * struct dn_pkt identifies a packet in the dummynet queue, but + * is also used to tag packets passed back to the various destinations + * (ip_input(), ip_output(), bdg_forward() and so on). + * As such the first part of the structure must be a struct m_hdr, + * followed by dummynet-specific parameters. The m_hdr must be + * initialized with + * mh_type = MT_TAG; + * mh_flags = PACKET_TYPE_DUMMYNET; + * mh_next = <pointer to the actual mbuf> + * + * mh_nextpkt, mh_data are free for dummynet use (mh_nextpkt is used to + * build a linked list of packets in a dummynet queue). + */ +struct dn_pkt { + struct m_hdr hdr ; +#define DN_NEXT(x) (struct dn_pkt *)(x)->hdr.mh_nextpkt +#define dn_m hdr.mh_next /* packet to be forwarded */ + + struct ip_fw *rule; /* matching rule */ + int dn_dir; /* action when packet comes out. */ +#define DN_TO_IP_OUT 1 +#define DN_TO_IP_IN 2 +#define DN_TO_BDG_FWD 3 +#define DN_TO_ETH_DEMUX 4 +#define DN_TO_ETH_OUT 5 + + dn_key output_time; /* when the pkt is due for delivery */ + struct ifnet *ifp; /* interface, for ip_output */ + struct sockaddr_in *dn_dst ; + struct route ro; /* route, for ip_output. MUST COPY */ + int flags ; /* flags, for ip_output (IPv6 ?) */ +}; + +/* + * Overall structure of dummynet (with WF2Q+): + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE. + +A QUEUE is just a queue with configurable size and queue management +policy. It is also associated with a mask (to discriminate among +different flows), a weight (used to give different shares of the +bandwidth to different flows) and a "pipe", which essentially +supplies the transmit clock for all queues associated with that +pipe. + +A PIPE emulates a fixed-bandwidth link, whose bandwidth is +configurable. The "clock" for a pipe can come from either an +internal timer, or from the transmit interrupt of an interface. +A pipe is also associated with one (or more, if masks are used) +queue, where all packets for that pipe are stored. + +The bandwidth available on the pipe is shared by the queues +associated with that pipe (only one in case the packet is sent +to a PIPE) according to the WF2Q+ scheduling algorithm and the +configured weights. + +In general, incoming packets are stored in the appropriate queue, +which is then placed into one of a few heaps managed by a scheduler +to decide when the packet should be extracted. +The scheduler (a function called dummynet()) is run at every timer +tick, and grabs queues from the head of the heaps when they are +ready for processing. + +There are three data structures definining a pipe and associated queues: + + + dn_pipe, which contains the main configuration parameters related + to delay and bandwidth; + + dn_flow_set, which contains WF2Q+ configuration, flow + masks, plr and RED configuration; + + dn_flow_queue, which is the per-flow queue (containing the packets) + +Multiple dn_flow_set can be linked to the same pipe, and multiple +dn_flow_queue can be linked to the same dn_flow_set. +All data structures are linked in a linear list which is used for +housekeeping purposes. + +During configuration, we create and initialize the dn_flow_set +and dn_pipe structures (a dn_pipe also contains a dn_flow_set). + +At runtime: packets are sent to the appropriate dn_flow_set (either +WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows), +which in turn dispatches them to the appropriate dn_flow_queue +(created dynamically according to the masks). + +The transmit clock for fixed rate flows (ready_event()) selects the +dn_flow_queue to be used to transmit the next packet. For WF2Q, +wfq_ready_event() extract a pipe which in turn selects the right +flow using a number of heaps defined into the pipe itself. + + * + */ + +/* + * per flow queue. This contains the flow identifier, the queue + * of packets, counters, and parameters used to support both RED and + * WF2Q+. + * + * A dn_flow_queue is created and initialized whenever a packet for + * a new flow arrives. + */ +struct dn_flow_queue { + struct dn_flow_queue *next ; + struct ipfw_flow_id id ; + + struct dn_pkt *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + long numbytes ; /* credit for transmission (dynamic queues) */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time ; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + dn_key sched_time ; /* current time when queue enters ready_heap */ + + dn_key S,F ; /* start time, finish time */ + /* + * Setting F < S means the timestamp is invalid. We only need + * to test this when the queue is empty. + */ +} ; + +/* + * flow_set descriptor. Contains the "template" parameters for the + * queue configuration, and pointers to the hash table of dn_flow_queue's. + * + * The hash table is an array of lists -- we identify the slot by + * hashing the flow-id, then scan the list looking for a match. + * The size of the hash table (buckets) is configurable on a per-queue + * basis. + * + * A dn_flow_set is created whenever a new queue or pipe is created (in the + * latter case, the structure is located inside the struct dn_pipe). + */ +struct dn_flow_set { + struct dn_flow_set *next; /* next flow set in all_flow_sets list */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DN_HAVE_FLOW_MASK 0x0001 +#define DN_IS_RED 0x0002 +#define DN_IS_GENTLE_RED 0x0004 +#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 + + struct dn_pipe *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +} ; + +/* + * Pipe descriptor. Contains global parameters, delay-line queue, + * and the flow_set used for fixed-rate queues. + * + * For WF2Q+ support it also has 3 heaps holding dn_flow_queue: + * not_eligible_heap, for queues whose start time is higher + * than the virtual time. Sorted by start time. + * scheduler_heap, for queues eligible for scheduling. Sorted by + * finish time. + * idle_heap, all flows that are idle and can be removed. We + * do that on each tick so we do not slow down too much + * operations during forwarding. + * + */ +struct dn_pipe { /* a pipe */ + struct dn_pipe *next ; + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct dn_pkt *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap not_eligible_heap; /* top extract- key Start time */ + struct dn_heap idle_heap ; /* random extract - key Start=Finish time */ + + dn_key V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + int numbytes; /* bits I can transmit (more or less). */ + + dn_key sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[16]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; + +#ifdef _KERNEL +typedef int ip_dn_ctl_t(struct sockopt *); /* raw_ip.c */ +typedef void ip_dn_ruledel_t(void *); /* ip_fw.c */ +typedef int ip_dn_io_t(struct mbuf *m, int pipe_nr, int dir, + struct ip_fw_args *fwa); +extern ip_dn_ctl_t *ip_dn_ctl_ptr; +extern ip_dn_ruledel_t *ip_dn_ruledel_ptr; +extern ip_dn_io_t *ip_dn_io_ptr; +#define DUMMYNET_LOADED (ip_dn_io_ptr != NULL) +#endif + +#endif /* _IP_DUMMYNET_H */ diff --git a/sys/netinet/ip_ecn.c b/sys/netinet/ip_ecn.c new file mode 100644 index 0000000..de3d38e --- /dev/null +++ b/sys/netinet/ip_ecn.c @@ -0,0 +1,142 @@ +/* $FreeBSD$ */ +/* $KAME: ip_ecn.c,v 1.11 2001/05/03 16:09:29 itojun Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * ECN consideration on tunnel ingress/egress operation. + * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt + */ + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/errno.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#include <netinet/ip_ecn.h> +#ifdef INET6 +#include <netinet6/ip6_ecn.h> +#endif + +/* + * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation). + */ +void +ip_ecn_ingress(mode, outer, inner) + int mode; + u_int8_t *outer; + const u_int8_t *inner; +{ + if (!outer || !inner) + panic("NULL pointer passed to ip_ecn_ingress"); + + *outer = *inner; + switch (mode) { + case ECN_ALLOWED: /* ECN allowed */ + *outer &= ~IPTOS_CE; + break; + case ECN_FORBIDDEN: /* ECN forbidden */ + *outer &= ~(IPTOS_ECT | IPTOS_CE); + break; + case ECN_NOCARE: /* no consideration to ECN */ + break; + } +} + +/* + * modify inner ECN (TOS) field on egress operation (tunnel decapsulation). + */ +void +ip_ecn_egress(mode, outer, inner) + int mode; + const u_int8_t *outer; + u_int8_t *inner; +{ + if (!outer || !inner) + panic("NULL pointer passed to ip_ecn_egress"); + + switch (mode) { + case ECN_ALLOWED: + if (*outer & IPTOS_CE) + *inner |= IPTOS_CE; + break; + case ECN_FORBIDDEN: /* ECN forbidden */ + case ECN_NOCARE: /* no consideration to ECN */ + break; + } +} + +#ifdef INET6 +void +ip6_ecn_ingress(mode, outer, inner) + int mode; + u_int32_t *outer; + const u_int32_t *inner; +{ + u_int8_t outer8, inner8; + + if (!outer || !inner) + panic("NULL pointer passed to ip6_ecn_ingress"); + + outer8 = (ntohl(*outer) >> 20) & 0xff; + inner8 = (ntohl(*inner) >> 20) & 0xff; + ip_ecn_ingress(mode, &outer8, &inner8); + *outer &= ~htonl(0xff << 20); + *outer |= htonl((u_int32_t)outer8 << 20); +} + +void +ip6_ecn_egress(mode, outer, inner) + int mode; + const u_int32_t *outer; + u_int32_t *inner; +{ + u_int8_t outer8, inner8; + + if (!outer || !inner) + panic("NULL pointer passed to ip6_ecn_egress"); + + outer8 = (ntohl(*outer) >> 20) & 0xff; + inner8 = (ntohl(*inner) >> 20) & 0xff; + ip_ecn_egress(mode, &outer8, &inner8); + *inner &= ~htonl(0xff << 20); + *inner |= htonl((u_int32_t)inner8 << 20); +} +#endif diff --git a/sys/netinet/ip_ecn.h b/sys/netinet/ip_ecn.h new file mode 100644 index 0000000..1a38a48 --- /dev/null +++ b/sys/netinet/ip_ecn.h @@ -0,0 +1,49 @@ +/* $FreeBSD$ */ +/* $KAME: ip_ecn.h,v 1.6 2001/05/03 14:51:48 itojun Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * ECN consideration on tunnel ingress/egress operation. + * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt + */ + +#if defined(_KERNEL) && !defined(_LKM) +#include "opt_inet.h" +#endif + +#define ECN_ALLOWED 1 /* ECN allowed */ +#define ECN_FORBIDDEN 0 /* ECN forbidden */ +#define ECN_NOCARE (-1) /* no consideration to ECN */ + +#ifdef _KERNEL +extern void ip_ecn_ingress(int, u_int8_t *, const u_int8_t *); +extern void ip_ecn_egress(int, const u_int8_t *, u_int8_t *); +#endif diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c new file mode 100644 index 0000000..e12f50a --- /dev/null +++ b/sys/netinet/ip_encap.c @@ -0,0 +1,522 @@ +/* $FreeBSD$ */ +/* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * My grandfather said that there's a devil inside tunnelling technology... + * + * We have surprisingly many protocols that want packets with IP protocol + * #4 or #41. Here's a list of protocols that want protocol #41: + * RFC1933 configured tunnel + * RFC1933 automatic tunnel + * RFC2401 IPsec tunnel + * RFC2473 IPv6 generic packet tunnelling + * RFC2529 6over4 tunnel + * mobile-ip6 (uses RFC2473) + * RFC3056 6to4 tunnel + * isatap tunnel + * Here's a list of protocol that want protocol #4: + * RFC1853 IPv4-in-IPv4 tunnelling + * RFC2003 IPv4 encapsulation within IPv4 + * RFC2344 reverse tunnelling for mobile-ip4 + * RFC2401 IPsec tunnel + * Well, what can I say. They impose different en/decapsulation mechanism + * from each other, so they need separate protocol handler. The only one + * we can easily determine by protocol # is IPsec, which always has + * AH/ESP/IPComp header right after outer IP header. + * + * So, clearly good old protosw does not work for protocol #4 and #41. + * The code will let you match protocol via src/dst address pair. + */ +/* XXX is M_NETADDR correct? */ + +#include "opt_mrouting.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/errno.h> +#include <sys/protosw.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_encap.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/ip6protosw.h> +#endif + +#include <machine/stdarg.h> + +#include <net/net_osdep.h> + +#include <sys/kernel.h> +#include <sys/malloc.h> +static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); + +static void encap_add(struct encaptab *); +static int mask_match(const struct encaptab *, const struct sockaddr *, + const struct sockaddr *); +static void encap_fillarg(struct mbuf *, const struct encaptab *); + +#ifndef LIST_HEAD_INITIALIZER +/* rely upon BSS initialization */ +LIST_HEAD(, encaptab) encaptab; +#else +LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab); +#endif + +void +encap_init() +{ + static int initialized = 0; + + if (initialized) + return; + initialized++; +#if 0 + /* + * we cannot use LIST_INIT() here, since drivers may want to call + * encap_attach(), on driver attach. encap_init() will be called + * on AF_INET{,6} initialization, which happens after driver + * initialization - using LIST_INIT() here can nuke encap_attach() + * from drivers. + */ + LIST_INIT(&encaptab); +#endif +} + +#ifdef INET +void +encap4_input(m, off) + struct mbuf *m; + int off; +{ + struct ip *ip; + int proto; + struct sockaddr_in s, d; + const struct protosw *psw; + struct encaptab *ep, *match; + int prio, matchprio; + + ip = mtod(m, struct ip *); + proto = ip->ip_p; + + bzero(&s, sizeof(s)); + s.sin_family = AF_INET; + s.sin_len = sizeof(struct sockaddr_in); + s.sin_addr = ip->ip_src; + bzero(&d, sizeof(d)); + d.sin_family = AF_INET; + d.sin_len = sizeof(struct sockaddr_in); + d.sin_addr = ip->ip_dst; + + match = NULL; + matchprio = 0; + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != AF_INET) + continue; + if (ep->proto >= 0 && ep->proto != proto) + continue; + if (ep->func) + prio = (*ep->func)(m, off, proto, ep->arg); + else { + /* + * it's inbound traffic, we need to match in reverse + * order + */ + prio = mask_match(ep, (struct sockaddr *)&d, + (struct sockaddr *)&s); + } + + /* + * We prioritize the matches by using bit length of the + * matches. mask_match() and user-supplied matching function + * should return the bit length of the matches (for example, + * if both src/dst are matched for IPv4, 64 should be returned). + * 0 or negative return value means "it did not match". + * + * The question is, since we have two "mask" portion, we + * cannot really define total order between entries. + * For example, which of these should be preferred? + * mask_match() returns 48 (32 + 16) for both of them. + * src=3ffe::/16, dst=3ffe:501::/32 + * src=3ffe:501::/32, dst=3ffe::/16 + * + * We need to loop through all the possible candidates + * to get the best match - the search takes O(n) for + * n attachments (i.e. interfaces). + */ + if (prio <= 0) + continue; + if (prio > matchprio) { + matchprio = prio; + match = ep; + } + } + + if (match) { + /* found a match, "match" has the best one */ + psw = match->psw; + if (psw && psw->pr_input) { + encap_fillarg(m, match); + (*psw->pr_input)(m, off); + } else + m_freem(m); + return; + } + + /* last resort: inject to raw socket */ + rip_input(m, off); +} +#endif + +#ifdef INET6 +int +encap6_input(mp, offp, proto) + struct mbuf **mp; + int *offp; + int proto; +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6; + struct sockaddr_in6 s, d; + const struct ip6protosw *psw; + struct encaptab *ep, *match; + int prio, matchprio; + + ip6 = mtod(m, struct ip6_hdr *); + + bzero(&s, sizeof(s)); + s.sin6_family = AF_INET6; + s.sin6_len = sizeof(struct sockaddr_in6); + s.sin6_addr = ip6->ip6_src; + bzero(&d, sizeof(d)); + d.sin6_family = AF_INET6; + d.sin6_len = sizeof(struct sockaddr_in6); + d.sin6_addr = ip6->ip6_dst; + + match = NULL; + matchprio = 0; + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != AF_INET6) + continue; + if (ep->proto >= 0 && ep->proto != proto) + continue; + if (ep->func) + prio = (*ep->func)(m, *offp, proto, ep->arg); + else { + /* + * it's inbound traffic, we need to match in reverse + * order + */ + prio = mask_match(ep, (struct sockaddr *)&d, + (struct sockaddr *)&s); + } + + /* see encap4_input() for issues here */ + if (prio <= 0) + continue; + if (prio > matchprio) { + matchprio = prio; + match = ep; + } + } + + if (match) { + /* found a match */ + psw = (const struct ip6protosw *)match->psw; + if (psw && psw->pr_input) { + encap_fillarg(m, match); + return (*psw->pr_input)(mp, offp, proto); + } else { + m_freem(m); + return IPPROTO_DONE; + } + } + + /* last resort: inject to raw socket */ + return rip6_input(mp, offp, proto); +} +#endif + +static void +encap_add(ep) + struct encaptab *ep; +{ + + LIST_INSERT_HEAD(&encaptab, ep, chain); +} + +/* + * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. + * length of mask (sm and dm) is assumed to be same as sp/dp. + * Return value will be necessary as input (cookie) for encap_detach(). + */ +const struct encaptab * +encap_attach(af, proto, sp, sm, dp, dm, psw, arg) + int af; + int proto; + const struct sockaddr *sp, *sm; + const struct sockaddr *dp, *dm; + const struct protosw *psw; + void *arg; +{ + struct encaptab *ep; + int error; + int s; + + s = splnet(); + /* sanity check on args */ + if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) { + error = EINVAL; + goto fail; + } + if (sp->sa_len != dp->sa_len) { + error = EINVAL; + goto fail; + } + if (af != sp->sa_family || af != dp->sa_family) { + error = EINVAL; + goto fail; + } + + /* check if anyone have already attached with exactly same config */ + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != af) + continue; + if (ep->proto != proto) + continue; + if (ep->src.ss_len != sp->sa_len || + bcmp(&ep->src, sp, sp->sa_len) != 0 || + bcmp(&ep->srcmask, sm, sp->sa_len) != 0) + continue; + if (ep->dst.ss_len != dp->sa_len || + bcmp(&ep->dst, dp, dp->sa_len) != 0 || + bcmp(&ep->dstmask, dm, dp->sa_len) != 0) + continue; + + error = EEXIST; + goto fail; + } + + ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ + if (ep == NULL) { + error = ENOBUFS; + goto fail; + } + bzero(ep, sizeof(*ep)); + + ep->af = af; + ep->proto = proto; + bcopy(sp, &ep->src, sp->sa_len); + bcopy(sm, &ep->srcmask, sp->sa_len); + bcopy(dp, &ep->dst, dp->sa_len); + bcopy(dm, &ep->dstmask, dp->sa_len); + ep->psw = psw; + ep->arg = arg; + + encap_add(ep); + + error = 0; + splx(s); + return ep; + +fail: + splx(s); + return NULL; +} + +const struct encaptab * +encap_attach_func(af, proto, func, psw, arg) + int af; + int proto; + int (*func)(const struct mbuf *, int, int, void *); + const struct protosw *psw; + void *arg; +{ + struct encaptab *ep; + int error; + int s; + + s = splnet(); + /* sanity check on args */ + if (!func) { + error = EINVAL; + goto fail; + } + + ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ + if (ep == NULL) { + error = ENOBUFS; + goto fail; + } + bzero(ep, sizeof(*ep)); + + ep->af = af; + ep->proto = proto; + ep->func = func; + ep->psw = psw; + ep->arg = arg; + + encap_add(ep); + + error = 0; + splx(s); + return ep; + +fail: + splx(s); + return NULL; +} + +int +encap_detach(cookie) + const struct encaptab *cookie; +{ + const struct encaptab *ep = cookie; + struct encaptab *p; + + LIST_FOREACH(p, &encaptab, chain) { + if (p == ep) { + LIST_REMOVE(p, chain); + free(p, M_NETADDR); /*XXX*/ + return 0; + } + } + + return EINVAL; +} + +static int +mask_match(ep, sp, dp) + const struct encaptab *ep; + const struct sockaddr *sp; + const struct sockaddr *dp; +{ + struct sockaddr_storage s; + struct sockaddr_storage d; + int i; + const u_int8_t *p, *q; + u_int8_t *r; + int matchlen; + + if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) + return 0; + if (sp->sa_family != ep->af || dp->sa_family != ep->af) + return 0; + if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len) + return 0; + + matchlen = 0; + + p = (const u_int8_t *)sp; + q = (const u_int8_t *)&ep->srcmask; + r = (u_int8_t *)&s; + for (i = 0 ; i < sp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX estimate */ + matchlen += (q[i] ? 8 : 0); + } + + p = (const u_int8_t *)dp; + q = (const u_int8_t *)&ep->dstmask; + r = (u_int8_t *)&d; + for (i = 0 ; i < dp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX rough estimate */ + matchlen += (q[i] ? 8 : 0); + } + + /* need to overwrite len/family portion as we don't compare them */ + s.ss_len = sp->sa_len; + s.ss_family = sp->sa_family; + d.ss_len = dp->sa_len; + d.ss_family = dp->sa_family; + + if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 && + bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) { + return matchlen; + } else + return 0; +} + +static void +encap_fillarg(m, ep) + struct mbuf *m; + const struct encaptab *ep; +{ +#if 0 + m->m_pkthdr.aux = ep->arg; +#else + struct mbuf *n; + + n = m_aux_add(m, AF_INET, IPPROTO_IPV4); + if (n) { + *mtod(n, void **) = ep->arg; + n->m_len = sizeof(void *); + } +#endif +} + +void * +encap_getarg(m) + struct mbuf *m; +{ + void *p; +#if 0 + p = m->m_pkthdr.aux; + m->m_pkthdr.aux = NULL; + return p; +#else + struct mbuf *n; + + p = NULL; + n = m_aux_find(m, AF_INET, IPPROTO_IPV4); + if (n) { + if (n->m_len == sizeof(void *)) + p = *mtod(n, void **); + m_aux_delete(m, n); + } + return p; +#endif +} diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h new file mode 100644 index 0000000..518f502 --- /dev/null +++ b/sys/netinet/ip_encap.h @@ -0,0 +1,64 @@ +/* $FreeBSD$ */ +/* $KAME: ip_encap.h,v 1.7 2000/03/25 07:23:37 sumikawa Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IP_ENCAP_H_ +#define _NETINET_IP_ENCAP_H_ + +#ifdef _KERNEL + +struct encaptab { + LIST_ENTRY(encaptab) chain; + int af; + int proto; /* -1: don't care, I'll check myself */ + struct sockaddr_storage src; /* my addr */ + struct sockaddr_storage srcmask; + struct sockaddr_storage dst; /* remote addr */ + struct sockaddr_storage dstmask; + int (*func)(const struct mbuf *, int, int, void *); + const struct protosw *psw; /* only pr_input will be used */ + void *arg; /* passed via m->m_pkthdr.aux */ +}; + +void encap_init(void); +void encap4_input(struct mbuf *, int); +int encap6_input(struct mbuf **, int *, int); +const struct encaptab *encap_attach(int, int, const struct sockaddr *, + const struct sockaddr *, const struct sockaddr *, + const struct sockaddr *, const struct protosw *, void *); +const struct encaptab *encap_attach_func(int, int, + int (*)(const struct mbuf *, int, int, void *), + const struct protosw *, void *); +int encap_detach(const struct encaptab *); +void *encap_getarg(struct mbuf *); +#endif + +#endif /*_NETINET_IP_ENCAP_H_*/ diff --git a/sys/netinet/ip_flow.c b/sys/netinet/ip_flow.c new file mode 100644 index 0000000..a23f2bb --- /dev/null +++ b/sys/netinet/ip_flow.c @@ -0,0 +1,333 @@ +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/kernel.h> + +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/ip_flow.h> + +#define IPFLOW_TIMER (5 * PR_SLOWHZ) +#define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ +#define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) +static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE]; +static int ipflow_inuse; +#define IPFLOW_MAX 256 + +static int ipflow_active = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, + &ipflow_active, 0, "Enable flow-based IP forwarding"); + +static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); + +static unsigned +ipflow_hash( + struct in_addr dst, + struct in_addr src, + unsigned tos) +{ + unsigned hash = tos; + int idx; + for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) + hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); + return hash & (IPFLOW_HASHSIZE-1); +} + +static struct ipflow * +ipflow_lookup( + const struct ip *ip) +{ + unsigned hash; + struct ipflow *ipf; + + hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + + ipf = LIST_FIRST(&ipflows[hash]); + while (ipf != NULL) { + if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr + && ip->ip_src.s_addr == ipf->ipf_src.s_addr + && ip->ip_tos == ipf->ipf_tos) + break; + ipf = LIST_NEXT(ipf, ipf_next); + } + return ipf; +} + +int +ipflow_fastforward( + struct mbuf *m) +{ + struct ip *ip; + struct ipflow *ipf; + struct rtentry *rt; + struct sockaddr *dst; + int error; + + /* + * Are we forwarding packets? Big enough for an IP packet? + */ + if (!ipforwarding || !ipflow_active || m->m_len < sizeof(struct ip)) + return 0; + /* + * IP header with no option and valid version and length + */ + ip = mtod(m, struct ip *); + if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) + || ntohs(ip->ip_len) > m->m_pkthdr.len) + return 0; + /* + * Find a flow. + */ + if ((ipf = ipflow_lookup(ip)) == NULL) + return 0; + + /* + * Route and interface still up? + */ + rt = ipf->ipf_ro.ro_rt; + if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) + return 0; + + /* + * Packet size OK? TTL? + */ + if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) + return 0; + + /* + * Everything checks out and so we can forward this packet. + * Modify the TTL and incrementally change the checksum. + */ + ip->ip_ttl -= IPTTLDEC; + if (ip->ip_sum >= htons(0xffff - (IPTTLDEC << 8))) { + ip->ip_sum += htons(IPTTLDEC << 8) + 1; + } else { + ip->ip_sum += htons(IPTTLDEC << 8); + } + + /* + * Send the packet on its way. All we can get back is ENOBUFS + */ + ipf->ipf_uses++; + ipf->ipf_timer = IPFLOW_TIMER; + + if (rt->rt_flags & RTF_GATEWAY) + dst = rt->rt_gateway; + else + dst = &ipf->ipf_ro.ro_dst; + if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) { + if (error == ENOBUFS) + ipf->ipf_dropped++; + else + ipf->ipf_errors++; + } + return 1; +} + +static void +ipflow_addstats( + struct ipflow *ipf) +{ + ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; + ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; + ipstat.ips_forward += ipf->ipf_uses; + ipstat.ips_fastforward += ipf->ipf_uses; +} + +static void +ipflow_free( + struct ipflow *ipf) +{ + int s; + /* + * Remove the flow from the hash table (at elevated IPL). + * Once it's off the list, we can deal with it at normal + * network IPL. + */ + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipflow_inuse--; + free(ipf, M_IPFLOW); +} + +static struct ipflow * +ipflow_reap( + void) +{ + struct ipflow *ipf, *maybe_ipf = NULL; + int idx; + int s; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + ipf = LIST_FIRST(&ipflows[idx]); + while (ipf != NULL) { + /* + * If this no longer points to a valid route + * reclaim it. + */ + if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) + goto done; + /* + * choose the one that's been least recently used + * or has had the least uses in the last 1.5 + * intervals. + */ + if (maybe_ipf == NULL + || ipf->ipf_timer < maybe_ipf->ipf_timer + || (ipf->ipf_timer == maybe_ipf->ipf_timer + && ipf->ipf_last_uses + ipf->ipf_uses < + maybe_ipf->ipf_last_uses + + maybe_ipf->ipf_uses)) + maybe_ipf = ipf; + ipf = LIST_NEXT(ipf, ipf_next); + } + } + ipf = maybe_ipf; + done: + /* + * Remove the entry from the flow table. + */ + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + return ipf; +} + +void +ipflow_slowtimo( + void) +{ + struct ipflow *ipf; + int idx; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + ipf = LIST_FIRST(&ipflows[idx]); + while (ipf != NULL) { + struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next); + if (--ipf->ipf_timer == 0) { + ipflow_free(ipf); + } else { + ipf->ipf_last_uses = ipf->ipf_uses; + ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; + ipstat.ips_forward += ipf->ipf_uses; + ipstat.ips_fastforward += ipf->ipf_uses; + ipf->ipf_uses = 0; + } + ipf = next_ipf; + } + } +} + +void +ipflow_create( + const struct route *ro, + struct mbuf *m) +{ + const struct ip *const ip = mtod(m, struct ip *); + struct ipflow *ipf; + unsigned hash; + int s; + + /* + * Don't create cache entries for ICMP messages. + */ + if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) + return; + /* + * See if an existing flow struct exists. If so remove it from it's + * list and free the old route. If not, try to malloc a new one + * (if we aren't at our limit). + */ + ipf = ipflow_lookup(ip); + if (ipf == NULL) { + if (ipflow_inuse == IPFLOW_MAX) { + ipf = ipflow_reap(); + } else { + ipf = (struct ipflow *) malloc(sizeof(*ipf), M_IPFLOW, + M_NOWAIT); + if (ipf == NULL) + return; + ipflow_inuse++; + } + bzero((caddr_t) ipf, sizeof(*ipf)); + } else { + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipf->ipf_uses = ipf->ipf_last_uses = 0; + ipf->ipf_errors = ipf->ipf_dropped = 0; + } + + /* + * Fill in the updated information. + */ + ipf->ipf_ro = *ro; + ro->ro_rt->rt_refcnt++; + ipf->ipf_dst = ip->ip_dst; + ipf->ipf_src = ip->ip_src; + ipf->ipf_tos = ip->ip_tos; + ipf->ipf_timer = IPFLOW_TIMER; + /* + * Insert into the approriate bucket of the flow table. + */ + hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + s = splimp(); + LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next); + splx(s); +} diff --git a/sys/netinet/ip_flow.h b/sys/netinet/ip_flow.h new file mode 100644 index 0000000..4675996 --- /dev/null +++ b/sys/netinet/ip_flow.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_FLOW_H +#define _NETINET_IP_FLOW_H + +struct ipflow { + LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */ + struct in_addr ipf_dst; /* destination address */ + struct in_addr ipf_src; /* source address */ + + u_int8_t ipf_tos; /* type-of-service */ + struct route ipf_ro; /* associated route entry */ + u_long ipf_uses; /* number of uses in this period */ + + int ipf_timer; /* remaining lifetime of this entry */ + u_long ipf_dropped; /* ENOBUFS returned by if_output */ + u_long ipf_errors; /* other errors returned by if_output */ + u_long ipf_last_uses; /* number of uses in last period */ +}; + +#endif diff --git a/sys/netinet/ip_fw.c b/sys/netinet/ip_fw.c new file mode 100644 index 0000000..d7ccad7 --- /dev/null +++ b/sys/netinet/ip_fw.c @@ -0,0 +1,2254 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * Copyright (c) 1996 Alex Nash + * Copyright (c) 2000-2002 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +#define DEB(x) +#define DDB(x) x + +/* + * Implement IP packet firewall + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/ucred.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#include <netinet/if_ether.h> /* XXX ethertype_ip */ + +static int fw_debug = 1; +#ifdef IPFIREWALL_VERBOSE +static int fw_verbose = 1; +#else +static int fw_verbose = 0; +#endif +int fw_one_pass = 1 ; +#ifdef IPFIREWALL_VERBOSE_LIMIT +static int fw_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#else +static int fw_verbose_limit = 0; +#endif +static int fw_permanent_rules = 0; + +/* + * Right now, two fields in the IP header are changed to host format + * by the IP layer before calling the firewall. Ideally, we would like + * to have them in network format so that the packet can be + * used as it comes from the device driver (and is thus readonly). + */ + +static u_int64_t counter; /* counter for ipfw_report(NULL...) */ + +#define IPFW_DEFAULT_RULE ((u_int)(u_short)~0) + +LIST_HEAD (ip_fw_head, ip_fw) ip_fw_chain_head; + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW, + &fw_enable, 0, "Enable ipfw"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW, + &fw_one_pass, 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, + &fw_debug, 0, "Enable printing of debug ip_fw statements"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW, + &fw_verbose, 0, "Log matches to ipfw rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, + &fw_verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, permanent_rules, CTLFLAG_RW, + &fw_permanent_rules, 0, "Set rule number, below which rules are permanent"); + +/* + * Extension for stateful ipfw. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, it is first hashed, then matched + * against the entries in the corresponding list. + * Matching occurs according to the rule type. The default is to + * match the four fields and the protocol, and rules are bidirectional. + * + * For a busy proxy/web server we will have lots of connections to + * the server. We could decide for a rule type where we ignore + * ports (different hashing) and avoid special SYN/RST/FIN handling. + * + * XXX when we decide to support more than one rule type, we should + * repeat the hashing multiple times uing only the useful fields. + * Or, we could run the various tests in parallel, because the + * 'move to front' technique should shorten the average search. + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is stored in dyn_count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rules holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. + * XXX check the latter!!! + */ +static struct ipfw_dyn_rule **ipfw_dyn_v = NULL ; +static u_int32_t dyn_buckets = 256 ; /* must be power of 2 */ +static u_int32_t curr_dyn_buckets = 256 ; /* must be power of 2 */ + +/* + * timeouts for various events in handing dynamic rules. + */ +static u_int32_t dyn_ack_lifetime = 300 ; +static u_int32_t dyn_syn_lifetime = 20 ; +static u_int32_t dyn_fin_lifetime = 1 ; +static u_int32_t dyn_rst_lifetime = 1 ; +static u_int32_t dyn_udp_lifetime = 10 ; +static u_int32_t dyn_short_lifetime = 5 ; + +/* + * after reaching 0, dynamic rules are considered still valid for + * an additional grace time, unless there is lack of resources. + */ +static u_int32_t dyn_grace_time = 10 ; + +static u_int32_t static_count = 0 ; /* # of static rules */ +static u_int32_t dyn_count = 0 ; /* # of dynamic rules */ +static u_int32_t dyn_max = 1000 ; /* max # of dynamic rules */ + +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, + &dyn_buckets, 0, "Number of dyn. buckets"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, + &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, + &dyn_count, 0, "Number of dyn. rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, + &dyn_max, 0, "Max number of dyn. rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, + &static_count, 0, "Number of static rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, + &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, + &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, + &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, + &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, + &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, + &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_grace_time, CTLFLAG_RD, + &dyn_grace_time, 0, "Grace time for dyn. rules"); + +#endif /* SYSCTL_NODE */ + +#define dprintf(a) do { \ + if (fw_debug) \ + printf a; \ + } while (0) +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 + +static int add_entry (struct ip_fw_head *chainptr, struct ip_fw *frwl); +static int del_entry (struct ip_fw_head *chainptr, u_short number); +static int zero_entry (struct ip_fw *, int); +static int check_ipfw_struct (struct ip_fw *m); +static int iface_match (struct ifnet *ifp, union ip_fw_if *ifu, + int byname); +static int ipopts_match (struct ip *ip, struct ip_fw *f); +static int iptos_match (struct ip *ip, struct ip_fw *f); +static __inline int + port_match (u_short *portptr, int nports, u_short port, + int range_flag, int mask); +static int tcpflg_match (struct tcphdr *tcp, struct ip_fw *f); +static int icmptype_match (struct icmp * icmp, struct ip_fw * f); +static void ipfw_report (struct ip_fw *f, struct ip *ip, int ip_off, + int ip_len, struct ifnet *rif, + struct ifnet *oif); + +static void flush_rule_ptrs(void); + +static ip_fw_chk_t ip_fw_chk; +static int ip_fw_ctl (struct sockopt *sopt); + +ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL; + +static char err_prefix[] = "ip_fw_ctl:"; + +/* + * Returns 1 if the port is matched by the vector, 0 otherwise + */ +static __inline int +port_match(u_short *portptr, int nports, u_short port, int range_flag, int mask) +{ + if (!nports) + return 1; + if (mask) { + if ( 0 == ((portptr[0] ^ port) & portptr[1]) ) + return 1; + nports -= 2; + portptr += 2; + } + if (range_flag) { + if (portptr[0] <= port && port <= portptr[1]) + return 1; + nports -= 2; + portptr += 2; + } + while (nports-- > 0) + if (*portptr++ == port) + return 1; + return 0; +} + +static int +tcpflg_match(struct tcphdr *tcp, struct ip_fw *f) +{ + u_char flg_set, flg_clr; + + /* + * If an established connection is required, reject packets that + * have only SYN of RST|ACK|SYN set. Otherwise, fall through to + * other flag requirements. + */ + if ((f->fw_ipflg & IP_FW_IF_TCPEST) && + ((tcp->th_flags & (TH_RST | TH_ACK | TH_SYN)) == TH_SYN)) + return 0; + + flg_set = tcp->th_flags & f->fw_tcpf; + flg_clr = tcp->th_flags & f->fw_tcpnf; + + if (flg_set != f->fw_tcpf) + return 0; + if (flg_clr) + return 0; + + return 1; +} + +static int +icmptype_match(struct icmp *icmp, struct ip_fw *f) +{ + int type; + + if (!(f->fw_flg & IP_FW_F_ICMPBIT)) + return(1); + + type = icmp->icmp_type; + + /* check for matching type in the bitmap */ + if (type < IP_FW_ICMPTYPES_MAX && + (f->fw_uar.fw_icmptypes[type / (sizeof(unsigned) * NBBY)] & + (1U << (type % (sizeof(unsigned) * NBBY))))) + return(1); + + return(0); /* no match */ +} + +static int +is_icmp_query(struct ip *ip) +{ + const struct icmp *icmp; + int icmp_type; + + icmp = (struct icmp *)((u_int32_t *)ip + ip->ip_hl); + icmp_type = icmp->icmp_type; + + if (icmp_type == ICMP_ECHO || icmp_type == ICMP_ROUTERSOLICIT || + icmp_type == ICMP_TSTAMP || icmp_type == ICMP_IREQ || + icmp_type == ICMP_MASKREQ) + return(1); + + return(0); +} + +static int +ipopts_match(struct ip *ip, struct ip_fw *f) +{ + register u_char *cp; + int opt, optlen, cnt; + u_char opts, nopts, nopts_sve; + + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + opts = f->fw_ipopt; + nopts = nopts_sve = f->fw_ipnopt; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + return 0; /*XXX*/ + } + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + opts &= ~IP_FW_IPOPT_LSRR; + nopts &= ~IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + opts &= ~IP_FW_IPOPT_SSRR; + nopts &= ~IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + opts &= ~IP_FW_IPOPT_RR; + nopts &= ~IP_FW_IPOPT_RR; + break; + case IPOPT_TS: + opts &= ~IP_FW_IPOPT_TS; + nopts &= ~IP_FW_IPOPT_TS; + break; + } + if (opts == nopts) + break; + } + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static int +iptos_match(struct ip *ip, struct ip_fw *f) +{ + + u_int flags = (ip->ip_tos & 0x1f); + u_char opts, nopts, nopts_sve; + + opts = (f->fw_iptos & 0x1f); + nopts = nopts_sve = f->fw_ipntos; + + while (flags != 0) { + u_int flag; + + flag = 1 << (ffs(flags) -1); + opts &= ~flag; + nopts &= ~flag; + flags &= ~flag; + } + + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; + +} + + +static int +tcpopts_match(struct tcphdr *tcp, struct ip_fw *f) +{ + register u_char *cp; + int opt, optlen, cnt; + u_char opts, nopts, nopts_sve; + + cp = (u_char *)(tcp + 1); + cnt = (tcp->th_off << 2) - sizeof (struct tcphdr); + opts = f->fw_tcpopt; + nopts = nopts_sve = f->fw_tcpnopt; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + opts &= ~IP_FW_TCPOPT_MSS; + nopts &= ~IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + opts &= ~IP_FW_TCPOPT_WINDOW; + nopts &= ~IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + opts &= ~IP_FW_TCPOPT_SACK; + nopts &= ~IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + opts &= ~IP_FW_TCPOPT_TS; + nopts &= ~IP_FW_TCPOPT_TS; + break; + + case TCPOPT_CC: + case TCPOPT_CCNEW: + case TCPOPT_CCECHO: + opts &= ~IP_FW_TCPOPT_CC; + nopts &= ~IP_FW_TCPOPT_CC; + break; + } + if (opts == nopts) + break; + } + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static int +iface_match(struct ifnet *ifp, union ip_fw_if *ifu, int byname) +{ + /* Check by name or by IP address */ + if (byname) { + /* Check unit number (-1 is wildcard) */ + if (ifu->fu_via_if.unit != -1 + && ifp->if_unit != ifu->fu_via_if.unit) + return(0); + /* Check name */ + if (strncmp(ifp->if_name, ifu->fu_via_if.name, FW_IFNLEN)) + return(0); + return(1); + } else if (ifu->fu_via_ip.s_addr != 0) { /* Zero == wildcard */ + struct ifaddr *ia; + + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr == NULL) + continue; + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (ifu->fu_via_ip.s_addr != ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) + continue; + return(1); + } + return(0); + } + return(1); +} + +static void +ipfw_report(struct ip_fw *f, struct ip *ip, int ip_off, int ip_len, + struct ifnet *rif, struct ifnet *oif) +{ + struct tcphdr *const tcp = (struct tcphdr *) ((u_int32_t *) ip+ ip->ip_hl); + struct udphdr *const udp = (struct udphdr *) ((u_int32_t *) ip+ ip->ip_hl); + struct icmp *const icmp = (struct icmp *) ((u_int32_t *) ip + ip->ip_hl); + u_int64_t count; + char *action; + char action2[32], proto[47], name[18], fragment[27]; + int len; + int offset = ip_off & IP_OFFMASK; + + count = f ? f->fw_pcnt : ++counter; + if ((f == NULL && fw_verbose_limit != 0 && count > fw_verbose_limit) || + (f && f->fw_logamount != 0 && count > f->fw_loghighest)) + return; + + /* Print command name */ + snprintf(SNPARGS(name, 0), "ipfw: %d", f ? f->fw_number : -1); + + action = action2; + if (!f) + action = "Refuse"; + else { + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_DENY: + action = "Deny"; + break; + case IP_FW_F_REJECT: + if (f->fw_reject_code == IP_FW_REJECT_RST) + action = "Reset"; + else + action = "Unreach"; + break; + case IP_FW_F_ACCEPT: + action = "Accept"; + break; + case IP_FW_F_COUNT: + action = "Count"; + break; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + f->fw_divert_port); + break; + case IP_FW_F_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + f->fw_divert_port); + break; +#endif + case IP_FW_F_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + f->fw_skipto_rule); + break; + case IP_FW_F_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + f->fw_skipto_rule); + break; + case IP_FW_F_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + f->fw_skipto_rule); + break; + + case IP_FW_F_FWD: + if (f->fw_fwd_ip.sin_port) + snprintf(SNPARGS(action2, 0), + "Forward to %s:%d", + inet_ntoa(f->fw_fwd_ip.sin_addr), + f->fw_fwd_ip.sin_port); + else + snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(f->fw_fwd_ip.sin_addr)); + break; + + default: + action = "UNKNOWN"; + break; + } + } + + switch (ip->ip_p) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", + inet_ntoa(ip->ip_src)); + if (offset == 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(tcp->th_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_dst)); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(tcp->th_dport)); + break; + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", + inet_ntoa(ip->ip_src)); + if (offset == 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(udp->uh_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_dst)); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(udp->uh_dport)); + break; + case IPPROTO_ICMP: + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_src)); + snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); + break; + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p, + inet_ntoa(ip->ip_src)); + snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); + break; + } + + if (ip_off & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), + offset << 3, + (ip_off & IP_MF) ? "+" : ""); + else + fragment[0] = '\0'; + if (oif) + log(LOG_SECURITY | LOG_INFO, "%s %s %s out via %s%d%s\n", + name, action, proto, oif->if_name, oif->if_unit, fragment); + else if (rif) + log(LOG_SECURITY | LOG_INFO, "%s %s %s in via %s%d%s\n", name, + action, proto, rif->if_name, rif->if_unit, fragment); + else + log(LOG_SECURITY | LOG_INFO, "%s %s %s%s\n", name, action, + proto, fragment); + if ((f ? f->fw_logamount != 0 : 1) && + count == (f ? f->fw_loghighest : fw_verbose_limit)) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + f ? f->fw_logamount : fw_verbose_limit, + f ? f->fw_number : -1); +} + +static __inline int +hash_packet(struct ipfw_flow_id *id) +{ + u_int32_t i ; + + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (curr_dyn_buckets - 1) ; + return i ; +} + +/** + * unlink a dynamic rule from a chain. prev is a pointer to + * the previous one, q is a pointer to the rule to delete, + * head is a pointer to the head of the queue. + * Modifies q and potentially also head. + */ +#define UNLINK_DYN_RULE(prev, head, q) { \ + struct ipfw_dyn_rule *old_q = q; \ + \ + /* remove a refcount to the parent */ \ + if (q->dyn_type == DYN_LIMIT) \ + q->parent->count--; \ + DEB(printf("-- unlink entry 0x%08x %d -> 0x%08x %d, %d left\n", \ + (q->id.src_ip), (q->id.src_port), \ + (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \ + if (prev != NULL) \ + prev->next = q = q->next ; \ + else \ + ipfw_dyn_v[i] = q = q->next ; \ + dyn_count-- ; \ + free(old_q, M_IPFW); } + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) +/** + * Remove all dynamic rules pointing to a given rule, or all + * rules if rule == NULL. Second parameter is 1 if we want to + * delete unconditionally, otherwise only expired rules are removed. + */ +static void +remove_dyn_rule(struct ip_fw *rule, int force) +{ + struct ipfw_dyn_rule *prev, *q; + int i, pass, max_pass ; + static u_int32_t last_remove = 0 ; + + if (ipfw_dyn_v == NULL || dyn_count == 0) + return ; + /* do not expire more than once per second, it is useless */ + if (force == 0 && last_remove == time_second) + return ; + last_remove = time_second ; + + /* + * because DYN_LIMIT refer to parent rules, during the first pass only + * remove child and mark any pending LIMIT_PARENT, and remove + * them in a second pass. + */ + for (pass = max_pass = 0; pass <= max_pass ; pass++ ) { + for (i = 0 ; i < curr_dyn_buckets ; i++) { + for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { + /* + * logic can become complex here, so we split tests. + * First, test if we match any rule, + * then make sure the rule is expired or we want to kill it, + * and possibly more in the future. + */ + int zap = ( rule == NULL || rule == q->rule); + if (zap) + zap = force || TIME_LEQ( q->expire , time_second ); + /* do not zap parent in first pass, record we need a second pass */ + if (q->dyn_type == DYN_LIMIT_PARENT) { + max_pass = 1; /* we need a second pass */ + if (zap == 1 && (pass == 0 || q->count != 0) ) { + zap = 0 ; + if (pass == 1) /* should not happen */ + printf("OUCH! cannot remove rule, count %d\n", + q->count); + } + } + if (zap) { + UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + } else { + prev = q ; + q = q->next ; + } + } + } + } +} + +#define EXPIRE_DYN_CHAIN(rule) remove_dyn_rule(rule, 0 /* expired ones */) +#define EXPIRE_DYN_CHAINS() remove_dyn_rule(NULL, 0 /* expired ones */) +#define DELETE_DYN_CHAIN(rule) remove_dyn_rule(rule, 1 /* force removal */) +#define DELETE_DYN_CHAINS() remove_dyn_rule(NULL, 1 /* force removal */) + +/** + * lookup a dynamic rule. + */ +static struct ipfw_dyn_rule * +lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction) +{ + /* + * stateful ipfw extensions. + * Lookup into dynamic session queue + */ + struct ipfw_dyn_rule *prev, *q ; + int i, dir = 0; +#define MATCH_FORWARD 1 + + if (ipfw_dyn_v == NULL) + return NULL ; + i = hash_packet( pkt ); + for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { + if (q->dyn_type == DYN_LIMIT_PARENT) + goto next; + if (TIME_LEQ( q->expire , time_second ) ) { /* expire entry */ + UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q); + continue; + } + if ( pkt->proto == q->id.proto) { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD ; + goto found ; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = 0 ; /* reverse match */ + goto found ; + } + } +next: + prev = q ; + q = q->next ; + } + return NULL ; /* clearly not found */ +found: + if ( prev != NULL) { /* found and not in front */ + prev->next = q->next ; + q->next = ipfw_dyn_v[i] ; + ipfw_dyn_v[i] = q ; + } + if (pkt->proto == IPPROTO_TCP) { + /* update state according to flags */ + u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); + q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); + switch (q->state) { + case TH_SYN : + /* opening */ + q->expire = time_second + dyn_syn_lifetime ; + break ; + case TH_SYN | (TH_SYN << 8) : + /* move to established */ + q->expire = time_second + dyn_ack_lifetime ; + break ; + case TH_SYN | (TH_SYN << 8) | TH_FIN : + case TH_SYN | (TH_SYN << 8) | (TH_FIN << 8) : + /* one side tries to close */ + q->expire = time_second + dyn_ack_lifetime ; + break ; + case TH_SYN | (TH_SYN << 8) | TH_FIN | (TH_FIN << 8) : + /* both sides closed */ + q->expire = time_second + dyn_fin_lifetime ; + break ; + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + q->expire = time_second + dyn_rst_lifetime ; + break ; + } + } else if (pkt->proto == IPPROTO_UDP) { + q->expire = time_second + dyn_udp_lifetime ; + } else { + /* other protocols */ + q->expire = time_second + dyn_short_lifetime ; + } + if (match_direction) + *match_direction = dir ; + return q ; +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (DYN_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (DYN_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (DYN_LIMIT_PARENT). + */ + +static struct ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) +{ + struct ipfw_dyn_rule *r ; + + int i ; + if (ipfw_dyn_v == NULL || + (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { + /* try reallocation, make sure we have a power of 2 */ + u_int32_t i = dyn_buckets ; + while ( i > 0 && (i & 1) == 0 ) + i >>= 1 ; + if (i != 1) /* not a power of 2 */ + dyn_buckets = curr_dyn_buckets ; /* reset */ + else { + curr_dyn_buckets = dyn_buckets ; + if (ipfw_dyn_v != NULL) + free(ipfw_dyn_v, M_IPFW); + ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof r, + M_IPFW, M_DONTWAIT | M_ZERO); + if (ipfw_dyn_v == NULL) + return NULL; /* failed ! */ + } + } + i = hash_packet(id); + + r = malloc(sizeof *r, M_IPFW, M_DONTWAIT | M_ZERO); + if (r == NULL) { + printf ("sorry cannot allocate state\n"); + return NULL ; + } + + /* increase refcount on parent, and set pointer */ + if (dyn_type == DYN_LIMIT) { + struct ipfw_dyn_rule *parent = (struct ipfw_dyn_rule *)rule; + if ( parent->dyn_type != DYN_LIMIT_PARENT) + panic("invalid parent"); + parent->count++ ; + r->parent = parent ; + rule = parent->rule; + } + + r->id = *id ; + r->expire = time_second + dyn_syn_lifetime ; + r->rule = rule ; + r->dyn_type = dyn_type ; + r->pcnt = r->bcnt = 0 ; + r->count = 0 ; + + r->bucket = i ; + r->next = ipfw_dyn_v[i] ; + ipfw_dyn_v[i] = r ; + dyn_count++ ; + DEB(printf("-- add entry 0x%08x %d -> 0x%08x %d, total %d\n", + (r->id.src_ip), (r->id.src_port), + (r->id.dst_ip), (r->id.dst_port), + dyn_count ); ) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static struct ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) +{ + struct ipfw_dyn_rule *q; + int i; + + if (ipfw_dyn_v) { + i = hash_packet( pkt ); + for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next) + if (q->dyn_type == DYN_LIMIT_PARENT && rule == q->rule && + pkt->proto == q->id.proto && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port) { + q->expire = time_second + dyn_short_lifetime ; + DEB(printf("lookup_dyn_parent found 0x%p\n", q);) + return q; + } + } + return add_dyn_rule(pkt, DYN_LIMIT_PARENT, rule); +} + +/* + * Install dynamic state. + * There are different types of dynamic rules which can be installed. + * The type is in rule->dyn_type. + * Type 0 (default) is a bidirectional rule + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +static int +install_state(struct ip_fw *rule, struct ip_fw_args *args) +{ + struct ipfw_dyn_rule *q ; + static int last_log ; + + u_int8_t type = rule->dyn_type ; + + DEB(printf("-- install state type %d 0x%08x %u -> 0x%08x %u\n", + type, + (args->f_id.src_ip), (args->f_id.src_port), + (args->f_id.dst_ip), (args->f_id.dst_port) );) + + q = lookup_dyn_rule(&args->f_id, NULL) ; + if (q != NULL) { /* should never occur */ + if (last_log != time_second) { + last_log = time_second ; + printf(" entry already present, done\n"); + } + return 0 ; + } + if (dyn_count >= dyn_max) /* try remove old ones... */ + EXPIRE_DYN_CHAINS(); + if (dyn_count >= dyn_max) { + if (last_log != time_second) { + last_log = time_second ; + printf(" Too many dynamic rules, sorry\n"); + } + return 1; /* cannot install, notify caller */ + } + + switch (type) { + case DYN_KEEP_STATE: /* bidir rule */ + add_dyn_rule(&args->f_id, DYN_KEEP_STATE, rule); + break ; + case DYN_LIMIT: /* limit number of sessions */ + { + u_int16_t limit_mask = rule->limit_mask ; + u_int16_t conn_limit = rule->conn_limit ; + struct ipfw_flow_id id; + struct ipfw_dyn_rule *parent; + + DEB(printf("installing dyn-limit rule %d\n", conn_limit);) + + id.dst_ip = id.src_ip = 0; + id.dst_port = id.src_port = 0 ; + id.proto = args->f_id.proto ; + + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + parent = lookup_dyn_parent(&id, rule); + if (parent == NULL) { + printf("add parent failed\n"); + return 1; + } + if (parent->count >= conn_limit) { + EXPIRE_DYN_CHAIN(rule); /* try to expire some */ + if (parent->count >= conn_limit) { + printf("drop session, too many entries\n"); + return 1; + } + } + add_dyn_rule(&args->f_id, DYN_LIMIT, (struct ip_fw *)parent); + } + break ; + default: + printf("unknown dynamic rule type %u\n", type); + return 1 ; + } + lookup_dyn_rule(&args->f_id, NULL) ; /* XXX just set the lifetime */ + return 0; +} + +/* + * given an ip_fw *, lookup_next_rule will return a pointer + * of the same type to the next one. This can be either the jump + * target (for skipto instructions) or the next one in the list (in + * all other cases including a missing jump target). + * Backward jumps are not allowed, so start looking from the next + * rule... + */ +static struct ip_fw * lookup_next_rule(struct ip_fw *me); + +static struct ip_fw * +lookup_next_rule(struct ip_fw *me) +{ + struct ip_fw *rule ; + int rulenum = me->fw_skipto_rule ; /* guess... */ + + if ( (me->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_SKIPTO ) + for (rule = LIST_NEXT(me,next); rule ; rule = LIST_NEXT(rule,next)) + if (rule->fw_number >= rulenum) + return rule ; + return LIST_NEXT(me,next) ; /* failure or not a skipto */ +} + +/* + * Parameters: + * + * *m The packet; we set to NULL when/if we nuke it. + * oif Outgoing interface, or NULL if packet is incoming + * *cookie Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * Special case: cookie == NULL on input for bridging. + * *flow_id pointer to the last matching rule (in/out) + * *next_hop socket we are forwarding to (in/out). + * For bridged packets, this is a pointer to the MAC header. + * + * Return value: + * + * IP_FW_PORT_DENY_FLAG the packet must be dropped. + * 0 The packet is to be accepted and routed normally OR + * the packet was denied/rejected and has been dropped; + * in the latter case, *m is equal to NULL upon return. + * port Divert the packet to port, with these caveats: + * + * - If IP_FW_PORT_TEE_FLAG is set, tee the packet instead + * of diverting it (ie, 'ipfw tee'). + * + * - If IP_FW_PORT_DYNT_FLAG is set, interpret the lower + * 16 bits as a dummynet pipe number instead of diverting + */ + +static int +ip_fw_chk(struct ip_fw_args *args) +#if 0 /* the old interface was this: */ + struct mbuf **m, struct ifnet *oif, u_int16_t *cookie, + struct ip_fw **flow_id, struct sockaddr_in **next_hop) +#endif +{ + /* + * grab things into variables to minimize diffs. + * XXX this has to be cleaned up later. + */ + struct mbuf **m = &(args->m); + struct ifnet *oif = args->oif; + u_int16_t *cookie = &(args->divert_rule); + struct ip_fw **flow_id = &(args->rule); + struct sockaddr_in **next_hop = &(args->next_hop); + + struct ip_fw *f = NULL; /* matching rule */ + struct ip *ip = mtod(*m, struct ip *); + struct ifnet *const rif = (*m)->m_pkthdr.rcvif; + struct ifnet *tif; + u_int hlen = ip->ip_hl << 2; + struct ether_header * eh = NULL; + + u_short ip_off=0, offset = 0 ; + /* local copy of addresses for faster matching */ + u_short src_port = 0, dst_port = 0; + struct in_addr src_ip, dst_ip; + u_int8_t proto= 0, flags = 0; + + u_int16_t skipto; + u_int16_t ip_len=0; + + int dyn_checked = 0 ; /* set after dyn.rules have been checked. */ + int direction = MATCH_FORWARD ; /* dirty trick... */ + struct ipfw_dyn_rule *q = NULL ; + +#define BRIDGED (args->eh != NULL) + if (BRIDGED) { /* this is a bridged packet */ + eh = args->eh; + if ( (*m)->m_pkthdr.len >= sizeof(struct ip) && + ntohs(eh->ether_type) == ETHERTYPE_IP) + hlen = ip->ip_hl << 2; + } else + hlen = ip->ip_hl << 2; + + /* Grab and reset cookie */ + skipto = *cookie; + *cookie = 0; + + /* + * Collect parameters into local variables for faster matching. + */ + if (hlen > 0) { /* this is an IP packet */ + proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + if (BRIDGED) { /* bridged packets are as on the wire */ + ip_off = ntohs(ip->ip_off); + ip_len = ntohs(ip->ip_len); + } else { + ip_off = ip->ip_off; + ip_len = ip->ip_len; + } + offset = ip_off & IP_OFFMASK; + if (offset == 0) { + +#define PULLUP_TO(len) \ + do { \ + if ((*m)->m_len < (len)) { \ + *m = m_pullup(*m, (len)); \ + if (*m == 0) \ + goto bogusfrag; \ + ip = mtod(*m, struct ip *); \ + } \ + } while (0) + + switch (proto) { + case IPPROTO_TCP : { + struct tcphdr *tcp; + + PULLUP_TO(hlen + sizeof(struct tcphdr)); + tcp =(struct tcphdr *)((u_int32_t *)ip + ip->ip_hl); + dst_port = tcp->th_dport ; + src_port = tcp->th_sport ; + flags = tcp->th_flags ; + } + break ; + + case IPPROTO_UDP : { + struct udphdr *udp; + + PULLUP_TO(hlen + sizeof(struct udphdr)); + udp =(struct udphdr *)((u_int32_t *)ip + ip->ip_hl); + dst_port = udp->uh_dport ; + src_port = udp->uh_sport ; + } + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen + 4); /* type, code and checksum. */ + flags = ((struct icmp *) + ((u_int32_t *)ip + ip->ip_hl))->icmp_type ; + break ; + + default : + break; + } +#undef PULLUP_TO + } + } + args->f_id.src_ip = ntohl(src_ip.s_addr); + args->f_id.dst_ip = ntohl(dst_ip.s_addr); + args->f_id.proto = proto; + args->f_id.src_port = ntohs(src_port); + args->f_id.dst_port = ntohs(dst_port); + args->f_id.flags = flags; + + if (*flow_id) { + /* + * Packet has already been tagged. Look for the next rule + * to restart processing. + */ + if (fw_one_pass) /* just accept if fw_one_pass is set */ + return 0; + + f = (*flow_id)->next_rule_ptr ; + if (f == NULL) + f = (*flow_id)->next_rule_ptr = lookup_next_rule(*flow_id); + if (f == NULL) + goto dropit; + } else { + /* + * Go down the list, looking for enlightment. + * If we've been asked to start at a given rule, do so. + */ + f = LIST_FIRST(&ip_fw_chain_head); + if (skipto != 0) { + if (skipto >= IPFW_DEFAULT_RULE) + goto dropit; + while (f && f->fw_number <= skipto) + f = LIST_NEXT(f, next); + if (f == NULL) + goto dropit; + } + } + + for (; f; f = LIST_NEXT(f, next)) { +again: + if (f->fw_number == IPFW_DEFAULT_RULE) + goto got_match ; + + /* Check if rule only valid for bridged packets */ + if ((f->fw_flg & IP_FW_BRIDGED) != 0 && !(BRIDGED)) + continue; +#undef BRIDGED + + if (oif) { + /* Check direction outbound */ + if (!(f->fw_flg & IP_FW_F_OUT)) + continue; + } else { + /* Check direction inbound */ + if (!(f->fw_flg & IP_FW_F_IN)) + continue; + } + + if (f->fw_flg & IP_FW_F_MAC) { + u_int32_t *want, *mask, *hdr; + + if (eh == NULL) /* header not available */ + continue; + + want = (void *)&(f->fw_mac_hdr); + mask = (void *)&(f->fw_mac_mask); + hdr = (void *)eh; + + if ( want[0] != (hdr[0] & mask[0]) ) + continue; + if ( want[1] != (hdr[1] & mask[1]) ) + continue; + if ( want[2] != (hdr[2] & mask[2]) ) + continue; + if (f->fw_flg & IP_FW_F_SRNG) { + u_int16_t type = ntohs(eh->ether_type); + if (type < (u_int16_t)(f->fw_mac_type) || + type > (u_int16_t)(f->fw_mac_mask_type) ) + continue; + } else { + if ((u_int16_t)(f->fw_mac_type) != (eh->ether_type & + (u_int16_t)(f->fw_mac_mask_type)) ) + continue; + } + } + + /* Interface check */ + if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + struct ifnet *const iface = oif ? oif : rif; + + /* Backwards compatibility hack for "via" */ + if (!iface || !iface_match(iface, + &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME)) + continue; + } else { + /* Check receive interface */ + if ((f->fw_flg & IP_FW_F_IIFACE) + && (!rif || !iface_match(rif, + &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME))) + continue; + /* Check outgoing interface */ + if ((f->fw_flg & IP_FW_F_OIFACE) + && (!oif || !iface_match(oif, + &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME))) + continue; + } + + /* + * For packets which matched the MAC check, we do not need + * to continue, this is a valid match. + * For not-ip packets, the rule does not apply. + */ + if (f->fw_flg & IP_FW_F_MAC) + goto rnd_then_got_match; + + if (hlen == 0) + continue; + + /* + * dynamic rules are checked at the first keep-state or + * check-state occurrence. + */ + if (f->fw_flg & (IP_FW_F_KEEP_S|IP_FW_F_CHECK_S) && + dyn_checked == 0 ) { + dyn_checked = 1 ; + q = lookup_dyn_rule(&args->f_id, &direction); + if (q != NULL) { + DEB(printf("-- dynamic match 0x%08x %d %s 0x%08x %d\n", + (q->id.src_ip), (q->id.src_port), + (direction == MATCH_FORWARD ? "-->" : "<--"), + (q->id.dst_ip), (q->id.dst_port) ); ) + f = q->rule ; + q->pcnt++ ; + q->bcnt += ip_len; + goto got_match ; /* random not allowed here */ + } + /* if this was a check-only rule, continue with next */ + if (f->fw_flg & IP_FW_F_CHECK_S) + continue ; + } + + /* Fragments */ + if ((f->fw_flg & IP_FW_F_FRAG) && offset == 0 ) + continue; + + /* + * For matching addresses, tif != NULL means we matched + * the address we requested (either "me" or addr/mask). + * Then the check for "xxx" or "not xxx" can be done + * with an XOR. + */ + + /* source address -- mandatory */ + if (f->fw_flg & IP_FW_F_SME) { + INADDR_TO_IFP(src_ip, tif); + } else + (int)tif = f->fw_src.s_addr == + (src_ip.s_addr & f->fw_smsk.s_addr); + if ( ((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ (tif == NULL) ) + continue; + + /* dst address -- mandatory */ + if (f->fw_flg & IP_FW_F_DME) { + INADDR_TO_IFP(dst_ip, tif); + } else + (int)tif = f->fw_dst.s_addr == + (dst_ip.s_addr & f->fw_dmsk.s_addr); + if ( ((f->fw_flg & IP_FW_F_INVDST) != 0) ^ (tif == NULL) ) + continue; + + /* Check IP header values */ + if (f->fw_ipflg & IP_FW_IF_IPOPT && !ipopts_match(ip, f)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPLEN && f->fw_iplen != ip_len) + continue; + if (f->fw_ipflg & IP_FW_IF_IPID && f->fw_ipid != ntohs(ip->ip_id)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPPRE && + (f->fw_iptos & 0xe0) != (ip->ip_tos & 0xe0)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPTOS && !iptos_match(ip, f)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPTTL && f->fw_ipttl != ip->ip_ttl) + continue; + if (f->fw_ipflg & IP_FW_IF_IPVER && f->fw_ipver != ip->ip_v) + continue; + + /* Check protocol; if wildcard, and no [ug]id, match */ + if (f->fw_prot == IPPROTO_IP) { + if (!(f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID))) + goto rnd_then_got_match; + } else + /* If different, don't match */ + if (proto != f->fw_prot) + continue; + + /* Protocol specific checks for uid only */ + if (f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)) { + switch (proto) { + case IPPROTO_TCP: + { + struct inpcb *P; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) + continue; + + if (oif) + P = in_pcblookup_hash(&tcbinfo, dst_ip, + dst_port, src_ip, src_port, 0, + oif); + else + P = in_pcblookup_hash(&tcbinfo, src_ip, + src_port, dst_ip, dst_port, 0, + NULL); + + if (P && P->inp_socket) { + if (f->fw_flg & IP_FW_F_UID) { + if (socheckuid(P->inp_socket, f->fw_uid)) + continue; + } else if (!groupmember(f->fw_gid, + P->inp_socket->so_cred)) + continue; + } else + continue; + break; + } + + case IPPROTO_UDP: + { + struct inpcb *P; + + if (offset != 0) + continue; + + if (oif) + P = in_pcblookup_hash(&udbinfo, dst_ip, + dst_port, src_ip, src_port, 1, + oif); + else + P = in_pcblookup_hash(&udbinfo, src_ip, + src_port, dst_ip, dst_port, 1, + NULL); + + if (P && P->inp_socket) { + if (f->fw_flg & IP_FW_F_UID) { + if (socheckuid(P->inp_socket, f->fw_uid)) + continue; + } else if (!groupmember(f->fw_gid, + P->inp_socket->so_cred)) + continue; + } else + continue; + break; + } + + default: + continue; + } + } + + /* Protocol specific checks */ + switch (proto) { + case IPPROTO_TCP: + { + struct tcphdr *tcp; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) { + /* + * TCP flags and ports aren't available in this + * packet -- if this rule specified either one, + * we consider the rule a non-match. + */ + if (IP_FW_HAVEPORTS(f) != 0 || + f->fw_ipflg & IP_FW_IF_TCPMSK) + continue; + + break; + } + tcp = (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl); + + if (f->fw_ipflg & IP_FW_IF_TCPOPT && !tcpopts_match(tcp, f)) + continue; + if (((f->fw_ipflg & IP_FW_IF_TCPFLG) || + (f->fw_ipflg & IP_FW_IF_TCPEST)) && + !tcpflg_match(tcp, f)) + continue; + if (f->fw_ipflg & IP_FW_IF_TCPSEQ && tcp->th_seq != f->fw_tcpseq) + continue; + if (f->fw_ipflg & IP_FW_IF_TCPACK && tcp->th_ack != f->fw_tcpack) + continue; + if (f->fw_ipflg & IP_FW_IF_TCPWIN && tcp->th_win != f->fw_tcpwin) + continue; + goto check_ports; + } + + case IPPROTO_UDP: + if (offset != 0) { + /* + * Port specification is unavailable -- if this + * rule specifies a port, we consider the rule + * a non-match. + */ + if (IP_FW_HAVEPORTS(f) ) + continue; + + break; + } +check_ports: + if (!port_match(&f->fw_uar.fw_pts[0], + IP_FW_GETNSRCP(f), ntohs(src_port), + f->fw_flg & IP_FW_F_SRNG, + f->fw_flg & IP_FW_F_SMSK)) + continue; + if (!port_match(&f->fw_uar.fw_pts[IP_FW_GETNSRCP(f)], + IP_FW_GETNDSTP(f), ntohs(dst_port), + f->fw_flg & IP_FW_F_DRNG, + f->fw_flg & IP_FW_F_DMSK)) + continue; + break; + + case IPPROTO_ICMP: + { + struct icmp *icmp; + + if (offset != 0) /* Type isn't valid */ + break; + icmp = (struct icmp *) ((u_int32_t *)ip + ip->ip_hl); + if (!icmptype_match(icmp, f)) + continue; + break; + } + + default: + break; + +bogusfrag: + if (fw_verbose) { + if (*m != NULL) + ipfw_report(NULL, ip, ip_off, ip_len, rif, oif); + else + printf("pullup failed\n"); + } + goto dropit; + + } + +rnd_then_got_match: + if ( f->dont_match_prob && random() < f->dont_match_prob ) + continue ; +got_match: + /* + * If not a dynamic match (q == NULL) and keep-state, install + * a new dynamic entry. + */ + if (q == NULL && f->fw_flg & IP_FW_F_KEEP_S) { + if (install_state(f, args)) /* error or limit violation */ + goto dropit; + } + /* Update statistics */ + f->fw_pcnt += 1; + f->fw_bcnt += ip_len; + f->timestamp = time_second; + + /* Log to console if desired */ + if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose && hlen >0) + ipfw_report(f, ip, ip_off, ip_len, rif, oif); + + /* Take appropriate action */ + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_ACCEPT: + return(0); + case IP_FW_F_COUNT: + continue; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: + *cookie = f->fw_number; + return(f->fw_divert_port); + case IP_FW_F_TEE: + *cookie = f->fw_number; + return(f->fw_divert_port | IP_FW_PORT_TEE_FLAG); +#endif + case IP_FW_F_SKIPTO: /* XXX check */ + if (f->next_rule_ptr == NULL) + f->next_rule_ptr = lookup_next_rule(f) ; + f = f->next_rule_ptr; + if (!f) + goto dropit; + goto again ; + + case IP_FW_F_PIPE: + case IP_FW_F_QUEUE: + *flow_id = f; /* XXX set flow id */ + return(f->fw_pipe_nr | IP_FW_PORT_DYNT_FLAG); + + case IP_FW_F_FWD: + /* Change the next-hop address for this packet. + * Initially we'll only worry about directly + * reachable next-hop's, but ultimately + * we will work out for next-hops that aren't + * direct the route we would take for it. We + * [cs]ould leave this latter problem to + * ip_output.c. We hope to high [name the abode of + * your favourite deity] that ip_output doesn't modify + * the new value of next_hop (which is dst there) + * XXX warning-- there is a dangerous reference here + * from next_hop to a field within the rule. If the + * rule is deleted, weird things might occur. + */ + if (next_hop != NULL /* Make sure, first... */ + && (q == NULL || direction == MATCH_FORWARD) ) + *next_hop = &(f->fw_fwd_ip); + return(0); /* Allow the packet */ + + } + + /* Deny/reject this packet using this rule */ + break; + } + + /* Rule IPFW_DEFAULT_RULE should always be there and match */ + KASSERT(f != NULL, ("ip_fw: no chain")); + + /* + * At this point, we're going to drop the packet. + * Send a reject notice if all of the following are true: + * + * - The packet matched a reject rule + * - The packet is not an ICMP packet, or is an ICMP query packet + * - The packet is not a multicast or broadcast packet + */ + if ((f->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT + && (proto != IPPROTO_ICMP || is_icmp_query(ip)) + && !((*m)->m_flags & (M_BCAST|M_MCAST)) + && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + switch (f->fw_reject_code) { + case IP_FW_REJECT_RST: + { + /* XXX warning, this code writes into the mbuf */ + struct tcphdr *const tcp = + (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl); + struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip; + + if (offset != 0 || (tcp->th_flags & TH_RST)) + break; + ti.ti_i = *((struct ipovly *) ip); + ti.ti_t = *tcp; + bcopy(&ti, ip, sizeof(ti)); + tip->ti_seq = ntohl(tip->ti_seq); + tip->ti_ack = ntohl(tip->ti_ack); + tip->ti_len = ip_len - hlen - (tip->ti_off << 2); + if (tcp->th_flags & TH_ACK) { + tcp_respond(NULL, (void *)ip, tcp, *m, + (tcp_seq)0, tcp->th_ack, TH_RST); + } else { + if (tcp->th_flags & TH_SYN) + tip->ti_len++; + tcp_respond(NULL, (void *)ip, tcp, *m, + tip->ti_seq + tip->ti_len, + (tcp_seq)0, TH_RST|TH_ACK); + } + *m = NULL; + break; + } + default: /* Send an ICMP unreachable using code */ + icmp_error(*m, ICMP_UNREACH, + f->fw_reject_code, 0L, 0); + *m = NULL; + break; + } + } + +dropit: + /* + * Finally, drop the packet. + */ + return(IP_FW_PORT_DENY_FLAG); +} + +/* + * when a rule is added/deleted, zero the direct pointers within + * all firewall rules. These will be reconstructed on the fly + * as packets are matched. + * Must be called at splimp(). + */ +static void +flush_rule_ptrs() +{ + struct ip_fw *fcp ; + + LIST_FOREACH(fcp, &ip_fw_chain_head, next) { + fcp->next_rule_ptr = NULL ; + } +} + +static int +add_entry(struct ip_fw_head *head, struct ip_fw *rule) +{ + struct ip_fw *ftmp, *fcp, *fcpl; + u_short nbr = 0; + int s; + + ftmp = malloc(sizeof *ftmp, M_IPFW, M_DONTWAIT | M_ZERO); + if (!ftmp) + return (ENOSPC); + bcopy(rule, ftmp, sizeof(*ftmp)); + + ftmp->fw_in_if.fu_via_if.name[FW_IFNLEN - 1] = '\0'; + ftmp->fw_pcnt = 0L; + ftmp->fw_bcnt = 0L; + ftmp->next_rule_ptr = NULL ; + ftmp->pipe_ptr = NULL ; + + s = splimp(); + + if (LIST_FIRST(head) == 0) { + LIST_INSERT_HEAD(head, ftmp, next); + goto done; + } + + /* If entry number is 0, find highest numbered rule and add 100 */ + if (ftmp->fw_number == 0) { + LIST_FOREACH(fcp, head, next) { + if (fcp->fw_number != IPFW_DEFAULT_RULE) + nbr = fcp->fw_number; + else + break; + } + if (nbr < IPFW_DEFAULT_RULE - 100) + nbr += 100; + ftmp->fw_number = rule->fw_number = nbr; + } + + /* Got a valid number; now insert it, keeping the list ordered */ + fcpl = NULL ; + LIST_FOREACH(fcp, head, next) { + if (fcp->fw_number > ftmp->fw_number) { + if (fcpl) { + LIST_INSERT_AFTER(fcpl, ftmp, next); + } else { + LIST_INSERT_HEAD(head, ftmp, next); + } + break; + } else { + fcpl = fcp; + } + } + flush_rule_ptrs(); +done: + static_count++; + splx(s); + DEB(printf("++ installed rule %d, static count now %d\n", + ftmp->fw_number, static_count);) + return (0); +} + +/** + * free storage associated with a static rule entry (including + * dependent dynamic rules), and zeroes rule pointers to avoid + * dangling pointer dereferences. + * @return a pointer to the next entry. + * Must be called at splimp() and with a non-null argument. + */ +static struct ip_fw * +free_chain(struct ip_fw *fcp) +{ + struct ip_fw *n; + + n = LIST_NEXT(fcp, next); + DELETE_DYN_CHAIN(fcp); + LIST_REMOVE(fcp, next); + static_count--; + if (DUMMYNET_LOADED) + ip_dn_ruledel_ptr(fcp) ; + flush_rule_ptrs(); /* more efficient to do outside the loop */ + free(fcp, M_IPFW); + return n; +} + +/** + * remove all rules with given number. + */ +static int +del_entry(struct ip_fw_head *chainptr, u_short number) +{ + struct ip_fw *rule; + + if (number != IPFW_DEFAULT_RULE) { + LIST_FOREACH(rule, chainptr, next) { + if (rule->fw_number == number) { + int s ; + + s = splimp(); /* prevent access to rules while removing */ + while (rule && rule->fw_number == number) + rule = free_chain(rule); + /* XXX could move flush_rule_ptrs() here */ + splx(s); + return 0 ; + } + } + } + return (EINVAL); +} + +/** + * Reset some or all counters on firewall rules. + * @arg frwl is null to clear all entries, or contains a specific + * rule number. + * @arg log_only is 1 if we only want to reset logs, zero otherwise. + */ + +static int +zero_entry(struct ip_fw *frwl, int log_only) +{ + struct ip_fw *rule; + int s; + u_short number = 0 ; + char *msg ; + + if (frwl == 0) { + s = splimp(); + LIST_FOREACH(rule, &ip_fw_chain_head, next) { + if (log_only == 0) { + rule->fw_bcnt = rule->fw_pcnt = 0; + rule->timestamp = 0; + } + rule->fw_loghighest = rule->fw_pcnt+rule->fw_logamount; + } + splx(s); + msg = log_only ? "ipfw: All logging counts cleared.\n" : + "ipfw: Accounting cleared.\n"; + } else { + int cleared = 0; + number = frwl->fw_number ; + /* + * It is possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + LIST_FOREACH(rule, &ip_fw_chain_head, next) + if (number == rule->fw_number) { + s = splimp(); + while (rule && number == rule->fw_number) { + if (log_only == 0) { + rule->fw_bcnt = rule->fw_pcnt = 0; + rule->timestamp = 0; + } + rule->fw_loghighest = rule->fw_pcnt+ rule->fw_logamount; + rule = LIST_NEXT(rule, next); + } + splx(s); + cleared = 1; + break; + } + if (!cleared) /* we did not find any matching rules */ + return (EINVAL); + msg = log_only ? "ipfw: Entry %d logging count reset.\n" : + "ipfw: Entry %d cleared.\n"; + } + if (fw_verbose) + log(LOG_SECURITY | LOG_NOTICE, msg, number); + return (0); +} + +static int +check_ipfw_struct(struct ip_fw *frwl) +{ + /* Check for invalid flag bits */ + if ((frwl->fw_flg & ~IP_FW_F_MASK) != 0) { + dprintf(("%s undefined flag bits set (flags=%x)\n", + err_prefix, frwl->fw_flg)); + return (EINVAL); + } + if ( (frwl->fw_flg & IP_FW_F_MAC) ) { /* match MAC address */ + return 0; + } + if (frwl->fw_flg == IP_FW_F_CHECK_S) { + /* check-state */ + return 0 ; + } + /* Must apply to incoming or outgoing (or both) */ + if (!(frwl->fw_flg & (IP_FW_F_IN | IP_FW_F_OUT))) { + dprintf(("%s neither in nor out\n", err_prefix)); + return (EINVAL); + } + /* Empty interface name is no good */ + if (((frwl->fw_flg & IP_FW_F_IIFNAME) + && !*frwl->fw_in_if.fu_via_if.name) + || ((frwl->fw_flg & IP_FW_F_OIFNAME) + && !*frwl->fw_out_if.fu_via_if.name)) { + dprintf(("%s empty interface name\n", err_prefix)); + return (EINVAL); + } + /* Sanity check interface matching */ + if ((frwl->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + ; /* allow "via" backwards compatibility */ + } else if ((frwl->fw_flg & IP_FW_F_IN) + && (frwl->fw_flg & IP_FW_F_OIFACE)) { + dprintf(("%s outgoing interface check on incoming\n", + err_prefix)); + return (EINVAL); + } + /* Sanity check port ranges */ + if ((frwl->fw_flg & IP_FW_F_SRNG) && IP_FW_GETNSRCP(frwl) < 2) { + dprintf(("%s src range set but n_src_p=%d\n", + err_prefix, IP_FW_GETNSRCP(frwl))); + return (EINVAL); + } + if ((frwl->fw_flg & IP_FW_F_DRNG) && IP_FW_GETNDSTP(frwl) < 2) { + dprintf(("%s dst range set but n_dst_p=%d\n", + err_prefix, IP_FW_GETNDSTP(frwl))); + return (EINVAL); + } + if (IP_FW_GETNSRCP(frwl) + IP_FW_GETNDSTP(frwl) > IP_FW_MAX_PORTS) { + dprintf(("%s too many ports (%d+%d)\n", + err_prefix, IP_FW_GETNSRCP(frwl), IP_FW_GETNDSTP(frwl))); + return (EINVAL); + } + /* + * Protocols other than TCP/UDP don't use port range + */ + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (IP_FW_GETNSRCP(frwl) || IP_FW_GETNDSTP(frwl))) { + dprintf(("%s port(s) specified for non TCP/UDP rule\n", + err_prefix)); + return (EINVAL); + } + + /* + * Rather than modify the entry to make such entries work, + * we reject this rule and require user level utilities + * to enforce whatever policy they deem appropriate. + */ + if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) || + (frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) { + dprintf(("%s rule never matches\n", err_prefix)); + return (EINVAL); + } + + if ((frwl->fw_flg & IP_FW_F_FRAG) && + (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) { + if (IP_FW_HAVEPORTS(frwl)) { + dprintf(("%s cannot mix 'frag' and ports\n", err_prefix)); + return (EINVAL); + } + if (frwl->fw_prot == IPPROTO_TCP && + frwl->fw_tcpf != frwl->fw_tcpnf) { + dprintf(("%s cannot mix 'frag' and TCP flags\n", err_prefix)); + return (EINVAL); + } + } + + if (frwl->fw_flg & (IP_FW_F_UID | IP_FW_F_GID)) { + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (frwl->fw_prot != IPPROTO_IP)) { + dprintf(("%s cannot use uid/gid logic on non-TCP/UDP\n", err_prefix)); + return (EINVAL); + } + } + + /* Check command specific stuff */ + switch (frwl->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_REJECT: + if (frwl->fw_reject_code >= 0x100 + && !(frwl->fw_prot == IPPROTO_TCP + && frwl->fw_reject_code == IP_FW_REJECT_RST)) { + dprintf(("%s unknown reject code\n", err_prefix)); + return (EINVAL); + } + break; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: /* Diverting to port zero is invalid */ + case IP_FW_F_TEE: +#endif + case IP_FW_F_PIPE: /* pipe 0 is invalid */ + case IP_FW_F_QUEUE: /* queue 0 is invalid */ + if (frwl->fw_divert_port == 0) { + dprintf(("%s 0 is an invalid argument\n", err_prefix)); + return (EINVAL); + } + break; + case IP_FW_F_DENY: + case IP_FW_F_ACCEPT: + case IP_FW_F_COUNT: + case IP_FW_F_SKIPTO: + case IP_FW_F_FWD: + break; + default: + dprintf(("%s invalid command\n", err_prefix)); + return (EINVAL); + } + + return 0; +} + +static int +ip_fw_ctl(struct sockopt *sopt) +{ + int error, s; + size_t size; + struct ip_fw *fcp; + struct ip_fw frwl, *bp , *buf; + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number 65535), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + */ + s = splimp(); + /* size of static rules */ + size = static_count * sizeof(struct ip_fw) ; + if (ipfw_dyn_v) /* add size of dyn.rules */ + size += (dyn_count * sizeof(struct ipfw_dyn_rule)); + + /* + * XXX todo: if the user passes a short length to know how + * much room is needed, do not + * bother filling up the buffer, just jump to the + * sooptcopyout. + */ + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == 0) { + splx(s); + error = ENOBUFS; + break; + } + + bp = buf ; + LIST_FOREACH(fcp, &ip_fw_chain_head, next) { + bcopy(fcp, bp, sizeof *fcp); + bp++; + } + if (ipfw_dyn_v) { + int i ; + struct ipfw_dyn_rule *p, *dst, *last = NULL ; + + dst = (struct ipfw_dyn_rule *)bp ; + for (i = 0 ; i < curr_dyn_buckets ; i++ ) + for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next, dst++ ) { + bcopy(p, dst, sizeof *p); + (int)dst->rule = p->rule->fw_number ; + /* + * store a non-null value in "next". The userland + * code will interpret a NULL here as a marker + * for the last dynamic rule. + */ + dst->next = dst ; + last = dst ; + if (TIME_LEQ(dst->expire, time_second) ) + dst->expire = 0 ; + else + dst->expire -= time_second ; + } + if (last != NULL) + last->next = NULL ; /* mark last dynamic rule */ + } + splx(s); + + error = sooptcopyout(sopt, buf, size); + free(buf, M_TEMP); + break; + + case IP_FW_FLUSH: + /* + * Normally we cannot release the lock on each iteration. + * We could do it here only because we start from the head all + * the times so there is no risk of missing some entries. + * On the other hand, the risk is that we end up with + * a very inconsistent ruleset, so better keep the lock + * around the whole cycle. + * + * XXX this code can be improved by resetting the head of + * the list to point to the default rule, and then freeing + * the old list without the need for a lock. + */ + + s = splimp(); + while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) && + fcp->fw_number != IPFW_DEFAULT_RULE ) + free_chain(fcp); + splx(s); + break; + + case IP_FW_ADD: + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error || (error = check_ipfw_struct(&frwl))) + break; + + if (frwl.fw_number == IPFW_DEFAULT_RULE) { + dprintf(("%s can't add rule %u\n", err_prefix, + (unsigned)IPFW_DEFAULT_RULE)); + error = EINVAL; + } else { + error = add_entry(&ip_fw_chain_head, &frwl); + if (!error && sopt->sopt_dir == SOPT_GET) + error = sooptcopyout(sopt, &frwl, sizeof frwl); + } + break; + + case IP_FW_DEL: + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error) + break; + + if (frwl.fw_number == IPFW_DEFAULT_RULE) { + dprintf(("%s can't delete rule %u\n", err_prefix, + (unsigned)IPFW_DEFAULT_RULE)); + error = EINVAL; + } else { + error = del_entry(&ip_fw_chain_head, frwl.fw_number); + } + break; + + case IP_FW_ZERO: + case IP_FW_RESETLOG: + { + int cmd = (sopt->sopt_name == IP_FW_RESETLOG ); + void *arg = NULL ; + + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error) + break; + arg = &frwl ; + } + error = zero_entry(arg, cmd); + } + break; + + default: + printf("ip_fw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL ; + } + + return (error); +} + +/** + * dummynet needs a reference to the default rule, because rules can + * be deleted while packets hold a reference to them (e.g. to resume + * processing at the next rule). When this happens, dummynet changes + * the reference to the default rule (probably it could well be a + * NULL pointer, but this way we do not need to check for the special + * case, plus here he have info on the default behaviour. + */ +struct ip_fw *ip_fw_default_rule ; + +void +ip_fw_init(void) +{ + struct ip_fw default_rule; + + ip_fw_chk_ptr = ip_fw_chk; + ip_fw_ctl_ptr = ip_fw_ctl; + LIST_INIT(&ip_fw_chain_head); + + bzero(&default_rule, sizeof default_rule); + default_rule.fw_prot = IPPROTO_IP; + default_rule.fw_number = IPFW_DEFAULT_RULE; +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + default_rule.fw_flg |= IP_FW_F_ACCEPT; +#else + default_rule.fw_flg |= IP_FW_F_DENY; +#endif + default_rule.fw_flg |= IP_FW_F_IN | IP_FW_F_OUT; + if (check_ipfw_struct(&default_rule) != 0 || + add_entry(&ip_fw_chain_head, &default_rule)) + panic("ip_fw_init"); + + ip_fw_default_rule = LIST_FIRST(&ip_fw_chain_head) ; + printf("IP packet filtering initialized, " +#ifdef IPDIVERT + "divert enabled, " +#else + "divert disabled, " +#endif + "rule-based forwarding enabled, " +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + "default to accept, "); +#else + "default to deny, " ); +#endif +#ifndef IPFIREWALL_VERBOSE + printf("logging disabled\n"); +#else + if (fw_verbose_limit == 0) + printf("unlimited logging\n"); + else + printf("logging limited to %d packets/entry by default\n", + fw_verbose_limit); +#endif +} + +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int s; + int err = 0 ; +#if defined(KLD_MODULE) + struct ip_fw *fcp; +#endif + + switch (type) { + case MOD_LOAD: + s = splimp(); + if (IPFW_LOADED) { + splx(s); + printf("IP firewall already loaded\n"); + err = EEXIST ; + } else { + ip_fw_init(); + splx(s); + } + break ; + case MOD_UNLOAD: +#if !defined(KLD_MODULE) + printf("ipfw statically compiled, cannot unload\n"); + err = EBUSY; +#else + s = splimp(); + ip_fw_chk_ptr = NULL ; + ip_fw_ctl_ptr = NULL ; + while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) != NULL) + free_chain(fcp); + splx(s); + printf("IP firewall unloaded\n"); +#endif + break; + default: + break; + } + return err; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; +DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(ipfw, 1); diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h new file mode 100644 index 0000000..dcb3bcf --- /dev/null +++ b/sys/netinet/ip_fw.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_H +#define _IP_FW_H + +#include <sys/queue.h> + +/* + * This union structure identifies an interface, either explicitly + * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME + * and IP_FW_F_OIFNAME say how to interpret this structure. An + * interface unit number of -1 matches any unit number, while an + * IP address of 0.0.0.0 indicates matches any interface. + * + * The receive and transmit interfaces are only compared against the + * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE) + * is set. Note some packets lack a receive or transmit interface + * (in which case the missing "interface" never matches). + */ + +union ip_fw_if { + struct in_addr fu_via_ip; /* Specified by IP address */ + struct { /* Specified by interface name */ +#define FW_IFNLEN 10 /* need room ! was IFNAMSIZ */ + char name[FW_IFNLEN]; + short unit; /* -1 means match any unit */ + } fu_via_if; +}; + +/* + * Format of an IP firewall descriptor + * + * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order. + * fw_flg and fw_n*p are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + */ + +/* + * To match MAC headers: + * 12 bytes at fw_mac_hdr contain the dst-src MAC address after masking. + * 12 bytes at fw_mac_mask contain the mask to apply to dst-src + * 2 bytes at fw_mac_type contain the mac type after mask (in net format) + * 2 bytes at fw_mac_type_mask contain the mac type mask + * If IP_FW_F_SRNG, the two contain the low-high of a range of types. + * IP_FW_F_DRNG is used to indicare we want to match a vlan. + */ +#define fw_mac_hdr fw_src +#define fw_mac_mask fw_uar +#define fw_mac_type fw_iplen +#define fw_mac_mask_type fw_ipid + +struct ip_fw { + LIST_ENTRY(ip_fw) next; /* bidirectional list of rules */ + u_int fw_flg; /* Operational Flags word */ + u_int64_t fw_pcnt; /* Packet counters */ + u_int64_t fw_bcnt; /* Byte counters */ + + struct in_addr fw_src; /* Source IP address */ + struct in_addr fw_dst; /* Destination IP address */ + struct in_addr fw_smsk; /* Mask for source IP address */ + struct in_addr fw_dmsk; /* Mask for destination address */ + u_short fw_number; /* Rule number */ + u_char fw_prot; /* IP protocol */ +#if 1 + u_char fw_nports; /* # of src/dst port in array */ +#define IP_FW_GETNSRCP(rule) ((rule)->fw_nports & 0x0f) +#define IP_FW_SETNSRCP(rule, n) do { \ + (rule)->fw_nports &= ~0x0f; \ + (rule)->fw_nports |= (n); \ + } while (0) +#define IP_FW_GETNDSTP(rule) ((rule)->fw_nports >> 4) +#define IP_FW_SETNDSTP(rule, n) do { \ + (rule)->fw_nports &= ~0xf0; \ + (rule)->fw_nports |= (n) << 4;\ + } while (0) +#define IP_FW_HAVEPORTS(rule) ((rule)->fw_nports != 0) +#else + u_char __pad[1]; + u_int _nsrcp; + u_int _ndstp; +#define IP_FW_GETNSRCP(rule) (rule)->_nsrcp +#define IP_FW_SETNSRCP(rule,n) (rule)->_nsrcp = n +#define IP_FW_GETNDSTP(rule) (rule)->_ndstp +#define IP_FW_SETNDSTP(rule,n) (rule)->_ndstp = n +#define IP_FW_HAVEPORTS(rule) ((rule)->_ndstp + (rule)->_nsrcp != 0) +#endif +#define IP_FW_MAX_PORTS 10 /* A reasonable maximum */ + union { + u_short fw_pts[IP_FW_MAX_PORTS]; /* port numbers to match */ +#define IP_FW_ICMPTYPES_MAX 128 +#define IP_FW_ICMPTYPES_DIM (IP_FW_ICMPTYPES_MAX / (sizeof(unsigned) * 8)) + unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM]; /*ICMP types bitmap*/ + } fw_uar; + + u_int fw_ipflg; /* IP flags word */ + u_short fw_iplen; /* IP length */ + u_short fw_ipid; /* Identification */ + u_char fw_ipopt; /* IP options set */ + u_char fw_ipnopt; /* IP options unset */ + u_char fw_iptos; /* IP type of service set */ + u_char fw_ipntos; /* IP type of service unset */ + u_char fw_ipttl; /* IP time to live */ + u_int fw_ipver:4; /* IP version */ + u_char fw_tcpopt; /* TCP options set */ + u_char fw_tcpnopt; /* TCP options unset */ + u_char fw_tcpf; /* TCP flags set */ + u_char fw_tcpnf; /* TCP flags unset */ + u_short fw_tcpwin; /* TCP window size */ + u_int32_t fw_tcpseq; /* TCP sequence */ + u_int32_t fw_tcpack; /* TCP acknowledgement */ + long timestamp; /* timestamp (tv_sec) of last match */ + union ip_fw_if fw_in_if; /* Incoming interfaces */ + union ip_fw_if fw_out_if; /* Outgoing interfaces */ + union { + u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ + u_short fu_pipe_nr; /* queue number (option DUMMYNET) */ + u_short fu_skipto_rule; /* SKIPTO command rule number */ + u_short fu_reject_code; /* REJECT response code */ + struct sockaddr_in fu_fwd_ip; + } fw_un; + void *pipe_ptr; /* flow_set ptr for dummynet pipe */ + void *next_rule_ptr; /* next rule in case of match */ + uid_t fw_uid; /* uid to match */ + gid_t fw_gid; /* gid to match */ + int fw_logamount; /* amount to log */ + u_int64_t fw_loghighest; /* highest number packet to log */ + + long dont_match_prob; /* 0x7fffffff means 1.0, always fail */ + u_char dyn_type; /* type for dynamic rule */ + +#define DYN_KEEP_STATE 0 /* type for keep-state rules */ +#define DYN_LIMIT 1 /* type for limit connection rules */ +#define DYN_LIMIT_PARENT 2 /* parent entry for limit connection rules */ + + /* following two fields are used to limit number of connections + * basing on either src, srcport, dst, dstport. + */ + u_char limit_mask; /* mask type for limit rule, can + * have many. + */ +#define DYN_SRC_ADDR 0x1 +#define DYN_SRC_PORT 0x2 +#define DYN_DST_ADDR 0x4 +#define DYN_DST_PORT 0x8 + + u_short conn_limit; /* # of connections for limit rule */ +}; + +#define fw_divert_port fw_un.fu_divert_port +#define fw_skipto_rule fw_un.fu_skipto_rule +#define fw_reject_code fw_un.fu_reject_code +#define fw_pipe_nr fw_un.fu_pipe_nr +#define fw_fwd_ip fw_un.fu_fwd_ip + +/* + * + * rule_ptr -------------+ + * V + * [ next.le_next ]---->[ next.le_next ]---- [ next.le_next ]---> + * [ next.le_prev ]<----[ next.le_prev ]<----[ next.le_prev ]<--- + * [ <ip_fw> body ] [ <ip_fw> body ] [ <ip_fw> body ] + * + */ + +/* + * Flow mask/flow id for each queue. + */ +struct ipfw_flow_id { + u_int32_t dst_ip; + u_int32_t src_ip; + u_int16_t dst_port; + u_int16_t src_port; + u_int8_t proto; + u_int8_t flags; /* protocol-specific flags */ +}; + +/* + * dynamic ipfw rule + */ +struct ipfw_dyn_rule { + struct ipfw_dyn_rule *next; + struct ipfw_flow_id id; /* (masked) flow id */ + struct ip_fw *rule; /* pointer to rule */ + struct ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int32_t expire; /* expire time */ + u_int64_t pcnt; /* packet match counters */ + u_int64_t bcnt; /* byte match counters */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; + +/* + * Values for "flags" field . + */ +#define IP_FW_F_COMMAND 0x000000ff /* Mask for type of chain entry: */ +#define IP_FW_F_DENY 0x00000000 /* This is a deny rule */ +#define IP_FW_F_REJECT 0x00000001 /* Deny and send a response packet */ +#define IP_FW_F_ACCEPT 0x00000002 /* This is an accept rule */ +#define IP_FW_F_COUNT 0x00000003 /* This is a count rule */ +#define IP_FW_F_DIVERT 0x00000004 /* This is a divert rule */ +#define IP_FW_F_TEE 0x00000005 /* This is a tee rule */ +#define IP_FW_F_SKIPTO 0x00000006 /* This is a skipto rule */ +#define IP_FW_F_FWD 0x00000007 /* This is a "change forwarding + * address" rule + */ +#define IP_FW_F_PIPE 0x00000008 /* This is a dummynet rule */ +#define IP_FW_F_QUEUE 0x00000009 /* This is a dummynet queue */ + +#define IP_FW_F_IN 0x00000100 /* Check inbound packets */ +#define IP_FW_F_OUT 0x00000200 /* Check outbound packets */ +#define IP_FW_F_IIFACE 0x00000400 /* Apply inbound interface test */ +#define IP_FW_F_OIFACE 0x00000800 /* Apply outbound interface test */ +#define IP_FW_F_PRN 0x00001000 /* Print if this rule matches */ +#define IP_FW_F_SRNG 0x00002000 /* The first two src ports are a min + * and max range (stored in host byte + * order). + */ +#define IP_FW_F_DRNG 0x00004000 /* The first two dst ports are a min + * and max range (stored in host byte + * order). + */ +#define IP_FW_F_FRAG 0x00008000 /* Fragment */ +#define IP_FW_F_IIFNAME 0x00010000 /* In interface by name/unit (not IP) */ +#define IP_FW_F_OIFNAME 0x00020000 /* Out interface by name/unit (not IP)*/ +#define IP_FW_F_INVSRC 0x00040000 /* Invert sense of src check */ +#define IP_FW_F_INVDST 0x00080000 /* Invert sense of dst check */ +#define IP_FW_F_ICMPBIT 0x00100000 /* ICMP type bitmap is valid */ +#define IP_FW_F_UID 0x00200000 /* filter by uid */ +#define IP_FW_F_GID 0x00400000 /* filter by gid */ +#define IP_FW_F_RND_MATCH 0x00800000 /* probabilistic rule match */ +#define IP_FW_F_SMSK 0x01000000 /* src-port + mask */ +#define IP_FW_F_DMSK 0x02000000 /* dst-port + mask */ +#define IP_FW_BRIDGED 0x04000000 /* only match bridged packets */ +#define IP_FW_F_KEEP_S 0x08000000 /* keep state */ +#define IP_FW_F_CHECK_S 0x10000000 /* check state */ +#define IP_FW_F_SME 0x20000000 /* source = me */ +#define IP_FW_F_DME 0x40000000 /* destination = me */ +#define IP_FW_F_MAC 0x80000000 /* match MAC header */ + +#define IP_FW_F_MASK 0xFFFFFFFF /* All possible flag bits mask */ + +/* + * Flags for the 'fw_ipflg' field, for comparing values + * of ip and its protocols. + */ +#define IP_FW_IF_TCPOPT 0x00000001 /* tcp options */ +#define IP_FW_IF_TCPFLG 0x00000002 /* tcp flags */ +#define IP_FW_IF_TCPSEQ 0x00000004 /* tcp sequence number */ +#define IP_FW_IF_TCPACK 0x00000008 /* tcp acknowledgement number */ +#define IP_FW_IF_TCPWIN 0x00000010 /* tcp window size */ +#define IP_FW_IF_TCPEST 0x00000020 /* established TCP connection */ +#define IP_FW_IF_TCPMSK 0x0000003f /* mask of all tcp values */ +#define IP_FW_IF_IPOPT 0x00000100 /* ip options */ +#define IP_FW_IF_IPLEN 0x00000200 /* ip length */ +#define IP_FW_IF_IPID 0x00000400 /* ip identification */ +#define IP_FW_IF_IPTOS 0x00000800 /* ip type of service */ +#define IP_FW_IF_IPTTL 0x00001000 /* ip time to live */ +#define IP_FW_IF_IPVER 0x00002000 /* ip version */ +#define IP_FW_IF_IPPRE 0x00004000 /* ip precedence */ +#define IP_FW_IF_IPMSK 0x00007f00 /* mask of all ip values */ +#define IP_FW_IF_MSK 0x0000ffff /* All possible bits mask */ + +/* + * For backwards compatibility with rules specifying "via iface" but + * not restricted to only "in" or "out" packets, we define this combination + * of bits to represent this configuration. + */ + +#define IF_FW_F_VIAHACK (IP_FW_F_IN|IP_FW_F_OUT|IP_FW_F_IIFACE|IP_FW_F_OIFACE) + +/* + * Definitions for REJECT response codes. + * Values less than 256 correspond to ICMP unreachable codes. + */ +#define IP_FW_REJECT_RST 0x0100 /* TCP packets: send RST */ + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +/* + * Main firewall chains definitions and global var's definitions. + */ +#ifdef _KERNEL + +#define IP_FW_PORT_DYNT_FLAG 0x10000 +#define IP_FW_PORT_TEE_FLAG 0x20000 +#define IP_FW_PORT_DENY_FLAG 0x40000 + +/* + * arguments for calling ip_fw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + struct ip_fw *rule; /* matching rule */ + struct ether_header *eh; /* for bridged packets */ + + struct route *ro; /* for dummynet */ + struct sockaddr_in *dst; /* for dummynet */ + int flags; /* for dummynet */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + u_int16_t divert_rule; /* divert cookie */ + u_int32_t retval; +}; + +/* + * Function definitions. + */ +void ip_fw_init(void); + +/* Firewall hooks */ +struct ip; +struct sockopt; +typedef int ip_fw_chk_t (struct ip_fw_args *args); +typedef int ip_fw_ctl_t (struct sockopt *); +extern ip_fw_chk_t *ip_fw_chk_ptr; +extern ip_fw_ctl_t *ip_fw_ctl_ptr; +extern int fw_one_pass; +extern int fw_enable; +#define IPFW_LOADED (ip_fw_chk_ptr != NULL) +#endif /* _KERNEL */ + +#endif /* _IP_FW_H */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c new file mode 100644 index 0000000..7042dd8 --- /dev/null +++ b/sys/netinet/ip_icmp.c @@ -0,0 +1,871 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> +#include <netinet/icmp_var.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#endif + +#include <machine/in_cksum.h> + +/* + * ICMP routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator, and + * host table maintenance routines. + */ + +static struct icmpstat icmpstat; +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, + &icmpstat, icmpstat, ""); + +static int icmpmaskrepl = 0; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, + &icmpmaskrepl, 0, ""); + +static int drop_redirect = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, + &drop_redirect, 0, ""); + +static int log_redirect = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, + &log_redirect, 0, ""); + +static int icmplim = 200; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, + &icmplim, 0, ""); + +static int icmplim_output = 1; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, + &icmplim_output, 0, ""); + +/* + * ICMP broadcast echo sysctl + */ + +static int icmpbmcastecho = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, + &icmpbmcastecho, 0, ""); + + +#ifdef ICMPPRINTFS +int icmpprintfs = 0; +#endif + +static void icmp_reflect(struct mbuf *); +static void icmp_send(struct mbuf *, struct mbuf *, struct route *); +static int ip_next_mtu(int, int); + +extern struct protosw inetsw[]; + +/* + * Generate an error packet of type error + * in response to bad packet ip. + */ +void +icmp_error(n, type, code, dest, destifp) + struct mbuf *n; + int type, code; + n_long dest; + struct ifnet *destifp; +{ + register struct ip *oip = mtod(n, struct ip *), *nip; + register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; + register struct icmp *icp; + register struct mbuf *m; + unsigned icmplen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_error(%p, %x, %d)\n", oip, type, code); +#endif + if (type != ICMP_REDIRECT) + icmpstat.icps_error++; + /* + * Don't send error if not the first fragment of message. + * Don't error if the old packet protocol was ICMP + * error message, only known informational types. + */ + if (oip->ip_off &~ (IP_MF|IP_DF)) + goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && + n->m_len >= oiplen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { + icmpstat.icps_oldicmp++; + goto freeit; + } + /* Don't send error in response to a multicast or broadcast packet */ + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + /* + * First, formulate icmp message + */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + goto freeit; + icmplen = min(oiplen + 8, oip->ip_len); + if (icmplen < sizeof(struct ip)) + panic("icmp_error: bad length"); + m->m_len = icmplen + ICMP_MINLEN; + MH_ALIGN(m, m->m_len); + icp = mtod(m, struct icmp *); + if ((u_int)type > ICMP_MAXTYPE) + panic("icmp_error"); + icmpstat.icps_outhist[type]++; + icp->icmp_type = type; + if (type == ICMP_REDIRECT) + icp->icmp_gwaddr.s_addr = dest; + else { + icp->icmp_void = 0; + /* + * The following assignments assume an overlay with the + * zeroed icmp_void field. + */ + if (type == ICMP_PARAMPROB) { + icp->icmp_pptr = code; + code = 0; + } else if (type == ICMP_UNREACH && + code == ICMP_UNREACH_NEEDFRAG && destifp) { + icp->icmp_nextmtu = htons(destifp->if_mtu); + } + } + + icp->icmp_code = code; + m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); + nip = &icp->icmp_ip; + + /* + * Convert fields to network representation. + */ + nip->ip_len = htons(nip->ip_len); + nip->ip_off = htons(nip->ip_off); + + /* + * Now, copy old ip header (without options) + * in front of icmp message. + */ + if (m->m_data - sizeof(struct ip) < m->m_pktdat) + panic("icmp len"); + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; + nip = mtod(m, struct ip *); + bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); + nip->ip_len = m->m_len; + nip->ip_vhl = IP_VHL_BORING; + nip->ip_p = IPPROTO_ICMP; + nip->ip_tos = 0; + icmp_reflect(m); + +freeit: + m_freem(n); +} + +static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET }; + +/* + * Process a received ICMP message. + */ +void +icmp_input(m, off) + register struct mbuf *m; + int off; +{ + int hlen = off; + register struct icmp *icp; + register struct ip *ip = mtod(m, struct ip *); + int icmplen = ip->ip_len; + register int i; + struct in_ifaddr *ia; + void (*ctlfunc)(int, struct sockaddr *, void *); + int code; + + /* + * Locate icmp structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_src)); + printf("icmp_input from %s to %s, len %d\n", + buf, inet_ntoa(ip->ip_dst), icmplen); + } +#endif + if (icmplen < ICMP_MINLEN) { + icmpstat.icps_tooshort++; + goto freeit; + } + i = hlen + min(icmplen, ICMP_ADVLENMIN); + if (m->m_len < i && (m = m_pullup(m, i)) == 0) { + icmpstat.icps_tooshort++; + return; + } + ip = mtod(m, struct ip *); + m->m_len -= hlen; + m->m_data += hlen; + icp = mtod(m, struct icmp *); + if (in_cksum(m, icmplen)) { + icmpstat.icps_checksum++; + goto freeit; + } + m->m_len += hlen; + m->m_data -= hlen; + + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { + /* + * Deliver very specific ICMP type only. + */ + switch (icp->icmp_type) { + case ICMP_UNREACH: + case ICMP_TIMXCEED: + break; + default: + goto freeit; + } + } + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_input, type %d code %d\n", icp->icmp_type, + icp->icmp_code); +#endif + + /* + * Message type specific processing. + */ + if (icp->icmp_type > ICMP_MAXTYPE) + goto raw; + icmpstat.icps_inhist[icp->icmp_type]++; + code = icp->icmp_code; + switch (icp->icmp_type) { + + case ICMP_UNREACH: + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_SRCFAIL: + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_TOSNET: + case ICMP_UNREACH_TOSHOST: + case ICMP_UNREACH_HOST_PRECEDENCE: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_NEEDFRAG: + code = PRC_MSGSIZE; + break; + + /* + * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. + * Treat subcodes 2,3 as immediate RST + */ + case ICMP_UNREACH_PROTOCOL: + case ICMP_UNREACH_PORT: + code = PRC_UNREACH_PORT; + break; + + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_FILTER_PROHIB: + code = PRC_UNREACH_ADMIN_PROHIB; + break; + + default: + goto badcode; + } + goto deliver; + + case ICMP_TIMXCEED: + if (code > 1) + goto badcode; + code += PRC_TIMXCEED_INTRANS; + goto deliver; + + case ICMP_PARAMPROB: + if (code > 1) + goto badcode; + code = PRC_PARAMPROB; + goto deliver; + + case ICMP_SOURCEQUENCH: + if (code) + goto badcode; + code = PRC_QUENCH; + deliver: + /* + * Problem with datagram; advise higher level routines. + */ + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + goto freeit; + } + icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); + /* Discard ICMP's in response to multicast packets */ + if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) + goto badcode; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; +#if 1 + /* + * MTU discovery: + * If we got a needfrag and there is a host route to the + * original destination, and the MTU is not locked, then + * set the MTU in the route to the suggested new value + * (if given) and then notify as usual. The ULPs will + * notice that the MTU has changed and adapt accordingly. + * If no new MTU was suggested, then we guess a new one + * less than the current value. If the new MTU is + * unreasonably small (arbitrarily set at 296), then + * we reset the MTU to the interface value and enable the + * lock bit, indicating that we are no longer doing MTU + * discovery. + */ + if (code == PRC_MSGSIZE) { + struct rtentry *rt; + int mtu; + + rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt && (rt->rt_flags & RTF_HOST) + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + mtu = ntohs(icp->icmp_nextmtu); + if (!mtu) + mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu, + 1); +#ifdef DEBUG_MTUDISC + printf("MTU for %s reduced to %d\n", + inet_ntoa(icmpsrc.sin_addr), mtu); +#endif + if (mtu < 296) { + /* rt->rt_rmx.rmx_mtu = + rt->rt_ifp->if_mtu; */ + rt->rt_rmx.rmx_locks |= RTV_MTU; + } else if (rt->rt_rmx.rmx_mtu > mtu) { + rt->rt_rmx.rmx_mtu = mtu; + } + } + if (rt) + RTFREE(rt); + } + +#endif + /* + * XXX if the packet contains [IPv4 AH TCP], we can't make a + * notification to TCP layer. + */ + ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; + if (ctlfunc) + (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, + (void *)&icp->icmp_ip); + break; + + badcode: + icmpstat.icps_badcode++; + break; + + case ICMP_ECHO: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + icmpstat.icps_bmcastecho++; + break; + } + icp->icmp_type = ICMP_ECHOREPLY; + if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) + goto freeit; + else + goto reflect; + + case ICMP_TSTAMP: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + icmpstat.icps_bmcasttstamp++; + break; + } + if (icmplen < ICMP_TSLEN) { + icmpstat.icps_badlen++; + break; + } + icp->icmp_type = ICMP_TSTAMPREPLY; + icp->icmp_rtime = iptime(); + icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ + if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) + goto freeit; + else + goto reflect; + + case ICMP_MASKREQ: + if (icmpmaskrepl == 0) + break; + /* + * We are not able to respond with all ones broadcast + * unless we receive it over a point-to-point interface. + */ + if (icmplen < ICMP_MASKLEN) + break; + switch (ip->ip_dst.s_addr) { + + case INADDR_BROADCAST: + case INADDR_ANY: + icmpdst.sin_addr = ip->ip_src; + break; + + default: + icmpdst.sin_addr = ip->ip_dst; + } + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + if (ia == 0) + break; + if (ia->ia_ifp == 0) + break; + icp->icmp_type = ICMP_MASKREPLY; + icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; + if (ip->ip_src.s_addr == 0) { + if (ia->ia_ifp->if_flags & IFF_BROADCAST) + ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; + else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) + ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; + } +reflect: + ip->ip_len += hlen; /* since ip_input deducts this */ + icmpstat.icps_reflect++; + icmpstat.icps_outhist[icp->icmp_type]++; + icmp_reflect(m); + return; + + case ICMP_REDIRECT: + if (log_redirect) { + u_long src, dst, gw; + + src = ntohl(ip->ip_src.s_addr); + dst = ntohl(icp->icmp_ip.ip_dst.s_addr); + gw = ntohl(icp->icmp_gwaddr.s_addr); + printf("icmp redirect from %d.%d.%d.%d: " + "%d.%d.%d.%d => %d.%d.%d.%d\n", + (int)(src >> 24), (int)((src >> 16) & 0xff), + (int)((src >> 8) & 0xff), (int)(src & 0xff), + (int)(dst >> 24), (int)((dst >> 16) & 0xff), + (int)((dst >> 8) & 0xff), (int)(dst & 0xff), + (int)(gw >> 24), (int)((gw >> 16) & 0xff), + (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); + } + if (drop_redirect) + break; + if (code > 3) + goto badcode; + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + break; + } + /* + * Short circuit routing redirects to force + * immediate change in the kernel's routing + * tables. The message is also handed to anyone + * listening on a raw socket (e.g. the routing + * daemon for use in updating its tables). + */ + icmpgw.sin_addr = ip->ip_src; + icmpdst.sin_addr = icp->icmp_gwaddr; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + + printf("redirect dst %s to %s\n", + buf, inet_ntoa(icp->icmp_gwaddr)); + } +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + rtredirect((struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, + (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, (struct rtentry **)0); + pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); +#ifdef IPSEC + key_sa_routechange((struct sockaddr *)&icmpsrc); +#endif + break; + + /* + * No kernel processing for the following; + * just fall through to send to raw listener. + */ + case ICMP_ECHOREPLY: + case ICMP_ROUTERADVERT: + case ICMP_ROUTERSOLICIT: + case ICMP_TSTAMPREPLY: + case ICMP_IREQREPLY: + case ICMP_MASKREPLY: + default: + break; + } + +raw: + rip_input(m, off); + return; + +freeit: + m_freem(m); +} + +/* + * Reflect the ip packet back to the source + */ +static void +icmp_reflect(m) + struct mbuf *m; +{ + struct ip *ip = mtod(m, struct ip *); + struct ifaddr *ifa; + struct in_ifaddr *ia; + struct in_addr t; + struct mbuf *opts = 0; + int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip); + struct route *ro = NULL, rt; + + if (!in_canforward(ip->ip_src) && + ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != + (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { + m_freem(m); /* Bad return address */ + icmpstat.icps_badaddr++; + goto done; /* Ip_output() will check for broadcast */ + } + t = ip->ip_dst; + ip->ip_dst = ip->ip_src; + ro = &rt; + bzero(ro, sizeof(*ro)); + /* + * If the incoming packet was addressed directly to us, + * use dst as the src for the reply. Otherwise (broadcast + * or anonymous), use the address which corresponds + * to the incoming interface. + */ + LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + goto match; + if (m->m_pkthdr.rcvif != NULL && + m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { + TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = ifatoia(ifa); + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + t.s_addr) + goto match; + } + } + ia = ip_rtaddr(ip->ip_dst, ro); + /* We need a route to do anything useful. */ + if (ia == NULL) { + m_freem(m); + icmpstat.icps_noroute++; + goto done; + } +match: + t = IA_SIN(ia)->sin_addr; + ip->ip_src = t; + ip->ip_ttl = ip_defttl; + + if (optlen > 0) { + register u_char *cp; + int opt, cnt; + u_int len; + + /* + * Retrieve any source routing from the incoming packet; + * add on any record-route or timestamp options. + */ + cp = (u_char *) (ip + 1); + if ((opts = ip_srcroute()) == 0 && + (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { + opts->m_len = sizeof(struct in_addr); + mtod(opts, struct in_addr *)->s_addr = 0; + } + if (opts) { +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_reflect optlen %d rt %d => ", + optlen, opts->m_len); +#endif + for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + len = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + break; + len = cp[IPOPT_OLEN]; + if (len < IPOPT_OLEN + sizeof(*cp) || + len > cnt) + break; + } + /* + * Should check for overflow, but it "can't happen" + */ + if (opt == IPOPT_RR || opt == IPOPT_TS || + opt == IPOPT_SECURITY) { + bcopy((caddr_t)cp, + mtod(opts, caddr_t) + opts->m_len, len); + opts->m_len += len; + } + } + /* Terminate & pad, if necessary */ + cnt = opts->m_len % 4; + if (cnt) { + for (; cnt < 4; cnt++) { + *(mtod(opts, caddr_t) + opts->m_len) = + IPOPT_EOL; + opts->m_len++; + } + } +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("%d\n", opts->m_len); +#endif + } + /* + * Now strip out original options by copying rest of first + * mbuf's data back, and adjust the IP length. + */ + ip->ip_len -= optlen; + ip->ip_vhl = IP_VHL_BORING; + m->m_len -= optlen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= optlen; + optlen += sizeof(struct ip); + bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), + (unsigned)(m->m_len - sizeof(struct ip))); + } + m->m_flags &= ~(M_BCAST|M_MCAST); + icmp_send(m, opts, ro); +done: + if (opts) + (void)m_free(opts); + if (ro && ro->ro_rt) + RTFREE(ro->ro_rt); +} + +/* + * Send an icmp packet back to the ip level, + * after supplying a checksum. + */ +static void +icmp_send(m, opts, rt) + register struct mbuf *m; + struct mbuf *opts; + struct route *rt; +{ + register struct ip *ip = mtod(m, struct ip *); + register int hlen; + register struct icmp *icp; + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + m->m_data += hlen; + m->m_len -= hlen; + icp = mtod(m, struct icmp *); + icp->icmp_cksum = 0; + icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + m->m_data -= hlen; + m->m_len += hlen; + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_dst)); + printf("icmp_send dst %s src %s\n", + buf, inet_ntoa(ip->ip_src)); + } +#endif + (void) ip_output(m, opts, rt, 0, NULL); +} + +n_time +iptime() +{ + struct timeval atv; + u_long t; + + getmicrotime(&atv); + t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; + return (htonl(t)); +} + +#if 1 +/* + * Return the next larger or smaller MTU plateau (table from RFC 1191) + * given current value MTU. If DIR is less than zero, a larger plateau + * is returned; otherwise, a smaller value is returned. + */ +static int +ip_next_mtu(mtu, dir) + int mtu; + int dir; +{ + static int mtutab[] = { + 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296, + 68, 0 + }; + int i; + + for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) { + if (mtu >= mtutab[i]) + break; + } + + if (dir < 0) { + if (i == 0) { + return 0; + } else { + return mtutab[i - 1]; + } + } else { + if (mtutab[i] == 0) { + return 0; + } else if(mtu > mtutab[i]) { + return mtutab[i]; + } else { + return mtutab[i + 1]; + } + } +} +#endif + + +/* + * badport_bandlim() - check for ICMP bandwidth limit + * + * Return 0 if it is ok to send an ICMP error response, -1 if we have + * hit our bandwidth limit and it is not ok. + * + * If icmplim is <= 0, the feature is disabled and 0 is returned. + * + * For now we separate the TCP and UDP subsystems w/ different 'which' + * values. We may eventually remove this separation (and simplify the + * code further). + * + * Note that the printing of the error message is delayed so we can + * properly print the icmp error rate that the system was trying to do + * (i.e. 22000/100 pps, etc...). This can cause long delays in printing + * the 'final' error, but it doesn't make sense to solve the printing + * delay with more complex code. + */ + +int +badport_bandlim(int which) +{ + static int lticks[BANDLIM_MAX + 1]; + static int lpackets[BANDLIM_MAX + 1]; + int dticks; + const char *bandlimittype[] = { + "Limiting icmp unreach response", + "Limiting icmp ping response", + "Limiting icmp tstamp response", + "Limiting closed port RST response", + "Limiting open port RST response" + }; + + /* + * Return ok status if feature disabled or argument out of + * ranage. + */ + + if (icmplim <= 0 || which > BANDLIM_MAX || which < 0) + return(0); + dticks = ticks - lticks[which]; + + /* + * reset stats when cumulative dt exceeds one second. + */ + + if ((unsigned int)dticks > hz) { + if (lpackets[which] > icmplim && icmplim_output) { + printf("%s from %d to %d packets per second\n", + bandlimittype[which], + lpackets[which], + icmplim + ); + } + lticks[which] = ticks; + lpackets[which] = 0; + } + + /* + * bump packet count + */ + + if (++lpackets[which] > icmplim) { + return(-1); + } + return(0); +} + diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h new file mode 100644 index 0000000..927efd9 --- /dev/null +++ b/sys/netinet/ip_icmp.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_ICMP_H_ +#define _NETINET_IP_ICMP_H_ + +/* + * Interface Control Message Protocol Definitions. + * Per RFC 792, September 1981. + */ + +/* + * Internal of an ICMP Router Advertisement + */ +struct icmp_ra_addr { + u_int32_t ira_addr; + u_int32_t ira_preference; +}; + +/* + * Structure of an icmp header. + */ +struct icmp { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ + union { + u_char ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + n_short icd_id; + n_short icd_seq; + } ih_idseq; + int ih_void; + + /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */ + struct ih_pmtu { + n_short ipm_void; + n_short ipm_nextmtu; + } ih_pmtu; + + struct ih_rtradv { + u_char irt_num_addrs; + u_char irt_wpa; + u_int16_t irt_lifetime; + } ih_rtradv; + } icmp_hun; +#define icmp_pptr icmp_hun.ih_pptr +#define icmp_gwaddr icmp_hun.ih_gwaddr +#define icmp_id icmp_hun.ih_idseq.icd_id +#define icmp_seq icmp_hun.ih_idseq.icd_seq +#define icmp_void icmp_hun.ih_void +#define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void +#define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu +#define icmp_num_addrs icmp_hun.ih_rtradv.irt_num_addrs +#define icmp_wpa icmp_hun.ih_rtradv.irt_wpa +#define icmp_lifetime icmp_hun.ih_rtradv.irt_lifetime + union { + struct id_ts { + n_time its_otime; + n_time its_rtime; + n_time its_ttime; + } id_ts; + struct id_ip { + struct ip idi_ip; + /* options and then 64 bits of data */ + } id_ip; + struct icmp_ra_addr id_radv; + u_int32_t id_mask; + char id_data[1]; + } icmp_dun; +#define icmp_otime icmp_dun.id_ts.its_otime +#define icmp_rtime icmp_dun.id_ts.its_rtime +#define icmp_ttime icmp_dun.id_ts.its_ttime +#define icmp_ip icmp_dun.id_ip.idi_ip +#define icmp_radv icmp_dun.id_radv +#define icmp_mask icmp_dun.id_mask +#define icmp_data icmp_dun.id_data +}; + +/* + * Lower bounds on packet lengths for various types. + * For the error advice packets must first insure that the + * packet is large enough to contain the returned ip header. + * Only then can we do the check to see if 64 bits of packet + * data have been returned, since we need to check the returned + * ip header length. + */ +#define ICMP_MINLEN 8 /* abs minimum */ +#define ICMP_TSLEN (8 + 3 * sizeof (n_time)) /* timestamp */ +#define ICMP_MASKLEN 12 /* address mask */ +#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ +#ifndef _IP_VHL +#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) + /* N.B.: must separately check that ip_hl >= 5 */ +#else +#define ICMP_ADVLEN(p) (8 + (IP_VHL_HL((p)->icmp_ip.ip_vhl) << 2) + 8) + /* N.B.: must separately check that header length >= 5 */ +#endif + +/* + * Definition of type and code field values. + */ +#define ICMP_ECHOREPLY 0 /* echo reply */ +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ +#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ +#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ +#define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ +#define ICMP_REDIRECT 5 /* shorter route, codes: */ +#define ICMP_REDIRECT_NET 0 /* for network */ +#define ICMP_REDIRECT_HOST 1 /* for host */ +#define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ +#define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ +#define ICMP_ECHO 8 /* echo service */ +#define ICMP_ROUTERADVERT 9 /* router advertisement */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TIMXCEED 11 /* time exceeded, code: */ +#define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ +#define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ +#define ICMP_PARAMPROB 12 /* ip header bad */ +#define ICMP_PARAMPROB_ERRATPTR 0 /* error at param ptr */ +#define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */ +#define ICMP_PARAMPROB_LENGTH 2 /* bad length */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_TSTAMPREPLY 14 /* timestamp reply */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_IREQREPLY 16 /* information reply */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_MASKREPLY 18 /* address mask reply */ + +#define ICMP_MAXTYPE 18 + +#define ICMP_INFOTYPE(type) \ + ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \ + (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \ + (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ + (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ + (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) + +#ifdef _KERNEL +void icmp_error(struct mbuf *, int, int, n_long, struct ifnet *); +void icmp_input(struct mbuf *, int); +#endif + +#endif diff --git a/sys/netinet/ip_id.c b/sys/netinet/ip_id.c new file mode 100644 index 0000000..664b4d1 --- /dev/null +++ b/sys/netinet/ip_id.c @@ -0,0 +1,210 @@ +/* $OpenBSD: ip_id.c,v 1.2 1999/08/26 13:37:01 provos Exp $ */ + +/* + * Copyright 1998 Niels Provos <provos@citi.umich.edu> + * All rights reserved. + * + * Theo de Raadt <deraadt@openbsd.org> came up with the idea of using + * such a mathematical system to generate more random (yet non-repeating) + * ids to solve the resolver/named problem. But Niels designed the + * actual system based on the constraints. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Niels Provos. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * seed = random 15bit + * n = prime, g0 = generator to n, + * j = random so that gcd(j,n-1) == 1 + * g = g0^j mod n will be a generator again. + * + * X[0] = random seed. + * X[n] = a*X[n-1]+b mod m is a Linear Congruential Generator + * with a = 7^(even random) mod m, + * b = random with gcd(b,m) == 1 + * m = 31104 and a maximal period of m-1. + * + * The transaction id is determined by: + * id[n] = seed xor (g^X[n] mod n) + * + * Effectivly the id is restricted to the lower 15 bits, thus + * yielding two different cycles by toggling the msb on and off. + * This avoids reuse issues caused by reseeding. + */ + +#include "opt_random_ip_id.h" +#include <sys/param.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/random.h> + +#ifdef RANDOM_IP_ID +#define RU_OUT 180 /* Time after wich will be reseeded */ +#define RU_MAX 30000 /* Uniq cycle, avoid blackjack prediction */ +#define RU_GEN 2 /* Starting generator */ +#define RU_N 32749 /* RU_N-1 = 2*2*3*2729 */ +#define RU_AGEN 7 /* determine ru_a as RU_AGEN^(2*rand) */ +#define RU_M 31104 /* RU_M = 2^7*3^5 - don't change */ + +#define PFAC_N 3 +const static u_int16_t pfacts[PFAC_N] = { + 2, + 3, + 2729 +}; + +static u_int16_t ru_x; +static u_int16_t ru_seed, ru_seed2; +static u_int16_t ru_a, ru_b; +static u_int16_t ru_g; +static u_int16_t ru_counter = 0; +static u_int16_t ru_msb = 0; +static long ru_reseed; +static u_int32_t tmp; /* Storage for unused random */ + +static u_int16_t pmod(u_int16_t, u_int16_t, u_int16_t); +static void ip_initid(void); +u_int16_t ip_randomid(void); + +/* + * Do a fast modular exponation, returned value will be in the range + * of 0 - (mod-1) + */ + +#ifdef __STDC__ +static u_int16_t +pmod(u_int16_t gen, u_int16_t exp, u_int16_t mod) +#else +static u_int16_t +pmod(gen, exp, mod) + u_int16_t gen, exp, mod; +#endif +{ + u_int16_t s, t, u; + + s = 1; + t = gen; + u = exp; + + while (u) { + if (u & 1) + s = (s*t) % mod; + u >>= 1; + t = (t*t) % mod; + } + return (s); +} + +/* + * Initalizes the seed and chooses a suitable generator. Also toggles + * the msb flag. The msb flag is used to generate two distinct + * cycles of random numbers and thus avoiding reuse of ids. + * + * This function is called from id_randomid() when needed, an + * application does not have to worry about it. + */ +static void +ip_initid(void) +{ + u_int16_t j, i; + int noprime = 1; + struct timeval time; + + getmicrotime(&time); + read_random((void *) &tmp, sizeof(tmp)); + ru_x = (tmp & 0xFFFF) % RU_M; + + /* 15 bits of random seed */ + ru_seed = (tmp >> 16) & 0x7FFF; + read_random((void *) &tmp, sizeof(tmp)); + ru_seed2 = tmp & 0x7FFF; + + read_random((void *) &tmp, sizeof(tmp)); + + /* Determine the LCG we use */ + ru_b = (tmp & 0xfffe) | 1; + ru_a = pmod(RU_AGEN, (tmp >> 16) & 0xfffe, RU_M); + while (ru_b % 3 == 0) + ru_b += 2; + + read_random((void *) &tmp, sizeof(tmp)); + j = tmp % RU_N; + tmp = tmp >> 16; + + /* + * Do a fast gcd(j,RU_N-1), so we can find a j with + * gcd(j, RU_N-1) == 1, giving a new generator for + * RU_GEN^j mod RU_N + */ + + while (noprime) { + for (i=0; i<PFAC_N; i++) + if (j%pfacts[i] == 0) + break; + + if (i>=PFAC_N) + noprime = 0; + else + j = (j+1) % RU_N; + } + + ru_g = pmod(RU_GEN,j,RU_N); + ru_counter = 0; + + ru_reseed = time.tv_sec + RU_OUT; + ru_msb = ru_msb == 0x8000 ? 0 : 0x8000; +} + +u_int16_t +ip_randomid(void) +{ + int i, n; + struct timeval time; + + getmicrotime(&time); + if (ru_counter >= RU_MAX || time.tv_sec > ru_reseed) + ip_initid(); + + if (!tmp) + read_random((void *) &tmp, sizeof(tmp)); + + /* Skip a random number of ids */ + n = tmp & 0x3; tmp = tmp >> 2; + if (ru_counter + n >= RU_MAX) + ip_initid(); + + for (i = 0; i <= n; i++) + /* Linear Congruential Generator */ + ru_x = (ru_a*ru_x + ru_b) % RU_M; + + ru_counter += i; + + return (ru_seed ^ pmod(ru_g,ru_seed2 ^ ru_x,RU_N)) | ru_msb; +} + +#endif /* RANDOM_IP_ID */ diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c new file mode 100644 index 0000000..bec09ea --- /dev/null +++ b/sys/netinet/ip_input.c @@ -0,0 +1,1948 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#define _IP_VHL + +#include "opt_bootp.h" +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_ipfilter.h" +#include "opt_ipstealth.h" +#include "opt_ipsec.h" +#include "opt_pfil_hooks.h" +#include "opt_random_ip_id.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> + +#include <net/pfil.h> +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/netisr.h> +#include <net/intrq.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <machine/in_cksum.h> + +#include <sys/socketvar.h> + +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#endif + +int rsvp_on = 0; + +int ipforwarding = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, + &ipforwarding, 0, "Enable IP forwarding between interfaces"); + +static int ipsendredirects = 1; /* XXX */ +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, + &ipsendredirects, 0, "Enable sending IP redirects"); + +int ip_defttl = IPDEFTTL; +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, + &ip_defttl, 0, "Maximum TTL on IP packets"); + +static int ip_dosourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, + &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); + +static int ip_acceptsourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, + CTLFLAG_RW, &ip_acceptsourceroute, 0, + "Enable accepting source routed IP packets"); + +static int ip_keepfaith = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, + &ip_keepfaith, 0, + "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); + +static int ip_nfragpackets = 0; +static int ip_maxfragpackets; /* initialized in ip_init() */ +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW, + &ip_maxfragpackets, 0, + "Maximum number of IPv4 fragment reassembly queue entries"); + +/* + * XXX - Setting ip_checkinterface mostly implements the receive side of + * the Strong ES model described in RFC 1122, but since the routing table + * and transmit implementation do not implement the Strong ES model, + * setting this to 1 results in an odd hybrid. + * + * XXX - ip_checkinterface currently must be disabled if you use ipnat + * to translate the destination address to another local interface. + * + * XXX - ip_checkinterface must be disabled if you add IP aliases + * to the loopback interface instead of the interface where the + * packets for those addresses are received. + */ +static int ip_checkinterface = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, + &ip_checkinterface, 0, "Verify packet arrives on correct interface"); + +#ifdef DIAGNOSTIC +static int ipprintfs = 0; +#endif + +static int ipqmaxlen = IFQ_MAXLEN; + +extern struct domain inetdomain; +extern struct protosw inetsw[]; +u_char ip_protox[IPPROTO_MAX]; +struct in_ifaddrhead in_ifaddrhead; /* first inet address */ +struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ +u_long in_ifaddrhmask; /* mask for hash table */ + +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, + &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, + &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); + +struct ipstat ipstat; +SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, + &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); + +/* Packet reassembly stuff */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + +static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; +static int nipq = 0; /* total # of reass queues */ +static int maxnipq; + +#ifdef IPCTL_DEFMTU +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, + &ip_mtu, 0, "Default MTU"); +#endif + +#ifdef IPSTEALTH +static int ipstealth = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, + &ipstealth, 0, ""); +#endif + + +/* Firewall hooks */ +ip_fw_chk_t *ip_fw_chk_ptr; +int fw_enable = 1 ; + +/* Dummynet hooks */ +ip_dn_io_t *ip_dn_io_ptr; + + +/* + * XXX this is ugly -- the following two global variables are + * used to store packet state while it travels through the stack. + * Note that the code even makes assumptions on the size and + * alignment of fields inside struct ip_srcrt so e.g. adding some + * fields will break the code. This needs to be fixed. + * + * We need to save the IP options in case a protocol wants to respond + * to an incoming packet over the same route if the packet got here + * using IP source routing. This allows connection establishment and + * maintenance when the remote end is on a network that is not known + * to us. + */ +static int ip_nhops = 0; +static struct ip_srcrt { + struct in_addr dst; /* final destination */ + char nop; /* one NOP to align */ + char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ + struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; +} ip_srcrt; + +static void save_rte(u_char *, struct in_addr); +static int ip_dooptions(struct mbuf *m, int, + struct sockaddr_in *next_hop); +static void ip_forward(struct mbuf *m, int srcrt, + struct sockaddr_in *next_hop); +static void ip_freef(struct ipqhead *, struct ipq *); +static struct mbuf *ip_reass(struct mbuf *, struct ipqhead *, + struct ipq *, u_int32_t *, u_int16_t *); +static void ipintr(void); + +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void +ip_init() +{ + register struct protosw *pr; + register int i; + + TAILQ_INIT(&in_ifaddrhead); + in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask); + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == 0) + panic("ip_init"); + for (i = 0; i < IPPROTO_MAX; i++) + ip_protox[i] = pr - inetsw; + for (pr = inetdomain.dom_protosw; + pr < inetdomain.dom_protoswNPROTOSW; pr++) + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) + ip_protox[pr->pr_protocol] = pr - inetsw; + + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&ipq[i]); + + maxnipq = nmbclusters / 4; + ip_maxfragpackets = nmbclusters / 4; + +#ifndef RANDOM_IP_ID + ip_id = time_second & 0xffff; +#endif + ipintrq.ifq_maxlen = ipqmaxlen; + mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF); + ipintrq_present = 1; + + register_netisr(NETISR_IP, ipintr); +} + +/* + * XXX watch out this one. It is perhaps used as a cache for + * the most recently used route ? it is cleared in in_addroute() + * when a new route is successfully created. + */ +struct route ipforward_rt; + +/* + * Ip input routine. Checksum and byte swap header. If fragmented + * try to reassemble. Process options. Pass to next level. + */ +void +ip_input(struct mbuf *m) +{ + struct ip *ip; + struct ipq *fp; + struct in_ifaddr *ia = NULL; + struct ifaddr *ifa; + int i, hlen, checkif; + u_short sum; + struct in_addr pkt_dst; + u_int32_t divert_info = 0; /* packet divert/tee info */ + struct ip_fw_args args; +#ifdef PFIL_HOOKS + struct packet_filter_hook *pfh; + struct mbuf *m0; + int rv; +#endif /* PFIL_HOOKS */ + + args.eh = NULL; + args.oif = NULL; + args.rule = NULL; + args.divert_rule = 0; /* divert cookie */ + args.next_hop = NULL; + + /* Grab info from MT_TAG mbufs prepended to the chain. */ + for (; m && m->m_type == MT_TAG; m = m->m_next) { + switch(m->m_tag_id) { + default: + printf("ip_input: unrecognised MT_TAG tag %d\n", + m->m_tag_id); + break; + + case PACKET_TAG_DUMMYNET: + args.rule = ((struct dn_pkt *)m)->rule; + break; + + case PACKET_TAG_DIVERT: + args.divert_rule = (intptr_t)m->m_hdr.mh_data & 0xffff; + break; + + case PACKET_TAG_IPFORWARD: + args.next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; + break; + } + } + + KASSERT(m != NULL && (m->m_flags & M_PKTHDR) != 0, + ("ip_input: no HDR")); + + if (args.rule) { /* dummynet already filtered us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + goto iphack ; + } + + ipstat.ips_total++; + + if (m->m_pkthdr.len < sizeof(struct ip)) + goto tooshort; + + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + return; + } + ip = mtod(m, struct ip *); + + if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { + ipstat.ips_badvers++; + goto bad; + } + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_badhlen++; + return; + } + ip = mtod(m, struct ip *); + } + + /* 127/8 must not appear on wire - RFC1122 */ + if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { + ipstat.ips_badaddr++; + goto bad; + } + } + + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else { + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + } + if (sum) { + ipstat.ips_badsum++; + goto bad; + } + + /* + * Convert fields to host representation. + */ + ip->ip_len = ntohs(ip->ip_len); + if (ip->ip_len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + ip->ip_off = ntohs(ip->ip_off); + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { +tooshort: + ipstat.ips_tooshort++; + goto bad; + } + if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } + +#ifdef IPSEC + if (ipsec_gethist(m, NULL)) + goto pass; +#endif + + /* + * IpHack's section. + * Right now when no processing on packet has done + * and it is still fresh out of network we do our black + * deals with it. + * - Firewall: deny/allow/divert + * - Xlate: translate packet's addr/port (NAT). + * - Pipe: pass pkt through dummynet. + * - Wrap: fake packet's addr/port <unimpl.> + * - Encapsulate: put it in another IP and send out. <unimp.> + */ + +iphack: + +#ifdef PFIL_HOOKS + /* + * Run through list of hooks for input packets. If there are any + * filters which require that additional packets in the flow are + * not fast-forwarded, they must clear the M_CANFASTFWD flag. + * Note that filters must _never_ set this flag, as another filter + * in the list may have previously cleared it. + */ + m0 = m; + pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); + for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) + if (pfh->pfil_func) { + rv = pfh->pfil_func(ip, hlen, + m->m_pkthdr.rcvif, 0, &m0); + if (rv) + return; + m = m0; + if (m == NULL) + return; + ip = mtod(m, struct ip *); + } +#endif /* PFIL_HOOKS */ + + if (fw_enable && IPFW_LOADED) { + /* + * If we've been forwarded from the output side, then + * skip the firewall a second time + */ + if (args.next_hop) + goto ours; + + args.m = m; + i = ip_fw_chk_ptr(&args); + m = args.m; + + if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */ + if (m) + m_freem(m); + return; + } + ip = mtod(m, struct ip *); /* just in case m changed */ + if (i == 0 && args.next_hop == NULL) /* common case */ + goto pass; + if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) { + /* Send packet to the appropriate pipe */ + ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args); + return; + } +#ifdef IPDIVERT + if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { + /* Divert or tee packet */ + divert_info = i; + goto ours; + } +#endif + if (i == 0 && args.next_hop != NULL) + goto pass; + /* + * if we get here, the packet must be dropped + */ + m_freem(m); + return; + } +pass: + + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + ip_nhops = 0; /* for source routed packets */ + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop)) + return; + + /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no + * matter if it is destined to another node, or whether it is + * a multicast one, RSVP wants it! and prevents it from being forwarded + * anywhere else. Also checks if the rsvp daemon is running before + * grabbing the packet. + */ + if (rsvp_on && ip->ip_p==IPPROTO_RSVP) + goto ours; + + /* + * Check our list of addresses, to see if the packet is for us. + * If we don't have any addresses, assume any unicast packet + * we receive might be for us (and let the upper layers deal + * with it). + */ + if (TAILQ_EMPTY(&in_ifaddrhead) && + (m->m_flags & (M_MCAST|M_BCAST)) == 0) + goto ours; + + /* + * Cache the destination address of the packet; this may be + * changed by use of 'ipfw fwd'. + */ + pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; + + /* + * Enable a consistency check between the destination address + * and the arrival interface for a unicast packet (the RFC 1122 + * strong ES model) if IP forwarding is disabled and the packet + * is not locally generated and the packet is not subject to + * 'ipfw fwd'. + * + * XXX - Checking also should be disabled if the destination + * address is ipnat'ed to a different interface. + * + * XXX - Checking is incompatible with IP aliases added + * to the loopback interface instead of the interface where + * the packets are received. + */ + checkif = ip_checkinterface && (ipforwarding == 0) && + m->m_pkthdr.rcvif != NULL && + ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && + (args.next_hop == NULL); + + /* + * Check for exact addresses in the hash bucket. + */ + LIST_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) { + /* + * If the address matches, verify that the packet + * arrived via the correct interface if checking is + * enabled. + */ + if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && + (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) + goto ours; + } + /* + * Check for broadcast addresses. + * + * Only accept broadcast packets that arrive via the matching + * interface. Reception of forwarded directed broadcasts would + * be handled via ip_forward() and ether_output() with the loopback + * into the stack for SIMPLEX interfaces handled by ether_output(). + */ + if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) { + TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = ifatoia(ifa); + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + pkt_dst.s_addr) + goto ours; + if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) + goto ours; +#ifdef BOOTP_COMPAT + if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) + goto ours; +#endif + } + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + if (ip_mrouter) { + /* + * If we are acting as a multicast router, all + * incoming multicast packets are passed to the + * kernel-level multicast forwarding function. + * The packet is returned (relatively) intact; if + * ip_mforward() returns a non-zero value, the packet + * must be discarded, else it may be accepted below. + */ + if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + + /* + * The process-level routing daemon needs to receive + * all multicast IGMP packets, whether or not this + * host belongs to their destination groups. + */ + if (ip->ip_p == IPPROTO_IGMP) + goto ours; + ipstat.ips_forward++; + } + /* + * See if we belong to the destination multicast group on the + * arrival interface. + */ + IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); + if (inm == NULL) { + ipstat.ips_notmember++; + m_freem(m); + return; + } + goto ours; + } + if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + goto ours; + if (ip->ip_dst.s_addr == INADDR_ANY) + goto ours; + + /* + * FAITH(Firewall Aided Internet Translator) + */ + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { + if (ip_keepfaith) { + if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) + goto ours; + } + m_freem(m); + return; + } + + /* + * Not for us; forward if possible and desirable. + */ + if (ipforwarding == 0) { + ipstat.ips_cantforward++; + m_freem(m); + } else { +#ifdef IPSEC + /* + * Enforce inbound IPsec SPD. + */ + if (ipsec4_in_reject(m, NULL)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif /* IPSEC */ + ip_forward(m, 0, args.next_hop); + } + return; + +ours: +#ifdef IPSTEALTH + /* + * IPSTEALTH: Process non-routing options only + * if the packet is destined for us. + */ + if (ipstealth && hlen > sizeof (struct ip) && + ip_dooptions(m, 1, args.next_hop)) + return; +#endif /* IPSTEALTH */ + + /* Count the packet in the ip address stats */ + if (ia != NULL) { + ia->ia_ifa.if_ipackets++; + ia->ia_ifa.if_ibytes += m->m_pkthdr.len; + } + + /* + * If offset or IP_MF are set, must reassemble. + * Otherwise, nothing need be done. + * (We could look in the reassembly queue to see + * if the packet was previously fragmented, + * but it's not worth the time; just let them time out.) + */ + if (ip->ip_off & (IP_MF | IP_OFFMASK)) { + + sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); + /* + * Look for queue of fragments + * of this datagram. + */ + TAILQ_FOREACH(fp, &ipq[sum], ipq_list) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && + ip->ip_p == fp->ipq_p) + goto found; + + fp = 0; + + /* check if there's a place for the new queue */ + if (nipq > maxnipq) { + /* + * drop something from the tail of the current queue + * before proceeding further + */ + struct ipq *q = TAILQ_LAST(&ipq[sum], ipqhead); + if (q == NULL) { /* gak */ + for (i = 0; i < IPREASS_NHASH; i++) { + struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); + if (r) { + ip_freef(&ipq[i], r); + break; + } + } + } else + ip_freef(&ipq[sum], q); + } +found: + /* + * Adjust ip_len to not reflect header, + * convert offset of this to bytes. + */ + ip->ip_len -= hlen; + if (ip->ip_off & IP_MF) { + /* + * Make sure that fragments have a data length + * that's a non-zero multiple of 8 bytes. + */ + if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { + ipstat.ips_toosmall++; /* XXX */ + goto bad; + } + m->m_flags |= M_FRAG; + } + ip->ip_off <<= 3; + + /* + * Attempt reassembly; if it succeeds, proceed. + * ip_reass() will return a different mbuf, and update + * the divert info in divert_info and args.divert_rule. + */ + ipstat.ips_fragments++; + m->m_pkthdr.header = ip; + m = ip_reass(m, + &ipq[sum], fp, &divert_info, &args.divert_rule); + if (m == 0) + return; + ipstat.ips_reassembled++; + ip = mtod(m, struct ip *); + /* Get the header length of the reassembled packet */ + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#ifdef IPDIVERT + /* Restore original checksum before diverting packet */ + if (divert_info != 0) { + ip->ip_len += hlen; + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + ip->ip_off = ntohs(ip->ip_off); + ip->ip_len = ntohs(ip->ip_len); + ip->ip_len -= hlen; + } +#endif + } else + ip->ip_len -= hlen; + +#ifdef IPDIVERT + /* + * Divert or tee packet to the divert protocol if required. + */ + if (divert_info != 0) { + struct mbuf *clone = NULL; + + /* Clone packet if we're doing a 'tee' */ + if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + + /* Restore packet header fields to original values */ + ip->ip_len += hlen; + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + + /* Deliver packet to divert input routine */ + divert_packet(m, 1, divert_info & 0xffff, args.divert_rule); + ipstat.ips_delivered++; + + /* If 'tee', continue with original packet */ + if (clone == NULL) + return; + m = clone; + ip = mtod(m, struct ip *); + ip->ip_len += hlen; + /* + * Jump backwards to complete processing of the + * packet. But first clear divert_info to avoid + * entering this block again. + * We do not need to clear args.divert_rule + * or args.next_hop as they will not be used. + */ + divert_info = 0; + goto pass; + } +#endif + +#ifdef IPSEC + /* + * enforce IPsec policy checking if we are seeing last header. + * note that we do not visit this with protocols with pcb layer + * code - like udp/tcp/raw ip. + */ + if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 && + ipsec4_in_reject(m, NULL)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif + + /* + * Switch out to protocol's input routine. + */ + ipstat.ips_delivered++; + if (args.next_hop && ip->ip_p == IPPROTO_TCP) { + /* TCP needs IPFORWARD info if available */ + struct m_hdr tag; + + tag.mh_type = MT_TAG; + tag.mh_flags = PACKET_TAG_IPFORWARD; + tag.mh_data = (caddr_t)args.next_hop; + tag.mh_next = m; + + (*inetsw[ip_protox[ip->ip_p]].pr_input)( + (struct mbuf *)&tag, hlen); + } else + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + return; +bad: + m_freem(m); +} + +/* + * IP software interrupt routine - to go away sometime soon + */ +static void +ipintr(void) +{ + struct mbuf *m; + + while (1) { + IF_DEQUEUE(&ipintrq, m); + if (m == 0) + return; + ip_input(m); + } +} + +/* + * Take incoming datagram fragment and try to reassemble it into + * whole datagram. If a chain for reassembly of this datagram already + * exists, then it is given as fp; otherwise have to make a chain. + * + * When IPDIVERT enabled, keep additional state with each packet that + * tells us if we need to divert or tee the packet we're building. + * In particular, *divinfo includes the port and TEE flag, + * *divert_rule is the number of the matching rule. + */ + +static struct mbuf * +ip_reass(struct mbuf *m, struct ipqhead *head, struct ipq *fp, + u_int32_t *divinfo, u_int16_t *divert_rule) +{ + struct ip *ip = mtod(m, struct ip *); + register struct mbuf *p, *q, *nq; + struct mbuf *t; + int hlen = IP_VHL_HL(ip->ip_vhl) << 2; + int i, next; + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == 0) { + /* + * Enforce upper bound on number of fragmented packets + * for which we attempt reassembly; + * If maxfrag is 0, never accept fragments. + * If maxfrag is -1, accept all fragments without limitation. + */ + if ((ip_maxfragpackets >= 0) && (ip_nfragpackets >= ip_maxfragpackets)) + goto dropfrag; + ip_nfragpackets++; + if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) + goto dropfrag; + fp = mtod(t, struct ipq *); + TAILQ_INSERT_HEAD(head, fp, ipq_list); + nipq++; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_src = ip->ip_src; + fp->ipq_dst = ip->ip_dst; + fp->ipq_frags = m; + m->m_nextpkt = NULL; +#ifdef IPDIVERT + fp->ipq_div_info = 0; + fp->ipq_div_cookie = 0; +#endif + goto inserted; + } + +#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) + + /* + * Find a segment which begins after this one does. + */ + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) + if (GETIP(q)->ip_off > ip->ip_off) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us, otherwise + * stick new segment in the proper place. + * + * If some of the data is dropped from the the preceding + * segment, then it's checksum is invalidated. + */ + if (p) { + i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; + if (i > 0) { + if (i >= ip->ip_len) + goto dropfrag; + m_adj(m, i); + m->m_pkthdr.csum_flags = 0; + ip->ip_off += i; + ip->ip_len -= i; + } + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } else { + m->m_nextpkt = fp->ipq_frags; + fp->ipq_frags = m; + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; + q = nq) { + i = (ip->ip_off + ip->ip_len) - + GETIP(q)->ip_off; + if (i < GETIP(q)->ip_len) { + GETIP(q)->ip_len -= i; + GETIP(q)->ip_off += i; + m_adj(q, i); + q->m_pkthdr.csum_flags = 0; + break; + } + nq = q->m_nextpkt; + m->m_nextpkt = nq; + m_freem(q); + } + +inserted: + +#ifdef IPDIVERT + /* + * Transfer firewall instructions to the fragment structure. + * Only trust info in the fragment at offset 0. + */ + if (ip->ip_off == 0) { + fp->ipq_div_info = *divinfo; + fp->ipq_div_cookie = *divert_rule; + } + *divinfo = 0; + *divert_rule = 0; +#endif + + /* + * Check for complete reassembly. + */ + next = 0; + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { + if (GETIP(q)->ip_off != next) + return (0); + next += GETIP(q)->ip_len; + } + /* Make sure the last packet didn't have the IP_MF flag */ + if (p->m_flags & M_FRAG) + return (0); + + /* + * Reassembly is complete. Make sure the packet is a sane size. + */ + q = fp->ipq_frags; + ip = GETIP(q); + if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { + ipstat.ips_toolong++; + ip_freef(head, fp); + return (0); + } + + /* + * Concatenate fragments. + */ + m = q; + t = m->m_next; + m->m_next = 0; + m_cat(m, t); + nq = q->m_nextpkt; + q->m_nextpkt = 0; + for (q = nq; q != NULL; q = nq) { + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; + m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; + m_cat(m, q); + } + +#ifdef IPDIVERT + /* + * Extract firewall instructions from the fragment structure. + */ + *divinfo = fp->ipq_div_info; + *divert_rule = fp->ipq_div_cookie; +#endif + + /* + * Create header for new ip packet by + * modifying header of first packet; + * dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip->ip_len = next; + ip->ip_src = fp->ipq_src; + ip->ip_dst = fp->ipq_dst; + TAILQ_REMOVE(head, fp, ipq_list); + nipq--; + (void) m_free(dtom(fp)); + ip_nfragpackets--; + m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); + m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ + register int plen = 0; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + } + return (m); + +dropfrag: +#ifdef IPDIVERT + *divinfo = 0; + *divert_rule = 0; +#endif + ipstat.ips_fragdropped++; + m_freem(m); + return (0); + +#undef GETIP +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +static void +ip_freef(fhp, fp) + struct ipqhead *fhp; + struct ipq *fp; +{ + register struct mbuf *q; + + while (fp->ipq_frags) { + q = fp->ipq_frags; + fp->ipq_frags = q->m_nextpkt; + m_freem(q); + } + TAILQ_REMOVE(fhp, fp, ipq_list); + (void) m_free(dtom(fp)); + ip_nfragpackets--; + nipq--; +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. + */ +void +ip_slowtimo() +{ + register struct ipq *fp; + int s = splnet(); + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + for(fp = TAILQ_FIRST(&ipq[i]); fp;) { + struct ipq *fpp; + + fpp = fp; + fp = TAILQ_NEXT(fp, ipq_list); + if(--fpp->ipq_ttl == 0) { + ipstat.ips_fragtimeout++; + ip_freef(&ipq[i], fpp); + } + } + } + /* + * If we are over the maximum number of fragments + * (due to the limit being lowered), drain off + * enough to get down to the new limit. + */ + for (i = 0; i < IPREASS_NHASH; i++) { + if (ip_maxfragpackets >= 0) { + while (ip_nfragpackets > ip_maxfragpackets && + !TAILQ_EMPTY(&ipq[i])) { + ipstat.ips_fragdropped++; + ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + } + } + } + ipflow_slowtimo(); + splx(s); +} + +/* + * Drain off all datagram fragments. + */ +void +ip_drain() +{ + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + while(!TAILQ_EMPTY(&ipq[i])) { + ipstat.ips_fragdropped++; + ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + } + } + in_rtqdrain(); +} + +/* + * Do option processing on a datagram, + * possibly discarding it if bad options are encountered, + * or forwarding it if source-routed. + * The pass argument is used when operating in the IPSTEALTH + * mode to tell what options to process: + * [LS]SRR (pass 0) or the others (pass 1). + * The reason for as many as two passes is that when doing IPSTEALTH, + * non-routing options should be processed only if the packet is for us. + * Returns 1 if packet has been forwarded/freed, + * 0 if the packet should be processed further. + */ +static int +ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop) +{ + struct ip *ip = mtod(m, struct ip *); + u_char *cp; + struct in_ifaddr *ia; + int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; + struct in_addr *sin, dst; + n_time ntime; + struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; + + dst = ip->ip_dst; + cp = (u_char *)(ip + 1); + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + } + switch (opt) { + + default: + break; + + /* + * Source routing with record. + * Find interface with current destination address. + * If none on this machine then drop if strictly routed, + * or do nothing if loosely routed. + * Record interface address and bring up next address + * component. If strictly routed make sure next + * address is on directly accessible net. + */ + case IPOPT_LSRR: + case IPOPT_SSRR: +#ifdef IPSTEALTH + if (ipstealth && pass > 0) + break; +#endif + if (optlen < IPOPT_OFFSET + sizeof(*cp)) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = ip->ip_dst; + ia = (struct in_ifaddr *) + ifa_ifwithaddr((struct sockaddr *)&ipaddr); + if (ia == 0) { + if (opt == IPOPT_SSRR) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + if (!ip_dosourceroute) + goto nosourcerouting; + /* + * Loose routing, and not at next destination + * yet; nothing to do except forward. + */ + break; + } + off--; /* 0 origin */ + if (off > optlen - (int)sizeof(struct in_addr)) { + /* + * End of source route. Should be for us. + */ + if (!ip_acceptsourceroute) + goto nosourcerouting; + save_rte(cp, ip->ip_src); + break; + } +#ifdef IPSTEALTH + if (ipstealth) + goto dropit; +#endif + if (!ip_dosourceroute) { + if (ipforwarding) { + char buf[16]; /* aaa.bbb.ccc.ddd\0 */ + /* + * Acting as a router, so generate ICMP + */ +nosourcerouting: + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_WARNING, + "attempted source route from %s to %s\n", + inet_ntoa(ip->ip_src), buf); + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } else { + /* + * Not acting as a router, so silently drop. + */ +#ifdef IPSTEALTH +dropit: +#endif + ipstat.ips_cantforward++; + m_freem(m); + return (1); + } + } + + /* + * locate outgoing interface + */ + (void)memcpy(&ipaddr.sin_addr, cp + off, + sizeof(ipaddr.sin_addr)); + + if (opt == IPOPT_SSRR) { +#define INA struct in_ifaddr * +#define SA struct sockaddr * + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) + ia = (INA)ifa_ifwithnet((SA)&ipaddr); + } else + ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt); + if (ia == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + ip->ip_dst = ipaddr.sin_addr; + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + /* + * Let ip_intr's mcast routing check handle mcast pkts + */ + forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); + break; + + case IPOPT_RR: +#ifdef IPSTEALTH + if (ipstealth && pass == 0) + break; +#endif + if (optlen < IPOPT_OFFSET + sizeof(*cp)) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + /* + * If no space remains, ignore. + */ + off--; /* 0 origin */ + if (off > optlen - (int)sizeof(struct in_addr)) + break; + (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, + sizeof(ipaddr.sin_addr)); + /* + * locate outgoing interface; if we're the destination, + * use the incoming interface (should be same). + */ + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && + (ia = ip_rtaddr(ipaddr.sin_addr, + &ipforward_rt)) == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS: +#ifdef IPSTEALTH + if (ipstealth && pass == 0) + break; +#endif + code = cp - (u_char *)ip; + if (optlen < 4 || optlen > 40) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < 5) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + if (off > optlen - (int)sizeof(int32_t)) { + cp[IPOPT_OFFSET + 1] += (1 << 4); + if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + break; + } + off--; /* 0 origin */ + sin = (struct in_addr *)(cp + off); + switch (cp[IPOPT_OFFSET + 1] & 0x0f) { + + case IPOPT_TS_TSONLY: + break; + + case IPOPT_TS_TSANDADDR: + if (off + sizeof(n_time) + + sizeof(struct in_addr) > optlen) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = dst; + ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, + m->m_pkthdr.rcvif); + if (ia == 0) + continue; + (void)memcpy(sin, &IA_SIN(ia)->sin_addr, + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS_PRESPEC: + if (off + sizeof(n_time) + + sizeof(struct in_addr) > optlen) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + (void)memcpy(&ipaddr.sin_addr, sin, + sizeof(struct in_addr)); + if (ifa_ifwithaddr((SA)&ipaddr) == 0) + continue; + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + default: + code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; + goto bad; + } + ntime = iptime(); + (void)memcpy(cp + off, &ntime, sizeof(n_time)); + cp[IPOPT_OFFSET] += sizeof(n_time); + } + } + if (forward && ipforwarding) { + ip_forward(m, 1, next_hop); + return (1); + } + return (0); +bad: + icmp_error(m, type, code, 0, 0); + ipstat.ips_badoptions++; + return (1); +} + +/* + * Given address of next destination (final or next hop), + * return internet address info of interface to be used to get there. + */ +struct in_ifaddr * +ip_rtaddr(dst, rt) + struct in_addr dst; + struct route *rt; +{ + register struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&rt->ro_dst; + + if (rt->ro_rt == 0 || + !(rt->ro_rt->rt_flags & RTF_UP) || + dst.s_addr != sin->sin_addr.s_addr) { + if (rt->ro_rt) { + RTFREE(rt->ro_rt); + rt->ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = dst; + + rtalloc_ign(rt, RTF_PRCLONING); + } + if (rt->ro_rt == 0) + return ((struct in_ifaddr *)0); + return (ifatoia(rt->ro_rt->rt_ifa)); +} + +/* + * Save incoming source route for use in replies, + * to be picked up later by ip_srcroute if the receiver is interested. + */ +void +save_rte(option, dst) + u_char *option; + struct in_addr dst; +{ + unsigned olen; + + olen = option[IPOPT_OLEN]; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("save_rte: olen %d\n", olen); +#endif + if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) + return; + bcopy(option, ip_srcrt.srcopt, olen); + ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); + ip_srcrt.dst = dst; +} + +/* + * Retrieve incoming source route for use in replies, + * in the same form used by setsockopt. + * The first hop is placed before the options, will be removed later. + */ +struct mbuf * +ip_srcroute() +{ + register struct in_addr *p, *q; + register struct mbuf *m; + + if (ip_nhops == 0) + return ((struct mbuf *)0); + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == 0) + return ((struct mbuf *)0); + +#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) + + /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ + m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + + OPTSIZ; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); +#endif + + /* + * First save first hop for return route + */ + p = &ip_srcrt.route[ip_nhops - 1]; + *(mtod(m, struct in_addr *)) = *p--; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr)); +#endif + + /* + * Copy option fields and padding (nop) to mbuf. + */ + ip_srcrt.nop = IPOPT_NOP; + ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; + (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), + &ip_srcrt.nop, OPTSIZ); + q = (struct in_addr *)(mtod(m, caddr_t) + + sizeof(struct in_addr) + OPTSIZ); +#undef OPTSIZ + /* + * Record return path as an IP source route, + * reversing the path (pointers are now aligned). + */ + while (p >= ip_srcrt.route) { +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx", (u_long)ntohl(q->s_addr)); +#endif + *q++ = *p--; + } + /* + * Last hop goes to final destination. + */ + *q = ip_srcrt.dst; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx\n", (u_long)ntohl(q->s_addr)); +#endif + return (m); +} + +/* + * Strip out IP options, at higher + * level protocol in the kernel. + * Second argument is buffer to which options + * will be moved, and return value is their length. + * XXX should be deleted; last arg currently ignored. + */ +void +ip_stripoptions(m, mopt) + register struct mbuf *m; + struct mbuf *mopt; +{ + register int i; + struct ip *ip = mtod(m, struct ip *); + register caddr_t opts; + int olen; + + olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + opts = (caddr_t)(ip + 1); + i = m->m_len - (sizeof (struct ip) + olen); + bcopy(opts + olen, opts, (unsigned)i); + m->m_len -= olen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= olen; + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2); +} + +u_char inetctlerrmap[PRC_NCMDS] = { + 0, 0, 0, 0, + 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, + EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, EHOSTUNREACH, 0, 0, + 0, 0, 0, 0, + ENOPROTOOPT, ECONNREFUSED +}; + +/* + * Forward a packet. If some error occurs return the sender + * an icmp packet. Note we can't always generate a meaningful + * icmp message because icmp doesn't have a large enough repertoire + * of codes and types. + * + * If not forwarding, just drop the packet. This could be confusing + * if ipforwarding was zero but some routing protocol was advancing + * us as a gateway to somewhere. However, we must let the routing + * protocol deal with that. + * + * The srcrt parameter indicates whether the packet is being forwarded + * via a source route. + */ +static void +ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) +{ + struct ip *ip = mtod(m, struct ip *); + struct rtentry *rt; + int error, type = 0, code = 0; + struct mbuf *mcopy; + n_long dest; + struct in_addr pkt_dst; + struct ifnet *destifp; +#ifdef IPSEC + struct ifnet dummyifp; +#endif + + dest = 0; + /* + * Cache the destination address of the packet; this may be + * changed by use of 'ipfw fwd'. + */ + pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst; + +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("forward: src %lx dst %lx ttl %x\n", + (u_long)ip->ip_src.s_addr, (u_long)pkt_dst.s_addr, + ip->ip_ttl); +#endif + + + if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(pkt_dst) == 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } +#ifdef IPSTEALTH + if (!ipstealth) { +#endif + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, + dest, 0); + return; + } +#ifdef IPSTEALTH + } +#endif + + if (ip_rtaddr(pkt_dst, &ipforward_rt) == 0) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); + return; + } else + rt = ipforward_rt.ro_rt; + + /* + * Save the IP header and at most 8 bytes of the payload, + * in case we need to generate an ICMP message to the src. + * + * XXX this can be optimized a lot by saving the data in a local + * buffer on the stack (72 bytes at most), and only allocating the + * mbuf if really necessary. The vast majority of the packets + * are forwarded without having to send an ICMP back (either + * because unnecessary, or because rate limited), so we are + * really we are wasting a lot of work here. + * + * We don't use m_copy() because it might return a reference + * to a shared cluster. Both this function and ip_output() + * assume exclusive access to the IP header in `m', so any + * data in a cluster may change before we reach icmp_error(). + */ + MGET(mcopy, M_DONTWAIT, m->m_type); + if (mcopy != NULL) { + M_COPY_PKTHDR(mcopy, m); + mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8, + (int)ip->ip_len); + m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); + } + +#ifdef IPSTEALTH + if (!ipstealth) { +#endif + ip->ip_ttl -= IPTTLDEC; +#ifdef IPSTEALTH + } +#endif + + /* + * If forwarding packet using same interface that it came in on, + * perhaps should send a redirect to sender to shortcut a hop. + * Only send redirect if source is sending directly to us, + * and if packet was not source routed (or has any options). + * Also, don't send redirect if forwarding using a default route + * or a route modified by a redirect. + */ + if (rt->rt_ifp == m->m_pkthdr.rcvif && + (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && + satosin(rt_key(rt))->sin_addr.s_addr != 0 && + ipsendredirects && !srcrt && !next_hop) { +#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + u_long src = ntohl(ip->ip_src.s_addr); + + if (RTA(rt) && + (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = pkt_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_long)dest); +#endif + } + } + + error = ip_output(m, (struct mbuf *)0, &ipforward_rt, + IP_FORWARDING, 0); + if (error) + ipstat.ips_cantforward++; + else { + ipstat.ips_forward++; + if (type) + ipstat.ips_redirectsent++; + else { + if (mcopy) { + ipflow_create(&ipforward_rt, mcopy); + m_freem(mcopy); + } + return; + } + } + if (mcopy == NULL) + return; + destifp = NULL; + + switch (error) { + + case 0: /* forwarded, but need redirect */ + /* type, code set above */ + break; + + case ENETUNREACH: /* shouldn't happen, checked above */ + case EHOSTUNREACH: + case ENETDOWN: + case EHOSTDOWN: + default: + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + break; + + case EMSGSIZE: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; +#ifndef IPSEC + if (ipforward_rt.ro_rt) + destifp = ipforward_rt.ro_rt->rt_ifp; +#else + /* + * If the packet is routed over IPsec tunnel, tell the + * originator the tunnel MTU. + * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz + * XXX quickhack!!! + */ + if (ipforward_rt.ro_rt) { + struct secpolicy *sp = NULL; + int ipsecerror; + int ipsechdr; + struct route *ro; + + sp = ipsec4_getpolicybyaddr(mcopy, + IPSEC_DIR_OUTBOUND, + IP_FORWARDING, + &ipsecerror); + + if (sp == NULL) + destifp = ipforward_rt.ro_rt->rt_ifp; + else { + /* count IPsec header size */ + ipsechdr = ipsec4_hdrsiz(mcopy, + IPSEC_DIR_OUTBOUND, + NULL); + + /* + * find the correct route for outer IPv4 + * header, compute tunnel MTU. + * + * XXX BUG ALERT + * The "dummyifp" code relies upon the fact + * that icmp_error() touches only ifp->if_mtu. + */ + /*XXX*/ + destifp = NULL; + if (sp->req != NULL + && sp->req->sav != NULL + && sp->req->sav->sah != NULL) { + ro = &sp->req->sav->sah->sa_route; + if (ro->ro_rt && ro->ro_rt->rt_ifp) { + dummyifp.if_mtu = + ro->ro_rt->rt_ifp->if_mtu; + dummyifp.if_mtu -= ipsechdr; + destifp = &dummyifp; + } + } + + key_freesp(sp); + } + } +#endif /*IPSEC*/ + ipstat.ips_cantfrag++; + break; + + case ENOBUFS: + type = ICMP_SOURCEQUENCH; + code = 0; + break; + + case EACCES: /* ipfw denied packet */ + m_freem(mcopy); + return; + } + icmp_error(mcopy, type, code, dest, destifp); +} + +void +ip_savecontrol(inp, mp, ip, m) + register struct inpcb *inp; + register struct mbuf **mp; + register struct ip *ip; + register struct mbuf *m; +{ + if (inp->inp_socket->so_options & SO_TIMESTAMP) { + struct timeval tv; + + microtime(&tv); + *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET); + if (*mp) + mp = &(*mp)->m_next; + } + if (inp->inp_flags & INP_RECVDSTADDR) { + *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#ifdef notyet + /* XXX + * Moving these out of udp_input() made them even more broken + * than they already were. + */ + /* options were tossed already */ + if (inp->inp_flags & INP_RECVOPTS) { + *mp = sbcreatecontrol((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + /* ip_srcroute doesn't do what we want here, need to fix */ + if (inp->inp_flags & INP_RECVRETOPTS) { + *mp = sbcreatecontrol((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#endif + if (inp->inp_flags & INP_RECVIF) { + struct ifnet *ifp; + struct sdlbuf { + struct sockaddr_dl sdl; + u_char pad[32]; + } sdlbuf; + struct sockaddr_dl *sdp; + struct sockaddr_dl *sdl2 = &sdlbuf.sdl; + + if (((ifp = m->m_pkthdr.rcvif)) + && ( ifp->if_index && (ifp->if_index <= if_index))) { + sdp = (struct sockaddr_dl *) + (ifaddr_byindex(ifp->if_index)->ifa_addr); + /* + * Change our mind and don't try copy. + */ + if ((sdp->sdl_family != AF_LINK) + || (sdp->sdl_len > sizeof(sdlbuf))) { + goto makedummy; + } + bcopy(sdp, sdl2, sdp->sdl_len); + } else { +makedummy: + sdl2->sdl_len + = offsetof(struct sockaddr_dl, sdl_data[0]); + sdl2->sdl_family = AF_LINK; + sdl2->sdl_index = 0; + sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; + } + *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +} + +/* + * XXX these routines are called from the upper part of the kernel. + * They need to be locked when we remove Giant. + * + * They could also be moved to ip_mroute.c, since all the RSVP + * handling is done there already. + */ +static int ip_rsvp_on; +struct socket *ip_rsvpd; +int +ip_rsvp_init(struct socket *so) +{ + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + if (ip_rsvpd != NULL) + return EADDRINUSE; + + ip_rsvpd = so; + /* + * This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!ip_rsvp_on) { + ip_rsvp_on = 1; + rsvp_on++; + } + + return 0; +} + +int +ip_rsvp_done(void) +{ + ip_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (ip_rsvp_on) { + ip_rsvp_on = 0; + rsvp_on--; + } + return 0; +} diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c new file mode 100644 index 0000000..668038e --- /dev/null +++ b/sys/netinet/ip_mroute.c @@ -0,0 +1,2257 @@ +/* + * IP multicast forwarding procedures + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Mark J. Steiglitz, Stanford, May, 1991 + * Modified by Van Jacobson, LBL, January 1993 + * Modified by Ajit Thyagarajan, PARC, August 1993 + * Modified by Bill Fenner, PARC, April 1995 + * + * MROUTING Revision: 3.5 + * $FreeBSD$ + */ + +#include "opt_mrouting.h" +#include "opt_random_ip_id.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sockio.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/igmp.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_encap.h> +#include <netinet/ip_mroute.h> +#include <netinet/ip_var.h> +#include <netinet/udp.h> +#include <machine/in_cksum.h> + +#ifndef MROUTING +extern u_long _ip_mcast_src(int vifi); +extern int _ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, + struct ip_moptions *imo); +extern int _ip_mrouter_done(void); +extern int _ip_mrouter_get(struct socket *so, struct sockopt *sopt); +extern int _ip_mrouter_set(struct socket *so, struct sockopt *sopt); +extern int _mrt_ioctl(int req, caddr_t data); + +/* + * Dummy routines and globals used when multicast routing is not compiled in. + */ + +struct socket *ip_mrouter = NULL; +u_int rsvpdebug = 0; + +int +_ip_mrouter_set(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_set)(struct socket *, struct sockopt *) = _ip_mrouter_set; + + +int +_ip_mrouter_get(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_get)(struct socket *, struct sockopt *) = _ip_mrouter_get; + +int +_ip_mrouter_done() +{ + return(0); +} + +int (*ip_mrouter_done)(void) = _ip_mrouter_done; + +int +_ip_mforward(ip, ifp, m, imo) + struct ip *ip; + struct ifnet *ifp; + struct mbuf *m; + struct ip_moptions *imo; +{ + return(0); +} + +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = _ip_mforward; + +int +_mrt_ioctl(int req, caddr_t data) +{ + return EOPNOTSUPP; +} + +int (*mrt_ioctl)(int, caddr_t) = _mrt_ioctl; + +void +rsvp_input(m, off) /* XXX must fixup manually */ + struct mbuf *m; + int off; +{ + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, off); + return; + } + /* Drop the packet */ + m_freem(m); +} + +int (*legal_vif_num)(int) = 0; + +/* + * This should never be called, since IP_MULTICAST_VIF should fail, but + * just in case it does get called, the code a little lower in ip_output + * will assign the packet a local address. + */ +u_long +_ip_mcast_src(int vifi) { return INADDR_ANY; } +u_long (*ip_mcast_src)(int) = _ip_mcast_src; + +int +ip_rsvp_vif_init(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EINVAL); +} + +int +ip_rsvp_vif_done(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EINVAL); +} + +void +ip_rsvp_force_done(so) + struct socket *so; +{ + return; +} + +#else /* MROUTING */ + +#define M_HASCL(m) ((m)->m_flags & M_EXT) + +static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); + +#ifndef MROUTE_KLD +/* The socket used to communicate with the multicast routing daemon. */ +struct socket *ip_mrouter = NULL; +#endif + +#if defined(MROUTING) || defined(MROUTE_KLD) +static struct mrtstat mrtstat; +SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, + &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); +#endif + +static struct mfc *mfctable[MFCTBLSIZ]; +static u_char nexpire[MFCTBLSIZ]; +static struct vif viftable[MAXVIFS]; +static u_int mrtdebug = 0; /* debug level */ +#define DEBUG_MFC 0x02 +#define DEBUG_FORWARD 0x04 +#define DEBUG_EXPIRE 0x08 +#define DEBUG_XMIT 0x10 +static u_int tbfdebug = 0; /* tbf debug level */ +static u_int rsvpdebug = 0; /* rsvp debug level */ + +static struct callout_handle expire_upcalls_ch; + +#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ +#define UPCALL_EXPIRE 6 /* number of timeouts */ + +/* + * Define the token bucket filter structures + * tbftable -> each vif has one of these for storing info + */ + +static struct tbf tbftable[MAXVIFS]; +#define TBF_REPROCESS (hz / 100) /* 100x / second */ + +/* + * 'Interfaces' associated with decapsulator (so we can tell + * packets that went through it from ones that get reflected + * by a broken gateway). These interfaces are never linked into + * the system ifnet list & no routes point to them. I.e., packets + * can't be sent this way. They only exist as a placeholder for + * multicast source verification. + */ +static struct ifnet multicast_decap_if[MAXVIFS]; + +#define ENCAP_TTL 64 +#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ + +/* prototype IP hdr for encapsulated packets */ +static struct ip multicast_encap_iphdr = { +#if BYTE_ORDER == LITTLE_ENDIAN + sizeof(struct ip) >> 2, IPVERSION, +#else + IPVERSION, sizeof(struct ip) >> 2, +#endif + 0, /* tos */ + sizeof(struct ip), /* total length */ + 0, /* id */ + 0, /* frag offset */ + ENCAP_TTL, ENCAP_PROTO, + 0, /* checksum */ +}; + +/* + * Private variables. + */ +static vifi_t numvifs = 0; +static const struct encaptab *encap_cookie = NULL; + +/* + * one-back cache used by mroute_encapcheck to locate a tunnel's vif + * given a datagram's src ip address. + */ +static u_long last_encap_src; +static struct vif *last_encap_vif; + +static u_long X_ip_mcast_src(int vifi); +static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); +static int X_ip_mrouter_done(void); +static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); +static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); +static int X_legal_vif_num(int vif); +static int X_mrt_ioctl(int cmd, caddr_t data); + +static int get_sg_cnt(struct sioc_sg_req *); +static int get_vif_cnt(struct sioc_vif_req *); +static int ip_mrouter_init(struct socket *, int); +static int add_vif(struct vifctl *); +static int del_vif(vifi_t); +static int add_mfc(struct mfcctl *); +static int del_mfc(struct mfcctl *); +static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); +static int set_assert(int); +static void expire_upcalls(void *); +static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, + vifi_t); +static void phyint_send(struct ip *, struct vif *, struct mbuf *); +static void encap_send(struct ip *, struct vif *, struct mbuf *); +static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); +static void tbf_queue(struct vif *, struct mbuf *); +static void tbf_process_q(struct vif *); +static void tbf_reprocess_q(void *); +static int tbf_dq_sel(struct vif *, struct ip *); +static void tbf_send_packet(struct vif *, struct mbuf *); +static void tbf_update_tokens(struct vif *); +static int priority(struct vif *, struct ip *); + +/* + * whether or not special PIM assert processing is enabled. + */ +static int pim_assert; +/* + * Rate limit for assert notification messages, in usec + */ +#define ASSERT_MSG_TIME 3000000 + +/* + * Hash function for a source, group entry + */ +#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ + ((g) >> 20) ^ ((g) >> 10) ^ (g)) + +/* + * Find a route for a given origin IP address and Multicast group address + * Type of service parameter to be added in the future!!! + */ + +#define MFCFIND(o, g, rt) { \ + register struct mfc *_rt = mfctable[MFCHASH(o,g)]; \ + rt = NULL; \ + ++mrtstat.mrts_mfc_lookups; \ + while (_rt) { \ + if ((_rt->mfc_origin.s_addr == o) && \ + (_rt->mfc_mcastgrp.s_addr == g) && \ + (_rt->mfc_stall == NULL)) { \ + rt = _rt; \ + break; \ + } \ + _rt = _rt->mfc_next; \ + } \ + if (rt == NULL) { \ + ++mrtstat.mrts_mfc_misses; \ + } \ +} + + +/* + * Macros to compute elapsed time efficiently + * Borrowed from Van Jacobson's scheduling code + */ +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a).tv_usec - (b).tv_usec; \ + if ((xxs = (a).tv_sec - (b).tv_sec)) { \ + switch (xxs) { \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + default: \ + delta += (1000000 * xxs); \ + } \ + } \ +} + +#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ + (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) + +#ifdef UPCALL_TIMING +u_long upcall_data[51]; +static void collate(struct timeval *); +#endif /* UPCALL_TIMING */ + + +/* + * Handle MRT setsockopt commands to modify the multicast routing tables. + */ +static int +X_ip_mrouter_set(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + vifi_t vifi; + struct vifctl vifc; + struct mfcctl mfc; + + if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) + return (EPERM); + + error = 0; + switch (sopt->sopt_name) { + case MRT_INIT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + error = ip_mrouter_init(so, optval); + break; + + case MRT_DONE: + error = ip_mrouter_done(); + break; + + case MRT_ADD_VIF: + error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); + if (error) + break; + error = add_vif(&vifc); + break; + + case MRT_DEL_VIF: + error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); + if (error) + break; + error = del_vif(vifi); + break; + + case MRT_ADD_MFC: + case MRT_DEL_MFC: + error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc); + if (error) + break; + if (sopt->sopt_name == MRT_ADD_MFC) + error = add_mfc(&mfc); + else + error = del_mfc(&mfc); + break; + + case MRT_ASSERT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + set_assert(optval); + break; + + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +#ifndef MROUTE_KLD +int (*ip_mrouter_set)(struct socket *, struct sockopt *) = X_ip_mrouter_set; +#endif + +/* + * Handle MRT getsockopt commands + */ +static int +X_ip_mrouter_get(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error; + static int version = 0x0305; /* !!! why is this here? XXX */ + + switch (sopt->sopt_name) { + case MRT_VERSION: + error = sooptcopyout(sopt, &version, sizeof version); + break; + + case MRT_ASSERT: + error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +#ifndef MROUTE_KLD +int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get; +#endif + +/* + * Handle ioctl commands to obtain information from the cache + */ +static int +X_mrt_ioctl(cmd, data) + int cmd; + caddr_t data; +{ + int error = 0; + + switch (cmd) { + case (SIOCGETVIFCNT): + return (get_vif_cnt((struct sioc_vif_req *)data)); + break; + case (SIOCGETSGCNT): + return (get_sg_cnt((struct sioc_sg_req *)data)); + break; + default: + return (EINVAL); + break; + } + return error; +} + +#ifndef MROUTE_KLD +int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; +#endif + +/* + * returns the packet, byte, rpf-failure count for the source group provided + */ +static int +get_sg_cnt(req) + register struct sioc_sg_req *req; +{ + register struct mfc *rt; + int s; + + s = splnet(); + MFCFIND(req->src.s_addr, req->grp.s_addr, rt); + splx(s); + if (rt != NULL) { + req->pktcnt = rt->mfc_pkt_cnt; + req->bytecnt = rt->mfc_byte_cnt; + req->wrong_if = rt->mfc_wrong_if; + } else + req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; + + return 0; +} + +/* + * returns the input and output packet and byte counts on the vif provided + */ +static int +get_vif_cnt(req) + register struct sioc_vif_req *req; +{ + register vifi_t vifi = req->vifi; + + if (vifi >= numvifs) return EINVAL; + + req->icount = viftable[vifi].v_pkt_in; + req->ocount = viftable[vifi].v_pkt_out; + req->ibytes = viftable[vifi].v_bytes_in; + req->obytes = viftable[vifi].v_bytes_out; + + return 0; +} + +/* + * Enable multicast routing + */ +static int +ip_mrouter_init(so, version) + struct socket *so; + int version; +{ + if (mrtdebug) + log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; + + if (version != 1) + return ENOPROTOOPT; + + if (ip_mrouter != NULL) return EADDRINUSE; + + ip_mrouter = so; + + bzero((caddr_t)mfctable, sizeof(mfctable)); + bzero((caddr_t)nexpire, sizeof(nexpire)); + + pim_assert = 0; + + expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_init\n"); + + return 0; +} + +/* + * Disable multicast routing + */ +static int +X_ip_mrouter_done() +{ + vifi_t vifi; + int i; + struct ifnet *ifp; + struct ifreq ifr; + struct mfc *rt; + struct rtdetq *rte; + int s; + + s = splnet(); + + /* + * For each phyint in use, disable promiscuous reception of all IP + * multicasts. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_lcl_addr.s_addr != 0 && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr + = INADDR_ANY; + ifp = viftable[vifi].v_ifp; + if_allmulti(ifp, 0); + } + } + bzero((caddr_t)tbftable, sizeof(tbftable)); + bzero((caddr_t)viftable, sizeof(viftable)); + numvifs = 0; + pim_assert = 0; + + untimeout(expire_upcalls, (caddr_t)NULL, expire_upcalls_ch); + + /* + * Free all multicast forwarding cache entries. + */ + for (i = 0; i < MFCTBLSIZ; i++) { + for (rt = mfctable[i]; rt != NULL; ) { + struct mfc *nr = rt->mfc_next; + + for (rte = rt->mfc_stall; rte != NULL; ) { + struct rtdetq *n = rte->next; + + m_freem(rte->m); + free(rte, M_MRTABLE); + rte = n; + } + free(rt, M_MRTABLE); + rt = nr; + } + } + + bzero((caddr_t)mfctable, sizeof(mfctable)); + + /* + * Reset de-encapsulation cache + */ + last_encap_src = 0; + last_encap_vif = NULL; + if (encap_cookie) { + encap_detach(encap_cookie); + encap_cookie = NULL; + } + + ip_mrouter = NULL; + + splx(s); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_done\n"); + + return 0; +} + +#ifndef MROUTE_KLD +int (*ip_mrouter_done)(void) = X_ip_mrouter_done; +#endif + +/* + * Set PIM assert processing global + */ +static int +set_assert(i) + int i; +{ + if ((i != 1) && (i != 0)) + return EINVAL; + + pim_assert = i; + + return 0; +} + +/* + * Decide if a packet is from a tunnelled peer. + * Return 0 if not, 64 if so. + */ +static int +mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +{ + struct ip *ip = mtod(m, struct ip *); + int hlen = ip->ip_hl << 2; + register struct vif *vifp; + + /* + * don't claim the packet if it's not to a multicast destination or if + * we don't have an encapsulating tunnel with the source. + * Note: This code assumes that the remote site IP address + * uniquely identifies the tunnel (i.e., that this site has + * at most one tunnel with the remote site). + */ + if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) { + return 0; + } + if (ip->ip_src.s_addr != last_encap_src) { + register struct vif *vife; + + vifp = viftable; + vife = vifp + numvifs; + last_encap_src = ip->ip_src.s_addr; + last_encap_vif = 0; + for ( ; vifp < vife; ++vifp) + if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { + if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) + == VIFF_TUNNEL) + last_encap_vif = vifp; + break; + } + } + if ((vifp = last_encap_vif) == 0) { + last_encap_src = 0; + return 0; + } + return 64; +} + +/* + * De-encapsulate a packet and feed it back through ip input (this + * routine is called whenever IP gets a packet that mroute_encap_func() + * claimed). + */ +static void +mroute_encap_input(struct mbuf *m, int off) +{ + struct ip *ip = mtod(m, struct ip *); + int hlen = ip->ip_hl << 2; + + if (hlen > sizeof(struct ip)) + ip_stripoptions(m, (struct mbuf *) 0); + m->m_data += sizeof(struct ip); + m->m_len -= sizeof(struct ip); + m->m_pkthdr.len -= sizeof(struct ip); + + m->m_pkthdr.rcvif = last_encap_vif->v_ifp; + + (void) IF_HANDOFF(&ipintrq, m, NULL); + /* + * normally we would need a "schednetisr(NETISR_IP)" + * here but we were called by ip_input and it is going + * to loop back & try to dequeue the packet we just + * queued as soon as we return so we avoid the + * unnecessary software interrrupt. + */ +} + +extern struct domain inetdomain; +static struct protosw mroute_encap_protosw = +{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, + mroute_encap_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}; + +/* + * Add a vif to the vif table + */ +static int +add_vif(vifcp) + register struct vifctl *vifcp; +{ + register struct vif *vifp = viftable + vifcp->vifc_vifi; + static struct sockaddr_in sin = {sizeof sin, AF_INET}; + struct ifaddr *ifa; + struct ifnet *ifp; + int error, s; + struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; + + if (vifcp->vifc_vifi >= MAXVIFS) return EINVAL; + if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE; + + /* Find the interface with an address in AF_INET family */ + sin.sin_addr = vifcp->vifc_lcl_addr; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa == 0) return EADDRNOTAVAIL; + ifp = ifa->ifa_ifp; + + if (vifcp->vifc_flags & VIFF_TUNNEL) { + if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { + /* + * An encapsulating tunnel is wanted. Tell + * mroute_encap_input() to start paying attention + * to encapsulated packets. + */ + if (encap_cookie == NULL) { + encap_cookie = encap_attach_func(AF_INET, -1, + mroute_encapcheck, + (struct protosw *)&mroute_encap_protosw, NULL); + + if (encap_cookie == NULL) { + printf("ip_mroute: unable to attach encap\n"); + return (EIO); /* XXX */ + } + for (s = 0; s < MAXVIFS; ++s) { + multicast_decap_if[s].if_name = "mdecap"; + multicast_decap_if[s].if_unit = s; + } + } + /* + * Set interface to fake encapsulator interface + */ + ifp = &multicast_decap_if[vifcp->vifc_vifi]; + /* + * Prepare cached route entry + */ + bzero(&vifp->v_route, sizeof(vifp->v_route)); + } else { + log(LOG_ERR, "source routed tunnels not supported\n"); + return EOPNOTSUPP; + } + } else { + /* Make sure the interface supports multicast */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) + return EOPNOTSUPP; + + /* Enable promiscuous reception of all IP multicasts from the if */ + s = splnet(); + error = if_allmulti(ifp, 1); + splx(s); + if (error) + return error; + } + + s = splnet(); + /* define parameters for the tbf structure */ + vifp->v_tbf = v_tbf; + GET_TIME(vifp->v_tbf->tbf_last_pkt_t); + vifp->v_tbf->tbf_n_tok = 0; + vifp->v_tbf->tbf_q_len = 0; + vifp->v_tbf->tbf_max_q_len = MAXQSIZE; + vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; + + vifp->v_flags = vifcp->vifc_flags; + vifp->v_threshold = vifcp->vifc_threshold; + vifp->v_lcl_addr = vifcp->vifc_lcl_addr; + vifp->v_rmt_addr = vifcp->vifc_rmt_addr; + vifp->v_ifp = ifp; + /* scaling up here allows division by 1024 in critical code */ + vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; + vifp->v_rsvp_on = 0; + vifp->v_rsvpd = NULL; + /* initialize per vif pkt counters */ + vifp->v_pkt_in = 0; + vifp->v_pkt_out = 0; + vifp->v_bytes_in = 0; + vifp->v_bytes_out = 0; + splx(s); + + /* Adjust numvifs up if the vifi is higher than numvifs */ + if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; + + if (mrtdebug) + log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", + vifcp->vifc_vifi, + (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), + (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", + (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), + vifcp->vifc_threshold, + vifcp->vifc_rate_limit); + + return 0; +} + +/* + * Delete a vif from the vif table + */ +static int +del_vif(vifi) + vifi_t vifi; +{ + register struct vif *vifp = &viftable[vifi]; + register struct mbuf *m; + struct ifnet *ifp; + struct ifreq ifr; + int s; + + if (vifi >= numvifs) return EINVAL; + if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL; + + s = splnet(); + + if (!(vifp->v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; + ifp = vifp->v_ifp; + if_allmulti(ifp, 0); + } + + if (vifp == last_encap_vif) { + last_encap_vif = 0; + last_encap_src = 0; + } + + /* + * Free packets queued at the interface + */ + while (vifp->v_tbf->tbf_q) { + m = vifp->v_tbf->tbf_q; + vifp->v_tbf->tbf_q = m->m_act; + m_freem(m); + } + + bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); + bzero((caddr_t)vifp, sizeof (*vifp)); + + if (mrtdebug) + log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); + + /* Adjust numvifs down */ + for (vifi = numvifs; vifi > 0; vifi--) + if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break; + numvifs = vifi; + + splx(s); + + return 0; +} + +/* + * Add an mfc entry + */ +static int +add_mfc(mfccp) + struct mfcctl *mfccp; +{ + struct mfc *rt; + u_long hash; + struct rtdetq *rte; + register u_short nstl; + int s; + int i; + + MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt); + + /* If an entry already exists, just update the fields */ + if (rt) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + s = splnet(); + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + splx(s); + return 0; + } + + /* + * Find the entry for which the upcall was made and update + */ + s = splnet(); + hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); + for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { + + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && + (rt->mfc_stall != NULL)) { + + if (nstl++) + log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", + "multiple kernel entries", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, (void *)rt->mfc_stall); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, (void *)rt->mfc_stall); + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + + rt->mfc_expire = 0; /* Don't clean this guy up */ + nexpire[hash]--; + + /* free packets Qed at the end of this entry */ + for (rte = rt->mfc_stall; rte != NULL; ) { + struct rtdetq *n = rte->next; + + ip_mdq(rte->m, rte->ifp, rt, -1); + m_freem(rte->m); +#ifdef UPCALL_TIMING + collate(&(rte->t)); +#endif /* UPCALL_TIMING */ + free(rte, M_MRTABLE); + rte = n; + } + rt->mfc_stall = NULL; + } + } + + /* + * It is possible that an entry is being inserted without an upcall + */ + if (nstl == 0) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", + hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { + + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + if (rt->mfc_expire) + nexpire[hash]--; + rt->mfc_expire = 0; + } + } + if (rt == NULL) { + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + splx(s); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + rt->mfc_expire = 0; + rt->mfc_stall = NULL; + + /* link into table */ + rt->mfc_next = mfctable[hash]; + mfctable[hash] = rt; + } + } + splx(s); + return 0; +} + +#ifdef UPCALL_TIMING +/* + * collect delay statistics on the upcalls + */ +static void collate(t) +register struct timeval *t; +{ + register u_long d; + register struct timeval tp; + register u_long delta; + + GET_TIME(tp); + + if (TV_LT(*t, tp)) + { + TV_DELTA(tp, *t, delta); + + d = delta >> 10; + if (d > 50) + d = 50; + + ++upcall_data[d]; + } +} +#endif /* UPCALL_TIMING */ + +/* + * Delete an mfc entry + */ +static int +del_mfc(mfccp) + struct mfcctl *mfccp; +{ + struct in_addr origin; + struct in_addr mcastgrp; + struct mfc *rt; + struct mfc **nptr; + u_long hash; + int s; + + origin = mfccp->mfcc_origin; + mcastgrp = mfccp->mfcc_mcastgrp; + hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", + (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); + + s = splnet(); + + nptr = &mfctable[hash]; + while ((rt = *nptr) != NULL) { + if (origin.s_addr == rt->mfc_origin.s_addr && + mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && + rt->mfc_stall == NULL) + break; + + nptr = &rt->mfc_next; + } + if (rt == NULL) { + splx(s); + return EADDRNOTAVAIL; + } + + *nptr = rt->mfc_next; + free(rt, M_MRTABLE); + + splx(s); + + return 0; +} + +/* + * Send a message to mrouted on the multicast routing socket + */ +static int +socket_send(s, mm, src) + struct socket *s; + struct mbuf *mm; + struct sockaddr_in *src; +{ + if (s) { + if (sbappendaddr(&s->so_rcv, + (struct sockaddr *)src, + mm, (struct mbuf *)0) != 0) { + sorwakeup(s); + return 0; + } + } + m_freem(mm); + return -1; +} + +/* + * IP multicast forwarding function. This function assumes that the packet + * pointed to by "ip" has arrived on (or is about to be sent to) the interface + * pointed to by "ifp", and the packet is to be relayed to other networks + * that have members of the packet's destination IP multicast group. + * + * The packet is returned unscathed to the caller, unless it is + * erroneous, in which case a non-zero return value tells the caller to + * discard it. + */ + +#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ + +static int +X_ip_mforward(ip, ifp, m, imo) + register struct ip *ip; + struct ifnet *ifp; + struct mbuf *m; + struct ip_moptions *imo; +{ + register struct mfc *rt; + register u_char *ipoptions; + static struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + static int srctun = 0; + register struct mbuf *mm; + int s; + vifi_t vifi; + struct vif *vifp; + + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", + (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), + (void *)ifp); + + if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || + (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) { + /* + * Packet arrived via a physical interface or + * an encapsulated tunnel. + */ + } else { + /* + * Packet arrived through a source-route tunnel. + * Source-route tunnels are no longer supported. + */ + if ((srctun++ % 1000) == 0) + log(LOG_ERR, + "ip_mforward: received source-routed packet from %lx\n", + (u_long)ntohl(ip->ip_src.s_addr)); + + return 1; + } + + if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { + if (ip->ip_ttl < 255) + ip->ip_ttl++; /* compensate for -1 in *_send routines */ + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + vifp = viftable + vifi; + printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n", + (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), + vifi, + (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", + vifp->v_ifp->if_name, vifp->v_ifp->if_unit); + } + return (ip_mdq(m, ifp, NULL, vifi)); + } + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", + (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); + if(!imo) + printf("In fact, no options were specified at all\n"); + } + + /* + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. + */ + if (ip->ip_ttl <= 1 || + ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) + return 0; + + /* + * Determine forwarding vifs from the forwarding cache table + */ + s = splnet(); + MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt); + + /* Entry exists, so forward if necessary */ + if (rt != NULL) { + splx(s); + return (ip_mdq(m, ifp, rt, -1)); + } else { + /* + * If we don't have a route for packet's origin, + * Make a copy of the packet & + * send message to routing daemon + */ + + register struct mbuf *mb0; + register struct rtdetq *rte; + register u_long hash; + int hlen = ip->ip_hl << 2; +#ifdef UPCALL_TIMING + struct timeval tp; + + GET_TIME(tp); +#endif + + mrtstat.mrts_no_route++; + if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) + log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", + (u_long)ntohl(ip->ip_src.s_addr), + (u_long)ntohl(ip->ip_dst.s_addr)); + + /* + * Allocate mbufs early so that we don't do extra work if we are + * just going to fail anyway. Make sure to pullup the header so + * that other people can't step on it. + */ + rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); + if (rte == NULL) { + splx(s); + return ENOBUFS; + } + mb0 = m_copy(m, 0, M_COPYALL); + if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) + mb0 = m_pullup(mb0, hlen); + if (mb0 == NULL) { + free(rte, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* is there an upcall waiting for this packet? */ + hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); + for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { + if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && + (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && + (rt->mfc_stall != NULL)) + break; + } + + if (rt == NULL) { + int i; + struct igmpmsg *im; + + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + free(rte, M_MRTABLE); + m_freem(mb0); + splx(s); + return ENOBUFS; + } + /* Make a copy of the header to send to the user level process */ + mm = m_copy(mb0, 0, hlen); + if (mm == NULL) { + free(rte, M_MRTABLE); + m_freem(mb0); + free(rt, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* + * Send message to routing daemon to install + * a route into the kernel table + */ + k_igmpsrc.sin_addr = ip->ip_src; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_NOCACHE; + im->im_mbz = 0; + + mrtstat.mrts_upcalls++; + + if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); + ++mrtstat.mrts_upq_sockfull; + free(rte, M_MRTABLE); + m_freem(mb0); + free(rt, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin.s_addr = ip->ip_src.s_addr; + rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; + rt->mfc_expire = UPCALL_EXPIRE; + nexpire[hash]++; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = 0; + rt->mfc_parent = -1; + + /* link into table */ + rt->mfc_next = mfctable[hash]; + mfctable[hash] = rt; + rt->mfc_stall = rte; + + } else { + /* determine if q has overflowed */ + int npkts = 0; + struct rtdetq **p; + + for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) + npkts++; + + if (npkts > MAX_UPQ) { + mrtstat.mrts_upq_ovflw++; + free(rte, M_MRTABLE); + m_freem(mb0); + splx(s); + return 0; + } + + /* Add this entry to the end of the queue */ + *p = rte; + } + + rte->m = mb0; + rte->ifp = ifp; +#ifdef UPCALL_TIMING + rte->t = tp; +#endif + rte->next = NULL; + + splx(s); + + return 0; + } +} + +#ifndef MROUTE_KLD +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = X_ip_mforward; +#endif + +/* + * Clean up the cache entry if upcall is not serviced + */ +static void +expire_upcalls(void *unused) +{ + struct rtdetq *rte; + struct mfc *mfc, **nptr; + int i; + int s; + + s = splnet(); + for (i = 0; i < MFCTBLSIZ; i++) { + if (nexpire[i] == 0) + continue; + nptr = &mfctable[i]; + for (mfc = *nptr; mfc != NULL; mfc = *nptr) { + /* + * Skip real cache entries + * Make sure it wasn't marked to not expire (shouldn't happen) + * If it expires now + */ + if (mfc->mfc_stall != NULL && + mfc->mfc_expire != 0 && + --mfc->mfc_expire == 0) { + if (mrtdebug & DEBUG_EXPIRE) + log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", + (u_long)ntohl(mfc->mfc_origin.s_addr), + (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); + /* + * drop all the packets + * free the mbuf with the pkt, if, timing info + */ + for (rte = mfc->mfc_stall; rte; ) { + struct rtdetq *n = rte->next; + + m_freem(rte->m); + free(rte, M_MRTABLE); + rte = n; + } + ++mrtstat.mrts_cache_cleanups; + nexpire[i]--; + + *nptr = mfc->mfc_next; + free(mfc, M_MRTABLE); + } else { + nptr = &mfc->mfc_next; + } + } + } + splx(s); + expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); +} + +/* + * Packet forwarding routine once entry in the cache is made + */ +static int +ip_mdq(m, ifp, rt, xmt_vif) + register struct mbuf *m; + register struct ifnet *ifp; + register struct mfc *rt; + register vifi_t xmt_vif; +{ + register struct ip *ip = mtod(m, struct ip *); + register vifi_t vifi; + register struct vif *vifp; + register int plen = ip->ip_len; + +/* + * Macro to send packet on vif. Since RSVP packets don't get counted on + * input, they shouldn't get counted on output, so statistics keeping is + * separate. + */ +#define MC_SEND(ip,vifp,m) { \ + if ((vifp)->v_flags & VIFF_TUNNEL) \ + encap_send((ip), (vifp), (m)); \ + else \ + phyint_send((ip), (vifp), (m)); \ +} + + /* + * If xmt_vif is not -1, send on only the requested vif. + * + * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) + */ + if (xmt_vif < numvifs) { + MC_SEND(ip, viftable + xmt_vif, m); + return 1; + } + + /* + * Don't forward if it didn't arrive from the parent vif for its origin. + */ + vifi = rt->mfc_parent; + if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { + /* came in the wrong interface */ + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", + (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); + ++mrtstat.mrts_wrong_if; + ++rt->mfc_wrong_if; + /* + * If we are doing PIM assert processing, and we are forwarding + * packets on this interface, and it is a broadcast medium + * interface (and not a tunnel), send a message to the routing daemon. + */ + if (pim_assert && rt->mfc_ttls[vifi] && + (ifp->if_flags & IFF_BROADCAST) && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + struct sockaddr_in k_igmpsrc; + struct mbuf *mm; + struct igmpmsg *im; + int hlen = ip->ip_hl << 2; + struct timeval now; + register u_long delta; + + GET_TIME(now); + + TV_DELTA(rt->mfc_last_assert, now, delta); + + if (delta > ASSERT_MSG_TIME) { + mm = m_copy(m, 0, hlen); + if (mm && (M_HASCL(mm) || mm->m_len < hlen)) + mm = m_pullup(mm, hlen); + if (mm == NULL) { + return ENOBUFS; + } + + rt->mfc_last_assert = now; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_WRONGVIF; + im->im_mbz = 0; + im->im_vif = vifi; + + k_igmpsrc.sin_addr = im->im_src; + + socket_send(ip_mrouter, mm, &k_igmpsrc); + } + } + return 0; + } + + /* If I sourced this packet, it counts as output, else it was input. */ + if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { + viftable[vifi].v_pkt_out++; + viftable[vifi].v_bytes_out += plen; + } else { + viftable[vifi].v_pkt_in++; + viftable[vifi].v_bytes_in += plen; + } + rt->mfc_pkt_cnt++; + rt->mfc_byte_cnt += plen; + + /* + * For each vif, decide if a copy of the packet should be forwarded. + * Forward if: + * - the ttl exceeds the vif's threshold + * - there are group members downstream on interface + */ + for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) + if ((rt->mfc_ttls[vifi] > 0) && + (ip->ip_ttl > rt->mfc_ttls[vifi])) { + vifp->v_pkt_out++; + vifp->v_bytes_out += plen; + MC_SEND(ip, vifp, m); + } + + return 0; +} + +/* + * check if a vif number is legal/ok. This is used by ip_output, to export + * numvifs there, + */ +static int +X_legal_vif_num(vif) + int vif; +{ + if (vif >= 0 && vif < numvifs) + return(1); + else + return(0); +} + +#ifndef MROUTE_KLD +int (*legal_vif_num)(int) = X_legal_vif_num; +#endif + +/* + * Return the local address used by this vif + */ +static u_long +X_ip_mcast_src(vifi) + int vifi; +{ + if (vifi >= 0 && vifi < numvifs) + return viftable[vifi].v_lcl_addr.s_addr; + else + return INADDR_ANY; +} + +#ifndef MROUTE_KLD +u_long (*ip_mcast_src)(int) = X_ip_mcast_src; +#endif + +static void +phyint_send(ip, vifp, m) + struct ip *ip; + struct vif *vifp; + struct mbuf *m; +{ + register struct mbuf *mb_copy; + register int hlen = ip->ip_hl << 2; + + /* + * Make a new reference to the packet; make sure that + * the IP header is actually copied, not just referenced, + * so that ip_output() only scribbles on the copy. + */ + mb_copy = m_copy(m, 0, M_COPYALL); + if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) + mb_copy = m_pullup(mb_copy, hlen); + if (mb_copy == NULL) + return; + + if (vifp->v_rate_limit == 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); +} + +static void +encap_send(ip, vifp, m) + register struct ip *ip; + register struct vif *vifp; + register struct mbuf *m; +{ + register struct mbuf *mb_copy; + register struct ip *ip_copy; + register int i, len = ip->ip_len; + + /* + * copy the old packet & pullup its IP header into the + * new mbuf so we can modify it. Try to fill the new + * mbuf since if we don't the ethernet driver will. + */ + MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); + if (mb_copy == NULL) + return; + mb_copy->m_data += max_linkhdr; + mb_copy->m_len = sizeof(multicast_encap_iphdr); + + if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { + m_freem(mb_copy); + return; + } + i = MHLEN - M_LEADINGSPACE(mb_copy); + if (i > len) + i = len; + mb_copy = m_pullup(mb_copy, i); + if (mb_copy == NULL) + return; + mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); + + /* + * fill in the encapsulating IP header. + */ + ip_copy = mtod(mb_copy, struct ip *); + *ip_copy = multicast_encap_iphdr; +#ifdef RANDOM_IP_ID + ip_copy->ip_id = ip_randomid(); +#else + ip_copy->ip_id = htons(ip_id++); +#endif + ip_copy->ip_len += len; + ip_copy->ip_src = vifp->v_lcl_addr; + ip_copy->ip_dst = vifp->v_rmt_addr; + + /* + * turn the encapsulated IP header back into a valid one. + */ + ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); + --ip->ip_ttl; + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + mb_copy->m_data += sizeof(multicast_encap_iphdr); + ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); + mb_copy->m_data -= sizeof(multicast_encap_iphdr); + + if (vifp->v_rate_limit == 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); +} + +/* + * Token bucket filter module + */ + +static void +tbf_control(vifp, m, ip, p_len) + register struct vif *vifp; + register struct mbuf *m; + register struct ip *ip; + register u_long p_len; +{ + register struct tbf *t = vifp->v_tbf; + + if (p_len > MAX_BKT_SIZE) { + /* drop if packet is too large */ + mrtstat.mrts_pkt2large++; + m_freem(m); + return; + } + + tbf_update_tokens(vifp); + + /* if there are enough tokens, + * and the queue is empty, + * send this packet out + */ + + if (t->tbf_q_len == 0) { + /* queue empty, send packet if enough tokens */ + if (p_len <= t->tbf_n_tok) { + t->tbf_n_tok -= p_len; + tbf_send_packet(vifp, m); + } else { + /* queue packet and timeout till later */ + tbf_queue(vifp, m); + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); + } + } else if (t->tbf_q_len < t->tbf_max_q_len) { + /* finite queue length, so queue pkts and process queue */ + tbf_queue(vifp, m); + tbf_process_q(vifp); + } else { + /* queue length too much, try to dq and queue and process */ + if (!tbf_dq_sel(vifp, ip)) { + mrtstat.mrts_q_overflow++; + m_freem(m); + return; + } else { + tbf_queue(vifp, m); + tbf_process_q(vifp); + } + } + return; +} + +/* + * adds a packet to the queue at the interface + */ +static void +tbf_queue(vifp, m) + register struct vif *vifp; + register struct mbuf *m; +{ + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + if (t->tbf_t == NULL) { + /* Queue was empty */ + t->tbf_q = m; + } else { + /* Insert at tail */ + t->tbf_t->m_act = m; + } + + /* Set new tail pointer */ + t->tbf_t = m; + +#ifdef DIAGNOSTIC + /* Make sure we didn't get fed a bogus mbuf */ + if (m->m_act) + panic("tbf_queue: m_act"); +#endif + m->m_act = NULL; + + t->tbf_q_len++; + + splx(s); +} + + +/* + * processes the queue at the interface + */ +static void +tbf_process_q(vifp) + register struct vif *vifp; +{ + register struct mbuf *m; + register int len; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + /* loop through the queue at the interface and send as many packets + * as possible + */ + while (t->tbf_q_len > 0) { + m = t->tbf_q; + + len = mtod(m, struct ip *)->ip_len; + + /* determine if the packet can be sent */ + if (len <= t->tbf_n_tok) { + /* if so, + * reduce no of tokens, dequeue the packet, + * send the packet. + */ + t->tbf_n_tok -= len; + + t->tbf_q = m->m_act; + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + + m->m_act = NULL; + tbf_send_packet(vifp, m); + + } else break; + } + splx(s); +} + +static void +tbf_reprocess_q(xvifp) + void *xvifp; +{ + register struct vif *vifp = xvifp; + if (ip_mrouter == NULL) + return; + + tbf_update_tokens(vifp); + + tbf_process_q(vifp); + + if (vifp->v_tbf->tbf_q_len) + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); +} + +/* function that will selectively discard a member of the queue + * based on the precedence value and the priority + */ +static int +tbf_dq_sel(vifp, ip) + register struct vif *vifp; + register struct ip *ip; +{ + register int s = splnet(); + register u_int p; + register struct mbuf *m, *last; + register struct mbuf **np; + register struct tbf *t = vifp->v_tbf; + + p = priority(vifp, ip); + + np = &t->tbf_q; + last = NULL; + while ((m = *np) != NULL) { + if (p > priority(vifp, mtod(m, struct ip *))) { + *np = m->m_act; + /* If we're removing the last packet, fix the tail pointer */ + if (m == t->tbf_t) + t->tbf_t = last; + m_freem(m); + /* it's impossible for the queue to be empty, but + * we check anyway. */ + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + splx(s); + mrtstat.mrts_drop_sel++; + return(1); + } + np = &m->m_act; + last = m; + } + splx(s); + return(0); +} + +static void +tbf_send_packet(vifp, m) + register struct vif *vifp; + register struct mbuf *m; +{ + struct ip_moptions imo; + int error; + static struct route ro; + int s = splnet(); + + if (vifp->v_flags & VIFF_TUNNEL) { + /* If tunnel options */ + ip_output(m, (struct mbuf *)0, &vifp->v_route, + IP_FORWARDING, (struct ip_moptions *)0); + } else { + imo.imo_multicast_ifp = vifp->v_ifp; + imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo.imo_multicast_loop = 1; + imo.imo_multicast_vif = -1; + + /* + * Re-entrancy should not be a problem here, because + * the packets that we send out and are looped back at us + * should get rejected because they appear to come from + * the loopback interface, thus preventing looping. + */ + error = ip_output(m, (struct mbuf *)0, &ro, + IP_FORWARDING, &imo); + + if (mrtdebug & DEBUG_XMIT) + log(LOG_DEBUG, "phyint_send on vif %d err %d\n", + vifp - viftable, error); + } + splx(s); +} + +/* determine the current time and then + * the elapsed time (between the last time and time now) + * in milliseconds & update the no. of tokens in the bucket + */ +static void +tbf_update_tokens(vifp) + register struct vif *vifp; +{ + struct timeval tp; + register u_long tm; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + GET_TIME(tp); + + TV_DELTA(tp, t->tbf_last_pkt_t, tm); + + /* + * This formula is actually + * "time in seconds" * "bytes/second". + * + * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) + * + * The (1000/1024) was introduced in add_vif to optimize + * this divide into a shift. + */ + t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; + t->tbf_last_pkt_t = tp; + + if (t->tbf_n_tok > MAX_BKT_SIZE) + t->tbf_n_tok = MAX_BKT_SIZE; + + splx(s); +} + +static int +priority(vifp, ip) + register struct vif *vifp; + register struct ip *ip; +{ + register int prio; + + /* temporary hack; may add general packet classifier some day */ + + /* + * The UDP port space is divided up into four priority ranges: + * [0, 16384) : unclassified - lowest priority + * [16384, 32768) : audio - highest priority + * [32768, 49152) : whiteboard - medium priority + * [49152, 65536) : video - low priority + */ + if (ip->ip_p == IPPROTO_UDP) { + struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); + switch (ntohs(udp->uh_dport) & 0xc000) { + case 0x4000: + prio = 70; + break; + case 0x8000: + prio = 60; + break; + case 0xc000: + prio = 55; + break; + default: + prio = 50; + break; + } + if (tbfdebug > 1) + log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio); + } else { + prio = 50; + } + return prio; +} + +/* + * End of token bucket filter modifications + */ + +int +ip_rsvp_vif_init(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, i, s; + + if (rsvpdebug) + printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + /* Check mbuf. */ + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + return (error); + + if (rsvpdebug) + printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", i, rsvp_on); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + /* Check if socket is available. */ + if (viftable[i].v_rsvpd != NULL) { + splx(s); + return EADDRINUSE; + } + + viftable[i].v_rsvpd = so; + /* This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 1; + rsvp_on++; + } + + splx(s); + return 0; +} + +int +ip_rsvp_vif_done(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, i, s; + + if (rsvpdebug) + printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + return (error); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + if (rsvpdebug) + printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n", + viftable[i].v_rsvpd, so); + + /* + * XXX as an additional consistency check, one could make sure + * that viftable[i].v_rsvpd == so, otherwise passing so as + * first parameter is pretty useless. + */ + viftable[i].v_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 0; + rsvp_on--; + } + + splx(s); + return 0; +} + +void +ip_rsvp_force_done(so) + struct socket *so; +{ + int vifi; + register int s; + + /* Don't bother if it is not the right type of socket. */ + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return; + + s = splnet(); + + /* The socket may be attached to more than one vif...this + * is perfectly legal. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_rsvpd == so) { + viftable[vifi].v_rsvpd = NULL; + /* This may seem silly, but we need to be sure we don't + * over-decrement the RSVP counter, in case something slips up. + */ + if (viftable[vifi].v_rsvp_on) { + viftable[vifi].v_rsvp_on = 0; + rsvp_on--; + } + } + } + + splx(s); + return; +} + +void +rsvp_input(m, off) + struct mbuf *m; + int off; +{ + int vifi; + register struct ip *ip = mtod(m, struct ip *); + static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; + register int s; + struct ifnet *ifp; + + if (rsvpdebug) + printf("rsvp_input: rsvp_on %d\n",rsvp_on); + + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + s = splnet(); + + if (rsvpdebug) + printf("rsvp_input: check vifs\n"); + +#ifdef DIAGNOSTIC + if (!(m->m_flags & M_PKTHDR)) + panic("rsvp_input no hdr"); +#endif + + ifp = m->m_pkthdr.rcvif; + /* Find which vif the packet arrived on. */ + for (vifi = 0; vifi < numvifs; vifi++) + if (viftable[vifi].v_ifp == ifp) + break; + + if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { + /* + * If the old-style non-vif-associated socket is set, + * then use it. Otherwise, drop packet since there + * is no specific socket for this vif. + */ + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, off); /* xxx */ + } else { + if (rsvpdebug && vifi == numvifs) + printf("rsvp_input: Can't find vif for packet.\n"); + else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) + printf("rsvp_input: No socket defined for vif %d\n",vifi); + m_freem(m); + } + splx(s); + return; + } + rsvp_src.sin_addr = ip->ip_src; + + if (rsvpdebug && m) + printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", + m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); + + if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { + if (rsvpdebug) + printf("rsvp_input: Failed to append to socket\n"); + } else { + if (rsvpdebug) + printf("rsvp_input: send packet up\n"); + } + + splx(s); +} + +#ifdef MROUTE_KLD + +static int +ip_mroute_modevent(module_t mod, int type, void *unused) +{ + int s; + + switch (type) { + static u_long (*old_ip_mcast_src)(int); + static int (*old_ip_mrouter_set)(struct socket *, + struct sockopt *); + static int (*old_ip_mrouter_get)(struct socket *, + struct sockopt *); + static int (*old_ip_mrouter_done)(void); + static int (*old_ip_mforward)(struct ip *, struct ifnet *, + struct mbuf *, struct ip_moptions *); + static int (*old_mrt_ioctl)(int, caddr_t); + static int (*old_legal_vif_num)(int); + + case MOD_LOAD: + s = splnet(); + /* XXX Protect against multiple loading */ + old_ip_mcast_src = ip_mcast_src; + ip_mcast_src = X_ip_mcast_src; + old_ip_mrouter_get = ip_mrouter_get; + ip_mrouter_get = X_ip_mrouter_get; + old_ip_mrouter_set = ip_mrouter_set; + ip_mrouter_set = X_ip_mrouter_set; + old_ip_mrouter_done = ip_mrouter_done; + ip_mrouter_done = X_ip_mrouter_done; + old_ip_mforward = ip_mforward; + ip_mforward = X_ip_mforward; + old_mrt_ioctl = mrt_ioctl; + mrt_ioctl = X_mrt_ioctl; + old_legal_vif_num = legal_vif_num; + legal_vif_num = X_legal_vif_num; + + splx(s); + return 0; + + case MOD_UNLOAD: + if (ip_mrouter) + return EINVAL; + + s = splnet(); + ip_mrouter_get = old_ip_mrouter_get; + ip_mrouter_set = old_ip_mrouter_set; + ip_mrouter_done = old_ip_mrouter_done; + ip_mforward = old_ip_mforward; + mrt_ioctl = old_mrt_ioctl; + legal_vif_num = old_legal_vif_num; + splx(s); + return 0; + + default: + break; + } + return 0; +} + +static moduledata_t ip_mroutemod = { + "ip_mroute", + ip_mroute_modevent, + 0 +}; +DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); + +#endif /* MROUTE_KLD */ +#endif /* MROUTING */ diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h new file mode 100644 index 0000000..0e61652 --- /dev/null +++ b/sys/netinet/ip_mroute.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 1989 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_MROUTE_H_ +#define _NETINET_IP_MROUTE_H_ + +/* + * Definitions for IP multicast forwarding. + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Ajit Thyagarajan, PARC, August 1993. + * Modified by Ajit Thyagarajan, PARC, August 1994. + * + * MROUTING Revision: 3.3.1.3 + */ + + +/* + * Multicast Routing set/getsockopt commands. + */ +#define MRT_INIT 100 /* initialize forwarder */ +#define MRT_DONE 101 /* shut down forwarder */ +#define MRT_ADD_VIF 102 /* create virtual interface */ +#define MRT_DEL_VIF 103 /* delete virtual interface */ +#define MRT_ADD_MFC 104 /* insert forwarding cache entry */ +#define MRT_DEL_MFC 105 /* delete forwarding cache entry */ +#define MRT_VERSION 106 /* get kernel version number */ +#define MRT_ASSERT 107 /* enable PIM assert processing */ + + +#define GET_TIME(t) microtime(&t) + +/* + * Types and macros for handling bitmaps with one bit per virtual interface. + */ +#define MAXVIFS 32 +typedef u_long vifbitmap_t; +typedef u_short vifi_t; /* type of a vif index */ +#define ALL_VIFS (vifi_t)-1 + +#define VIFM_SET(n, m) ((m) |= (1 << (n))) +#define VIFM_CLR(n, m) ((m) &= ~(1 << (n))) +#define VIFM_ISSET(n, m) ((m) & (1 << (n))) +#define VIFM_CLRALL(m) ((m) = 0x00000000) +#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom)) +#define VIFM_SAME(m1, m2) ((m1) == (m2)) + + +/* + * Argument structure for MRT_ADD_VIF. + * (MRT_DEL_VIF takes a single vifi_t argument.) + */ +struct vifctl { + vifi_t vifc_vifi; /* the index of the vif to be added */ + u_char vifc_flags; /* VIFF_ flags defined below */ + u_char vifc_threshold; /* min ttl required to forward on vif */ + u_int vifc_rate_limit; /* max rate */ + struct in_addr vifc_lcl_addr; /* local interface address */ + struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */ +}; + +#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */ +#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */ + +/* + * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC + * (mfcc_tos to be added at a future point) + */ +struct mfcctl { + struct in_addr mfcc_origin; /* ip origin of mcasts */ + struct in_addr mfcc_mcastgrp; /* multicast group associated*/ + vifi_t mfcc_parent; /* incoming vif */ + u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ +}; + +/* + * The kernel's multicast routing statistics. + */ +struct mrtstat { + u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ + u_long mrts_mfc_misses; /* # forw. cache hash table misses */ + u_long mrts_upcalls; /* # calls to mrouted */ + u_long mrts_no_route; /* no route for packet's origin */ + u_long mrts_bad_tunnel; /* malformed tunnel options */ + u_long mrts_cant_tunnel; /* no room for tunnel options */ + u_long mrts_wrong_if; /* arrived on wrong interface */ + u_long mrts_upq_ovflw; /* upcall Q overflow */ + u_long mrts_cache_cleanups; /* # entries with no upcalls */ + u_long mrts_drop_sel; /* pkts dropped selectively */ + u_long mrts_q_overflow; /* pkts dropped - Q overflow */ + u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ +}; + +/* + * Argument structure used by mrouted to get src-grp pkt counts + */ +struct sioc_sg_req { + struct in_addr src; + struct in_addr grp; + u_long pktcnt; + u_long bytecnt; + u_long wrong_if; +}; + +/* + * Argument structure used by mrouted to get vif pkt counts + */ +struct sioc_vif_req { + vifi_t vifi; /* vif number */ + u_long icount; /* Input packet count on vif */ + u_long ocount; /* Output packet count on vif */ + u_long ibytes; /* Input byte count on vif */ + u_long obytes; /* Output byte count on vif */ +}; + + +/* + * The kernel's virtual-interface structure. + */ +struct vif { + u_char v_flags; /* VIFF_ flags defined above */ + u_char v_threshold; /* min ttl required to forward on vif*/ + u_int v_rate_limit; /* max rate */ + struct tbf *v_tbf; /* token bucket structure at intf. */ + struct in_addr v_lcl_addr; /* local interface address */ + struct in_addr v_rmt_addr; /* remote address (tunnels only) */ + struct ifnet *v_ifp; /* pointer to interface */ + u_long v_pkt_in; /* # pkts in on interface */ + u_long v_pkt_out; /* # pkts out on interface */ + u_long v_bytes_in; /* # bytes in on interface */ + u_long v_bytes_out; /* # bytes out on interface */ + struct route v_route; /* cached route if this is a tunnel */ + u_int v_rsvp_on; /* RSVP listening on this vif */ + struct socket *v_rsvpd; /* RSVP daemon socket */ +}; + +/* + * The kernel's multicast forwarding cache entry structure + * (A field for the type of service (mfc_tos) is to be added + * at a future point) + */ +struct mfc { + struct in_addr mfc_origin; /* IP origin of mcasts */ + struct in_addr mfc_mcastgrp; /* multicast group associated*/ + vifi_t mfc_parent; /* incoming vif */ + u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ + u_long mfc_pkt_cnt; /* pkt count for src-grp */ + u_long mfc_byte_cnt; /* byte count for src-grp */ + u_long mfc_wrong_if; /* wrong if for src-grp */ + int mfc_expire; /* time to clean entry up */ + struct timeval mfc_last_assert; /* last time I sent an assert*/ + struct rtdetq *mfc_stall; /* q of packets awaiting mfc */ + struct mfc *mfc_next; /* next mfc entry */ +}; + +/* + * Struct used to communicate from kernel to multicast router + * note the convenient similarity to an IP packet + */ +struct igmpmsg { + u_long unused1; + u_long unused2; + u_char im_msgtype; /* what type of message */ +#define IGMPMSG_NOCACHE 1 +#define IGMPMSG_WRONGVIF 2 + u_char im_mbz; /* must be zero */ + u_char im_vif; /* vif rec'd on */ + u_char unused3; + struct in_addr im_src, im_dst; +}; + +/* + * Argument structure used for pkt info. while upcall is made + */ +struct rtdetq { + struct mbuf *m; /* A copy of the packet */ + struct ifnet *ifp; /* Interface pkt came in on */ + vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */ +#ifdef UPCALL_TIMING + struct timeval t; /* Timestamp */ +#endif /* UPCALL_TIMING */ + struct rtdetq *next; /* Next in list of packets */ +}; + +#define MFCTBLSIZ 256 +#if (MFCTBLSIZ & (MFCTBLSIZ - 1)) == 0 /* from sys:route.h */ +#define MFCHASHMOD(h) ((h) & (MFCTBLSIZ - 1)) +#else +#define MFCHASHMOD(h) ((h) % MFCTBLSIZ) +#endif + +#define MAX_UPQ 4 /* max. no of pkts in upcall Q */ + +/* + * Token Bucket filter code + */ +#define MAX_BKT_SIZE 10000 /* 10K bytes size */ +#define MAXQSIZE 10 /* max # of pkts in queue */ + +/* + * the token bucket filter at each vif + */ +struct tbf +{ + struct timeval tbf_last_pkt_t; /* arr. time of last pkt */ + u_long tbf_n_tok; /* no of tokens in bucket */ + u_long tbf_q_len; /* length of queue at this vif */ + u_long tbf_max_q_len; /* max. queue length */ + struct mbuf *tbf_q; /* Packet queue */ + struct mbuf *tbf_t; /* tail-insertion pointer */ +}; + +#ifdef _KERNEL + +struct sockopt; + +extern int (*ip_mrouter_set)(struct socket *, struct sockopt *); +extern int (*ip_mrouter_get)(struct socket *, struct sockopt *); +extern int (*ip_mrouter_done)(void); +extern int (*mrt_ioctl)(int, caddr_t); + +#endif /* _KERNEL */ + +#endif /* _NETINET_IP_MROUTE_H_ */ diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c new file mode 100644 index 0000000..3402a28 --- /dev/null +++ b/sys/netinet/ip_output.c @@ -0,0 +1,2050 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#define _IP_VHL + +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_ipfilter.h" +#include "opt_ipsec.h" +#include "opt_pfil_hooks.h" +#include "opt_random_ip_id.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +#include <machine/in_cksum.h> + +static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#ifdef IPSEC_DEBUG +#include <netkey/key_debug.h> +#else +#define KEYDEBUG(lev,arg) +#endif +#endif /*IPSEC*/ + +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> + +#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ + x, (ntohl(a.s_addr)>>24)&0xFF,\ + (ntohl(a.s_addr)>>16)&0xFF,\ + (ntohl(a.s_addr)>>8)&0xFF,\ + (ntohl(a.s_addr))&0xFF, y); + +u_short ip_id; + +static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); +static struct ifnet *ip_multicast_if(struct in_addr *, int *); +static void ip_mloopback + (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); +static int ip_getmoptions + (struct sockopt *, struct ip_moptions *); +static int ip_pcbopts(int, struct mbuf **, struct mbuf *); +static int ip_setmoptions + (struct sockopt *, struct ip_moptions **); + +int ip_optcopy(struct ip *, struct ip *); + + +extern struct protosw inetsw[]; + +/* + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + */ +int +ip_output(m0, opt, ro, flags, imo) + struct mbuf *m0; + struct mbuf *opt; + struct route *ro; + int flags; + struct ip_moptions *imo; +{ + struct ip *ip, *mhip; + struct ifnet *ifp = NULL; /* keep compiler happy */ + struct mbuf *m; + int hlen = sizeof (struct ip); + int len, off, error = 0; + struct sockaddr_in *dst = NULL; /* keep compiler happy */ + struct in_ifaddr *ia; + int isbroadcast, sw_csum; + struct in_addr pkt_dst; +#ifdef IPSEC + struct route iproute; + struct socket *so = NULL; + struct secpolicy *sp = NULL; +#endif + struct ip_fw_args args; + int src_was_INADDR_ANY = 0; /* as the name says... */ +#ifdef PFIL_HOOKS + struct packet_filter_hook *pfh; + struct mbuf *m1; + int rv; +#endif /* PFIL_HOOKS */ + + args.eh = NULL; + args.rule = NULL; + args.next_hop = NULL; + args.divert_rule = 0; /* divert cookie */ + + /* Grab info from MT_TAG mbufs prepended to the chain. */ + for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) { + switch(m0->m_tag_id) { + default: + printf("ip_output: unrecognised MT_TAG tag %d\n", + m0->m_tag_id); + break; + + case PACKET_TAG_DUMMYNET: + /* + * the packet was already tagged, so part of the + * processing was already done, and we need to go down. + * Get parameters from the header. + */ + args.rule = ((struct dn_pkt *)m0)->rule; + opt = NULL ; + ro = & ( ((struct dn_pkt *)m0)->ro ) ; + imo = NULL ; + dst = ((struct dn_pkt *)m0)->dn_dst ; + ifp = ((struct dn_pkt *)m0)->ifp ; + flags = ((struct dn_pkt *)m0)->flags ; + break; + + case PACKET_TAG_DIVERT: + args.divert_rule = (intptr_t)m0->m_data & 0xffff; + break; + + case PACKET_TAG_IPFORWARD: + args.next_hop = (struct sockaddr_in *)m0->m_data; + break; + } + } + m = m0; + + KASSERT(!m || (m->m_flags & M_PKTHDR) != 0, ("ip_output: no HDR")); + + KASSERT(ro != NULL, ("ip_output: no route, proto %d", + mtod(m, struct ip *)->ip_p)); + +#ifdef IPSEC + so = ipsec_getsocket(m); + (void)ipsec_setsocket(m, NULL); +#endif + if (args.rule != NULL) { /* dummynet already saw us */ + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + ia = ifatoia(ro->ro_rt->rt_ifa); + goto sendit; + } + + if (opt) { + m = ip_insertoptions(m, opt, &len); + hlen = len; + } + ip = mtod(m, struct ip *); + pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; + + /* + * Fill in IP header. + */ + if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); + ip->ip_off &= IP_DF; +#ifdef RANDOM_IP_ID + ip->ip_id = ip_randomid(); +#else + ip->ip_id = htons(ip_id++); +#endif + ipstat.ips_localout++; + } else { + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } + + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + * The address family should also be checked in case of sharing the + * cache with IPv6. + */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != pkt_dst.s_addr)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if (ro->ro_rt == 0) { + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = pkt_dst; + } + /* + * If routing to interface only, + * short circuit routing lookup. + */ + if (flags & IP_ROUTETOIF) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && + imo != NULL && imo->imo_multicast_ifp != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + ifp = imo->imo_multicast_ifp; + IFP_TO_IA(ifp, ia); + isbroadcast = 0; /* fool gcc */ + } else { + /* + * If this is the case, we probably don't want to allocate + * a protocol-cloned route since we didn't get one from the + * ULP. This lets TCP do its thing, while not burdening + * forwarding or ICMP with the overhead of cloning a route. + * Of course, we still want to do any cloning requested by + * the link layer, as this is probably required in all cases + * for correct operation (as it is for ARP). + */ + if (ro->ro_rt == 0) + rtalloc_ign(ro, RTF_PRCLONING); + if (ro->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + ro->ro_rt->rt_use++; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_HOST) + isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { + struct in_multi *inm; + + m->m_flags |= M_MCAST; + /* + * IP destination address is multicast. Make sure "dst" + * still points to the address in "ro". (It may have been + * changed to point to a gateway address, above.) + */ + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * See if the caller provided any multicast options + */ + if (imo != NULL) { + ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_vif != -1) + ip->ip_src.s_addr = + ip_mcast_src(imo->imo_multicast_vif); + } else + ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + /* + * Confirm that the outgoing interface supports multicast. + */ + if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + } + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + /* Interface may have no addresses. */ + if (ia != NULL) + ip->ip_src = IA_SIN(ia)->sin_addr; + } + + if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + /* + * XXX + * delayed checksums are not currently + * compatible with IP multicast routing + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= + ~CSUM_DELAY_DATA; + } + } + IN_LOOKUP_MULTI(pkt_dst, ifp, inm); + if (inm != NULL && + (imo == NULL || imo->imo_multicast_loop)) { + /* + * If we belong to the destination multicast group + * on the outgoing interface, and the caller did not + * forbid loopback, loop back a copy. + */ + ip_mloopback(ifp, m, dst, hlen); + } + else { + /* + * If we are acting as a multicast router, perform + * multicast forwarding as if the packet had just + * arrived on the interface to which we are about + * to send. The multicast forwarding function + * recursively calls this function, using the + * IP_FORWARDING flag to prevent infinite recursion. + * + * Multicasts that are looped back by ip_mloopback(), + * above, will be forwarded by the ip_input() routine, + * if necessary. + */ + if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + /* + * Check if rsvp daemon is running. If not, don't + * set ip_moptions. This ensures that the packet + * is multicast and not just sent down one link + * as prescribed by rsvpd. + */ + if (!rsvp_on) + imo = NULL; + if (ip_mforward(ip, ifp, m, imo) != 0) { + m_freem(m); + goto done; + } + } + } + + /* + * Multicasts with a time-to-live of zero may be looped- + * back, above, but must not be transmitted on a network. + * Also, multicasts addressed to the loopback interface + * are not sent -- the above call to ip_mloopback() will + * loop back a copy if this host actually belongs to the + * destination group on the loopback interface. + */ + if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { + m_freem(m); + goto done; + } + + goto sendit; + } +#ifndef notdef + /* + * If the source address is not specified yet, use the address + * of the outoing interface. In case, keep note we did that, so + * if the the firewall changes the next-hop causing the output + * interface to change, we can fix that. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + /* Interface may have no addresses. */ + if (ia != NULL) { + ip->ip_src = IA_SIN(ia)->sin_addr; + src_was_INADDR_ANY = 1; + } + } +#endif /* notdef */ + /* + * Verify that we have any chance at all of being able to queue + * the packet or packet fragments + */ + if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= + ifp->if_snd.ifq_maxlen) { + error = ENOBUFS; + ipstat.ips_odropped++; + goto bad; + } + + /* + * Look for broadcast address and + * verify user is allowed to send + * such a packet. + */ + if (isbroadcast) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & IP_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + /* don't allow broadcast messages to be fragmented */ + if ((u_short)ip->ip_len > ifp->if_mtu) { + error = EMSGSIZE; + goto bad; + } + m->m_flags |= M_BCAST; + } else { + m->m_flags &= ~M_BCAST; + } + +sendit: +#ifdef IPSEC + /* get SP for this packet */ + if (so == NULL) + sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); + else + sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); + + if (sp == NULL) { + ipsecstat.out_inval++; + goto bad; + } + + error = 0; + + /* check policy */ + switch (sp->policy) { + case IPSEC_POLICY_DISCARD: + /* + * This packet is just discarded. + */ + ipsecstat.out_polvio++; + goto bad; + + case IPSEC_POLICY_BYPASS: + case IPSEC_POLICY_NONE: + /* no need to do IPsec. */ + goto skip_ipsec; + + case IPSEC_POLICY_IPSEC: + if (sp->req == NULL) { + /* acquire a policy */ + error = key_spdacquire(sp); + goto bad; + } + break; + + case IPSEC_POLICY_ENTRUST: + default: + printf("ip_output: Invalid policy found. %d\n", sp->policy); + } + { + struct ipsec_output_state state; + bzero(&state, sizeof(state)); + state.m = m; + if (flags & IP_ROUTETOIF) { + state.ro = &iproute; + bzero(&iproute, sizeof(iproute)); + } else + state.ro = ro; + state.dst = (struct sockaddr *)dst; + + ip->ip_sum = 0; + + /* + * XXX + * delayed checksums are not currently compatible with IPsec + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + + error = ipsec4_output(&state, sp, flags); + + m = state.m; + if (flags & IP_ROUTETOIF) { + /* + * if we have tunnel mode SA, we may need to ignore + * IP_ROUTETOIF. + */ + if (state.ro != &iproute || state.ro->ro_rt != NULL) { + flags &= ~IP_ROUTETOIF; + ro = state.ro; + } + } else + ro = state.ro; + dst = (struct sockaddr_in *)state.dst; + if (error) { + /* mbuf is already reclaimed in ipsec4_output. */ + m0 = NULL; + switch (error) { + case EHOSTUNREACH: + case ENETUNREACH: + case EMSGSIZE: + case ENOBUFS: + case ENOMEM: + break; + default: + printf("ip4_output (ipsec): error code %d\n", error); + /*fall through*/ + case ENOENT: + /* don't show these error codes to the user */ + error = 0; + break; + } + goto bad; + } + } + + /* be sure to update variables that are affected by ipsec4_output() */ + ip = mtod(m, struct ip *); +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + if (ro->ro_rt == NULL) { + if ((flags & IP_ROUTETOIF) == 0) { + printf("ip_output: " + "can't update route after IPsec processing\n"); + error = EHOSTUNREACH; /*XXX*/ + goto bad; + } + } else { + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + } + + /* make it flipped, again. */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); +skip_ipsec: +#endif /*IPSEC*/ + + /* + * IpHack's section. + * - Xlate: translate packet's addr/port (NAT). + * - Firewall: deny/allow/etc. + * - Wrap: fake packet's addr/port <unimpl.> + * - Encapsulate: put it in another IP and send out. <unimp.> + */ +#ifdef PFIL_HOOKS + /* + * Run through list of hooks for output packets. + */ + m1 = m; + pfh = pfil_hook_get(PFIL_OUT, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); + for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) + if (pfh->pfil_func) { + rv = pfh->pfil_func(ip, hlen, ifp, 1, &m1); + if (rv) { + error = EHOSTUNREACH; + goto done; + } + m = m1; + if (m == NULL) + goto done; + ip = mtod(m, struct ip *); + } +#endif /* PFIL_HOOKS */ + + /* + * Check with the firewall... + * but not if we are already being fwd'd from a firewall. + */ + if (fw_enable && IPFW_LOADED && !args.next_hop) { + struct sockaddr_in *old = dst; + + args.m = m; + args.next_hop = dst; + args.oif = ifp; + off = ip_fw_chk_ptr(&args); + m = args.m; + dst = args.next_hop; + + /* + * On return we must do the following: + * m == NULL -> drop the pkt (old interface, deprecated) + * (off & IP_FW_PORT_DENY_FLAG) -> drop the pkt (new interface) + * 1<=off<= 0xffff -> DIVERT + * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe + * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet + * dst != old -> IPFIREWALL_FORWARD + * off==0, dst==old -> accept + * If some of the above modules are not compiled in, then + * we should't have to check the corresponding condition + * (because the ipfw control socket should not accept + * unsupported rules), but better play safe and drop + * packets in case of doubt. + */ + if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) { + if (m) + m_freem(m); + error = EACCES; + goto done; + } + ip = mtod(m, struct ip *); + if (off == 0 && dst == old) /* common case */ + goto pass; + if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + args.ro = ro; + args.dst = dst; + args.flags = flags; + + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args); + goto done; + } +#ifdef IPDIVERT + if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { + struct mbuf *clone = NULL; + + /* Clone packet if we're doing a 'tee' */ + if ((off & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + + /* + * XXX + * delayed checksums are not currently compatible + * with divert sockets. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + /* Restore packet header fields to original values */ + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + + /* Deliver packet to divert input routine */ + divert_packet(m, 0, off & 0xffff, args.divert_rule); + + /* If 'tee', continue with original packet */ + if (clone != NULL) { + m = clone; + ip = mtod(m, struct ip *); + goto pass; + } + goto done; + } +#endif + + /* IPFIREWALL_FORWARD */ + /* + * Check dst to make sure it is directly reachable on the + * interface we previously thought it was. + * If it isn't (which may be likely in some situations) we have + * to re-route it (ie, find a route for the next-hop and the + * associated interface) and set them here. This is nested + * forwarding which in most cases is undesirable, except where + * such control is nigh impossible. So we do it here. + * And I'm babbling. + */ + if (off == 0 && old != dst) { /* FORWARD, dst has changed */ +#if 0 + /* + * XXX To improve readability, this block should be + * changed into a function call as below: + */ + error = ip_ipforward(&m, &dst, &ifp); + if (error) + goto bad; + if (m == NULL) /* ip_input consumed the mbuf */ + goto done; +#else + struct in_ifaddr *ia; + + /* + * XXX sro_fwd below is static, and a pointer + * to it gets passed to routines downstream. + * This could have surprisingly bad results in + * practice, because its content is overwritten + * by subsequent packets. + */ + /* There must be a better way to do this next line... */ + static struct route sro_fwd; + struct route *ro_fwd = &sro_fwd; + +#if 0 + print_ip("IPFIREWALL_FORWARD: New dst ip: ", + dst->sin_addr, "\n"); +#endif + + /* + * We need to figure out if we have been forwarded + * to a local socket. If so, then we should somehow + * "loop back" to ip_input, and get directed to the + * PCB as if we had received this packet. This is + * because it may be dificult to identify the packets + * you want to forward until they are being output + * and have selected an interface. (e.g. locally + * initiated packets) If we used the loopback inteface, + * we would not be able to control what happens + * as the packet runs through ip_input() as + * it is done through a ISR. + */ + LIST_FOREACH(ia, + INADDR_HASH(dst->sin_addr.s_addr), ia_hash) { + /* + * If the addr to forward to is one + * of ours, we pretend to + * be the destination for this packet. + */ + if (IA_SIN(ia)->sin_addr.s_addr == + dst->sin_addr.s_addr) + break; + } + if (ia) { /* tell ip_input "dont filter" */ + struct m_hdr tag; + + tag.mh_type = MT_TAG; + tag.mh_flags = PACKET_TAG_IPFORWARD; + tag.mh_data = (caddr_t)args.next_hop; + tag.mh_next = m; + + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = ifunit("lo0"); + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m0->m_pkthdr.csum_data = 0xffff; + } + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip_input((struct mbuf *)&tag); + goto done; + } + /* Some of the logic for this was + * nicked from above. + * + * This rewrites the cached route in a local PCB. + * Is this what we want to do? + */ + bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); + + ro_fwd->ro_rt = 0; + rtalloc_ign(ro_fwd, RTF_PRCLONING); + + if (ro_fwd->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + + ia = ifatoia(ro_fwd->ro_rt->rt_ifa); + ifp = ro_fwd->ro_rt->rt_ifp; + ro_fwd->ro_rt->rt_use++; + if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *) + ro_fwd->ro_rt->rt_gateway; + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) + isbroadcast = + (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + if (ro->ro_rt) + RTFREE(ro->ro_rt); + ro->ro_rt = ro_fwd->ro_rt; + dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + +#endif /* ... block to be put into a function */ + /* + * If we added a default src ip earlier, + * which would have been gotten from the-then + * interface, do it again, from the new one. + */ + if (src_was_INADDR_ANY) + ip->ip_src = IA_SIN(ia)->sin_addr; + goto pass ; + } + + /* + * if we get here, none of the above matches, and + * we have to drop the pkt + */ + m_freem(m); + error = EACCES; /* not sure this is the right error msg */ + goto done; + } + +pass: + /* 127/8 must not appear on wire - RFC1122. */ + if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + if ((ifp->if_flags & IFF_LOOPBACK) == 0) { + ipstat.ips_badaddr++; + error = EADDRNOTAVAIL; + goto bad; + } + } + + m->m_pkthdr.csum_flags |= CSUM_IP; + sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; + if (sw_csum & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + sw_csum &= ~CSUM_DELAY_DATA; + } + m->m_pkthdr.csum_flags &= ifp->if_hwassist; + + /* + * If small enough for interface, or the interface will take + * care of the fragmentation for us, can just send directly. + */ + if ((u_short)ip->ip_len <= ifp->if_mtu || + ifp->if_hwassist & CSUM_FRAGMENT) { + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + } + + /* Record statistics for this interface address. */ + if (!(flags & IP_FORWARDING) && ia) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } + +#ifdef IPSEC + /* clean ipsec history once it goes out of the node */ + ipsec_delaux(m); +#endif + + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + /* + * This case can happen if the user changed the MTU + * of an interface after enabling IP on it. Because + * most netifs don't keep track of routes pointing to + * them, there is no way for one to update all its + * routes when the MTU is changed. + */ + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + ipstat.ips_cantfrag++; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + /* + * if the interface will not calculate checksums on + * fragmented packets, then do it here. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && + (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + if (len > PAGE_SIZE) { + /* + * Fragement large datagrams such that each segment + * contains a multiple of PAGE_SIZE amount of data, + * plus headers. This enables a receiver to perform + * page-flipping zero-copy optimizations. + */ + + int newlen; + struct mbuf *mtmp; + + for (mtmp = m, off = 0; + mtmp && ((off + mtmp->m_len) <= ifp->if_mtu); + mtmp = mtmp->m_next) { + off += mtmp->m_len; + } + /* + * firstlen (off - hlen) must be aligned on an + * 8-byte boundary + */ + if (off < hlen) + goto smart_frag_failure; + off = ((off - hlen) & ~7) + hlen; + newlen = (~PAGE_MASK) & ifp->if_mtu; + if ((newlen + sizeof (struct ip)) > ifp->if_mtu) { + /* we failed, go back the default */ +smart_frag_failure: + newlen = len; + off = hlen + len; + } + +/* printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n", + len, hlen, sizeof (struct ip), newlen, off);*/ + + len = newlen; + + } else { + off = hlen + len; + } + + + + { + int mhlen, firstlen = off - hlen; + struct mbuf **mnext = &m->m_nextpkt; + int nfrags = 1; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (; off < (u_short)ip->ip_len; off += len) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; + m->m_data += max_linkhdr; + mhip = mtod(m, struct ip *); + *mhip = *ip; + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; + if (off + len >= (u_short)ip->ip_len) + len = (u_short)ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; /* ??? */ + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; + mhip->ip_off = htons(mhip->ip_off); + mhip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + if (mhip->ip_vhl == IP_VHL_BORING) { + mhip->ip_sum = in_cksum_hdr(mhip); + } else { + mhip->ip_sum = in_cksum(m, mhlen); + } + } + *mnext = m; + mnext = &m->m_nextpkt; + nfrags++; + } + ipstat.ips_ofragments += nfrags; + + /* set first/last markers for fragment chain */ + m->m_flags |= M_LASTFRAG; + m0->m_flags |= M_FIRSTFRAG | M_FRAG; + m0->m_pkthdr.csum_data = nfrags; + + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m = m0; + m_adj(m, hlen + firstlen - (u_short)ip->ip_len); + m->m_pkthdr.len = hlen + firstlen; + ip->ip_len = htons((u_short)m->m_pkthdr.len); + ip->ip_off |= IP_MF; + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + } +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; +#ifdef IPSEC + /* clean ipsec history once it goes out of the node */ + ipsec_delaux(m); +#endif + if (error == 0) { + /* Record statistics for this interface address. */ + if (ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } + + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + } else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + } +done: +#ifdef IPSEC + if (ro == &iproute && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + if (sp != NULL) { + KEYDEBUG(KEYDEBUG_IPSEC_STAMP, + printf("DP ip_output call free SP:%p\n", sp)); + key_freesp(sp); + } +#endif /* IPSEC */ + return (error); +bad: + m_freem(m); + goto done; +} + +void +in_delayed_cksum(struct mbuf *m) +{ + struct ip *ip; + u_short csum, offset; + + ip = mtod(m, struct ip *); + offset = IP_VHL_HL(ip->ip_vhl) << 2 ; + csum = in_cksum_skip(m, ip->ip_len, offset); + if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) + csum = 0xffff; + offset += m->m_pkthdr.csum_data; /* checksum offset */ + + if (offset + sizeof(u_short) > m->m_len) { + printf("delayed m_pullup, m->len: %d off: %d p: %d\n", + m->m_len, offset, ip->ip_p); + /* + * XXX + * this shouldn't happen, but if it does, the + * correct behavior may be to insert the checksum + * in the existing chain instead of rearranging it. + */ + m = m_pullup(m, offset + sizeof(u_short)); + } + *(u_short *)(m->m_data + offset) = csum; +} + +/* + * Insert IP options into preformed packet. + * Adjust IP destination as required for IP source routing, + * as indicated by a non-zero in_addr at the start of the options. + * + * XXX This routine assumes that the packet has no options in place. + */ +static struct mbuf * +ip_insertoptions(m, opt, phlen) + register struct mbuf *m; + struct mbuf *opt; + int *phlen; +{ + register struct ipoption *p = mtod(opt, struct ipoption *); + struct mbuf *n; + register struct ip *ip = mtod(m, struct ip *); + unsigned optlen; + + optlen = opt->m_len - sizeof(p->ipopt_dst); + if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) + return (m); /* XXX should fail */ + if (p->ipopt_dst.s_addr) + ip->ip_dst = p->ipopt_dst; + if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { + MGETHDR(n, M_DONTWAIT, MT_HEADER); + if (n == 0) + return (m); + n->m_pkthdr.rcvif = (struct ifnet *)0; + n->m_pkthdr.len = m->m_pkthdr.len + optlen; + m->m_len -= sizeof(struct ip); + m->m_data += sizeof(struct ip); + n->m_next = m; + m = n; + m->m_len = optlen + sizeof(struct ip); + m->m_data += max_linkhdr; + (void)memcpy(mtod(m, void *), ip, sizeof(struct ip)); + } else { + m->m_data -= optlen; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + bcopy(p->ipopt_list, ip + 1, optlen); + *phlen = sizeof(struct ip) + optlen; + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); + ip->ip_len += optlen; + return (m); +} + +/* + * Copy options from ip to jp, + * omitting those not copied during fragmentation. + */ +int +ip_optcopy(ip, jp) + struct ip *ip, *jp; +{ + register u_char *cp, *dp; + int opt, optlen, cnt; + + cp = (u_char *)(ip + 1); + dp = (u_char *)(jp + 1); + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) { + /* Preserve for IP mcast tunnel's LSRR alignment. */ + *dp++ = IPOPT_NOP; + optlen = 1; + continue; + } + + KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), + ("ip_optcopy: malformed ipv4 option")); + optlen = cp[IPOPT_OLEN]; + KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, + ("ip_optcopy: malformed ipv4 option")); + + /* bogus lengths should have been caught by ip_dooptions */ + if (optlen > cnt) + optlen = cnt; + if (IPOPT_COPIED(opt)) { + bcopy(cp, dp, optlen); + dp += optlen; + } + } + for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) + *dp++ = IPOPT_EOL; + return (optlen); +} + +/* + * IP socket option processing. + */ +int +ip_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + error = optval = 0; + if (sopt->sopt_level != IPPROTO_IP) { + return (EINVAL); + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_OPTIONS: +#ifdef notyet + case IP_RETOPTS: +#endif + { + struct mbuf *m; + if (sopt->sopt_valsize > MLEN) { + error = EMSGSIZE; + break; + } + MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + break; + } + m->m_len = sopt->sopt_valsize; + error = sooptcopyin(sopt, mtod(m, char *), m->m_len, + m->m_len); + + return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, + m)); + } + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + case IP_FAITH: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case IP_TOS: + inp->inp_ip_tos = optval; + break; + + case IP_TTL: + inp->inp_ip_ttl = optval; + break; +#define OPTSET(bit) \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; + + case IP_RECVOPTS: + OPTSET(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + OPTSET(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + OPTSET(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + OPTSET(INP_RECVIF); + break; + + case IP_FAITH: + OPTSET(INP_FAITH); + break; + } + break; +#undef OPTSET + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_setmoptions(sopt, &inp->inp_moptions); + break; + + case IP_PORTRANGE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (optval) { + case IP_PORTRANGE_DEFAULT: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); + break; + + case IP_PORTRANGE_HIGH: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; + break; + + case IP_PORTRANGE_LOW: + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; + break; + + default: + error = EINVAL; + break; + } + break; + +#ifdef IPSEC + case IP_IPSEC_POLICY: + { + caddr_t req; + size_t len = 0; + int priv; + struct mbuf *m; + int optname; + + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ + break; + if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ + break; + priv = (sopt->sopt_td != NULL && + suser(sopt->sopt_td) != 0) ? 0 : 1; + req = mtod(m, caddr_t); + len = m->m_len; + optname = sopt->sopt_name; + error = ipsec4_set_policy(inp, optname, req, len, priv); + m_freem(m); + break; + } +#endif /*IPSEC*/ + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_OPTIONS: + case IP_RETOPTS: + if (inp->inp_options) + error = sooptcopyout(sopt, + mtod(inp->inp_options, + char *), + inp->inp_options->m_len); + else + sopt->sopt_valsize = 0; + break; + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + case IP_PORTRANGE: + case IP_FAITH: + switch (sopt->sopt_name) { + + case IP_TOS: + optval = inp->inp_ip_tos; + break; + + case IP_TTL: + optval = inp->inp_ip_ttl; + break; + +#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) + + case IP_RECVOPTS: + optval = OPTBIT(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + optval = OPTBIT(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + optval = OPTBIT(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + optval = OPTBIT(INP_RECVIF); + break; + + case IP_PORTRANGE: + if (inp->inp_flags & INP_HIGHPORT) + optval = IP_PORTRANGE_HIGH; + else if (inp->inp_flags & INP_LOWPORT) + optval = IP_PORTRANGE_LOW; + else + optval = 0; + break; + + case IP_FAITH: + optval = OPTBIT(INP_FAITH); + break; + } + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_getmoptions(sopt, inp->inp_moptions); + break; + +#ifdef IPSEC + case IP_IPSEC_POLICY: + { + struct mbuf *m = NULL; + caddr_t req = NULL; + size_t len = 0; + + if (m != 0) { + req = mtod(m, caddr_t); + len = m->m_len; + } + error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); + if (error == 0) + error = soopt_mcopyout(sopt, m); /* XXX */ + if (error == 0) + m_freem(m); + break; + } +#endif /*IPSEC*/ + + default: + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +/* + * Set up IP options in pcb for insertion in output packets. + * Store in mbuf with pointer in pcbopt, adding pseudo-option + * with destination address if source routed. + */ +static int +ip_pcbopts(optname, pcbopt, m) + int optname; + struct mbuf **pcbopt; + register struct mbuf *m; +{ + register int cnt, optlen; + register u_char *cp; + u_char opt; + + /* turn off any old options */ + if (*pcbopt) + (void)m_free(*pcbopt); + *pcbopt = 0; + if (m == (struct mbuf *)0 || m->m_len == 0) { + /* + * Only turning off any previous options. + */ + if (m) + (void)m_free(m); + return (0); + } + + if (m->m_len % sizeof(int32_t)) + goto bad; + /* + * IP first-hop destination address will be stored before + * actual options; move other options back + * and clear it when none present. + */ + if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + goto bad; + cnt = m->m_len; + m->m_len += sizeof(struct in_addr); + cp = mtod(m, u_char *) + sizeof(struct in_addr); + ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); + bzero(mtod(m, caddr_t), sizeof(struct in_addr)); + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + goto bad; + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + goto bad; + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + case IPOPT_SSRR: + /* + * user process specifies route as: + * ->A->B->C->D + * D must be our final destination (but we can't + * check that since we may not have connected yet). + * A is first hop destination, which doesn't appear in + * actual IP option, but is stored before the options. + */ + if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + goto bad; + m->m_len -= sizeof(struct in_addr); + cnt -= sizeof(struct in_addr); + optlen -= sizeof(struct in_addr); + cp[IPOPT_OLEN] = optlen; + /* + * Move first hop before start of options. + */ + bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), + sizeof(struct in_addr)); + /* + * Then copy rest of options back + * to close up the deleted entry. + */ + ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + + sizeof(struct in_addr)), + (caddr_t)&cp[IPOPT_OFFSET+1], + (unsigned)cnt + sizeof(struct in_addr)); + break; + } + } + if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + goto bad; + *pcbopt = m; + return (0); + +bad: + (void)m_free(m); + return (EINVAL); +} + +/* + * XXX + * The whole multicast option thing needs to be re-thought. + * Several of these options are equally applicable to non-multicast + * transmission, and one (IP_MULTICAST_TTL) totally duplicates a + * standard option (IP_TTL). + */ + +/* + * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. + */ +static struct ifnet * +ip_multicast_if(a, ifindexp) + struct in_addr *a; + int *ifindexp; +{ + int ifindex; + struct ifnet *ifp; + + if (ifindexp) + *ifindexp = 0; + if (ntohl(a->s_addr) >> 24 == 0) { + ifindex = ntohl(a->s_addr) & 0xffffff; + if (ifindex < 0 || if_index < ifindex) + return NULL; + ifp = ifnet_byindex(ifindex); + if (ifindexp) + *ifindexp = ifindex; + } else { + INADDR_TO_IFP(*a, ifp); + } + return ifp; +} + +/* + * Set the IP multicast options in response to user setsockopt(). + */ +static int +ip_setmoptions(sopt, imop) + struct sockopt *sopt; + struct ip_moptions **imop; +{ + int error = 0; + int i; + struct in_addr addr; + struct ip_mreq mreq; + struct ifnet *ifp; + struct ip_moptions *imo = *imop; + struct route ro; + struct sockaddr_in *dst; + int ifindex; + int s; + + if (imo == NULL) { + /* + * No multicast option buffer attached to the pcb; + * allocate one and initialize to default values. + */ + imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + } + + switch (sopt->sopt_name) { + /* store an index number for the vif you wanna use in the send */ + case IP_MULTICAST_VIF: + if (legal_vif_num == 0) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + break; + if (!legal_vif_num(i) && (i != -1)) { + error = EINVAL; + break; + } + imo->imo_multicast_vif = i; + break; + + case IP_MULTICAST_IF: + /* + * Select the interface for outgoing multicast packets. + */ + error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); + if (error) + break; + /* + * INADDR_ANY is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (addr.s_addr == INADDR_ANY) { + imo->imo_multicast_ifp = NULL; + break; + } + /* + * The selected interface is identified by its local + * IP address. Find the interface and confirm that + * it supports multicasting. + */ + s = splimp(); + ifp = ip_multicast_if(&addr, &ifindex); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + splx(s); + error = EADDRNOTAVAIL; + break; + } + imo->imo_multicast_ifp = ifp; + if (ifindex) + imo->imo_multicast_addr = addr; + else + imo->imo_multicast_addr.s_addr = INADDR_ANY; + splx(s); + break; + + case IP_MULTICAST_TTL: + /* + * Set the IP time-to-live for outgoing multicast packets. + * The original multicast API required a char argument, + * which is inconsistent with the rest of the socket API. + * We allow either a char or an int. + */ + if (sopt->sopt_valsize == 1) { + u_char ttl; + error = sooptcopyin(sopt, &ttl, 1, 1); + if (error) + break; + imo->imo_multicast_ttl = ttl; + } else { + u_int ttl; + error = sooptcopyin(sopt, &ttl, sizeof ttl, + sizeof ttl); + if (error) + break; + if (ttl > 255) + error = EINVAL; + else + imo->imo_multicast_ttl = ttl; + } + break; + + case IP_MULTICAST_LOOP: + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. The original multicast API required a + * char argument, which is inconsistent with the rest + * of the socket API. We allow either a char or an int. + */ + if (sopt->sopt_valsize == 1) { + u_char loop; + error = sooptcopyin(sopt, &loop, 1, 1); + if (error) + break; + imo->imo_multicast_loop = !!loop; + } else { + u_int loop; + error = sooptcopyin(sopt, &loop, sizeof loop, + sizeof loop); + if (error) + break; + imo->imo_multicast_loop = !!loop; + } + break; + + case IP_ADD_MEMBERSHIP: + /* + * Add a multicast group membership. + * Group must be a valid IP multicast address. + */ + error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); + if (error) + break; + + if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + s = splimp(); + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq.imr_interface.s_addr == INADDR_ANY) { + bzero((caddr_t)&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq.imr_multiaddr; + rtalloc(&ro); + if (ro.ro_rt == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + ifp = ro.ro_rt->rt_ifp; + rtfree(ro.ro_rt); + } + else { + ifp = ip_multicast_if(&mreq.imr_interface, NULL); + } + + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq.imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + splx(s); + break; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + splx(s); + break; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + splx(s); + break; + } + ++imo->imo_num_memberships; + splx(s); + break; + + case IP_DROP_MEMBERSHIP: + /* + * Drop a multicast group membership. + * Group must be a valid IP multicast address. + */ + error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); + if (error) + break; + + if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + + s = splimp(); + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq.imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + ifp = ip_multicast_if(&mreq.imr_interface, NULL); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq.imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + splx(s); + break; + + default: + error = EOPNOTSUPP; + break; + } + + /* + * If all options have default values, no need to keep the mbuf. + */ + if (imo->imo_multicast_ifp == NULL && + imo->imo_multicast_vif == -1 && + imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && + imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && + imo->imo_num_memberships == 0) { + free(*imop, M_IPMOPTS); + *imop = NULL; + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +static int +ip_getmoptions(sopt, imo) + struct sockopt *sopt; + register struct ip_moptions *imo; +{ + struct in_addr addr; + struct in_ifaddr *ia; + int error, optval; + u_char coptval; + + error = 0; + switch (sopt->sopt_name) { + case IP_MULTICAST_VIF: + if (imo != NULL) + optval = imo->imo_multicast_vif; + else + optval = -1; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_IF: + if (imo == NULL || imo->imo_multicast_ifp == NULL) + addr.s_addr = INADDR_ANY; + else if (imo->imo_multicast_addr.s_addr) { + /* return the value user has set */ + addr = imo->imo_multicast_addr; + } else { + IFP_TO_IA(imo->imo_multicast_ifp, ia); + addr.s_addr = (ia == NULL) ? INADDR_ANY + : IA_SIN(ia)->sin_addr.s_addr; + } + error = sooptcopyout(sopt, &addr, sizeof addr); + break; + + case IP_MULTICAST_TTL: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_TTL; + else + optval = coptval = imo->imo_multicast_ttl; + if (sopt->sopt_valsize == 1) + error = sooptcopyout(sopt, &coptval, 1); + else + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_LOOP: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_LOOP; + else + optval = coptval = imo->imo_multicast_loop; + if (sopt->sopt_valsize == 1) + error = sooptcopyout(sopt, &coptval, 1); + else + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); +} + +/* + * Discard the IP multicast options. + */ +void +ip_freemoptions(imo) + register struct ip_moptions *imo; +{ + register int i; + + if (imo != NULL) { + for (i = 0; i < imo->imo_num_memberships; ++i) + in_delmulti(imo->imo_membership[i]); + free(imo, M_IPMOPTS); + } +} + +/* + * Routine called from ip_output() to loop back a copy of an IP multicast + * packet to the input queue of a specified interface. Note that this + * calls the output routine of the loopback "driver", but with an interface + * pointer that might NOT be a loopback interface -- evil, but easier than + * replicating that code here. + */ +static void +ip_mloopback(ifp, m, dst, hlen) + struct ifnet *ifp; + register struct mbuf *m; + register struct sockaddr_in *dst; + int hlen; +{ + register struct ip *ip; + struct mbuf *copym; + + copym = m_copy(m, 0, M_COPYALL); + if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_pullup(copym, hlen); + if (copym != NULL) { + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(copym, hlen); + } + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by using if_simloop(). + */ +#if 1 /* XXX */ + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } +#endif + +#ifdef notdef + copym->m_pkthdr.rcvif = ifp; + ip_input(copym); +#else + /* if the checksum hasn't been computed, mark it as valid */ + if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + copym->m_pkthdr.csum_data = 0xffff; + } + if_simloop(ifp, copym, dst->sin_family, 0); +#endif + } +} diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h new file mode 100644 index 0000000..e084d1d --- /dev/null +++ b/sys/netinet/ip_var.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_VAR_H_ +#define _NETINET_IP_VAR_H_ + +#include <sys/queue.h> + +/* + * Overlay for ip header used by other protocols (tcp, udp). + */ +struct ipovly { + u_char ih_x1[9]; /* (unused) */ + u_char ih_pr; /* protocol */ + u_short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; + +/* + * Ip reassembly queue structure. Each fragment + * being reassembled is attached to one of these structures. + * They are timed out after ipq_ttl drops to 0, and may also + * be reclaimed if memory becomes tight. + */ +struct ipq { + TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ + u_char ipq_ttl; /* time for reass q to live */ + u_char ipq_p; /* protocol of this fragment */ + u_short ipq_id; /* sequence id for reassembly */ + struct mbuf *ipq_frags; /* to ip headers of fragments */ + struct in_addr ipq_src,ipq_dst; +#ifdef IPDIVERT + u_int32_t ipq_div_info; /* ipfw divert port & flags */ + u_int16_t ipq_div_cookie; /* ipfw divert cookie */ +#endif +}; + +/* + * Structure stored in mbuf in inpcb.ip_options + * and passed to ip_output when ip options are in use. + * The actual length of the options (including ipopt_dst) + * is in m_len. + */ +#define MAX_IPOPTLEN 40 + +struct ipoption { + struct in_addr ipopt_dst; /* first-hop dst if source routed */ + char ipopt_list[MAX_IPOPTLEN]; /* options proper */ +}; + +/* + * Structure attached to inpcb.ip_moptions and + * passed to ip_output when IP multicast options are in use. + */ +struct ip_moptions { + struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ + struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ + u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ + u_char imo_multicast_loop; /* 1 => hear sends if a member */ + u_short imo_num_memberships; /* no. memberships this socket */ + struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; + u_long imo_multicast_vif; /* vif num outgoing multicasts */ +}; + +struct ipstat { + u_long ips_total; /* total packets received */ + u_long ips_badsum; /* checksum bad */ + u_long ips_tooshort; /* packet too short */ + u_long ips_toosmall; /* not enough data */ + u_long ips_badhlen; /* ip header length < data size */ + u_long ips_badlen; /* ip length < ip header length */ + u_long ips_fragments; /* fragments received */ + u_long ips_fragdropped; /* frags dropped (dups, out of space) */ + u_long ips_fragtimeout; /* fragments timed out */ + u_long ips_forward; /* packets forwarded */ + u_long ips_fastforward; /* packets fast forwarded */ + u_long ips_cantforward; /* packets rcvd for unreachable dest */ + u_long ips_redirectsent; /* packets forwarded on same net */ + u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_delivered; /* datagrams delivered to upper level*/ + u_long ips_localout; /* total ip packets generated here */ + u_long ips_odropped; /* lost packets due to nobufs, etc. */ + u_long ips_reassembled; /* total packets reassembled ok */ + u_long ips_fragmented; /* datagrams successfully fragmented */ + u_long ips_ofragments; /* output fragments created */ + u_long ips_cantfrag; /* don't fragment flag was set, etc. */ + u_long ips_badoptions; /* error in option processing */ + u_long ips_noroute; /* packets discarded due to no route */ + u_long ips_badvers; /* ip version != 4 */ + u_long ips_rawout; /* total raw ip packets generated */ + u_long ips_toolong; /* ip length > max ip packet size */ + u_long ips_notmember; /* multicasts for unregistered grps */ + u_long ips_nogif; /* no match gif found */ + u_long ips_badaddr; /* invalid address on header */ +}; + +#ifdef _KERNEL + +/* flags passed to ip_output as last parameter */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define IP_RAWOUTPUT 0x2 /* raw ip header exists */ +#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ + +struct ip; +struct inpcb; +struct route; +struct sockopt; + +extern struct ipstat ipstat; +#ifndef RANDOM_IP_ID +extern u_short ip_id; /* ip packet ctr, for ids */ +#endif +extern int ip_defttl; /* default IP ttl */ +extern int ipforwarding; /* ip forwarding */ +extern struct route ipforward_rt; /* ip forwarding cached route */ +extern u_char ip_protox[]; +extern struct socket *ip_rsvpd; /* reservation protocol daemon */ +extern struct socket *ip_mrouter; /* multicast routing daemon */ +extern int (*legal_vif_num)(int); +extern u_long (*ip_mcast_src)(int); +extern int rsvp_on; +extern struct pr_usrreqs rip_usrreqs; + +int ip_ctloutput(struct socket *, struct sockopt *sopt); +void ip_drain(void); +void ip_freemoptions(struct ip_moptions *); +void ip_init(void); +extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *); +int ip_output(struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *); +struct in_ifaddr * + ip_rtaddr(struct in_addr, struct route *); +void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, + struct mbuf *); +void ip_slowtimo(void); +struct mbuf * + ip_srcroute(void); +void ip_stripoptions(struct mbuf *, struct mbuf *); +#ifdef RANDOM_IP_ID +u_int16_t + ip_randomid(void); +#endif +int rip_ctloutput(struct socket *, struct sockopt *); +void rip_ctlinput(int, struct sockaddr *, void *); +void rip_init(void); +void rip_input(struct mbuf *, int); +int rip_output(struct mbuf *, struct socket *, u_long); +void ipip_input(struct mbuf *, int); +void rsvp_input(struct mbuf *, int); +int ip_rsvp_init(struct socket *); +int ip_rsvp_done(void); +int ip_rsvp_vif_init(struct socket *, struct sockopt *); +int ip_rsvp_vif_done(struct socket *, struct sockopt *); +void ip_rsvp_force_done(struct socket *); + +#ifdef IPDIVERT +void div_init(void); +void div_input(struct mbuf *, int); +void divert_packet(struct mbuf *m, int incoming, int port, int rule); +extern struct pr_usrreqs div_usrreqs; +#endif + +void in_delayed_cksum(struct mbuf *m); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_IP_VAR_H_ */ diff --git a/sys/netinet/ipprotosw.h b/sys/netinet/ipprotosw.h new file mode 100644 index 0000000..bdc4c73 --- /dev/null +++ b/sys/netinet/ipprotosw.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 1995, 1996, 1997, 1998, and 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)protosw.h 8.1 (Berkeley) 6/2/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IPPROTOSW_H_ +#define _NETINET_IPPROTOSW_H_ + +/* + * For pfil_head structure. + */ +#include <net/pfil.h> + +/* Forward declare these structures referenced from prototypes below. */ +struct mbuf; +struct sockaddr; +struct socket; +struct sockopt; + +struct ipprotosw { + short pr_type; /* socket type used for */ + struct domain *pr_domain; /* domain protocol a member of */ + short pr_protocol; /* protocol number */ + short pr_flags; /* see below */ +/* protocol-protocol hooks */ + pr_in_input_t *pr_input; /* input to protocol (from below) */ + pr_output_t *pr_output; /* output to protocol (from above) */ + pr_ctlinput_t *pr_ctlinput; /* control input (from below) */ + pr_ctloutput_t *pr_ctloutput; /* control output (from above) */ +/* user-protocol hook */ + void *pr_ousrreq; +/* utility hooks */ + pr_init_t *pr_init; + pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */ + pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */ + pr_drain_t *pr_drain; /* flush any excess space possible */ + + struct pr_usrreqs *pr_usrreqs; /* supersedes pr_usrreq() */ + struct pfil_head pr_pfh; +}; + +#endif /* !_NETINET_IPPROTOSW_H_ */ diff --git a/sys/netinet/libalias/HISTORY b/sys/netinet/libalias/HISTORY new file mode 100644 index 0000000..c5bca59 --- /dev/null +++ b/sys/netinet/libalias/HISTORY @@ -0,0 +1,145 @@ +$FreeBSD$ + +Version 1.0: August 11, 1996 (cjm) + +Version 1.1: August 20, 1996 (cjm) + - Host accepts incoming connections for ports 0 to 1023. + +Version 1.2: September 7, 1996 (cjm) + - Fragment handling error in alias_db.c corrected. + +Version 1.3: September 15, 1996 (cjm) + - Generalized mechanism for handling incoming + connections (no more 0 to 1023 restriction). + + - Increased ICMP support (will handle traceroute now). + + - Improved TCP close connection logic. + +Version 1.4: September 16, 1996 (cjm) + +Version 1.5: September 17, 1996 (cjm) + - Corrected error in handling incoming UDP packets + with zero checksum. + +Version 1.6: September 18, 1996 + - Simplified ICMP data storage. Will now handle + tracert from Win95 and NT as well as FreeBSD + traceroute, which uses UDP packets to non-existent + ports. + +Version 1.7: January 9, 1997 (cjm) + - Reduced malloc() activity for ICMP echo and + timestamp requests. + + - Added handling for out-of-order IP fragments. + + - Switched to differential checksum computation + for IP headers (TCP, UDP and ICMP checksums + were already differential). + + - Accepts FTP data connections from other than + port 20. This allows one ftp connections + from two hosts which are both running packet + aliasing. + + - Checksum error on FTP transfers. Problem + in code located by Martin Renters and + Brian Somers. + +Version 1.8: January 14, 1997 (cjm) + - Fixed data type error in function StartPoint() + in alias_db.c (this bug did not exist before v1.7) + Problem in code located by Ari Suutari. + +Version 1.9: February 1, 1997 (Eivind Eklund <perhaps@yes.no>) + - Added support for IRC DCC (ee) + + - Changed the aliasing routines to use ANSI style + throughout (ee) + + - Minor API changes for integration with other + programs than PPP (ee) + + - Fixed minor security hole in alias_ftp.c for + other applications of the aliasing software. + Hole could _not_ manifest in ppp+pktAlias, but + could potentially manifest in other applications + of the aliasing. (ee) + + - Connections initiated from packet aliasing + host machine will not have their port number + aliased unless it conflicts with an aliasing + port already being used. (There is an option + to disable this for debugging) (cjm) + + - Sockets will be allocated in cases where + there might be port interference with the + host machine. This can be disabled in cases + where the ppp host will be acting purely as a + masquerading router and not generate any + traffic of its own. + (cjm) + +Version 2.0: March, 1997 (cjm) + - Aliasing links are cleared only when a host interface address + changes. + + - PacketAliasPermanentLink() API added. + + - Option for only aliasing private, unregistered + IP addresses added. + + - Substantial rework to the aliasing lookup engine. + +Version 2.1: May, 1997 (cjm) + - Continuing rework to the aliasing lookup engine + to support multiple incoming addresses and static + NAT. PacketAliasRedirectPort() and + PacketAliasRedirectAddr() added to API. + + - Now supports outgoing as well as incoming ICMP + error messages. + +Version 2.2: July, 1997 (cjm) + - Rationalized API function names to all begin with + "PacketAlias..." Old function names are retained + for backwards compatibility. + + - Packet aliasing engine will now free memory of + fragments which are never resolved after a timeout + period. Once a fragment is resolved, it becomes + the users responsibility to free the memory. + +Version 2.3: August 11, 1997 (cjm) + - Problem associated with socket file descriptor + accumulation in alias_db.c corrected. The sockets + had to be closed when a binding failed. Problem + in code located by Gordon Burditt. + +Version 2.4: September 1, 1997 (cjm) + - PKT_ALIAS_UNREGISTERED_ONLY option repaired. + This part of the code was incorrectly re-implemented + in version 2.1. + +Version 2.5: December, 1997 (ee) + - Added PKT_ALIAS_PUNCH_FW mode for firewall + bypass of FTP/IRC DCC data connections. Also added + improved TCP connection monitoring. + +Version 2.6: May, 1998 (amurai) + - Added supporting routine for NetBios over TCP/IP. + +Version 3.0: January 1, 1999 + - Transparent proxying support added. + - PPTP redirecting support added based on patches + contributed by Dru Nelson <dnelson@redwoodsoft.com>. + +Version 3.1: May, 2000 (Erik Salander, erik@whistle.com) + - Added support to alias 227 replies, allows aliasing for + FTP servers in passive mode. + - Added support for PPTP aliasing. + +Version 3.2: July, 2000 (Erik Salander, erik@whistle.com and + Junichi Satoh, junichi@junichi.org) + - Added support for streaming media (RTSP and PNA) aliasing. diff --git a/sys/netinet/libalias/Makefile b/sys/netinet/libalias/Makefile new file mode 100644 index 0000000..a6f577d --- /dev/null +++ b/sys/netinet/libalias/Makefile @@ -0,0 +1,13 @@ +# $FreeBSD$ + +LIB= alias +SHLIB_MAJOR= 4 +SHLIB_MINOR= 0 +CFLAGS+= -Wall -Wmissing-prototypes +SRCS= alias.c alias_cuseeme.c alias_db.c alias_ftp.c alias_irc.c \ + alias_nbt.c alias_pptp.c alias_proxy.c alias_smedia.c \ + alias_util.c +INCS= alias.h +MAN= libalias.3 + +.include <bsd.lib.mk> diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c new file mode 100644 index 0000000..320c5c2 --- /dev/null +++ b/sys/netinet/libalias/alias.c @@ -0,0 +1,1574 @@ +/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- */ + +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + Alias.c provides supervisory control for the functions of the + packet aliasing software. It consists of routines to monitor + TCP connection state, protocol-specific aliasing routines, + fragment handling and the following outside world functional + interfaces: SaveFragmentPtr, GetFragmentPtr, FragmentAliasIn, + PacketAliasIn and PacketAliasOut. + + The other C program files are briefly described. The data + structure framework which holds information needed to translate + packets is encapsulated in alias_db.c. Data is accessed by + function calls, so other segments of the program need not know + about the underlying data structures. Alias_ftp.c contains + special code for modifying the ftp PORT command used to establish + data connections, while alias_irc.c does the same for IRC + DCC. Alias_util.c contains a few utility routines. + + Version 1.0 August, 1996 (cjm) + + Version 1.1 August 20, 1996 (cjm) + PPP host accepts incoming connections for ports 0 to 1023. + (Gary Roberts pointed out the need to handle incoming + connections.) + + Version 1.2 September 7, 1996 (cjm) + Fragment handling error in alias_db.c corrected. + (Tom Torrance helped fix this problem.) + + Version 1.4 September 16, 1996 (cjm) + - A more generalized method for handling incoming + connections, without the 0-1023 restriction, is + implemented in alias_db.c + - Improved ICMP support in alias.c. Traceroute + packet streams can now be correctly aliased. + - TCP connection closing logic simplified in + alias.c and now allows for additional 1 minute + "grace period" after FIN or RST is observed. + + Version 1.5 September 17, 1996 (cjm) + Corrected error in handling incoming UDP packets with 0 checksum. + (Tom Torrance helped fix this problem.) + + Version 1.6 September 18, 1996 (cjm) + Simplified ICMP aliasing scheme. Should now support + traceroute from Win95 as well as FreeBSD. + + Version 1.7 January 9, 1997 (cjm) + - Out-of-order fragment handling. + - IP checksum error fixed for ftp transfers + from aliasing host. + - Integer return codes added to all + aliasing/de-aliasing functions. + - Some obsolete comments cleaned up. + - Differential checksum computations for + IP header (TCP, UDP and ICMP were already + differential). + + Version 2.1 May 1997 (cjm) + - Added support for outgoing ICMP error + messages. + - Added two functions PacketAliasIn2() + and PacketAliasOut2() for dynamic address + control (e.g. round-robin allocation of + incoming packets). + + Version 2.2 July 1997 (cjm) + - Rationalized API function names to begin + with "PacketAlias..." + - Eliminated PacketAliasIn2() and + PacketAliasOut2() as poorly conceived. + + Version 2.3 Dec 1998 (dillon) + - Major bounds checking additions, see FreeBSD/CVS + + Version 3.1 May, 2000 (salander) + - Added hooks to handle PPTP. + + Version 3.2 July, 2000 (salander and satoh) + - Added PacketUnaliasOut routine. + - Added hooks to handle RTSP/RTP. + + See HISTORY file for additional revisions. +*/ + +#include <sys/types.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include <stdio.h> + +#include "alias_local.h" +#include "alias.h" + +#define NETBIOS_NS_PORT_NUMBER 137 +#define NETBIOS_DGM_PORT_NUMBER 138 +#define FTP_CONTROL_PORT_NUMBER 21 +#define IRC_CONTROL_PORT_NUMBER_1 6667 +#define IRC_CONTROL_PORT_NUMBER_2 6668 +#define CUSEEME_PORT_NUMBER 7648 +#define RTSP_CONTROL_PORT_NUMBER_1 554 +#define RTSP_CONTROL_PORT_NUMBER_2 7070 +#define TFTP_PORT_NUMBER 69 +#define PPTP_CONTROL_PORT_NUMBER 1723 + + + + +/* TCP Handling Routines + + TcpMonitorIn() -- These routines monitor TCP connections, and + TcpMonitorOut() delete a link when a connection is closed. + +These routines look for SYN, FIN and RST flags to determine when TCP +connections open and close. When a TCP connection closes, the data +structure containing packet aliasing information is deleted after +a timeout period. +*/ + +/* Local prototypes */ +static void TcpMonitorIn(struct ip *, struct alias_link *); + +static void TcpMonitorOut(struct ip *, struct alias_link *); + + +static void +TcpMonitorIn(struct ip *pip, struct alias_link *link) +{ + struct tcphdr *tc; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + switch (GetStateIn(link)) + { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (tc->th_flags & TH_RST) + SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED); + else if (tc->th_flags & TH_SYN) + SetStateIn(link, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (tc->th_flags & (TH_FIN | TH_RST)) + SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + +static void +TcpMonitorOut(struct ip *pip, struct alias_link *link) +{ + struct tcphdr *tc; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + switch (GetStateOut(link)) + { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (tc->th_flags & TH_RST) + SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED); + else if (tc->th_flags & TH_SYN) + SetStateOut(link, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (tc->th_flags & (TH_FIN | TH_RST)) + SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + + + + + +/* Protocol Specific Packet Aliasing Routines + + IcmpAliasIn(), IcmpAliasIn1(), IcmpAliasIn2() + IcmpAliasOut(), IcmpAliasOut1(), IcmpAliasOut2() + ProtoAliasIn(), ProtoAliasOut() + UdpAliasIn(), UdpAliasOut() + TcpAliasIn(), TcpAliasOut() + +These routines handle protocol specific details of packet aliasing. +One may observe a certain amount of repetitive arithmetic in these +functions, the purpose of which is to compute a revised checksum +without actually summing over the entire data packet, which could be +unnecessarily time consuming. + +The purpose of the packet aliasing routines is to replace the source +address of the outgoing packet and then correctly put it back for +any incoming packets. For TCP and UDP, ports are also re-mapped. + +For ICMP echo/timestamp requests and replies, the following scheme +is used: the ID number is replaced by an alias for the outgoing +packet. + +ICMP error messages are handled by looking at the IP fragment +in the data section of the message. + +For TCP and UDP protocols, a port number is chosen for an outgoing +packet, and then incoming packets are identified by IP address and +port numbers. For TCP packets, there is additional logic in the event +that sequence and ACK numbers have been altered (as in the case for +FTP data port commands). + +The port numbers used by the packet aliasing module are not true +ports in the Unix sense. No sockets are actually bound to ports. +They are more correctly thought of as placeholders. + +All packets go through the aliasing mechanism, whether they come from +the gateway machine or other machines on a local area network. +*/ + + +/* Local prototypes */ +static int IcmpAliasIn1(struct ip *); +static int IcmpAliasIn2(struct ip *); +static int IcmpAliasIn (struct ip *); + +static int IcmpAliasOut1(struct ip *); +static int IcmpAliasOut2(struct ip *); +static int IcmpAliasOut (struct ip *); + +static int ProtoAliasIn(struct ip *); +static int ProtoAliasOut(struct ip *); + +static int UdpAliasOut(struct ip *); +static int UdpAliasIn (struct ip *); + +static int TcpAliasOut(struct ip *, int); +static int TcpAliasIn (struct ip *); + + +static int +IcmpAliasIn1(struct ip *pip) +{ +/* + De-alias incoming echo and timestamp replies. + Alias incoming echo and timestamp requests. +*/ + struct alias_link *link; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + +/* Get source address from ICMP data field and restore original data */ + link = FindIcmpIn(pip->ip_src, pip->ip_dst, ic->icmp_id, 1); + if (link != NULL) + { + u_short original_id; + int accumulate; + + original_id = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Put original sequence number back in */ + ic->icmp_id = original_id; + +/* Put original address back into IP header */ + { + struct in_addr original_address; + + original_address = GetOriginalAddress(link); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + } + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +IcmpAliasIn2(struct ip *pip) +{ +/* + Alias incoming ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + ip = &ic->icmp_ip; + + ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2)); + tc = (struct tcphdr *) ud; + ic2 = (struct icmp *) ud; + + if (ip->ip_p == IPPROTO_UDP) + link = FindUdpTcpIn(ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (ip->ip_p == IPPROTO_TCP) + link = FindUdpTcpIn(ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + link = FindIcmpIn(ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); + else + link = NULL; + } else + link = NULL; + + if (link != NULL) + { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(link); + original_port = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ud->uh_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + +/* Un-alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_src = original_address; + ud->uh_sport = original_port; + } + else if (ip->ip_p == IPPROTO_ICMP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(link); + original_id = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic2->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + +/* Un-alias address of original IP packet and sequence number of + embedded ICMP datagram */ + ip->ip_src = original_address; + ic2->icmp_id = original_id; + } + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasIn(struct ip *pip) +{ + int iresult; + struct icmp *ic; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) + { + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + if (ic->icmp_code == 0) + { + iresult = IcmpAliasIn1(pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasIn2(pip); + break; + case ICMP_ECHO: + case ICMP_TSTAMP: + iresult = IcmpAliasIn1(pip); + break; + } + return(iresult); +} + + +static int +IcmpAliasOut1(struct ip *pip) +{ +/* + Alias outgoing echo and timestamp requests. + De-alias outgoing echo and timestamp replies. +*/ + struct alias_link *link; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + +/* Save overwritten data for when echo packet returns */ + link = FindIcmpOut(pip->ip_src, pip->ip_dst, ic->icmp_id, 1); + if (link != NULL) + { + u_short alias_id; + int accumulate; + + alias_id = GetAliasPort(link); + +/* Since data field is being modified, adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Alias sequence number */ + ic->icmp_id = alias_id; + +/* Change source address */ + { + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut2(struct ip *pip) +{ +/* + Alias outgoing ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + ip = &ic->icmp_ip; + + ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2)); + tc = (struct tcphdr *) ud; + ic2 = (struct icmp *) ud; + + if (ip->ip_p == IPPROTO_UDP) + link = FindUdpTcpOut(ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (ip->ip_p == IPPROTO_TCP) + link = FindUdpTcpOut(ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + link = FindIcmpOut(ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); + else + link = NULL; + } else + link = NULL; + + if (link != NULL) + { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr alias_address; + u_short alias_port; + + alias_address = GetAliasAddress(link); + alias_port = GetAliasPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ud->uh_dport; + accumulate -= alias_port; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* + * Alias address in IP header if it comes from the host + * the original TCP/UDP packet was destined for. + */ + if (pip->ip_src.s_addr == ip->ip_dst.s_addr) { + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + +/* Alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_dst = alias_address; + ud->uh_dport = alias_port; + } + else if (ip->ip_p == IPPROTO_ICMP) + { + u_short *sptr; + int accumulate; + struct in_addr alias_address; + u_short alias_id; + + alias_address = GetAliasAddress(link); + alias_id = GetAliasPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic2->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* + * Alias address in IP header if it comes from the host + * the original ICMP message was destined for. + */ + if (pip->ip_src.s_addr == ip->ip_dst.s_addr) { + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + +/* Alias address of original IP packet and sequence number of + embedded ICMP datagram */ + ip->ip_dst = alias_address; + ic2->icmp_id = alias_id; + } + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut(struct ip *pip) +{ + int iresult; + struct icmp *ic; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) + { + case ICMP_ECHO: + case ICMP_TSTAMP: + if (ic->icmp_code == 0) + { + iresult = IcmpAliasOut1(pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasOut2(pip); + break; + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + iresult = IcmpAliasOut1(pip); + } + return(iresult); +} + + + +static int +ProtoAliasIn(struct ip *pip) +{ +/* + Handle incoming IP packets. The + only thing which is done in this case is to alias + the dest IP address of the packet to our inside + machine. +*/ + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + link = FindProtoIn(pip->ip_src, pip->ip_dst, pip->ip_p); + if (link != NULL) + { + struct in_addr original_address; + + original_address = GetOriginalAddress(link); + +/* Restore original IP address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +ProtoAliasOut(struct ip *pip) +{ +/* + Handle outgoing IP packets. The + only thing which is done in this case is to alias + the source IP address of the packet. +*/ + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + link = FindProtoOut(pip->ip_src, pip->ip_dst, pip->ip_p); + if (link != NULL) + { + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + +/* Change source address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +UdpAliasIn(struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpIn(pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP, 1); + if (link != NULL) + { + struct in_addr alias_address; + struct in_addr original_address; + u_short alias_port; + int accumulate; + u_short *sptr; + int r = 0; + + alias_address = GetAliasAddress(link); + original_address = GetOriginalAddress(link); + alias_port = ud->uh_dport; + ud->uh_dport = GetOriginalPort(link); + +/* Special processing for IP encoding protocols */ + if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER) + AliasHandleCUSeeMeIn(pip, original_address); +/* If NETBIOS Datagram, It should be alias address in UDP Data, too */ + else if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER) + r = AliasHandleUdpNbt(pip, link, &original_address, ud->uh_dport); + else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER) + r = AliasHandleUdpNbtNS(pip, link, &alias_address, &alias_port, + &original_address, &ud->uh_dport); + +/* If UDP checksum is not zero, then adjust since destination port */ +/* is being unaliased and destination address is being altered. */ + if (ud->uh_sum != 0) + { + accumulate = alias_port; + accumulate -= ud->uh_dport; + sptr = (u_short *) &alias_address; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } + +/* Restore original IP address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + /* + * If we cannot figure out the packet, ignore it. + */ + if (r < 0) + return(PKT_ALIAS_IGNORED); + else + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +UdpAliasOut(struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpOut(pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP, 1); + if (link != NULL) + { + u_short alias_port; + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + alias_port = GetAliasPort(link); + +/* Special processing for IP encoding protocols */ + if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER) + AliasHandleCUSeeMeOut(pip, link); +/* If NETBIOS Datagram, It should be alias address in UDP Data, too */ + else if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER) + AliasHandleUdpNbt(pip, link, &alias_address, alias_port); + else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER) + AliasHandleUdpNbtNS(pip, link, &pip->ip_src, &ud->uh_sport, + &alias_address, &alias_port); +/* + * We don't know in advance what TID the TFTP server will choose, + * so we create a wilcard link (destination port is unspecified) + * that will match any TID from a given destination. + */ + else if (ntohs(ud->uh_dport) == TFTP_PORT_NUMBER) + FindRtspOut(pip->ip_src, pip->ip_dst, + ud->uh_sport, alias_port, IPPROTO_UDP); + +/* If UDP checksum is not zero, adjust since source port is */ +/* being aliased and source address is being altered */ + if (ud->uh_sum != 0) + { + int accumulate; + u_short *sptr; + + accumulate = ud->uh_sport; + accumulate -= alias_port; + sptr = (u_short *) &(pip->ip_src); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } + +/* Put alias port in UDP header */ + ud->uh_sport = alias_port; + +/* Change source address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + + +static int +TcpAliasIn(struct ip *pip) +{ + struct tcphdr *tc; + struct alias_link *link; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpIn(pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP, + !(packetAliasMode & PKT_ALIAS_PROXY_ONLY)); + if (link != NULL) + { + struct in_addr alias_address; + struct in_addr original_address; + struct in_addr proxy_address; + u_short alias_port; + u_short proxy_port; + int accumulate; + u_short *sptr; + +/* Special processing for IP encoding protocols */ + if (ntohs(tc->th_dport) == PPTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == PPTP_CONTROL_PORT_NUMBER) + AliasHandlePptpIn(pip, link); + + alias_address = GetAliasAddress(link); + original_address = GetOriginalAddress(link); + proxy_address = GetProxyAddress(link); + alias_port = tc->th_dport; + tc->th_dport = GetOriginalPort(link); + proxy_port = GetProxyPort(link); + +/* Adjust TCP checksum since destination port is being unaliased */ +/* and destination port is being altered. */ + accumulate = alias_port; + accumulate -= tc->th_dport; + sptr = (u_short *) &alias_address; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* If this is a proxy, then modify the TCP source port and + checksum accumulation */ + if (proxy_port != 0) + { + accumulate += tc->th_sport; + tc->th_sport = proxy_port; + accumulate -= tc->th_sport; + + sptr = (u_short *) &pip->ip_src; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &proxy_address; + accumulate -= *sptr++; + accumulate -= *sptr; + } + +/* See if ACK number needs to be modified */ + if (GetAckModified(link) == 1) + { + int delta; + + delta = GetDeltaAckIn(pip, link); + if (delta != 0) + { + sptr = (u_short *) &tc->th_ack; + accumulate += *sptr++; + accumulate += *sptr; + tc->th_ack = htonl(ntohl(tc->th_ack) - delta); + sptr = (u_short *) &tc->th_ack; + accumulate -= *sptr++; + accumulate -= *sptr; + } + } + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Restore original IP address */ + sptr = (u_short *) &pip->ip_dst; + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_dst = original_address; + sptr = (u_short *) &pip->ip_dst; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* If this is a transparent proxy packet, then modify the source + address */ + if (proxy_address.s_addr != 0) + { + sptr = (u_short *) &pip->ip_src; + accumulate += *sptr++; + accumulate += *sptr; + pip->ip_src = proxy_address; + sptr = (u_short *) &pip->ip_src; + accumulate -= *sptr++; + accumulate -= *sptr; + } + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + +/* Monitor TCP connection state */ + TcpMonitorIn(pip, link); + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +TcpAliasOut(struct ip *pip, int maxpacketsize) +{ + int proxy_type; + u_short dest_port; + u_short proxy_server_port; + struct in_addr dest_address; + struct in_addr proxy_server_address; + struct tcphdr *tc; + struct alias_link *link; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + proxy_type = ProxyCheck(pip, &proxy_server_address, &proxy_server_port); + + if (proxy_type == 0 && (packetAliasMode & PKT_ALIAS_PROXY_ONLY)) + return PKT_ALIAS_OK; + +/* If this is a transparent proxy, save original destination, + then alter the destination and adjust checksums */ + dest_port = tc->th_dport; + dest_address = pip->ip_dst; + if (proxy_type != 0) + { + int accumulate; + u_short *sptr; + + accumulate = tc->th_dport; + tc->th_dport = proxy_server_port; + accumulate -= tc->th_dport; + + sptr = (u_short *) &(pip->ip_dst); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &proxy_server_address; + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + sptr = (u_short *) &(pip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_dst = proxy_server_address; + sptr = (u_short *) &(pip->ip_dst); + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + + link = FindUdpTcpOut(pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP, 1); + if (link !=NULL) + { + u_short alias_port; + struct in_addr alias_address; + int accumulate; + u_short *sptr; + +/* Save original destination address, if this is a proxy packet. + Also modify packet to include destination encoding. This may + change the size of IP header. */ + if (proxy_type != 0) + { + SetProxyPort(link, dest_port); + SetProxyAddress(link, dest_address); + ProxyModify(link, pip, maxpacketsize, proxy_type); + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + } + +/* Get alias address and port */ + alias_port = GetAliasPort(link); + alias_address = GetAliasAddress(link); + +/* Monitor TCP connection state */ + TcpMonitorOut(pip, link); + +/* Special processing for IP encoding protocols */ + if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == FTP_CONTROL_PORT_NUMBER) + AliasHandleFtpOut(pip, link, maxpacketsize); + else if (ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_2) + AliasHandleIrcOut(pip, link, maxpacketsize); + else if (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_sport) == RTSP_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2 + || ntohs(tc->th_sport) == RTSP_CONTROL_PORT_NUMBER_2) + AliasHandleRtspOut(pip, link, maxpacketsize); + else if (ntohs(tc->th_dport) == PPTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == PPTP_CONTROL_PORT_NUMBER) + AliasHandlePptpOut(pip, link); + +/* Adjust TCP checksum since source port is being aliased */ +/* and source address is being altered */ + accumulate = tc->th_sport; + tc->th_sport = alias_port; + accumulate -= tc->th_sport; + + sptr = (u_short *) &(pip->ip_src); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* Modify sequence number if necessary */ + if (GetAckModified(link) == 1) + { + int delta; + + delta = GetDeltaSeqOut(pip, link); + if (delta != 0) + { + sptr = (u_short *) &tc->th_seq; + accumulate += *sptr++; + accumulate += *sptr; + tc->th_seq = htonl(ntohl(tc->th_seq) + delta); + sptr = (u_short *) &tc->th_seq; + accumulate -= *sptr++; + accumulate -= *sptr; + } + } + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Change source address */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_src = alias_address; + sptr = (u_short *) &(pip->ip_src); + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + + + +/* Fragment Handling + + FragmentIn() + FragmentOut() + +The packet aliasing module has a limited ability for handling IP +fragments. If the ICMP, TCP or UDP header is in the first fragment +received, then the ID number of the IP packet is saved, and other +fragments are identified according to their ID number and IP address +they were sent from. Pointers to unresolved fragments can also be +saved and recalled when a header fragment is seen. +*/ + +/* Local prototypes */ +static int FragmentIn(struct ip *); +static int FragmentOut(struct ip *); + + +static int +FragmentIn(struct ip *pip) +{ + struct alias_link *link; + + link = FindFragmentIn2(pip->ip_src, pip->ip_dst, pip->ip_id); + if (link != NULL) + { + struct in_addr original_address; + + GetFragmentAddr(link, &original_address); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_UNRESOLVED_FRAGMENT); +} + + +static int +FragmentOut(struct ip *pip) +{ + struct in_addr alias_address; + + alias_address = FindAliasAddress(pip->ip_src); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); +} + + + + + + +/* Outside World Access + + PacketAliasSaveFragment() + PacketAliasGetFragment() + PacketAliasFragmentIn() + PacketAliasIn() + PacketAliasOut() + PacketUnaliasOut() + +(prototypes in alias.h) +*/ + + +int +PacketAliasSaveFragment(char *ptr) +{ + int iresult; + struct alias_link *link; + struct ip *pip; + + pip = (struct ip *) ptr; + link = AddFragmentPtrLink(pip->ip_src, pip->ip_id); + iresult = PKT_ALIAS_ERROR; + if (link != NULL) + { + SetFragmentPtr(link, ptr); + iresult = PKT_ALIAS_OK; + } + return(iresult); +} + + +char * +PacketAliasGetFragment(char *ptr) +{ + struct alias_link *link; + char *fptr; + struct ip *pip; + + pip = (struct ip *) ptr; + link = FindFragmentPtr(pip->ip_src, pip->ip_id); + if (link != NULL) + { + GetFragmentPtr(link, &fptr); + SetFragmentPtr(link, NULL); + SetExpire(link, 0); /* Deletes link */ + + return(fptr); + } + else + { + return(NULL); + } +} + + +void +PacketAliasFragmentIn(char *ptr, /* Points to correctly de-aliased + header fragment */ + char *ptr_fragment /* Points to fragment which must + be de-aliased */ + ) +{ + struct ip *pip; + struct ip *fpip; + + pip = (struct ip *) ptr; + fpip = (struct ip *) ptr_fragment; + + DifferentialChecksum(&fpip->ip_sum, + (u_short *) &pip->ip_dst, + (u_short *) &fpip->ip_dst, + 2); + fpip->ip_dst = pip->ip_dst; +} + + +int +PacketAliasIn(char *ptr, int maxpacketsize) +{ + struct in_addr alias_addr; + struct ip *pip; + int iresult; + + if (packetAliasMode & PKT_ALIAS_REVERSE) { + packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = PacketAliasOut(ptr, maxpacketsize); + packetAliasMode |= PKT_ALIAS_REVERSE; + return iresult; + } + + HouseKeeping(); + ClearCheckNewLink(); + pip = (struct ip *) ptr; + alias_addr = pip->ip_dst; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return PKT_ALIAS_IGNORED; + + iresult = PKT_ALIAS_IGNORED; + if ( (ntohs(pip->ip_off) & IP_OFFMASK) == 0 ) + { + switch (pip->ip_p) + { + case IPPROTO_ICMP: + iresult = IcmpAliasIn(pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasIn(pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasIn(pip); + break; + case IPPROTO_GRE: + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY || + AliasHandlePptpGreIn(pip) == 0) + iresult = PKT_ALIAS_OK; + else + iresult = ProtoAliasIn(pip); + break; + default: + iresult = ProtoAliasIn(pip); + break; + } + + if (ntohs(pip->ip_off) & IP_MF) + { + struct alias_link *link; + + link = FindFragmentIn1(pip->ip_src, alias_addr, pip->ip_id); + if (link != NULL) + { + iresult = PKT_ALIAS_FOUND_HEADER_FRAGMENT; + SetFragmentAddr(link, pip->ip_dst); + } + else + { + iresult = PKT_ALIAS_ERROR; + } + } + } + else + { + iresult = FragmentIn(pip); + } + + return(iresult); +} + + + +/* Unregistered address ranges */ + +/* 10.0.0.0 -> 10.255.255.255 */ +#define UNREG_ADDR_A_LOWER 0x0a000000 +#define UNREG_ADDR_A_UPPER 0x0affffff + +/* 172.16.0.0 -> 172.31.255.255 */ +#define UNREG_ADDR_B_LOWER 0xac100000 +#define UNREG_ADDR_B_UPPER 0xac1fffff + +/* 192.168.0.0 -> 192.168.255.255 */ +#define UNREG_ADDR_C_LOWER 0xc0a80000 +#define UNREG_ADDR_C_UPPER 0xc0a8ffff + +int +PacketAliasOut(char *ptr, /* valid IP packet */ + int maxpacketsize /* How much the packet data may grow + (FTP and IRC inline changes) */ + ) +{ + int iresult; + struct in_addr addr_save; + struct ip *pip; + + if (packetAliasMode & PKT_ALIAS_REVERSE) { + packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = PacketAliasIn(ptr, maxpacketsize); + packetAliasMode |= PKT_ALIAS_REVERSE; + return iresult; + } + + HouseKeeping(); + ClearCheckNewLink(); + pip = (struct ip *) ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return PKT_ALIAS_IGNORED; + + addr_save = GetDefaultAliasAddress(); + if (packetAliasMode & PKT_ALIAS_UNREGISTERED_ONLY) + { + u_long addr; + int iclass; + + iclass = 0; + addr = ntohl(pip->ip_src.s_addr); + if (addr >= UNREG_ADDR_C_LOWER && addr <= UNREG_ADDR_C_UPPER) + iclass = 3; + else if (addr >= UNREG_ADDR_B_LOWER && addr <= UNREG_ADDR_B_UPPER) + iclass = 2; + else if (addr >= UNREG_ADDR_A_LOWER && addr <= UNREG_ADDR_A_UPPER) + iclass = 1; + + if (iclass == 0) + { + SetDefaultAliasAddress(pip->ip_src); + } + } + + iresult = PKT_ALIAS_IGNORED; + if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) + { + switch (pip->ip_p) + { + case IPPROTO_ICMP: + iresult = IcmpAliasOut(pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasOut(pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasOut(pip, maxpacketsize); + break; + case IPPROTO_GRE: + if (AliasHandlePptpGreOut(pip) == 0) + iresult = PKT_ALIAS_OK; + else + iresult = ProtoAliasOut(pip); + break; + default: + iresult = ProtoAliasOut(pip); + break; + } + } + else + { + iresult = FragmentOut(pip); + } + + SetDefaultAliasAddress(addr_save); + return(iresult); +} + +int +PacketUnaliasOut(char *ptr, /* valid IP packet */ + int maxpacketsize /* for error checking */ + ) +{ + struct ip *pip; + struct icmp *ic; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + int iresult = PKT_ALIAS_IGNORED; + + pip = (struct ip *) ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return(iresult); + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + tc = (struct tcphdr *) ud; + ic = (struct icmp *) ud; + + /* Find a link */ + if (pip->ip_p == IPPROTO_UDP) + link = FindUdpTcpIn(pip->ip_dst, pip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (pip->ip_p == IPPROTO_TCP) + link = FindUdpTcpIn(pip->ip_dst, pip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (pip->ip_p == IPPROTO_ICMP) + link = FindIcmpIn(pip->ip_dst, pip->ip_src, ic->icmp_id, 0); + else + link = NULL; + + /* Change it from an aliased packet to an unaliased packet */ + if (link != NULL) + { + if (pip->ip_p == IPPROTO_UDP || pip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(link); + original_port = GetOriginalPort(link); + + /* Adjust TCP/UDP checksum */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + + if (pip->ip_p == IPPROTO_UDP) { + accumulate += ud->uh_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } else { + accumulate += tc->th_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + } + + /* Adjust IP checksum */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_src, + 2); + + /* Un-alias source address and port number */ + pip->ip_src = original_address; + if (pip->ip_p == IPPROTO_UDP) + ud->uh_sport = original_port; + else + tc->th_sport = original_port; + + iresult = PKT_ALIAS_OK; + + } else if (pip->ip_p == IPPROTO_ICMP) { + + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(link); + original_id = GetOriginalPort(link); + + /* Adjust ICMP checksum */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + + /* Adjust IP checksum */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_src, + 2); + + /* Un-alias source address and port number */ + pip->ip_src = original_address; + ic->icmp_id = original_id; + + iresult = PKT_ALIAS_OK; + } + } + return(iresult); + +} diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h new file mode 100644 index 0000000..9df8929 --- /dev/null +++ b/sys/netinet/libalias/alias.h @@ -0,0 +1,182 @@ +/* lint -save -library Flexelint comment for external headers */ + +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * Alias.h defines the outside world interfaces for the packet aliasing + * software. + * + * This software is placed into the public domain with no restrictions on its + * distribution. + */ + +#ifndef _ALIAS_H_ +#define _ALIAS_H_ + +/* The external interface to libalias, the packet aliasing engine. */ + +/* Initialization and control functions. */ +void PacketAliasInit(void); +void PacketAliasSetAddress(struct in_addr _addr); +void PacketAliasSetFWBase(unsigned int _base, unsigned int _num); +unsigned int + PacketAliasSetMode(unsigned int _flags, unsigned int _mask); +void PacketAliasUninit(void); + +/* Packet Handling functions. */ +int PacketAliasIn(char *_ptr, int _maxpacketsize); +int PacketAliasOut(char *_ptr, int _maxpacketsize); +int PacketUnaliasOut(char *_ptr, int _maxpacketsize); + +/* Port and address redirection functions. */ + +/* + * An anonymous structure, a pointer to which is returned from + * PacketAliasRedirectAddr(), PacketAliasRedirectPort() or + * PacketAliasRedirectProto(), passed to PacketAliasAddServer(), + * and freed by PacketAliasRedirectDelete(). + */ +struct alias_link; + +int PacketAliasAddServer(struct alias_link *_link, + struct in_addr _addr, unsigned short _port); +struct alias_link * + PacketAliasRedirectAddr(struct in_addr _src_addr, + struct in_addr _alias_addr); +void PacketAliasRedirectDelete(struct alias_link *_link); +struct alias_link * + PacketAliasRedirectPort(struct in_addr _src_addr, + unsigned short _src_port, struct in_addr _dst_addr, + unsigned short _dst_port, struct in_addr _alias_addr, + unsigned short _alias_port, unsigned char _proto); +struct alias_link * + PacketAliasRedirectProto(struct in_addr _src_addr, + struct in_addr _dst_addr, struct in_addr _alias_addr, + unsigned char _proto); + +/* Fragment Handling functions. */ +void PacketAliasFragmentIn(char *_ptr, char *_ptr_fragment); +char *PacketAliasGetFragment(char *_ptr); +int PacketAliasSaveFragment(char *_ptr); + +/* Miscellaneous functions. */ +int PacketAliasCheckNewLink(void); +unsigned short + PacketAliasInternetChecksum(unsigned short *_ptr, int _nbytes); +void PacketAliasSetTarget(struct in_addr _target_addr); + +/* Transparent proxying routines. */ +int PacketAliasProxyRule(const char *_cmd); + +/* Mode flags, set using PacketAliasSetMode() */ + +/* + * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log + * every time a link is created or deleted. This is useful for debugging. + */ +#define PKT_ALIAS_LOG 0x01 + +/* + * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp, + * telnet or web servers will be prevented by the aliasing mechanism. + */ +#define PKT_ALIAS_DENY_INCOMING 0x02 + +/* + * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the + * same port as they originated on. This allows e.g. rsh to work *99% of the + * time*, but _not_ 100% (it will be slightly flakey instead of not working + * at all). This mode bit is set by PacketAliasInit(), so it is a default + * mode of operation. + */ +#define PKT_ALIAS_SAME_PORTS 0x04 + +/* + * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g. + * destination port and/or address is zero), the packet aliasing engine will + * attempt to allocate a socket for the aliasing port it chooses. This will + * avoid interference with the host machine. Fully specified links do not + * require this. This bit is set after a call to PacketAliasInit(), so it is + * a default mode of operation. + */ +#define PKT_ALIAS_USE_SOCKETS 0x08 + +/*- + * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with + * unregistered source addresses will be aliased. Private + * addresses are those in the following ranges: + * + * 10.0.0.0 -> 10.255.255.255 + * 172.16.0.0 -> 172.31.255.255 + * 192.168.0.0 -> 192.168.255.255 + */ +#define PKT_ALIAS_UNREGISTERED_ONLY 0x10 + +/* + * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic + * aliasing links will be reset whenever PacketAliasSetAddress() changes the + * default aliasing address. If the default aliasing address is left + * unchanged by this function call, then the table of dynamic aliasing links + * will be left intact. This bit is set after a call to PacketAliasInit(). + */ +#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 + +#ifndef NO_FW_PUNCH +/* + * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will + * create a 'hole' in the firewall to allow the transfers to work. The + * ipfw rule number that the hole is created with is controlled by + * PacketAliasSetFWBase(). The hole will be attached to that + * particular alias_link, so when the link goes away the hole is deleted. + */ +#define PKT_ALIAS_PUNCH_FW 0x100 +#endif + +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + +/* Function return codes. */ +#define PKT_ALIAS_ERROR -1 +#define PKT_ALIAS_OK 1 +#define PKT_ALIAS_IGNORED 2 +#define PKT_ALIAS_UNRESOLVED_FRAGMENT 3 +#define PKT_ALIAS_FOUND_HEADER_FRAGMENT 4 + +#endif /* !_ALIAS_H_ */ + +/* lint -restore */ diff --git a/sys/netinet/libalias/alias_cuseeme.c b/sys/netinet/libalias/alias_cuseeme.c new file mode 100644 index 0000000..2c0587e --- /dev/null +++ b/sys/netinet/libalias/alias_cuseeme.c @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> + * with the aid of code written by + * Junichi SATOH <junichi@astec.co.jp> 1996, 1997. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> + +#include "alias_local.h" + +/* CU-SeeMe Data Header */ +struct cu_header { + u_int16_t dest_family; + u_int16_t dest_port; + u_int32_t dest_addr; + int16_t family; + u_int16_t port; + u_int32_t addr; + u_int32_t seq; + u_int16_t msg; + u_int16_t data_type; + u_int16_t packet_len; +}; + +/* Open Continue Header */ +struct oc_header { + u_int16_t client_count; /* Number of client info structs */ + u_int32_t seq_no; + char user_name[20]; + char reserved[4]; /* flags, version stuff, etc */ +}; + +/* client info structures */ +struct client_info { + u_int32_t address; /* Client address */ + char reserved[8]; /* Flags, pruning bitfield, packet counts etc */ +}; + +void +AliasHandleCUSeeMeOut(struct ip *pip, struct alias_link *link) +{ + struct udphdr *ud; + + ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2)); + if (ntohs(ud->uh_ulen) - sizeof(struct udphdr) >= sizeof(struct cu_header)) { + struct cu_header *cu; + struct alias_link *cu_link; + + cu = (struct cu_header *)(ud + 1); + if (cu->addr) + cu->addr = (u_int32_t)GetAliasAddress(link).s_addr; + + cu_link = FindUdpTcpOut(pip->ip_src, GetDestAddress(link), + ud->uh_dport, 0, IPPROTO_UDP, 1); + +#ifndef NO_FW_PUNCH + if (cu_link) + PunchFWHole(cu_link); +#endif + } +} + +void +AliasHandleCUSeeMeIn(struct ip *pip, struct in_addr original_addr) +{ + struct in_addr alias_addr; + struct udphdr *ud; + struct cu_header *cu; + struct oc_header *oc; + struct client_info *ci; + char *end; + int i; + + alias_addr.s_addr = pip->ip_dst.s_addr; + ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2)); + cu = (struct cu_header *)(ud + 1); + oc = (struct oc_header *)(cu + 1); + ci = (struct client_info *)(oc + 1); + end = (char *)ud + ntohs(ud->uh_ulen); + + if ((char *)oc <= end) { + if(cu->dest_addr) + cu->dest_addr = (u_int32_t)original_addr.s_addr; + if(ntohs(cu->data_type) == 101) + /* Find and change our address */ + for(i = 0; (char *)(ci + 1) <= end && i < oc->client_count; i++, ci++) + if(ci->address == (u_int32_t)alias_addr.s_addr) { + ci->address = (u_int32_t)original_addr.s_addr; + break; + } + } +} diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c new file mode 100644 index 0000000..52384b3 --- /dev/null +++ b/sys/netinet/libalias/alias_db.c @@ -0,0 +1,2812 @@ +/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- */ + +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + Alias_db.c encapsulates all data structures used for storing + packet aliasing data. Other parts of the aliasing software + access data through functions provided in this file. + + Data storage is based on the notion of a "link", which is + established for ICMP echo/reply packets, UDP datagrams and + TCP stream connections. A link stores the original source + and destination addresses. For UDP and TCP, it also stores + source and destination port numbers, as well as an alias + port number. Links are also used to store information about + fragments. + + There is a facility for sweeping through and deleting old + links as new packets are sent through. A simple timeout is + used for ICMP and UDP links. TCP links are left alone unless + there is an incomplete connection, in which case the link + can be deleted after a certain amount of time. + + + Initial version: August, 1996 (cjm) + + Version 1.4: September 16, 1996 (cjm) + Facility for handling incoming links added. + + Version 1.6: September 18, 1996 (cjm) + ICMP data handling simplified. + + Version 1.7: January 9, 1997 (cjm) + Fragment handling simplified. + Saves pointers for unresolved fragments. + Permits links for unspecified remote ports + or unspecified remote addresses. + Fixed bug which did not properly zero port + table entries after a link was deleted. + Cleaned up some obsolete comments. + + Version 1.8: January 14, 1997 (cjm) + Fixed data type error in StartPoint(). + (This error did not exist prior to v1.7 + and was discovered and fixed by Ari Suutari) + + Version 1.9: February 1, 1997 + Optionally, connections initiated from packet aliasing host + machine will will not have their port number aliased unless it + conflicts with an aliasing port already being used. (cjm) + + All options earlier being #ifdef'ed are now available through + a new interface, SetPacketAliasMode(). This allows run time + control (which is now available in PPP+pktAlias through the + 'alias' keyword). (ee) + + Added ability to create an alias port without + either destination address or port specified. + port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee) + + Removed K&R style function headers + and general cleanup. (ee) + + Added packetAliasMode to replace compiler #defines's (ee) + + Allocates sockets for partially specified + ports if ALIAS_USE_SOCKETS defined. (cjm) + + Version 2.0: March, 1997 + SetAliasAddress() will now clean up alias links + if the aliasing address is changed. (cjm) + + PacketAliasPermanentLink() function added to support permanent + links. (J. Fortes suggested the need for this.) + Examples: + + (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port + + (192.168.0.2, port 21) <-> alias port 3604, known dest addr + unknown dest port + + These permanent links allow for incoming connections to + machines on the local network. They can be given with a + user-chosen amount of specificity, with increasing specificity + meaning more security. (cjm) + + Quite a bit of rework to the basic engine. The portTable[] + array, which kept track of which ports were in use was replaced + by a table/linked list structure. (cjm) + + SetExpire() function added. (cjm) + + DeleteLink() no longer frees memory association with a pointer + to a fragment (this bug was first recognized by E. Eklund in + v1.9). + + Version 2.1: May, 1997 (cjm) + Packet aliasing engine reworked so that it can handle + multiple external addresses rather than just a single + host address. + + PacketAliasRedirectPort() and PacketAliasRedirectAddr() + added to the API. The first function is a more generalized + version of PacketAliasPermanentLink(). The second function + implements static network address translation. + + Version 3.2: July, 2000 (salander and satoh) + Added FindNewPortGroup to get contiguous range of port values. + + Added QueryUdpTcpIn and QueryUdpTcpOut to look for an aliasing + link but not actually add one. + + Added FindRtspOut, which is closely derived from FindUdpTcpOut, + except that the alias port (from FindNewPortGroup) is provided + as input. + + See HISTORY file for additional revisions. +*/ + + +/* System include files */ +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> + +#include <sys/queue.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/types.h> + +/* BSD network include files */ +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include "alias.h" +#include "alias_local.h" + + + +/* + Constants (note: constants are also defined + near relevant functions or structs) +*/ + +/* Sizes of input and output link tables */ +#define LINK_TABLE_OUT_SIZE 101 +#define LINK_TABLE_IN_SIZE 4001 + +/* Parameters used for cleanup of expired links */ +#define ALIAS_CLEANUP_INTERVAL_SECS 60 +#define ALIAS_CLEANUP_MAX_SPOKES 30 + +/* Timeouts (in seconds) for different link types */ +#define ICMP_EXPIRE_TIME 60 +#define UDP_EXPIRE_TIME 60 +#define PROTO_EXPIRE_TIME 60 +#define FRAGMENT_ID_EXPIRE_TIME 10 +#define FRAGMENT_PTR_EXPIRE_TIME 30 + +/* TCP link expire time for different cases */ +/* When the link has been used and closed - minimal grace time to + allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */ +#ifndef TCP_EXPIRE_DEAD +# define TCP_EXPIRE_DEAD 10 +#endif + +/* When the link has been used and closed on one side - the other side + is allowed to still send data */ +#ifndef TCP_EXPIRE_SINGLEDEAD +# define TCP_EXPIRE_SINGLEDEAD 90 +#endif + +/* When the link isn't yet up */ +#ifndef TCP_EXPIRE_INITIAL +# define TCP_EXPIRE_INITIAL 300 +#endif + +/* When the link is up */ +#ifndef TCP_EXPIRE_CONNECTED +# define TCP_EXPIRE_CONNECTED 86400 +#endif + + +/* Dummy port number codes used for FindLinkIn/Out() and AddLink(). + These constants can be anything except zero, which indicates an + unknown port number. */ + +#define NO_DEST_PORT 1 +#define NO_SRC_PORT 1 + + + +/* Data Structures + + The fundamental data structure used in this program is + "struct alias_link". Whenever a TCP connection is made, + a UDP datagram is sent out, or an ICMP echo request is made, + a link record is made (if it has not already been created). + The link record is identified by the source address/port + and the destination address/port. In the case of an ICMP + echo request, the source port is treated as being equivalent + with the 16-bit ID number of the ICMP packet. + + The link record also can store some auxiliary data. For + TCP connections that have had sequence and acknowledgment + modifications, data space is available to track these changes. + A state field is used to keep track in changes to the TCP + connection state. ID numbers of fragments can also be + stored in the auxiliary space. Pointers to unresolved + fragments can also be stored. + + The link records support two independent chainings. Lookup + tables for input and out tables hold the initial pointers + the link chains. On input, the lookup table indexes on alias + port and link type. On output, the lookup table indexes on + source address, destination address, source port, destination + port and link type. +*/ + +struct ack_data_record /* used to save changes to ACK/sequence numbers */ +{ + u_long ack_old; + u_long ack_new; + int delta; + int active; +}; + +struct tcp_state /* Information about TCP connection */ +{ + int in; /* State for outside -> inside */ + int out; /* State for inside -> outside */ + int index; /* Index to ACK data array */ + int ack_modified; /* Indicates whether ACK and sequence numbers */ + /* been modified */ +}; + +#define N_LINK_TCP_DATA 3 /* Number of distinct ACK number changes + saved for a modified TCP stream */ +struct tcp_dat +{ + struct tcp_state state; + struct ack_data_record ack[N_LINK_TCP_DATA]; + int fwhole; /* Which firewall record is used for this hole? */ +}; + +struct server /* LSNAT server pool (circular list) */ +{ + struct in_addr addr; + u_short port; + struct server *next; +}; + +struct alias_link /* Main data structure */ +{ + struct in_addr src_addr; /* Address and port information */ + struct in_addr dst_addr; + struct in_addr alias_addr; + struct in_addr proxy_addr; + u_short src_port; + u_short dst_port; + u_short alias_port; + u_short proxy_port; + struct server *server; + + int link_type; /* Type of link: TCP, UDP, ICMP, proto, frag */ + +/* values for link_type */ +#define LINK_ICMP IPPROTO_ICMP +#define LINK_UDP IPPROTO_UDP +#define LINK_TCP IPPROTO_TCP +#define LINK_FRAGMENT_ID (IPPROTO_MAX + 1) +#define LINK_FRAGMENT_PTR (IPPROTO_MAX + 2) +#define LINK_ADDR (IPPROTO_MAX + 3) +#define LINK_PPTP (IPPROTO_MAX + 4) + + int flags; /* indicates special characteristics */ + int pflags; /* protocol-specific flags */ + +/* flag bits */ +#define LINK_UNKNOWN_DEST_PORT 0x01 +#define LINK_UNKNOWN_DEST_ADDR 0x02 +#define LINK_PERMANENT 0x04 +#define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */ +#define LINK_UNFIREWALLED 0x08 + + int timestamp; /* Time link was last accessed */ + int expire_time; /* Expire time for link */ + + int sockfd; /* socket descriptor */ + + LIST_ENTRY(alias_link) list_out; /* Linked list of pointers for */ + LIST_ENTRY(alias_link) list_in; /* input and output lookup tables */ + + union /* Auxiliary data */ + { + char *frag_ptr; + struct in_addr frag_addr; + struct tcp_dat *tcp; + } data; +}; + + + + + +/* Global Variables + + The global variables listed here are only accessed from + within alias_db.c and so are prefixed with the static + designation. +*/ + +int packetAliasMode; /* Mode flags */ + /* - documented in alias.h */ + +static struct in_addr aliasAddress; /* Address written onto source */ + /* field of IP packet. */ + +static struct in_addr targetAddress; /* IP address incoming packets */ + /* are sent to if no aliasing */ + /* link already exists */ + +static struct in_addr nullAddress; /* Used as a dummy parameter for */ + /* some function calls */ +static LIST_HEAD(, alias_link) +linkTableOut[LINK_TABLE_OUT_SIZE]; /* Lookup table of pointers to */ + /* chains of link records. Each */ +static LIST_HEAD(, alias_link) /* link record is doubly indexed */ +linkTableIn[LINK_TABLE_IN_SIZE]; /* into input and output lookup */ + /* tables. */ + +static int icmpLinkCount; /* Link statistics */ +static int udpLinkCount; +static int tcpLinkCount; +static int pptpLinkCount; +static int protoLinkCount; +static int fragmentIdLinkCount; +static int fragmentPtrLinkCount; +static int sockCount; + +static int cleanupIndex; /* Index to chain of link table */ + /* being inspected for old links */ + +static int timeStamp; /* System time in seconds for */ + /* current packet */ + +static int lastCleanupTime; /* Last time IncrementalCleanup() */ + /* was called */ + +static int houseKeepingResidual; /* used by HouseKeeping() */ + +static int deleteAllLinks; /* If equal to zero, DeleteLink() */ + /* will not remove permanent links */ + +static FILE *monitorFile; /* File descriptor for link */ + /* statistics monitoring file */ + +static int newDefaultLink; /* Indicates if a new aliasing */ + /* link has been created after a */ + /* call to PacketAliasIn/Out(). */ + +#ifndef NO_FW_PUNCH +static int fireWallFD = -1; /* File descriptor to be able to */ + /* control firewall. Opened by */ + /* PacketAliasSetMode on first */ + /* setting the PKT_ALIAS_PUNCH_FW */ + /* flag. */ +#endif + + + + + + + +/* Internal utility routines (used only in alias_db.c) + +Lookup table starting points: + StartPointIn() -- link table initial search point for + incoming packets + StartPointOut() -- link table initial search point for + outgoing packets + +Miscellaneous: + SeqDiff() -- difference between two TCP sequences + ShowAliasStats() -- send alias statistics to a monitor file +*/ + + +/* Local prototypes */ +static u_int StartPointIn(struct in_addr, u_short, int); + +static u_int StartPointOut(struct in_addr, struct in_addr, + u_short, u_short, int); + +static int SeqDiff(u_long, u_long); + +static void ShowAliasStats(void); + +#ifndef NO_FW_PUNCH +/* Firewall control */ +static void InitPunchFW(void); +static void UninitPunchFW(void); +static void ClearFWHole(struct alias_link *link); +#endif + +/* Log file control */ +static void InitPacketAliasLog(void); +static void UninitPacketAliasLog(void); + +static u_int +StartPointIn(struct in_addr alias_addr, + u_short alias_port, + int link_type) +{ + u_int n; + + n = alias_addr.s_addr; + if (link_type != LINK_PPTP) + n += alias_port; + n += link_type; + return(n % LINK_TABLE_IN_SIZE); +} + + +static u_int +StartPointOut(struct in_addr src_addr, struct in_addr dst_addr, + u_short src_port, u_short dst_port, int link_type) +{ + u_int n; + + n = src_addr.s_addr; + n += dst_addr.s_addr; + if (link_type != LINK_PPTP) { + n += src_port; + n += dst_port; + } + n += link_type; + + return(n % LINK_TABLE_OUT_SIZE); +} + + +static int +SeqDiff(u_long x, u_long y) +{ +/* Return the difference between two TCP sequence numbers */ + +/* + This function is encapsulated in case there are any unusual + arithmetic conditions that need to be considered. +*/ + + return (ntohl(y) - ntohl(x)); +} + + +static void +ShowAliasStats(void) +{ +/* Used for debugging */ + + if (monitorFile) + { + fprintf(monitorFile, "icmp=%d, udp=%d, tcp=%d, pptp=%d, proto=%d, frag_id=%d frag_ptr=%d", + icmpLinkCount, + udpLinkCount, + tcpLinkCount, + pptpLinkCount, + protoLinkCount, + fragmentIdLinkCount, + fragmentPtrLinkCount); + + fprintf(monitorFile, " / tot=%d (sock=%d)\n", + icmpLinkCount + udpLinkCount + + tcpLinkCount + + pptpLinkCount + + protoLinkCount + + fragmentIdLinkCount + + fragmentPtrLinkCount, + sockCount); + + fflush(monitorFile); + } +} + + + + + +/* Internal routines for finding, deleting and adding links + +Port Allocation: + GetNewPort() -- find and reserve new alias port number + GetSocket() -- try to allocate a socket for a given port + +Link creation and deletion: + CleanupAliasData() - remove all link chains from lookup table + IncrementalCleanup() - look for stale links in a single chain + DeleteLink() - remove link + AddLink() - add link + ReLink() - change link + +Link search: + FindLinkOut() - find link for outgoing packets + FindLinkIn() - find link for incoming packets + +Port search: + FindNewPortGroup() - find an available group of ports +*/ + +/* Local prototypes */ +static int GetNewPort(struct alias_link *, int); + +static u_short GetSocket(u_short, int *, int); + +static void CleanupAliasData(void); + +static void IncrementalCleanup(void); + +static void DeleteLink(struct alias_link *); + +static struct alias_link * +AddLink(struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +ReLink(struct alias_link *, + struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +FindLinkOut(struct in_addr, struct in_addr, u_short, u_short, int, int); + +static struct alias_link * +FindLinkIn(struct in_addr, struct in_addr, u_short, u_short, int, int); + + +#define ALIAS_PORT_BASE 0x08000 +#define ALIAS_PORT_MASK 0x07fff +#define ALIAS_PORT_MASK_EVEN 0x07ffe +#define GET_NEW_PORT_MAX_ATTEMPTS 20 + +#define GET_ALIAS_PORT -1 +#define GET_ALIAS_ID GET_ALIAS_PORT + +#define FIND_EVEN_ALIAS_BASE 1 + +/* GetNewPort() allocates port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because GetNewPort() looks for + unused triplets: (dest addr, dest port, alias port). */ + +static int +GetNewPort(struct alias_link *link, int alias_port_param) +{ + int i; + int max_trials; + u_short port_sys; + u_short port_net; + +/* + Description of alias_port_param for GetNewPort(). When + this parameter is zero or positive, it precisely specifies + the port number. GetNewPort() will return this number + without check that it is in use. + + When this parameter is GET_ALIAS_PORT, it indicates to get a randomly + selected port number. +*/ + + if (alias_port_param == GET_ALIAS_PORT) + { + /* + * The aliasing port is automatically selected + * by one of two methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (packetAliasMode & PKT_ALIAS_SAME_PORTS) + { + /* + * When the PKT_ALIAS_SAME_PORTS option is + * chosen, the first try will be the + * actual source port. If this is already + * in use, the remainder of the trials + * will be random. + */ + port_net = link->src_port; + port_sys = ntohs(port_net); + } + else + { + /* First trial and all subsequent are random. */ + port_sys = random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + } + else if (alias_port_param >= 0 && alias_port_param < 0x10000) + { + link->alias_port = (u_short) alias_port_param; + return(0); + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetNewPort(): "); + fprintf(stderr, "input parameter error\n"); +#endif + return(-1); + } + + +/* Port number search */ + for (i=0; i<max_trials; i++) + { + int go_ahead; + struct alias_link *search_result; + + search_result = FindLinkIn(link->dst_addr, link->alias_addr, + link->dst_port, port_net, + link->link_type, 0); + + if (search_result == NULL) + go_ahead = 1; + else if (!(link->flags & LINK_PARTIALLY_SPECIFIED) + && (search_result->flags & LINK_PARTIALLY_SPECIFIED)) + go_ahead = 1; + else + go_ahead = 0; + + if (go_ahead) + { + if ((packetAliasMode & PKT_ALIAS_USE_SOCKETS) + && (link->flags & LINK_PARTIALLY_SPECIFIED) + && ((link->link_type == LINK_TCP) || + (link->link_type == LINK_UDP))) + { + if (GetSocket(port_net, &link->sockfd, link->link_type)) + { + link->alias_port = port_net; + return(0); + } + } + else + { + link->alias_port = port_net; + return(0); + } + } + + port_sys = random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetnewPort(): "); + fprintf(stderr, "could not find free port\n"); +#endif + + return(-1); +} + + +static u_short +GetSocket(u_short port_net, int *sockfd, int link_type) +{ + int err; + int sock; + struct sockaddr_in sock_addr; + + if (link_type == LINK_TCP) + sock = socket(AF_INET, SOCK_STREAM, 0); + else if (link_type == LINK_UDP) + sock = socket(AF_INET, SOCK_DGRAM, 0); + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "incorrect link type\n"); +#endif + return(0); + } + + if (sock < 0) + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "socket() error %d\n", *sockfd); +#endif + return(0); + } + + sock_addr.sin_family = AF_INET; + sock_addr.sin_addr.s_addr = htonl(INADDR_ANY); + sock_addr.sin_port = port_net; + + err = bind(sock, + (struct sockaddr *) &sock_addr, + sizeof(sock_addr)); + if (err == 0) + { + sockCount++; + *sockfd = sock; + return(1); + } + else + { + close(sock); + return(0); + } +} + + +/* FindNewPortGroup() returns a base port number for an available + range of contiguous port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because FindNewPortGroup() + looks for unused triplets: (dest addr, dest port, alias port). */ + +int +FindNewPortGroup(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + u_short port_count, + u_char proto, + u_char align) +{ + int i, j; + int max_trials; + u_short port_sys; + int link_type; + + /* + * Get link_type from protocol + */ + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return (0); + break; + } + + /* + * The aliasing port is automatically selected + * by one of two methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (packetAliasMode & PKT_ALIAS_SAME_PORTS) { + /* + * When the ALIAS_SAME_PORTS option is + * chosen, the first try will be the + * actual source port. If this is already + * in use, the remainder of the trials + * will be random. + */ + port_sys = ntohs(src_port); + + } else { + + /* First trial and all subsequent are random. */ + if (align == FIND_EVEN_ALIAS_BASE) + port_sys = random() & ALIAS_PORT_MASK_EVEN; + else + port_sys = random() & ALIAS_PORT_MASK; + + port_sys += ALIAS_PORT_BASE; + } + +/* Port number search */ + for (i = 0; i < max_trials; i++) { + + struct alias_link *search_result; + + for (j = 0; j < port_count; j++) + if (0 != (search_result = FindLinkIn(dst_addr, alias_addr, + dst_port, htons(port_sys + j), + link_type, 0))) + break; + + /* Found a good range, return base */ + if (j == port_count) + return (htons(port_sys)); + + /* Find a new base to try */ + if (align == FIND_EVEN_ALIAS_BASE) + port_sys = random() & ALIAS_PORT_MASK_EVEN; + else + port_sys = random() & ALIAS_PORT_MASK; + + port_sys += ALIAS_PORT_BASE; + } + +#ifdef DEBUG + fprintf(stderr, "PacketAlias/FindNewPortGroup(): "); + fprintf(stderr, "could not find free port(s)\n"); +#endif + + return(0); +} + +static void +CleanupAliasData(void) +{ + struct alias_link *link; + int i, icount; + + icount = 0; + for (i=0; i<LINK_TABLE_OUT_SIZE; i++) + { + link = LIST_FIRST(&linkTableOut[i]); + while (link != NULL) + { + struct alias_link *link_next; + link_next = LIST_NEXT(link, list_out); + icount++; + DeleteLink(link); + link = link_next; + } + } + + cleanupIndex =0; +} + + +static void +IncrementalCleanup(void) +{ + int icount; + struct alias_link *link; + + icount = 0; + link = LIST_FIRST(&linkTableOut[cleanupIndex++]); + while (link != NULL) + { + int idelta; + struct alias_link *link_next; + + link_next = LIST_NEXT(link, list_out); + idelta = timeStamp - link->timestamp; + switch (link->link_type) + { + case LINK_TCP: + if (idelta > link->expire_time) + { + struct tcp_dat *tcp_aux; + + tcp_aux = link->data.tcp; + if (tcp_aux->state.in != ALIAS_TCP_STATE_CONNECTED + || tcp_aux->state.out != ALIAS_TCP_STATE_CONNECTED) + { + DeleteLink(link); + icount++; + } + } + break; + default: + if (idelta > link->expire_time) + { + DeleteLink(link); + icount++; + } + break; + } + link = link_next; + } + + if (cleanupIndex == LINK_TABLE_OUT_SIZE) + cleanupIndex = 0; +} + +static void +DeleteLink(struct alias_link *link) +{ + +/* Don't do anything if the link is marked permanent */ + if (deleteAllLinks == 0 && link->flags & LINK_PERMANENT) + return; + +#ifndef NO_FW_PUNCH +/* Delete associated firewall hole, if any */ + ClearFWHole(link); +#endif + +/* Free memory allocated for LSNAT server pool */ + if (link->server != NULL) { + struct server *head, *curr, *next; + + head = curr = link->server; + do { + next = curr->next; + free(curr); + } while ((curr = next) != head); + } + +/* Adjust output table pointers */ + LIST_REMOVE(link, list_out); + +/* Adjust input table pointers */ + LIST_REMOVE(link, list_in); + +/* Close socket, if one has been allocated */ + if (link->sockfd != -1) + { + sockCount--; + close(link->sockfd); + } + +/* Link-type dependent cleanup */ + switch(link->link_type) + { + case LINK_ICMP: + icmpLinkCount--; + break; + case LINK_UDP: + udpLinkCount--; + break; + case LINK_TCP: + tcpLinkCount--; + free(link->data.tcp); + break; + case LINK_PPTP: + pptpLinkCount--; + break; + case LINK_FRAGMENT_ID: + fragmentIdLinkCount--; + break; + case LINK_FRAGMENT_PTR: + fragmentPtrLinkCount--; + if (link->data.frag_ptr != NULL) + free(link->data.frag_ptr); + break; + case LINK_ADDR: + break; + default: + protoLinkCount--; + break; + } + +/* Free memory */ + free(link); + +/* Write statistics, if logging enabled */ + if (packetAliasMode & PKT_ALIAS_LOG) + { + ShowAliasStats(); + } +} + + +static struct alias_link * +AddLink(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) /* port will be automatically */ +{ /* chosen. If greater than */ + u_int start_point; /* zero, equal to alias port */ + struct alias_link *link; + + link = malloc(sizeof(struct alias_link)); + if (link != NULL) + { + /* Basic initialization */ + link->src_addr = src_addr; + link->dst_addr = dst_addr; + link->alias_addr = alias_addr; + link->proxy_addr.s_addr = INADDR_ANY; + link->src_port = src_port; + link->dst_port = dst_port; + link->proxy_port = 0; + link->server = NULL; + link->link_type = link_type; + link->sockfd = -1; + link->flags = 0; + link->pflags = 0; + link->timestamp = timeStamp; + + /* Expiration time */ + switch (link_type) + { + case LINK_ICMP: + link->expire_time = ICMP_EXPIRE_TIME; + break; + case LINK_UDP: + link->expire_time = UDP_EXPIRE_TIME; + break; + case LINK_TCP: + link->expire_time = TCP_EXPIRE_INITIAL; + break; + case LINK_PPTP: + link->flags |= LINK_PERMANENT; /* no timeout. */ + break; + case LINK_FRAGMENT_ID: + link->expire_time = FRAGMENT_ID_EXPIRE_TIME; + break; + case LINK_FRAGMENT_PTR: + link->expire_time = FRAGMENT_PTR_EXPIRE_TIME; + break; + case LINK_ADDR: + break; + default: + link->expire_time = PROTO_EXPIRE_TIME; + break; + } + + /* Determine alias flags */ + if (dst_addr.s_addr == INADDR_ANY) + link->flags |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + link->flags |= LINK_UNKNOWN_DEST_PORT; + + /* Determine alias port */ + if (GetNewPort(link, alias_port_param) != 0) + { + free(link); + return(NULL); + } + + /* Link-type dependent initialization */ + switch(link_type) + { + struct tcp_dat *aux_tcp; + + case LINK_ICMP: + icmpLinkCount++; + break; + case LINK_UDP: + udpLinkCount++; + break; + case LINK_TCP: + aux_tcp = malloc(sizeof(struct tcp_dat)); + if (aux_tcp != NULL) + { + int i; + + tcpLinkCount++; + aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.index = 0; + aux_tcp->state.ack_modified = 0; + for (i=0; i<N_LINK_TCP_DATA; i++) + aux_tcp->ack[i].active = 0; + aux_tcp->fwhole = -1; + link->data.tcp = aux_tcp; + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/AddLink: "); + fprintf(stderr, " cannot allocate auxiliary TCP data\n"); +#endif + free(link); + return (NULL); + } + break; + case LINK_PPTP: + pptpLinkCount++; + break; + case LINK_FRAGMENT_ID: + fragmentIdLinkCount++; + break; + case LINK_FRAGMENT_PTR: + fragmentPtrLinkCount++; + break; + case LINK_ADDR: + break; + default: + protoLinkCount++; + break; + } + + /* Set up pointers for output lookup table */ + start_point = StartPointOut(src_addr, dst_addr, + src_port, dst_port, link_type); + LIST_INSERT_HEAD(&linkTableOut[start_point], link, list_out); + + /* Set up pointers for input lookup table */ + start_point = StartPointIn(alias_addr, link->alias_port, link_type); + LIST_INSERT_HEAD(&linkTableIn[start_point], link, list_in); + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/AddLink(): "); + fprintf(stderr, "malloc() call failed.\n"); +#endif + } + + if (packetAliasMode & PKT_ALIAS_LOG) + { + ShowAliasStats(); + } + + return(link); +} + +static struct alias_link * +ReLink(struct alias_link *old_link, + struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) /* port will be automatically */ +{ /* chosen. If greater than */ + struct alias_link *new_link; /* zero, equal to alias port */ + + new_link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port_param, + link_type); +#ifndef NO_FW_PUNCH + if (new_link != NULL && + old_link->link_type == LINK_TCP && + old_link->data.tcp->fwhole > 0) { + PunchFWHole(new_link); + } +#endif + DeleteLink(old_link); + return new_link; +} + +static struct alias_link * +_FindLinkOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type); + LIST_FOREACH(link, &linkTableOut[i], list_out) + { + if (link->src_addr.s_addr == src_addr.s_addr + && link->server == NULL + && link->dst_addr.s_addr == dst_addr.s_addr + && link->dst_port == dst_port + && link->src_port == src_port + && link->link_type == link_type) + { + link->timestamp = timeStamp; + break; + } + } + +/* Search for partially specified links. */ + if (link == NULL && replace_partial_links) + { + if (dst_port != 0 && dst_addr.s_addr != INADDR_ANY) + { + link = _FindLinkOut(src_addr, dst_addr, src_port, 0, + link_type, 0); + if (link == NULL) + link = _FindLinkOut(src_addr, nullAddress, src_port, + dst_port, link_type, 0); + } + if (link == NULL && + (dst_port != 0 || dst_addr.s_addr != INADDR_ANY)) + { + link = _FindLinkOut(src_addr, nullAddress, src_port, 0, + link_type, 0); + } + if (link != NULL) + { + link = ReLink(link, + src_addr, dst_addr, link->alias_addr, + src_port, dst_port, link->alias_port, + link_type); + } + } + + return(link); +} + +static struct alias_link * +FindLinkOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *link; + + link = _FindLinkOut(src_addr, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + + if (link == NULL) + { + /* The following allows permanent links to be + specified as using the default source address + (i.e. device interface address) without knowing + in advance what that address is. */ + if (aliasAddress.s_addr != 0 && + src_addr.s_addr == aliasAddress.s_addr) + { + link = _FindLinkOut(nullAddress, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + } + } + + return(link); +} + + +static struct alias_link * +_FindLinkIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + int flags_in; + u_int start_point; + struct alias_link *link; + struct alias_link *link_fully_specified; + struct alias_link *link_unknown_all; + struct alias_link *link_unknown_dst_addr; + struct alias_link *link_unknown_dst_port; + +/* Initialize pointers */ + link_fully_specified = NULL; + link_unknown_all = NULL; + link_unknown_dst_addr = NULL; + link_unknown_dst_port = NULL; + +/* If either the dest addr or port is unknown, the search + loop will have to know about this. */ + + flags_in = 0; + if (dst_addr.s_addr == INADDR_ANY) + flags_in |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + flags_in |= LINK_UNKNOWN_DEST_PORT; + +/* Search loop */ + start_point = StartPointIn(alias_addr, alias_port, link_type); + LIST_FOREACH(link, &linkTableIn[start_point], list_in) + { + int flags; + + flags = flags_in | link->flags; + if (!(flags & LINK_PARTIALLY_SPECIFIED)) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->dst_addr.s_addr == dst_addr.s_addr + && link->dst_port == dst_port + && link->link_type == link_type) + { + link_fully_specified = link; + break; + } + } + else if ((flags & LINK_UNKNOWN_DEST_ADDR) + && (flags & LINK_UNKNOWN_DEST_PORT)) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type) + { + if (link_unknown_all == NULL) + link_unknown_all = link; + } + } + else if (flags & LINK_UNKNOWN_DEST_ADDR) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type + && link->dst_port == dst_port) + { + if (link_unknown_dst_addr == NULL) + link_unknown_dst_addr = link; + } + } + else if (flags & LINK_UNKNOWN_DEST_PORT) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type + && link->dst_addr.s_addr == dst_addr.s_addr) + { + if (link_unknown_dst_port == NULL) + link_unknown_dst_port = link; + } + } + } + + + + if (link_fully_specified != NULL) + { + link_fully_specified->timestamp = timeStamp; + link = link_fully_specified; + } + else if (link_unknown_dst_port != NULL) + link = link_unknown_dst_port; + else if (link_unknown_dst_addr != NULL) + link = link_unknown_dst_addr; + else if (link_unknown_all != NULL) + link = link_unknown_all; + else + return (NULL); + + if (replace_partial_links && + (link->flags & LINK_PARTIALLY_SPECIFIED || link->server != NULL)) + { + struct in_addr src_addr; + u_short src_port; + + if (link->server != NULL) { /* LSNAT link */ + src_addr = link->server->addr; + src_port = link->server->port; + link->server = link->server->next; + } else { + src_addr = link->src_addr; + src_port = link->src_port; + } + + link = ReLink(link, + src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + } + + return (link); +} + +static struct alias_link * +FindLinkIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *link; + + link = _FindLinkIn(dst_addr, alias_addr, dst_port, alias_port, + link_type, replace_partial_links); + + if (link == NULL) + { + /* The following allows permanent links to be + specified as using the default aliasing address + (i.e. device interface address) without knowing + in advance what that address is. */ + if (aliasAddress.s_addr != 0 && + alias_addr.s_addr == aliasAddress.s_addr) + { + link = _FindLinkIn(dst_addr, nullAddress, dst_port, alias_port, + link_type, replace_partial_links); + } + } + + return(link); +} + + + + +/* External routines for finding/adding links + +-- "external" means outside alias_db.c, but within alias*.c -- + + FindIcmpIn(), FindIcmpOut() + FindFragmentIn1(), FindFragmentIn2() + AddFragmentPtrLink(), FindFragmentPtr() + FindProtoIn(), FindProtoOut() + FindUdpTcpIn(), FindUdpTcpOut() + AddPptp(), FindPptpOutByCallId(), FindPptpInByCallId(), + FindPptpOutByPeerCallId(), FindPptpInByPeerCallId() + FindOriginalAddress(), FindAliasAddress() + +(prototypes in alias_local.h) +*/ + + +struct alias_link * +FindIcmpIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short id_alias, + int create) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, id_alias, + LINK_ICMP, 0); + if (link == NULL && create && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING)) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + id_alias, NO_DEST_PORT, id_alias, + LINK_ICMP); + } + + return (link); +} + + +struct alias_link * +FindIcmpOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short id, + int create) +{ + struct alias_link * link; + + link = FindLinkOut(src_addr, dst_addr, + id, NO_DEST_PORT, + LINK_ICMP, 0); + if (link == NULL && create) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + id, NO_DEST_PORT, GET_ALIAS_ID, + LINK_ICMP); + } + + return(link); +} + + +struct alias_link * +FindFragmentIn1(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short ip_id) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); + + if (link == NULL) + { + link = AddLink(nullAddress, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID); + } + + return(link); +} + + +struct alias_link * +FindFragmentIn2(struct in_addr dst_addr, /* Doesn't add a link if one */ + struct in_addr alias_addr, /* is not found. */ + u_short ip_id) +{ + return FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); +} + + +struct alias_link * +AddFragmentPtrLink(struct in_addr dst_addr, + u_short ip_id) +{ + return AddLink(nullAddress, dst_addr, nullAddress, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR); +} + + +struct alias_link * +FindFragmentPtr(struct in_addr dst_addr, + u_short ip_id) +{ + return FindLinkIn(dst_addr, nullAddress, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR, 0); +} + + +struct alias_link * +FindProtoIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_char proto) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, 0, + proto, 1); + + if (link == NULL && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING)) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + } + + return (link); +} + + +struct alias_link * +FindProtoOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_char proto) +{ + struct alias_link *link; + + link = FindLinkOut(src_addr, dst_addr, + NO_SRC_PORT, NO_DEST_PORT, + proto, 1); + + if (link == NULL) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + } + + return (link); +} + + +struct alias_link * +FindUdpTcpIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + u_char proto, + int create) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkIn(dst_addr, alias_addr, + dst_port, alias_port, + link_type, create); + + if (link == NULL && create && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING)) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + alias_port, dst_port, alias_port, + link_type); + } + + return(link); +} + + +struct alias_link * +FindUdpTcpOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + u_char proto, + int create) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkOut(src_addr, dst_addr, src_port, dst_port, link_type, create); + + if (link == NULL && create) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, GET_ALIAS_PORT, + link_type); + } + + return(link); +} + + +struct alias_link * +AddPptp(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t src_call_id) +{ + struct alias_link *link; + + link = AddLink(src_addr, dst_addr, alias_addr, + src_call_id, 0, GET_ALIAS_PORT, + LINK_PPTP); + + return (link); +} + + +struct alias_link * +FindPptpOutByCallId(struct in_addr src_addr, + struct in_addr dst_addr, + u_int16_t src_call_id) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); + LIST_FOREACH(link, &linkTableOut[i], list_out) + if (link->link_type == LINK_PPTP && + link->src_addr.s_addr == src_addr.s_addr && + link->dst_addr.s_addr == dst_addr.s_addr && + link->src_port == src_call_id) + break; + + return (link); +} + + +struct alias_link * +FindPptpOutByPeerCallId(struct in_addr src_addr, + struct in_addr dst_addr, + u_int16_t dst_call_id) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); + LIST_FOREACH(link, &linkTableOut[i], list_out) + if (link->link_type == LINK_PPTP && + link->src_addr.s_addr == src_addr.s_addr && + link->dst_addr.s_addr == dst_addr.s_addr && + link->dst_port == dst_call_id) + break; + + return (link); +} + + +struct alias_link * +FindPptpInByCallId(struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t dst_call_id) +{ + u_int i; + struct alias_link *link; + + i = StartPointIn(alias_addr, 0, LINK_PPTP); + LIST_FOREACH(link, &linkTableIn[i], list_in) + if (link->link_type == LINK_PPTP && + link->dst_addr.s_addr == dst_addr.s_addr && + link->alias_addr.s_addr == alias_addr.s_addr && + link->dst_port == dst_call_id) + break; + + return (link); +} + + +struct alias_link * +FindPptpInByPeerCallId(struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t alias_call_id) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + 0/* any */, alias_call_id, + LINK_PPTP, 0); + + + return (link); +} + + +struct alias_link * +FindRtspOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkOut(src_addr, dst_addr, src_port, 0, link_type, 1); + + if (link == NULL) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, 0, alias_port, + link_type); + } + + return(link); +} + + +struct in_addr +FindOriginalAddress(struct in_addr alias_addr) +{ + struct alias_link *link; + + link = FindLinkIn(nullAddress, alias_addr, + 0, 0, LINK_ADDR, 0); + if (link == NULL) + { + newDefaultLink = 1; + if (targetAddress.s_addr == INADDR_ANY) + return alias_addr; + else if (targetAddress.s_addr == INADDR_NONE) + return aliasAddress; + else + return targetAddress; + } + else + { + if (link->server != NULL) { /* LSNAT link */ + struct in_addr src_addr; + + src_addr = link->server->addr; + link->server = link->server->next; + return (src_addr); + } else if (link->src_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return link->src_addr; + } +} + + +struct in_addr +FindAliasAddress(struct in_addr original_addr) +{ + struct alias_link *link; + + link = FindLinkOut(original_addr, nullAddress, + 0, 0, LINK_ADDR, 0); + if (link == NULL) + { + return aliasAddress; + } + else + { + if (link->alias_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return link->alias_addr; + } +} + + +/* External routines for getting or changing link data + (external to alias_db.c, but internal to alias*.c) + + SetFragmentData(), GetFragmentData() + SetFragmentPtr(), GetFragmentPtr() + SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut() + GetOriginalAddress(), GetDestAddress(), GetAliasAddress() + GetOriginalPort(), GetAliasPort() + SetAckModified(), GetAckModified() + GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq() + SetProtocolFlags(), GetProtocolFlags() + SetDestCallId() +*/ + + +void +SetFragmentAddr(struct alias_link *link, struct in_addr src_addr) +{ + link->data.frag_addr = src_addr; +} + + +void +GetFragmentAddr(struct alias_link *link, struct in_addr *src_addr) +{ + *src_addr = link->data.frag_addr; +} + + +void +SetFragmentPtr(struct alias_link *link, char *fptr) +{ + link->data.frag_ptr = fptr; +} + + +void +GetFragmentPtr(struct alias_link *link, char **fptr) +{ + *fptr = link->data.frag_ptr; +} + + +void +SetStateIn(struct alias_link *link, int state) +{ + /* TCP input state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (link->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_DEAD; + else + link->expire_time = TCP_EXPIRE_SINGLEDEAD; + break; + case ALIAS_TCP_STATE_CONNECTED: + if (link->data.tcp->state.out == ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_CONNECTED; + break; + default: + abort(); + } + link->data.tcp->state.in = state; +} + + +void +SetStateOut(struct alias_link *link, int state) +{ + /* TCP output state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (link->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_DEAD; + else + link->expire_time = TCP_EXPIRE_SINGLEDEAD; + break; + case ALIAS_TCP_STATE_CONNECTED: + if (link->data.tcp->state.in == ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_CONNECTED; + break; + default: + abort(); + } + link->data.tcp->state.out = state; +} + + +int +GetStateIn(struct alias_link *link) +{ + /* TCP input state */ + return link->data.tcp->state.in; +} + + +int +GetStateOut(struct alias_link *link) +{ + /* TCP output state */ + return link->data.tcp->state.out; +} + + +struct in_addr +GetOriginalAddress(struct alias_link *link) +{ + if (link->src_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return(link->src_addr); +} + + +struct in_addr +GetDestAddress(struct alias_link *link) +{ + return(link->dst_addr); +} + + +struct in_addr +GetAliasAddress(struct alias_link *link) +{ + if (link->alias_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return link->alias_addr; +} + + +struct in_addr +GetDefaultAliasAddress() +{ + return aliasAddress; +} + + +void +SetDefaultAliasAddress(struct in_addr alias_addr) +{ + aliasAddress = alias_addr; +} + + +u_short +GetOriginalPort(struct alias_link *link) +{ + return(link->src_port); +} + + +u_short +GetAliasPort(struct alias_link *link) +{ + return(link->alias_port); +} + +#ifndef NO_FW_PUNCH +static u_short +GetDestPort(struct alias_link *link) +{ + return(link->dst_port); +} +#endif + +void +SetAckModified(struct alias_link *link) +{ +/* Indicate that ACK numbers have been modified in a TCP connection */ + link->data.tcp->state.ack_modified = 1; +} + + +struct in_addr +GetProxyAddress(struct alias_link *link) +{ + return link->proxy_addr; +} + + +void +SetProxyAddress(struct alias_link *link, struct in_addr addr) +{ + link->proxy_addr = addr; +} + + +u_short +GetProxyPort(struct alias_link *link) +{ + return link->proxy_port; +} + + +void +SetProxyPort(struct alias_link *link, u_short port) +{ + link->proxy_port = port; +} + + +int +GetAckModified(struct alias_link *link) +{ +/* See if ACK numbers have been modified */ + return link->data.tcp->state.ack_modified; +} + + +int +GetDeltaAckIn(struct ip *pip, struct alias_link *link) +{ +/* +Find out how much the ACK number has been altered for an incoming +TCP packet. To do this, a circular list of ACK numbers where the TCP +packet size was altered is searched. +*/ + + int i; + struct tcphdr *tc; + int delta, ack_diff_min; + u_long ack; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + ack = tc->th_ack; + + delta = 0; + ack_diff_min = -1; + for (i=0; i<N_LINK_TCP_DATA; i++) + { + struct ack_data_record x; + + x = link->data.tcp->ack[i]; + if (x.active == 1) + { + int ack_diff; + + ack_diff = SeqDiff(x.ack_new, ack); + if (ack_diff >= 0) + { + if (ack_diff_min >= 0) + { + if (ack_diff < ack_diff_min) + { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + else + { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + } + } + return (delta); +} + + +int +GetDeltaSeqOut(struct ip *pip, struct alias_link *link) +{ +/* +Find out how much the sequence number has been altered for an outgoing +TCP packet. To do this, a circular list of ACK numbers where the TCP +packet size was altered is searched. +*/ + + int i; + struct tcphdr *tc; + int delta, seq_diff_min; + u_long seq; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + seq = tc->th_seq; + + delta = 0; + seq_diff_min = -1; + for (i=0; i<N_LINK_TCP_DATA; i++) + { + struct ack_data_record x; + + x = link->data.tcp->ack[i]; + if (x.active == 1) + { + int seq_diff; + + seq_diff = SeqDiff(x.ack_old, seq); + if (seq_diff >= 0) + { + if (seq_diff_min >= 0) + { + if (seq_diff < seq_diff_min) + { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + else + { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + } + } + return (delta); +} + + +void +AddSeq(struct ip *pip, struct alias_link *link, int delta) +{ +/* +When a TCP packet has been altered in length, save this +information in a circular list. If enough packets have +been altered, then this list will begin to overwrite itself. +*/ + + struct tcphdr *tc; + struct ack_data_record x; + int hlen, tlen, dlen; + int i; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + x.ack_old = htonl(ntohl(tc->th_seq) + dlen); + x.ack_new = htonl(ntohl(tc->th_seq) + dlen + delta); + x.delta = delta; + x.active = 1; + + i = link->data.tcp->state.index; + link->data.tcp->ack[i] = x; + + i++; + if (i == N_LINK_TCP_DATA) + link->data.tcp->state.index = 0; + else + link->data.tcp->state.index = i; +} + +void +SetExpire(struct alias_link *link, int expire) +{ + if (expire == 0) + { + link->flags &= ~LINK_PERMANENT; + DeleteLink(link); + } + else if (expire == -1) + { + link->flags |= LINK_PERMANENT; + } + else if (expire > 0) + { + link->expire_time = expire; + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/SetExpire(): "); + fprintf(stderr, "error in expire parameter\n"); +#endif + } +} + +void +ClearCheckNewLink(void) +{ + newDefaultLink = 0; +} + +void +SetProtocolFlags(struct alias_link *link, int pflags) +{ + + link->pflags = pflags;; +} + +int +GetProtocolFlags(struct alias_link *link) +{ + + return (link->pflags); +} + +void +SetDestCallId(struct alias_link *link, u_int16_t cid) +{ + + deleteAllLinks = 1; + link = ReLink(link, link->src_addr, link->dst_addr, link->alias_addr, + link->src_port, cid, link->alias_port, link->link_type); + deleteAllLinks = 0; +} + + +/* Miscellaneous Functions + + HouseKeeping() + InitPacketAliasLog() + UninitPacketAliasLog() +*/ + +/* + Whenever an outgoing or incoming packet is handled, HouseKeeping() + is called to find and remove timed-out aliasing links. Logic exists + to sweep through the entire table and linked list structure + every 60 seconds. + + (prototype in alias_local.h) +*/ + +void +HouseKeeping(void) +{ + int i, n, n100; + struct timeval tv; + struct timezone tz; + + /* + * Save system time (seconds) in global variable timeStamp for + * use by other functions. This is done so as not to unnecessarily + * waste timeline by making system calls. + */ + gettimeofday(&tv, &tz); + timeStamp = tv.tv_sec; + + /* Compute number of spokes (output table link chains) to cover */ + n100 = LINK_TABLE_OUT_SIZE * 100 + houseKeepingResidual; + n100 *= timeStamp - lastCleanupTime; + n100 /= ALIAS_CLEANUP_INTERVAL_SECS; + + n = n100/100; + + /* Handle different cases */ + if (n > ALIAS_CLEANUP_MAX_SPOKES) + { + n = ALIAS_CLEANUP_MAX_SPOKES; + lastCleanupTime = timeStamp; + houseKeepingResidual = 0; + + for (i=0; i<n; i++) + IncrementalCleanup(); + } + else if (n > 0) + { + lastCleanupTime = timeStamp; + houseKeepingResidual = n100 - 100*n; + + for (i=0; i<n; i++) + IncrementalCleanup(); + } + else if (n < 0) + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/HouseKeeping(): "); + fprintf(stderr, "something unexpected in time values\n"); +#endif + lastCleanupTime = timeStamp; + houseKeepingResidual = 0; + } +} + + +/* Init the log file and enable logging */ +static void +InitPacketAliasLog(void) +{ + if ((~packetAliasMode & PKT_ALIAS_LOG) + && (monitorFile = fopen("/var/log/alias.log", "w"))) + { + packetAliasMode |= PKT_ALIAS_LOG; + fprintf(monitorFile, + "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n"); + } +} + + +/* Close the log-file and disable logging. */ +static void +UninitPacketAliasLog(void) +{ + if (monitorFile) { + fclose(monitorFile); + monitorFile = NULL; + } + packetAliasMode &= ~PKT_ALIAS_LOG; +} + + + + + + +/* Outside world interfaces + +-- "outside world" means other than alias*.c routines -- + + PacketAliasRedirectPort() + PacketAliasAddServer() + PacketAliasRedirectProto() + PacketAliasRedirectAddr() + PacketAliasRedirectDelete() + PacketAliasSetAddress() + PacketAliasInit() + PacketAliasUninit() + PacketAliasSetMode() + +(prototypes in alias.h) +*/ + +/* Redirection from a specific public addr:port to a + private addr:port */ +struct alias_link * +PacketAliasRedirectPort(struct in_addr src_addr, u_short src_port, + struct in_addr dst_addr, u_short dst_port, + struct in_addr alias_addr, u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch(proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: +#ifdef DEBUG + fprintf(stderr, "PacketAliasRedirectPort(): "); + fprintf(stderr, "only TCP and UDP protocols allowed\n"); +#endif + return NULL; + } + + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectPort(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + +/* Add server to the pool of servers */ +int +PacketAliasAddServer(struct alias_link *link, struct in_addr addr, u_short port) +{ + struct server *server; + + server = malloc(sizeof(struct server)); + + if (server != NULL) { + struct server *head; + + server->addr = addr; + server->port = port; + + head = link->server; + if (head == NULL) + server->next = server; + else { + struct server *s; + + for (s = head; s->next != head; s = s->next); + s->next = server; + server->next = head; + } + link->server = server; + return (0); + } else + return (-1); +} + +/* Redirect packets of a given IP protocol from a specific + public address to a private address */ +struct alias_link * +PacketAliasRedirectProto(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_char proto) +{ + struct alias_link *link; + + link = AddLink(src_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectProto(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + +/* Static address translation */ +struct alias_link * +PacketAliasRedirectAddr(struct in_addr src_addr, + struct in_addr alias_addr) +{ + struct alias_link *link; + + link = AddLink(src_addr, nullAddress, alias_addr, + 0, 0, 0, + LINK_ADDR); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectAddr(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + + +void +PacketAliasRedirectDelete(struct alias_link *link) +{ +/* This is a dangerous function to put in the API, + because an invalid pointer can crash the program. */ + + deleteAllLinks = 1; + DeleteLink(link); + deleteAllLinks = 0; +} + + +void +PacketAliasSetAddress(struct in_addr addr) +{ + if (packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE + && aliasAddress.s_addr != addr.s_addr) + CleanupAliasData(); + + aliasAddress = addr; +} + + +void +PacketAliasSetTarget(struct in_addr target_addr) +{ + targetAddress = target_addr; +} + + +void +PacketAliasInit(void) +{ + int i; + struct timeval tv; + struct timezone tz; + static int firstCall = 1; + + if (firstCall == 1) + { + gettimeofday(&tv, &tz); + timeStamp = tv.tv_sec; + lastCleanupTime = tv.tv_sec; + houseKeepingResidual = 0; + + for (i=0; i<LINK_TABLE_OUT_SIZE; i++) + LIST_INIT(&linkTableOut[i]); + for (i=0; i<LINK_TABLE_IN_SIZE; i++) + LIST_INIT(&linkTableIn[i]); + + atexit(PacketAliasUninit); + firstCall = 0; + } + else + { + deleteAllLinks = 1; + CleanupAliasData(); + deleteAllLinks = 0; + } + + aliasAddress.s_addr = INADDR_ANY; + targetAddress.s_addr = INADDR_ANY; + + icmpLinkCount = 0; + udpLinkCount = 0; + tcpLinkCount = 0; + pptpLinkCount = 0; + protoLinkCount = 0; + fragmentIdLinkCount = 0; + fragmentPtrLinkCount = 0; + sockCount = 0; + + cleanupIndex =0; + + packetAliasMode = PKT_ALIAS_SAME_PORTS + | PKT_ALIAS_USE_SOCKETS + | PKT_ALIAS_RESET_ON_ADDR_CHANGE; +} + +void +PacketAliasUninit(void) { + deleteAllLinks = 1; + CleanupAliasData(); + deleteAllLinks = 0; + UninitPacketAliasLog(); +#ifndef NO_FW_PUNCH + UninitPunchFW(); +#endif +} + + +/* Change mode for some operations */ +unsigned int +PacketAliasSetMode( + unsigned int flags, /* Which state to bring flags to */ + unsigned int mask /* Mask of which flags to affect (use 0 to do a + probe for flag values) */ +) +{ +/* Enable logging? */ + if (flags & mask & PKT_ALIAS_LOG) + { + InitPacketAliasLog(); /* Do the enable */ + } else +/* _Disable_ logging? */ + if (~flags & mask & PKT_ALIAS_LOG) { + UninitPacketAliasLog(); + } + +#ifndef NO_FW_PUNCH +/* Start punching holes in the firewall? */ + if (flags & mask & PKT_ALIAS_PUNCH_FW) { + InitPunchFW(); + } else +/* Stop punching holes in the firewall? */ + if (~flags & mask & PKT_ALIAS_PUNCH_FW) { + UninitPunchFW(); + } +#endif + +/* Other flags can be set/cleared without special action */ + packetAliasMode = (flags & mask) | (packetAliasMode & ~mask); + return packetAliasMode; +} + + +int +PacketAliasCheckNewLink(void) +{ + return newDefaultLink; +} + + +#ifndef NO_FW_PUNCH + +/***************** + Code to support firewall punching. This shouldn't really be in this + file, but making variables global is evil too. + ****************/ + +/* Firewall include files */ +#include <net/if.h> +#include <netinet/ip_fw.h> +#include <string.h> +#include <err.h> + +static void ClearAllFWHoles(void); + +static int fireWallBaseNum; /* The first firewall entry free for our use */ +static int fireWallNumNums; /* How many entries can we use? */ +static int fireWallActiveNum; /* Which entry did we last use? */ +static char *fireWallField; /* bool array for entries */ + +#define fw_setfield(field, num) \ +do { \ + (field)[(num) - fireWallBaseNum] = 1; \ +} /*lint -save -e717 */ while(0) /*lint -restore */ +#define fw_clrfield(field, num) \ +do { \ + (field)[(num) - fireWallBaseNum] = 0; \ +} /*lint -save -e717 */ while(0) /*lint -restore */ +#define fw_tstfield(field, num) ((field)[(num) - fireWallBaseNum]) + +static void +InitPunchFW(void) { + fireWallField = malloc(fireWallNumNums); + if (fireWallField) { + memset(fireWallField, 0, fireWallNumNums); + if (fireWallFD < 0) { + fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + } + ClearAllFWHoles(); + fireWallActiveNum = fireWallBaseNum; + } +} + +static void +UninitPunchFW(void) { + ClearAllFWHoles(); + if (fireWallFD >= 0) + close(fireWallFD); + fireWallFD = -1; + if (fireWallField) + free(fireWallField); + fireWallField = NULL; + packetAliasMode &= ~PKT_ALIAS_PUNCH_FW; +} + +/* Make a certain link go through the firewall */ +void +PunchFWHole(struct alias_link *link) { + int r; /* Result code */ + struct ip_fw rule; /* On-the-fly built rule */ + int fwhole; /* Where to punch hole */ + +/* Don't do anything unless we are asked to */ + if ( !(packetAliasMode & PKT_ALIAS_PUNCH_FW) || + fireWallFD < 0 || + link->link_type != LINK_TCP) + return; + + memset(&rule, 0, sizeof rule); + +/** Build rule **/ + + /* Find empty slot */ + for (fwhole = fireWallActiveNum; + fwhole < fireWallBaseNum + fireWallNumNums && + fw_tstfield(fireWallField, fwhole); + fwhole++) + ; + if (fwhole == fireWallBaseNum + fireWallNumNums) { + for (fwhole = fireWallBaseNum; + fwhole < fireWallActiveNum && + fw_tstfield(fireWallField, fwhole); + fwhole++) + ; + if (fwhole == fireWallActiveNum) { + /* No rule point empty - we can't punch more holes. */ + fireWallActiveNum = fireWallBaseNum; +#ifdef DEBUG + fprintf(stderr, "libalias: Unable to create firewall hole!\n"); +#endif + return; + } + } + /* Start next search at next position */ + fireWallActiveNum = fwhole+1; + + /* Build generic part of the two rules */ + rule.fw_number = fwhole; + IP_FW_SETNSRCP(&rule, 1); /* Number of source ports. */ + IP_FW_SETNDSTP(&rule, 1); /* Number of destination ports. */ + rule.fw_flg = IP_FW_F_ACCEPT | IP_FW_F_IN | IP_FW_F_OUT; + rule.fw_prot = IPPROTO_TCP; + rule.fw_smsk.s_addr = INADDR_BROADCAST; + rule.fw_dmsk.s_addr = INADDR_BROADCAST; + + /* Build and apply specific part of the rules */ + rule.fw_src = GetOriginalAddress(link); + rule.fw_dst = GetDestAddress(link); + rule.fw_uar.fw_pts[0] = ntohs(GetOriginalPort(link)); + rule.fw_uar.fw_pts[1] = ntohs(GetDestPort(link)); + + /* Skip non-bound links - XXX should not be strictly necessary, + but seems to leave hole if not done. Leak of non-bound links? + (Code should be left even if the problem is fixed - it is a + clear optimization) */ + if (rule.fw_uar.fw_pts[0] != 0 && rule.fw_uar.fw_pts[1] != 0) { + r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule); +#ifdef DEBUG + if (r) + err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)"); +#endif + rule.fw_src = GetDestAddress(link); + rule.fw_dst = GetOriginalAddress(link); + rule.fw_uar.fw_pts[0] = ntohs(GetDestPort(link)); + rule.fw_uar.fw_pts[1] = ntohs(GetOriginalPort(link)); + r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule); +#ifdef DEBUG + if (r) + err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)"); +#endif + } +/* Indicate hole applied */ + link->data.tcp->fwhole = fwhole; + fw_setfield(fireWallField, fwhole); +} + +/* Remove a hole in a firewall associated with a particular alias + link. Calling this too often is harmless. */ +static void +ClearFWHole(struct alias_link *link) { + if (link->link_type == LINK_TCP) { + int fwhole = link->data.tcp->fwhole; /* Where is the firewall hole? */ + struct ip_fw rule; + + if (fwhole < 0) + return; + + memset(&rule, 0, sizeof rule); + rule.fw_number = fwhole; + while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule)) + ; + fw_clrfield(fireWallField, fwhole); + link->data.tcp->fwhole = -1; + } +} + +/* Clear out the entire range dedicated to firewall holes. */ +static void +ClearAllFWHoles(void) { + struct ip_fw rule; /* On-the-fly built rule */ + int i; + + if (fireWallFD < 0) + return; + + memset(&rule, 0, sizeof rule); + for (i = fireWallBaseNum; i < fireWallBaseNum + fireWallNumNums; i++) { + rule.fw_number = i; + while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule)) + ; + } + memset(fireWallField, 0, fireWallNumNums); +} +#endif + +void +PacketAliasSetFWBase(unsigned int base, unsigned int num) { +#ifndef NO_FW_PUNCH + fireWallBaseNum = base; + fireWallNumNums = num; +#endif +} diff --git a/sys/netinet/libalias/alias_ftp.c b/sys/netinet/libalias/alias_ftp.c new file mode 100644 index 0000000..efc78c7 --- /dev/null +++ b/sys/netinet/libalias/alias_ftp.c @@ -0,0 +1,583 @@ +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + Alias_ftp.c performs special processing for FTP sessions under + TCP. Specifically, when a PORT/EPRT command from the client + side or 227/229 reply from the server is sent, it is intercepted + and modified. The address is changed to the gateway machine + and an aliasing port is used. + + For this routine to work, the message must fit entirely into a + single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new message will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + + References: RFC 959, RFC 2428. + + Initial version: August, 1996 (cjm) + + Version 1.6 + Brian Somers and Martin Renters identified an IP checksum + error for modified IP packets. + + Version 1.7: January 9, 1996 (cjm) + Differential checksum computation for change + in IP packet length. + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + within the packet aliasing module. + + Version 3.1: May, 2000 (eds) + Add support for passive mode, alias the 227 replies. + + See HISTORY file for record of revisions. +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include "alias_local.h" + +#define FTP_CONTROL_PORT_NUMBER 21 +#define MAX_MESSAGE_SIZE 128 + +/* FTP protocol flags. */ +#define WAIT_CRLF 0x01 + +enum ftp_message_type { + FTP_PORT_COMMAND, + FTP_EPRT_COMMAND, + FTP_227_REPLY, + FTP_229_REPLY, + FTP_UNKNOWN_MESSAGE +}; + +static int ParseFtpPortCommand(char *, int); +static int ParseFtpEprtCommand(char *, int); +static int ParseFtp227Reply(char *, int); +static int ParseFtp229Reply(char *, int); +static void NewFtpMessage(struct ip *, struct alias_link *, int, int); + +static struct in_addr true_addr; /* in network byte order. */ +static u_short true_port; /* in host byte order. */ + +void +AliasHandleFtpOut( +struct ip *pip, /* IP packet to examine/patch */ +struct alias_link *link, /* The link to go through (aliased port) */ +int maxpacketsize /* The maximum size this packet can grow to (including headers) */) +{ + int hlen, tlen, dlen, pflags; + char *sptr; + struct tcphdr *tc; + int ftp_message_type; + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Place string pointer and beginning of data */ + sptr = (char *) pip; + sptr += hlen; + +/* + * Check that data length is not too long and previous message was + * properly terminated with CRLF. + */ + pflags = GetProtocolFlags(link); + if (dlen <= MAX_MESSAGE_SIZE && !(pflags & WAIT_CRLF)) { + ftp_message_type = FTP_UNKNOWN_MESSAGE; + + if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER) { +/* + * When aliasing a client, check for the PORT/EPRT command. + */ + if (ParseFtpPortCommand(sptr, dlen)) + ftp_message_type = FTP_PORT_COMMAND; + else if (ParseFtpEprtCommand(sptr, dlen)) + ftp_message_type = FTP_EPRT_COMMAND; + } else { +/* + * When aliasing a server, check for the 227/229 reply. + */ + if (ParseFtp227Reply(sptr, dlen)) + ftp_message_type = FTP_227_REPLY; + else if (ParseFtp229Reply(sptr, dlen)) { + ftp_message_type = FTP_229_REPLY; + true_addr.s_addr = pip->ip_src.s_addr; + } + } + + if (ftp_message_type != FTP_UNKNOWN_MESSAGE) + NewFtpMessage(pip, link, maxpacketsize, ftp_message_type); + } + +/* Track the msgs which are CRLF term'd for PORT/PASV FW breach */ + + if (dlen) { /* only if there's data */ + sptr = (char *) pip; /* start over at beginning */ + tlen = ntohs(pip->ip_len); /* recalc tlen, pkt may have grown */ + if (sptr[tlen-2] == '\r' && sptr[tlen-1] == '\n') + pflags &= ~WAIT_CRLF; + else + pflags |= WAIT_CRLF; + SetProtocolFlags(link, pflags); + } +} + +static int +ParseFtpPortCommand(char *sptr, int dlen) +{ + char ch; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "PORT A,D,D,R,PO,RT". */ + + /* Return if data length is too short. */ + if (dlen < 18) + return 0; + + addr = port = octet = 0; + state = -4; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) { + case -4: if (ch == 'P') state++; else return 0; break; + case -3: if (ch == 'O') state++; else return 0; break; + case -2: if (ch == 'R') state++; else return 0; break; + case -1: if (ch == 'T') state++; else return 0; break; + + case 0: + if (isspace(ch)) + break; + else + state++; + case 1: case 3: case 5: case 7: case 9: case 11: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return 0; + break; + case 2: case 4: case 6: case 8: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',') { + addr = (addr << 8) + octet; + state++; + } else + return 0; + break; + case 10: case 12: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',' || state == 12) { + port = (port << 8) + octet; + state++; + } else + return 0; + break; + } + } + + if (state == 13) { + true_addr.s_addr = htonl(addr); + true_port = port; + return 1; + } else + return 0; +} + +static int +ParseFtpEprtCommand(char *sptr, int dlen) +{ + char ch, delim; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "EPRT |1|A.D.D.R|PORT|". */ + + /* Return if data length is too short. */ + if (dlen < 18) + return 0; + + addr = port = octet = 0; + delim = '|'; /* XXX gcc -Wuninitialized */ + state = -4; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) + { + case -4: if (ch == 'E') state++; else return 0; break; + case -3: if (ch == 'P') state++; else return 0; break; + case -2: if (ch == 'R') state++; else return 0; break; + case -1: if (ch == 'T') state++; else return 0; break; + + case 0: + if (!isspace(ch)) { + delim = ch; + state++; + } + break; + case 1: + if (ch == '1') /* IPv4 address */ + state++; + else + return 0; + break; + case 2: + if (ch == delim) + state++; + else + return 0; + break; + case 3: case 5: case 7: case 9: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return 0; + break; + case 4: case 6: case 8: case 10: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == '.' || state == 10) { + addr = (addr << 8) + octet; + state++; + } else + return 0; + break; + case 11: + if (isdigit(ch)) { + port = ch - '0'; + state++; + } else + return 0; + break; + case 12: + if (isdigit(ch)) + port = 10 * port + ch - '0'; + else if (ch == delim) + state++; + else + return 0; + break; + } + } + + if (state == 13) { + true_addr.s_addr = htonl(addr); + true_port = port; + return 1; + } else + return 0; +} + +static int +ParseFtp227Reply(char *sptr, int dlen) +{ + char ch; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "227 Entering Passive Mode (A,D,D,R,PO,RT)" */ + + /* Return if data length is too short. */ + if (dlen < 17) + return 0; + + addr = port = octet = 0; + + state = -3; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) + { + case -3: if (ch == '2') state++; else return 0; break; + case -2: if (ch == '2') state++; else return 0; break; + case -1: if (ch == '7') state++; else return 0; break; + + case 0: + if (ch == '(') + state++; + break; + case 1: case 3: case 5: case 7: case 9: case 11: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return 0; + break; + case 2: case 4: case 6: case 8: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',') { + addr = (addr << 8) + octet; + state++; + } else + return 0; + break; + case 10: case 12: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',' || (state == 12 && ch == ')')) { + port = (port << 8) + octet; + state++; + } else + return 0; + break; + } + } + + if (state == 13) { + true_port = port; + true_addr.s_addr = htonl(addr); + return 1; + } else + return 0; +} + +static int +ParseFtp229Reply(char *sptr, int dlen) +{ + char ch, delim; + int i, state; + u_short port; + + /* Format: "229 Entering Extended Passive Mode (|||PORT|)" */ + + /* Return if data length is too short. */ + if (dlen < 11) + return 0; + + port = 0; + delim = '|'; /* XXX gcc -Wuninitialized */ + + state = -3; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) + { + case -3: if (ch == '2') state++; else return 0; break; + case -2: if (ch == '2') state++; else return 0; break; + case -1: if (ch == '9') state++; else return 0; break; + + case 0: + if (ch == '(') + state++; + break; + case 1: + delim = ch; + state++; + break; + case 2: case 3: + if (ch == delim) + state++; + else + return 0; + break; + case 4: + if (isdigit(ch)) { + port = ch - '0'; + state++; + } else + return 0; + break; + case 5: + if (isdigit(ch)) + port = 10 * port + ch - '0'; + else if (ch == delim) + state++; + else + return 0; + break; + case 6: + if (ch == ')') + state++; + else + return 0; + break; + } + } + + if (state == 7) { + true_port = port; + return 1; + } else + return 0; +} + +static void +NewFtpMessage(struct ip *pip, + struct alias_link *link, + int maxpacketsize, + int ftp_message_type) +{ + struct alias_link *ftp_link; + +/* Security checks. */ + if (pip->ip_src.s_addr != true_addr.s_addr) + return; + + if (true_port < IPPORT_RESERVED) + return; + +/* Establish link to address and port found in FTP control message. */ + ftp_link = FindUdpTcpOut(true_addr, GetDestAddress(link), + htons(true_port), 0, IPPROTO_TCP, 1); + + if (ftp_link != NULL) + { + int slen, hlen, tlen, dlen; + struct tcphdr *tc; + +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(ftp_link); +#endif + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Create new FTP message. */ + { + char stemp[MAX_MESSAGE_SIZE + 1]; + char *sptr; + u_short alias_port; + u_char *ptr; + int a1, a2, a3, a4, p1, p2; + struct in_addr alias_address; + +/* Decompose alias address into quad format */ + alias_address = GetAliasAddress(link); + ptr = (u_char *) &alias_address.s_addr; + a1 = *ptr++; a2=*ptr++; a3=*ptr++; a4=*ptr; + + alias_port = GetAliasPort(ftp_link); + + switch (ftp_message_type) + { + case FTP_PORT_COMMAND: + case FTP_227_REPLY: + /* Decompose alias port into pair format. */ + ptr = (char *) &alias_port; + p1 = *ptr++; p2=*ptr; + + if (ftp_message_type == FTP_PORT_COMMAND) { + /* Generate PORT command string. */ + sprintf(stemp, "PORT %d,%d,%d,%d,%d,%d\r\n", + a1,a2,a3,a4,p1,p2); + } else { + /* Generate 227 reply string. */ + sprintf(stemp, + "227 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n", + a1,a2,a3,a4,p1,p2); + } + break; + case FTP_EPRT_COMMAND: + /* Generate EPRT command string. */ + sprintf(stemp, "EPRT |1|%d.%d.%d.%d|%d|\r\n", + a1,a2,a3,a4,ntohs(alias_port)); + break; + case FTP_229_REPLY: + /* Generate 229 reply string. */ + sprintf(stemp, "229 Entering Extended Passive Mode (|||%d|)\r\n", + ntohs(alias_port)); + break; + } + +/* Save string length for IP header modification */ + slen = strlen(stemp); + +/* Copy modified buffer into IP packet. */ + sptr = (char *) pip; sptr += hlen; + strncpy(sptr, stemp, maxpacketsize-hlen); + } + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+slen-dlen); + } + +/* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + slen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + +/* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + } + else + { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/HandleFtpOut: Cannot allocate FTP data port\n"); +#endif + } +} diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c new file mode 100644 index 0000000..82c39e3 --- /dev/null +++ b/sys/netinet/libalias/alias_irc.c @@ -0,0 +1,357 @@ +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* Alias_irc.c intercepts packages contain IRC CTCP commands, and + changes DCC commands to export a port on the aliasing host instead + of an aliased host. + + For this routine to work, the DCC command must fit entirely into a + single TCP packet. This will usually happen, but is not + guaranteed. + + The interception is likely to change the length of the packet. + The handling of this is copied more-or-less verbatim from + ftp_alias.c + + Initial version: Eivind Eklund <perhaps@yes.no> (ee) 97-01-29 + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + withing the packet alising module. +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <limits.h> + +#include "alias_local.h" + +/* Local defines */ +#define DBprintf(a) + + +void +AliasHandleIrcOut(struct ip *pip, /* IP packet to examine */ + struct alias_link *link, /* Which link are we on? */ + int maxsize /* Maximum size of IP packet including headers */ + ) +{ + int hlen, tlen, dlen; + struct in_addr true_addr; + u_short true_port; + char *sptr; + struct tcphdr *tc; + int i; /* Iterator through the source */ + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Return if data length is too short - assume an entire PRIVMSG in each packet. */ + if (dlen<sizeof(":A!a@n.n PRIVMSG A :aDCC 1 1a")-1) + return; + +/* Place string pointer at beginning of data */ + sptr = (char *) pip; + sptr += hlen; + maxsize -= hlen; /* We're interested in maximum size of data, not packet */ + + /* Search for a CTCP command [Note 1] */ + for( i=0; i<dlen; i++ ) { + if(sptr[i]=='\001') + goto lFOUND_CTCP; + } + return; /* No CTCP commands in */ + /* Handle CTCP commands - the buffer may have to be copied */ +lFOUND_CTCP: + { + char newpacket[65536]; /* Estimate of maximum packet size :) */ + int copyat = i; /* Same */ + int iCopy = 0; /* How much data have we written to copy-back string? */ + unsigned long org_addr; /* Original IP address */ + unsigned short org_port; /* Original source port address */ + lCTCP_START: + if( i >= dlen || iCopy >= sizeof(newpacket) ) + goto lPACKET_DONE; + newpacket[iCopy++] = sptr[i++]; /* Copy the CTCP start character */ + /* Start of a CTCP */ + if( i+4 >= dlen ) /* Too short for DCC */ + goto lBAD_CTCP; + if( sptr[i+0] != 'D' ) + goto lBAD_CTCP; + if( sptr[i+1] != 'C' ) + goto lBAD_CTCP; + if( sptr[i+2] != 'C' ) + goto lBAD_CTCP; + if( sptr[i+3] != ' ' ) + goto lBAD_CTCP; + /* We have a DCC command - handle it! */ + i+= 4; /* Skip "DCC " */ + if( iCopy+4 > sizeof(newpacket) ) + goto lPACKET_DONE; + newpacket[iCopy++] = 'D'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = ' '; + + DBprintf(("Found DCC\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen) { + DBprintf(("DCC packet terminated in just spaces\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring command...\n")); + while(sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if( ++i >= dlen || iCopy >= sizeof(newpacket) ) { + DBprintf(("DCC packet terminated during command\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if( i+1 < dlen && iCopy < sizeof(newpacket) ) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done command - removing spaces\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("DCC packet terminated in just spaces (post-command)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring filename...\n")); + while(sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if( ++i >= dlen || iCopy >= sizeof(newpacket) ) { + DBprintf(("DCC packet terminated during filename\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if( i+1 < dlen && iCopy < sizeof(newpacket) ) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done filename - removing spaces\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("DCC packet terminated in just spaces (post-filename)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Fetching IP address\n")); + /* Fetch IP address */ + org_addr = 0; + while(i<dlen && isdigit(sptr[i])) { + if( org_addr > ULONG_MAX/10UL ) { /* Terminate on overflow */ + DBprintf(("DCC Address overflow (org_addr == 0x%08lx, next char %c\n", org_addr, sptr[i])); + goto lBAD_CTCP; + } + org_addr *= 10; + org_addr += sptr[i++]-'0'; + } + DBprintf(("Skipping space\n")); + if( i+1 >= dlen || sptr[i] != ' ' ) { + DBprintf(("Overflow (%d >= %d) or bad character (%02x) terminating IP address\n", i+1, dlen, sptr[i])); + goto lBAD_CTCP; + } + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway, so we might + as well play it safe */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("Packet failure - space overflow.\n")); + goto lPACKET_DONE; + } + } + DBprintf(("Fetching port number\n")); + /* Fetch source port */ + org_port = 0; + while(i<dlen && isdigit(sptr[i])) { + if( org_port > 6554 ) { /* Terminate on overflow (65536/10 rounded up*/ + DBprintf(("DCC: port number overflow\n")); + goto lBAD_CTCP; + } + org_port *= 10; + org_port += sptr[i++]-'0'; + } + /* Skip illegal addresses (or early termination) */ + if( i >= dlen || (sptr[i] != '\001' && sptr[i] != ' ') ) { + DBprintf(("Bad port termination\n")); + goto lBAD_CTCP; + } + DBprintf(("Got IP %lu and port %u\n", org_addr, (unsigned)org_port)); + + /* We've got the address and port - now alias it */ + { + struct alias_link *dcc_link; + struct in_addr destaddr; + + + true_port = htons(org_port); + true_addr.s_addr = htonl(org_addr); + destaddr.s_addr = 0; + + /* Sanity/Security checking */ + if (!org_addr || !org_port || + pip->ip_src.s_addr != true_addr.s_addr || + org_port < IPPORT_RESERVED) + goto lBAD_CTCP; + + /* Steal the FTP_DATA_PORT - it doesn't really matter, and this + would probably allow it through at least _some_ + firewalls. */ + dcc_link = FindUdpTcpOut(true_addr, destaddr, + true_port, 0, + IPPROTO_TCP, 1); + DBprintf(("Got a DCC link\n")); + if ( dcc_link ) { + struct in_addr alias_address; /* Address from aliasing */ + u_short alias_port; /* Port given by aliasing */ + int n; + +#ifndef NO_FW_PUNCH + /* Generate firewall hole as appropriate */ + PunchFWHole(dcc_link); +#endif + + alias_address = GetAliasAddress(link); + n = snprintf(&newpacket[iCopy], + sizeof(newpacket)-iCopy, + "%lu ", (u_long)htonl(alias_address.s_addr)); + if( n < 0 ) { + DBprintf(("DCC packet construct failure.\n")); + goto lBAD_CTCP; + } + if( (iCopy += n) >= sizeof(newpacket) ) { /* Truncated/fit exactly - bad news */ + DBprintf(("DCC constructed packet overflow.\n")); + goto lBAD_CTCP; + } + alias_port = GetAliasPort(dcc_link); + n = snprintf(&newpacket[iCopy], + sizeof(newpacket)-iCopy, + "%u", htons(alias_port) ); + if( n < 0 ) { + DBprintf(("DCC packet construct failure.\n")); + goto lBAD_CTCP; + } + iCopy += n; + /* Done - truncated cases will be taken care of by lBAD_CTCP */ + DBprintf(("Aliased IP %lu and port %u\n", alias_address.s_addr, (unsigned)alias_port)); + } + } + /* An uninteresting CTCP - state entered right after '\001' has + been pushed. Also used to copy the rest of a DCC, after IP + address and port has been handled */ + lBAD_CTCP: + for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if(sptr[i] == '\001') { + goto lNORMAL_TEXT; + } + } + goto lPACKET_DONE; + /* Normal text */ + lNORMAL_TEXT: + for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if(sptr[i] == '\001') { + goto lCTCP_START; + } + } + /* Handle the end of a packet */ + lPACKET_DONE: + iCopy = iCopy > maxsize-copyat ? maxsize-copyat : iCopy; + memcpy(sptr+copyat, newpacket, iCopy); + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+copyat+iCopy-dlen); + } + + /* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + iCopy + copyat); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + return; + } +} + +/* Notes: + [Note 1] + The initial search will most often fail; it could be replaced with a 32-bit specific search. + Such a search would be done for 32-bit unsigned value V: + V ^= 0x01010101; (Search is for null bytes) + if( ((V-0x01010101)^V) & 0x80808080 ) { + (found a null bytes which was a 01 byte) + } + To assert that the processor is 32-bits, do + extern int ircdccar[32]; (32 bits) + extern int ircdccar[CHAR_BIT*sizeof(unsigned int)]; + which will generate a type-error on all but 32-bit machines. + + [Note 2] This routine really ought to be replaced with one that + creates a transparent proxy on the aliasing host, to allow arbitary + changes in the TCP stream. This should not be too difficult given + this base; I (ee) will try to do this some time later. + */ diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h new file mode 100644 index 0000000..c25259a --- /dev/null +++ b/sys/netinet/libalias/alias_local.h @@ -0,0 +1,229 @@ +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Alias_local.h contains the function prototypes for alias.c, + * alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well + * as any future add-ons). It also includes macros, globals and + * struct definitions shared by more than one alias*.c file. + * + * This include file is intended to be used only within the aliasing + * software. Outside world interfaces are defined in alias.h + * + * This software is placed into the public domain with no restrictions + * on its distribution. + * + * Initial version: August, 1996 (cjm) + * + * <updated several times by original author and Eivind Eklund> + */ + +#ifndef _ALIAS_LOCAL_H_ +#define _ALIAS_LOCAL_H_ + +/* Macros */ + +/* + * The following macro is used to update an + * internet checksum. "delta" is a 32-bit + * accumulation of all the changes to the + * checksum (adding in new 16-bit words and + * subtracting out old words), and "cksum" + * is the checksum value to be updated. + */ +#define ADJUST_CHECKSUM(acc, cksum) \ + do { \ + acc += cksum; \ + if (acc < 0) { \ + acc = -acc; \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) ~acc; \ + } else { \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) acc; \ + } \ + } while (0) + +/* Globals */ + +extern int packetAliasMode; + +/* Prototypes */ + +/* General utilities */ +u_short IpChecksum(struct ip *_pip); +u_short TcpChecksum(struct ip *_pip); +void DifferentialChecksum(u_short *_cksum, u_short *_new, u_short *_old, + int _n); + +/* Internal data access */ +struct alias_link * + FindIcmpIn(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _id_alias, int _create); +struct alias_link * + FindIcmpOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _id, int _create); +struct alias_link * + FindFragmentIn1(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _ip_id); +struct alias_link * + FindFragmentIn2(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _ip_id); +struct alias_link * + AddFragmentPtrLink(struct in_addr _dst_addr, u_short _ip_id); +struct alias_link * + FindFragmentPtr(struct in_addr _dst_addr, u_short _ip_id); +struct alias_link * + FindProtoIn(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_char _proto); +struct alias_link * + FindProtoOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_char _proto); +struct alias_link * + FindUdpTcpIn(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _dst_port, u_short _alias_port, u_char _proto, int _create); +struct alias_link * + FindUdpTcpOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _src_port, u_short _dst_port, u_char _proto, int _create); +struct alias_link * + AddPptp(struct in_addr _src_addr, struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _src_call_id); +struct alias_link * + FindPptpOutByCallId(struct in_addr _src_addr, + struct in_addr _dst_addr, u_int16_t _src_call_id); +struct alias_link * + FindPptpInByCallId(struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _dst_call_id); +struct alias_link * + FindPptpOutByPeerCallId(struct in_addr _src_addr, + struct in_addr _dst_addr, u_int16_t _dst_call_id); +struct alias_link * + FindPptpInByPeerCallId(struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _alias_call_id); +struct alias_link * + FindRtspOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _src_port, u_short _alias_port, u_char _proto); +struct in_addr + FindOriginalAddress(struct in_addr _alias_addr); +struct in_addr + FindAliasAddress(struct in_addr _original_addr); + +/* External data access/modification */ +int FindNewPortGroup(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _src_port, u_short _dst_port, u_short _port_count, + u_char _proto, u_char _align); +void GetFragmentAddr(struct alias_link *_link, struct in_addr *_src_addr); +void SetFragmentAddr(struct alias_link *_link, struct in_addr _src_addr); +void GetFragmentPtr(struct alias_link *_link, char **_fptr); +void SetFragmentPtr(struct alias_link *_link, char *fptr); +void SetStateIn(struct alias_link *_link, int _state); +void SetStateOut(struct alias_link *_link, int _state); +int GetStateIn(struct alias_link *_link); +int GetStateOut(struct alias_link *_link); +struct in_addr + GetOriginalAddress(struct alias_link *_link); +struct in_addr + GetDestAddress(struct alias_link *_link); +struct in_addr + GetAliasAddress(struct alias_link *_link); +struct in_addr + GetDefaultAliasAddress(void); +void SetDefaultAliasAddress(struct in_addr _alias_addr); +u_short GetOriginalPort(struct alias_link *_link); +u_short GetAliasPort(struct alias_link *_link); +struct in_addr + GetProxyAddress(struct alias_link *_link); +void SetProxyAddress(struct alias_link *_link, struct in_addr _addr); +u_short GetProxyPort(struct alias_link *_link); +void SetProxyPort(struct alias_link *_link, u_short _port); +void SetAckModified(struct alias_link *_link); +int GetAckModified(struct alias_link *_link); +int GetDeltaAckIn(struct ip *_pip, struct alias_link *_link); +int GetDeltaSeqOut(struct ip *_pip, struct alias_link *_link); +void AddSeq(struct ip *_pip, struct alias_link *_link, int _delta); +void SetExpire(struct alias_link *_link, int _expire); +void ClearCheckNewLink(void); +void SetProtocolFlags(struct alias_link *_link, int _pflags); +int GetProtocolFlags(struct alias_link *_link); +void SetDestCallId(struct alias_link *_link, u_int16_t _cid); +#ifndef NO_FW_PUNCH +void PunchFWHole(struct alias_link *_link); +#endif + +/* Housekeeping function */ +void HouseKeeping(void); + +/* Tcp specfic routines */ +/* lint -save -library Suppress flexelint warnings */ + +/* FTP routines */ +void AliasHandleFtpOut(struct ip *_pip, struct alias_link *_link, + int _maxpacketsize); + +/* IRC routines */ +void AliasHandleIrcOut(struct ip *_pip, struct alias_link *_link, + int _maxsize); + +/* RTSP routines */ +void AliasHandleRtspOut(struct ip *_pip, struct alias_link *_link, + int _maxpacketsize); + +/* PPTP routines */ +void AliasHandlePptpOut(struct ip *_pip, struct alias_link *_link); +void AliasHandlePptpIn(struct ip *_pip, struct alias_link *_link); +int AliasHandlePptpGreOut(struct ip *_pip); +int AliasHandlePptpGreIn(struct ip *_pip); + +/* NetBIOS routines */ +int AliasHandleUdpNbt(struct ip *_pip, struct alias_link *_link, + struct in_addr *_alias_address, u_short _alias_port); +int AliasHandleUdpNbtNS(struct ip *_pip, struct alias_link *_link, + struct in_addr *_alias_address, u_short *_alias_port, + struct in_addr *_original_address, u_short *_original_port); + +/* CUSeeMe routines */ +void AliasHandleCUSeeMeOut(struct ip *_pip, struct alias_link *_link); +void AliasHandleCUSeeMeIn(struct ip *_pip, struct in_addr _original_addr); + +/* Transparent proxy routines */ +int ProxyCheck(struct ip *_pip, struct in_addr *_proxy_server_addr, + u_short *_proxy_server_port); +void ProxyModify(struct alias_link *_link, struct ip *_pip, + int _maxpacketsize, int _proxy_type); + +enum alias_tcp_state { + ALIAS_TCP_STATE_NOT_CONNECTED, + ALIAS_TCP_STATE_CONNECTED, + ALIAS_TCP_STATE_DISCONNECTED +}; + +/*lint -restore */ + +#endif /* !_ALIAS_LOCAL_H_ */ diff --git a/sys/netinet/libalias/alias_nbt.c b/sys/netinet/libalias/alias_nbt.c new file mode 100644 index 0000000..89e8089 --- /dev/null +++ b/sys/netinet/libalias/alias_nbt.c @@ -0,0 +1,704 @@ +/*- + * Written by Atsushi Murai <amurai@spec.co.jp> + * Copyright (c) 1998, System Planning and Engineering Co. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * TODO: + * oClean up. + * oConsidering for word alignment for other platform. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + alias_nbt.c performs special processing for NetBios over TCP/IP + sessions by UDP. + + Initial version: May, 1998 (Atsushi Murai <amurai@spec.co.jp>) + + See HISTORY file for record of revisions. +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <netinet/tcp.h> + +#include "alias_local.h" + +typedef struct { + struct in_addr oldaddr; + u_short oldport; + struct in_addr newaddr; + u_short newport; + u_short *uh_sum; +} NBTArguments; + +typedef struct { + unsigned char type; + unsigned char flags; + u_short id; + struct in_addr source_ip; + u_short source_port; + u_short len; + u_short offset; +} NbtDataHeader; + +#define OpQuery 0 +#define OpUnknown 4 +#define OpRegist 5 +#define OpRelease 6 +#define OpWACK 7 +#define OpRefresh 8 +typedef struct { + u_short nametrid; + u_short dir:1, opcode:4, nmflags:7, rcode:4; + u_short qdcount; + u_short ancount; + u_short nscount; + u_short arcount; +} NbtNSHeader; + +#define FMT_ERR 0x1 +#define SRV_ERR 0x2 +#define IMP_ERR 0x4 +#define RFS_ERR 0x5 +#define ACT_ERR 0x6 +#define CFT_ERR 0x7 + + +#ifdef DEBUG +static void PrintRcode( u_char rcode ) { + + switch (rcode) { + case FMT_ERR: + printf("\nFormat Error."); + case SRV_ERR: + printf("\nSever failure."); + case IMP_ERR: + printf("\nUnsupported request error.\n"); + case RFS_ERR: + printf("\nRefused error.\n"); + case ACT_ERR: + printf("\nActive error.\n"); + case CFT_ERR: + printf("\nName in conflict error.\n"); + default: + printf("\n?%c?=%0x\n", '?', rcode ); + + } +} +#endif + + +/* Handling Name field */ +static u_char *AliasHandleName ( u_char *p, char *pmax ) { + + u_char *s; + u_char c; + int compress; + + /* Following length field */ + + if (p == NULL || (char *)p >= pmax) + return(NULL); + + if (*p & 0xc0 ) { + p = p + 2; + if ((char *)p > pmax) + return(NULL); + return ((u_char *)p); + } + while ( ( *p & 0x3f) != 0x00 ) { + s = p + 1; + if ( *p == 0x20 ) + compress = 1; + else + compress = 0; + + /* Get next length field */ + p = (u_char *)(p + (*p & 0x3f) + 1); + if ((char *)p > pmax) { + p = NULL; + break; + } +#ifdef DEBUG + printf(":"); +#endif + while (s < p) { + if ( compress == 1 ) { + c = (u_char )(((((*s & 0x0f) << 4) | (*(s+1) & 0x0f)) - 0x11)); +#ifdef DEBUG + if (isprint( c ) ) + printf("%c", c ); + else + printf("<0x%02x>", c ); +#endif + s +=2; + } else { +#ifdef DEBUG + printf("%c", *s); +#endif + s++; + } + } +#ifdef DEBUG + printf(":"); +#endif + fflush(stdout); + } + + /* Set up to out of Name field */ + if (p == NULL || (char *)p >= pmax) + p = NULL; + else + p++; + return ((u_char *)p); +} + +/* + * NetBios Datagram Handler (IP/UDP) + */ +#define DGM_DIRECT_UNIQ 0x10 +#define DGM_DIRECT_GROUP 0x11 +#define DGM_BROADCAST 0x12 +#define DGM_ERROR 0x13 +#define DGM_QUERY 0x14 +#define DGM_POSITIVE_RES 0x15 +#define DGM_NEGATIVE_RES 0x16 + +int AliasHandleUdpNbt( + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link, + struct in_addr *alias_address, + u_short alias_port +) { + struct udphdr * uh; + NbtDataHeader *ndh; + u_char *p = NULL; + char *pmax; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + pmax = (char *)uh + ntohs( uh->uh_ulen ); + + ndh = (NbtDataHeader *)((char *)uh + (sizeof (struct udphdr))); + if ((char *)(ndh + 1) > pmax) + return(-1); +#ifdef DEBUG + printf("\nType=%02x,", ndh->type ); +#endif + switch ( ndh->type ) { + case DGM_DIRECT_UNIQ: + case DGM_DIRECT_GROUP: + case DGM_BROADCAST: + p = (u_char *)ndh + 14; + p = AliasHandleName ( p, pmax ); /* Source Name */ + p = AliasHandleName ( p, pmax ); /* Destination Name */ + break; + case DGM_ERROR: + p = (u_char *)ndh + 11; + break; + case DGM_QUERY: + case DGM_POSITIVE_RES: + case DGM_NEGATIVE_RES: + p = (u_char *)ndh + 10; + p = AliasHandleName ( p, pmax ); /* Destination Name */ + break; + } + if (p == NULL || (char *)p > pmax) + p = NULL; +#ifdef DEBUG + printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) ); +#endif + /* Doing a IP address and Port number Translation */ + if ( uh->uh_sum != 0 ) { + int acc; + u_short *sptr; + acc = ndh->source_port; + acc -= alias_port; + sptr = (u_short *) &(ndh->source_ip); + acc += *sptr++; + acc += *sptr; + sptr = (u_short *) alias_address; + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, uh->uh_sum); + } + ndh->source_ip = *alias_address; + ndh->source_port = alias_port; +#ifdef DEBUG + printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) ); + fflush(stdout); +#endif + return((p == NULL) ? -1 : 0); +} +/* Question Section */ +#define QS_TYPE_NB 0x0020 +#define QS_TYPE_NBSTAT 0x0021 +#define QS_CLAS_IN 0x0001 +typedef struct { + u_short type; /* The type of Request */ + u_short class; /* The class of Request */ +} NBTNsQuestion; + +static u_char * +AliasHandleQuestion( + u_short count, + NBTNsQuestion *q, + char *pmax, + NBTArguments *nbtarg) +{ + + while ( count != 0 ) { + /* Name Filed */ + q = (NBTNsQuestion *)AliasHandleName((u_char *)q, pmax); + + if (q == NULL || (char *)(q + 1) > pmax) { + q = NULL; + break; + } + + /* Type and Class filed */ + switch ( ntohs(q->type) ) { + case QS_TYPE_NB: + case QS_TYPE_NBSTAT: + q= q+1; + break; + default: +#ifdef DEBUG + printf("\nUnknown Type on Question %0x\n", ntohs(q->type) ); +#endif + break; + } + count--; + } + + /* Set up to out of Question Section */ + return ((u_char *)q); +} + +/* Resource Record */ +#define RR_TYPE_A 0x0001 +#define RR_TYPE_NS 0x0002 +#define RR_TYPE_NULL 0x000a +#define RR_TYPE_NB 0x0020 +#define RR_TYPE_NBSTAT 0x0021 +#define RR_CLAS_IN 0x0001 +#define SizeOfNsResource 8 +typedef struct { + u_short type; + u_short class; + unsigned int ttl; + u_short rdlen; +} NBTNsResource; + +#define SizeOfNsRNB 6 +typedef struct { + u_short g:1, ont:2, resv:13; + struct in_addr addr; +} NBTNsRNB; + +static u_char * +AliasHandleResourceNB( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsRNB *nb; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + /* Check out a length */ + bcount = ntohs(q->rdlen); + + /* Forward to Resource NB position */ + nb = (NBTNsRNB *)((u_char *)q + SizeOfNsResource); + + /* Processing all in_addr array */ +#ifdef DEBUG + printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s, %dbytes] ",inet_ntoa(nbtarg->newaddr ), bcount); +#endif + while ( nb != NULL && bcount != 0 ) { + if ((char *)(nb + 1) > pmax) { + nb = NULL; + break; + } +#ifdef DEBUG + printf("<%s>", inet_ntoa(nb->addr) ); +#endif + if (!bcmp(&nbtarg->oldaddr,&nb->addr, sizeof(struct in_addr) ) ) { + if ( *nbtarg->uh_sum != 0 ) { + int acc; + u_short *sptr; + + sptr = (u_short *) &(nb->addr); + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) &(nbtarg->newaddr); + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); + } + + nb->addr = nbtarg->newaddr; +#ifdef DEBUG + printf("O"); +#endif + } +#ifdef DEBUG + else { + printf("."); + } +#endif + nb=(NBTNsRNB *)((u_char *)nb + SizeOfNsRNB); + bcount -= SizeOfNsRNB; + } + if (nb == NULL || (char *)(nb + 1) > pmax) { + nb = NULL; + } + + return ((u_char *)nb); +} + +#define SizeOfResourceA 6 +typedef struct { + struct in_addr addr; +} NBTNsResourceA; + +static u_char * +AliasHandleResourceA( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceA *a; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource A position */ + a = (NBTNsResourceA *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ +#ifdef DEBUG + printf("Arec [%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s]",inet_ntoa(nbtarg->newaddr )); +#endif + while ( bcount != 0 ) { + if (a == NULL || (char *)(a + 1) > pmax) + return(NULL); +#ifdef DEBUG + printf("..%s", inet_ntoa(a->addr) ); +#endif + if ( !bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr) ) ) { + if ( *nbtarg->uh_sum != 0 ) { + int acc; + u_short *sptr; + + sptr = (u_short *) &(a->addr); /* Old */ + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) &nbtarg->newaddr; /* New */ + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); + } + + a->addr = nbtarg->newaddr; + } + a++; /*XXXX*/ + bcount -= SizeOfResourceA; + } + if (a == NULL || (char *)(a + 1) > pmax) + a = NULL; + return ((u_char *)a); +} + +typedef struct { + u_short opcode:4, flags:8, resv:4; +} NBTNsResourceNULL; + +static u_char * +AliasHandleResourceNULL( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ + while ( bcount != 0 ) { + if ((char *)(n + 1) > pmax) { + n = NULL; + break; + } + n++; + bcount -= sizeof(NBTNsResourceNULL); + } + if ((char *)(n + 1) > pmax) + n = NULL; + + return ((u_char *)n); +} + +static u_char * +AliasHandleResourceNS( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Resource Record Name Filed */ + q = (NBTNsResource *)AliasHandleName( (u_char *)n, pmax ); /* XXX */ + + if (q == NULL || (char *)((u_char *)n + bcount) > pmax) + return(NULL); + else + return ((u_char *)n + bcount); +} + +typedef struct { + u_short numnames; +} NBTNsResourceNBSTAT; + +static u_char * +AliasHandleResourceNBSTAT( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNBSTAT *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NBSTAT position */ + n = (NBTNsResourceNBSTAT *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + if (q == NULL || (char *)((u_char *)n + bcount) > pmax) + return(NULL); + else + return ((u_char *)n + bcount); +} + +static u_char * +AliasHandleResource( + u_short count, + NBTNsResource *q, + char *pmax, + NBTArguments + *nbtarg) +{ + while ( count != 0 ) { + /* Resource Record Name Filed */ + q = (NBTNsResource *)AliasHandleName( (u_char *)q, pmax ); + + if (q == NULL || (char *)(q + 1) > pmax) + break; +#ifdef DEBUG + printf("type=%02x, count=%d\n", ntohs(q->type), count ); +#endif + + /* Type and Class filed */ + switch ( ntohs(q->type) ) { + case RR_TYPE_NB: + q = (NBTNsResource *)AliasHandleResourceNB( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_A: + q = (NBTNsResource *)AliasHandleResourceA( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NS: + q = (NBTNsResource *)AliasHandleResourceNS( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NULL: + q = (NBTNsResource *)AliasHandleResourceNULL( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NBSTAT: + q = (NBTNsResource *)AliasHandleResourceNBSTAT( + q, + pmax, + nbtarg + ); + break; + default: +#ifdef DEBUG + printf( + "\nUnknown Type of Resource %0x\n", + ntohs(q->type) + ); +#endif + break; + } + count--; + } + fflush(stdout); + return ((u_char *)q); +} + +int AliasHandleUdpNbtNS( + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link, + struct in_addr *alias_address, + u_short *alias_port, + struct in_addr *original_address, + u_short *original_port ) +{ + struct udphdr * uh; + NbtNSHeader * nsh; + u_char * p; + char *pmax; + NBTArguments nbtarg; + + /* Set up Common Parameter */ + nbtarg.oldaddr = *alias_address; + nbtarg.oldport = *alias_port; + nbtarg.newaddr = *original_address; + nbtarg.newport = *original_port; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + nbtarg.uh_sum = &(uh->uh_sum); + nsh = (NbtNSHeader *)((char *)uh + (sizeof(struct udphdr))); + p = (u_char *)(nsh + 1); + pmax = (char *)uh + ntohs( uh->uh_ulen ); + + if ((char *)(nsh + 1) > pmax) + return(-1); + +#ifdef DEBUG + printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x" + ", an=%04x, ns=%04x, ar=%04x, [%d]-->", + nsh->dir ? "Response": "Request", + nsh->nametrid, + nsh->opcode, + nsh->nmflags, + nsh->rcode, + ntohs(nsh->qdcount), + ntohs(nsh->ancount), + ntohs(nsh->nscount), + ntohs(nsh->arcount), + (u_char *)p -(u_char *)nsh + ); +#endif + + /* Question Entries */ + if (ntohs(nsh->qdcount) !=0 ) { + p = AliasHandleQuestion( + ntohs(nsh->qdcount), + (NBTNsQuestion *)p, + pmax, + &nbtarg + ); + } + + /* Answer Resource Records */ + if (ntohs(nsh->ancount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->ancount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + + /* Authority Resource Recodrs */ + if (ntohs(nsh->nscount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->nscount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + + /* Additional Resource Recodrs */ + if (ntohs(nsh->arcount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->arcount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + +#ifdef DEBUG + PrintRcode(nsh->rcode); +#endif + return ((p == NULL) ? -1 : 0); +} diff --git a/sys/netinet/libalias/alias_pptp.c b/sys/netinet/libalias/alias_pptp.c new file mode 100644 index 0000000..2d3d9ac --- /dev/null +++ b/sys/netinet/libalias/alias_pptp.c @@ -0,0 +1,369 @@ +/* + * alias_pptp.c + * + * Copyright (c) 2000 Whistle Communications, Inc. + * All rights reserved. + * + * Subject to the following obligations and disclaimer of warranty, use and + * redistribution of this software, in source or object code forms, with or + * without modifications are expressly permitted by Whistle Communications; + * provided, however, that: + * 1. Any and all reproductions of the source or object code must include the + * copyright notice above and the following disclaimer of warranties; and + * 2. No rights are granted, in any manner or form, to use Whistle + * Communications, Inc. trademarks, including the mark "WHISTLE + * COMMUNICATIONS" on advertising, endorsements, or otherwise except as + * such appears in the above copyright notice or in the software. + * + * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND + * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO + * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, + * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY + * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS + * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. + * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES + * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING + * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * Author: Erik Salander <erik@whistle.com> + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + Alias_pptp.c performs special processing for PPTP sessions under TCP. + Specifically, watch PPTP control messages and alias the Call ID or the + Peer's Call ID in the appropriate messages. Note, PPTP requires + "de-aliasing" of incoming packets, this is different than any other + TCP applications that are currently (ie. FTP, IRC and RTSP) aliased. + + For Call IDs encountered for the first time, a PPTP alias link is created. + The PPTP alias link uses the Call ID in place of the original port number. + An alias Call ID is created. + + For this routine to work, the PPTP control messages must fit entirely + into a single TCP packet. This is typically the case, but is not + required by the spec. + + Unlike some of the other TCP applications that are aliased (ie. FTP, + IRC and RTSP), the PPTP control messages that need to be aliased are + guaranteed to remain the same length. The aliased Call ID is a fixed + length field. + + Reference: RFC 2637 + + Initial version: May, 2000 (eds) + +*/ + +/* Includes */ +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <stdio.h> + +#include "alias_local.h" + +/* + * PPTP definitions + */ + +struct grehdr /* Enhanced GRE header. */ +{ + u_int16_t gh_flags; /* Flags. */ + u_int16_t gh_protocol; /* Protocol type. */ + u_int16_t gh_length; /* Payload length. */ + u_int16_t gh_call_id; /* Call ID. */ + u_int32_t gh_seq_no; /* Sequence number (optional). */ + u_int32_t gh_ack_no; /* Acknowledgment number (optional). */ +}; +typedef struct grehdr GreHdr; + +/* The PPTP protocol ID used in the GRE 'proto' field. */ +#define PPTP_GRE_PROTO 0x880b + +/* Bits that must be set a certain way in all PPTP/GRE packets. */ +#define PPTP_INIT_VALUE ((0x2001 << 16) | PPTP_GRE_PROTO) +#define PPTP_INIT_MASK 0xef7fffff + +#define PPTP_MAGIC 0x1a2b3c4d +#define PPTP_CTRL_MSG_TYPE 1 + +enum { + PPTP_StartCtrlConnRequest = 1, + PPTP_StartCtrlConnReply = 2, + PPTP_StopCtrlConnRequest = 3, + PPTP_StopCtrlConnReply = 4, + PPTP_EchoRequest = 5, + PPTP_EchoReply = 6, + PPTP_OutCallRequest = 7, + PPTP_OutCallReply = 8, + PPTP_InCallRequest = 9, + PPTP_InCallReply = 10, + PPTP_InCallConn = 11, + PPTP_CallClearRequest = 12, + PPTP_CallDiscNotify = 13, + PPTP_WanErrorNotify = 14, + PPTP_SetLinkInfo = 15 +}; + + /* Message structures */ + struct pptpMsgHead { + u_int16_t length; /* total length */ + u_int16_t msgType; /* PPTP message type */ + u_int32_t magic; /* magic cookie */ + u_int16_t type; /* control message type */ + u_int16_t resv0; /* reserved */ + }; + typedef struct pptpMsgHead *PptpMsgHead; + + struct pptpCodes { + u_int8_t resCode; /* Result Code */ + u_int8_t errCode; /* Error Code */ + }; + typedef struct pptpCodes *PptpCode; + + struct pptpCallIds { + u_int16_t cid1; /* Call ID field #1 */ + u_int16_t cid2; /* Call ID field #2 */ + }; + typedef struct pptpCallIds *PptpCallId; + +static PptpCallId AliasVerifyPptp(struct ip *, u_int16_t *); + + +void +AliasHandlePptpOut(struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link) /* The PPTP control link */ +{ + struct alias_link *pptp_link; + PptpCallId cptr; + PptpCode codes; + u_int16_t ctl_type; /* control message type */ + struct tcphdr *tc; + + /* Verify valid PPTP control message */ + if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL) + return; + + /* Modify certain PPTP messages */ + switch (ctl_type) { + case PPTP_OutCallRequest: + case PPTP_OutCallReply: + case PPTP_InCallRequest: + case PPTP_InCallReply: + /* Establish PPTP link for address and Call ID found in control message. */ + pptp_link = AddPptp(GetOriginalAddress(link), GetDestAddress(link), + GetAliasAddress(link), cptr->cid1); + break; + case PPTP_CallClearRequest: + case PPTP_CallDiscNotify: + /* Find PPTP link for address and Call ID found in control message. */ + pptp_link = FindPptpOutByCallId(GetOriginalAddress(link), + GetDestAddress(link), + cptr->cid1); + break; + default: + return; + } + + if (pptp_link != NULL) { + int accumulate = cptr->cid1; + + /* alias the Call Id */ + cptr->cid1 = GetAliasPort(pptp_link); + + /* Compute TCP checksum for revised packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + accumulate -= cptr->cid1; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + switch (ctl_type) { + case PPTP_OutCallReply: + case PPTP_InCallReply: + codes = (PptpCode)(cptr + 1); + if (codes->resCode == 1) /* Connection established, */ + SetDestCallId(pptp_link, /* note the Peer's Call ID. */ + cptr->cid2); + else + SetExpire(pptp_link, 0); /* Connection refused. */ + break; + case PPTP_CallDiscNotify: /* Connection closed. */ + SetExpire(pptp_link, 0); + break; + } + } +} + +void +AliasHandlePptpIn(struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link) /* The PPTP control link */ +{ + struct alias_link *pptp_link; + PptpCallId cptr; + u_int16_t *pcall_id; + u_int16_t ctl_type; /* control message type */ + struct tcphdr *tc; + + /* Verify valid PPTP control message */ + if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL) + return; + + /* Modify certain PPTP messages */ + switch (ctl_type) + { + case PPTP_InCallConn: + case PPTP_WanErrorNotify: + case PPTP_SetLinkInfo: + pcall_id = &cptr->cid1; + break; + case PPTP_OutCallReply: + case PPTP_InCallReply: + pcall_id = &cptr->cid2; + break; + case PPTP_CallDiscNotify: /* Connection closed. */ + pptp_link = FindPptpInByCallId(GetDestAddress(link), + GetAliasAddress(link), + cptr->cid1); + if (pptp_link != NULL) + SetExpire(pptp_link, 0); + return; + default: + return; + } + + /* Find PPTP link for address and Call ID found in PPTP Control Msg */ + pptp_link = FindPptpInByPeerCallId(GetDestAddress(link), + GetAliasAddress(link), + *pcall_id); + + if (pptp_link != NULL) { + int accumulate = *pcall_id; + + /* De-alias the Peer's Call Id. */ + *pcall_id = GetOriginalPort(pptp_link); + + /* Compute TCP checksum for modified packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + accumulate -= *pcall_id; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + if (ctl_type == PPTP_OutCallReply || ctl_type == PPTP_InCallReply) { + PptpCode codes = (PptpCode)(cptr + 1); + + if (codes->resCode == 1) /* Connection established, */ + SetDestCallId(pptp_link, /* note the Call ID. */ + cptr->cid1); + else + SetExpire(pptp_link, 0); /* Connection refused. */ + } + } +} + +static PptpCallId +AliasVerifyPptp(struct ip *pip, u_int16_t *ptype) /* IP packet to examine/patch */ +{ + int hlen, tlen, dlen; + PptpMsgHead hptr; + struct tcphdr *tc; + + /* Calculate some lengths */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Verify data length */ + if (dlen < (sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds))) + return(NULL); + + /* Move up to PPTP message header */ + hptr = (PptpMsgHead)(((char *) pip) + hlen); + + /* Return the control message type */ + *ptype = ntohs(hptr->type); + + /* Verify PPTP Control Message */ + if ((ntohs(hptr->msgType) != PPTP_CTRL_MSG_TYPE) || + (ntohl(hptr->magic) != PPTP_MAGIC)) + return(NULL); + + /* Verify data length. */ + if ((*ptype == PPTP_OutCallReply || *ptype == PPTP_InCallReply) && + (dlen < sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds) + + sizeof(struct pptpCodes))) + return (NULL); + else + return (PptpCallId)(hptr + 1); +} + + +int +AliasHandlePptpGreOut(struct ip *pip) +{ + GreHdr *gr; + struct alias_link *link; + + gr = (GreHdr *)((char *)pip + (pip->ip_hl << 2)); + + /* Check GRE header bits. */ + if ((ntohl(*((u_int32_t *)gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE) + return (-1); + + link = FindPptpOutByPeerCallId(pip->ip_src, pip->ip_dst, gr->gh_call_id); + if (link != NULL) { + struct in_addr alias_addr = GetAliasAddress(link); + + /* Change source IP address. */ + DifferentialChecksum(&pip->ip_sum, + (u_short *)&alias_addr, + (u_short *)&pip->ip_src, + 2); + pip->ip_src = alias_addr; + } + + return (0); +} + + +int +AliasHandlePptpGreIn(struct ip *pip) +{ + GreHdr *gr; + struct alias_link *link; + + gr = (GreHdr *)((char *)pip + (pip->ip_hl << 2)); + + /* Check GRE header bits. */ + if ((ntohl(*((u_int32_t *)gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE) + return (-1); + + link = FindPptpInByPeerCallId(pip->ip_src, pip->ip_dst, gr->gh_call_id); + if (link != NULL) { + struct in_addr src_addr = GetOriginalAddress(link); + + /* De-alias the Peer's Call Id. */ + gr->gh_call_id = GetOriginalPort(link); + + /* Restore original IP address. */ + DifferentialChecksum(&pip->ip_sum, + (u_short *)&src_addr, + (u_short *)&pip->ip_dst, + 2); + pip->ip_dst = src_addr; + } + + return (0); +} diff --git a/sys/netinet/libalias/alias_proxy.c b/sys/netinet/libalias/alias_proxy.c new file mode 100644 index 0000000..3bd007c --- /dev/null +++ b/sys/netinet/libalias/alias_proxy.c @@ -0,0 +1,837 @@ +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* file: alias_proxy.c + + This file encapsulates special operations related to transparent + proxy redirection. This is where packets with a particular destination, + usually tcp port 80, are redirected to a proxy server. + + When packets are proxied, the destination address and port are + modified. In certain cases, it is necessary to somehow encode + the original address/port info into the packet. Two methods are + presently supported: addition of a [DEST addr port] string at the + beginning a of tcp stream, or inclusion of an optional field + in the IP header. + + There is one public API function: + + PacketAliasProxyRule() -- Adds and deletes proxy + rules. + + Rules are stored in a linear linked list, so lookup efficiency + won't be too good for large lists. + + + Initial development: April, 1998 (cjm) +*/ + + +/* System includes */ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> + +#include <sys/types.h> +#include <sys/socket.h> + +/* BSD IPV4 includes */ +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <arpa/inet.h> + +#include "alias_local.h" /* Functions used by alias*.c */ +#include "alias.h" /* Public API functions for libalias */ + + + +/* + Data structures + */ + +/* + * A linked list of arbitrary length, based on struct proxy_entry is + * used to store proxy rules. + */ +struct proxy_entry +{ +#define PROXY_TYPE_ENCODE_NONE 1 +#define PROXY_TYPE_ENCODE_TCPSTREAM 2 +#define PROXY_TYPE_ENCODE_IPHDR 3 + int rule_index; + int proxy_type; + u_char proto; + u_short proxy_port; + u_short server_port; + + struct in_addr server_addr; + + struct in_addr src_addr; + struct in_addr src_mask; + + struct in_addr dst_addr; + struct in_addr dst_mask; + + struct proxy_entry *next; + struct proxy_entry *last; +}; + + + +/* + File scope variables +*/ + +static struct proxy_entry *proxyList; + + + +/* Local (static) functions: + + IpMask() -- Utility function for creating IP + masks from integer (1-32) specification. + IpAddr() -- Utility function for converting string + to IP address + IpPort() -- Utility function for converting string + to port number + RuleAdd() -- Adds an element to the rule list. + RuleDelete() -- Removes an element from the rule list. + RuleNumberDelete() -- Removes all elements from the rule list + having a certain rule number. + ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning + of a TCP stream. + ProxyEncodeIpHeader() -- Adds an IP option indicating the true + destination of a proxied IP packet +*/ + +static int IpMask(int, struct in_addr *); +static int IpAddr(char *, struct in_addr *); +static int IpPort(char *, int, int *); +static void RuleAdd(struct proxy_entry *); +static void RuleDelete(struct proxy_entry *); +static int RuleNumberDelete(int); +static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int); +static void ProxyEncodeIpHeader(struct ip *, int); + +static int +IpMask(int nbits, struct in_addr *mask) +{ + int i; + u_int imask; + + if (nbits < 0 || nbits > 32) + return -1; + + imask = 0; + for (i=0; i<nbits; i++) + imask = (imask >> 1) + 0x80000000; + mask->s_addr = htonl(imask); + + return 0; +} + +static int +IpAddr(char *s, struct in_addr *addr) +{ + if (inet_aton(s, addr) == 0) + return -1; + else + return 0; +} + +static int +IpPort(char *s, int proto, int *port) +{ + int n; + + n = sscanf(s, "%d", port); + if (n != 1) + { + struct servent *se; + + if (proto == IPPROTO_TCP) + se = getservbyname(s, "tcp"); + else if (proto == IPPROTO_UDP) + se = getservbyname(s, "udp"); + else + return -1; + + if (se == NULL) + return -1; + + *port = (u_int) ntohs(se->s_port); + } + + return 0; +} + +void +RuleAdd(struct proxy_entry *entry) +{ + int rule_index; + struct proxy_entry *ptr; + struct proxy_entry *ptr_last; + + if (proxyList == NULL) + { + proxyList = entry; + entry->last = NULL; + entry->next = NULL; + return; + } + + rule_index = entry->rule_index; + ptr = proxyList; + ptr_last = NULL; + while (ptr != NULL) + { + if (ptr->rule_index >= rule_index) + { + if (ptr_last == NULL) + { + entry->next = proxyList; + entry->last = NULL; + proxyList->last = entry; + proxyList = entry; + return; + } + + ptr_last->next = entry; + ptr->last = entry; + entry->last = ptr->last; + entry->next = ptr; + return; + } + ptr_last = ptr; + ptr = ptr->next; + } + + ptr_last->next = entry; + entry->last = ptr_last; + entry->next = NULL; +} + +static void +RuleDelete(struct proxy_entry *entry) +{ + if (entry->last != NULL) + entry->last->next = entry->next; + else + proxyList = entry->next; + + if (entry->next != NULL) + entry->next->last = entry->last; + + free(entry); +} + +static int +RuleNumberDelete(int rule_index) +{ + int err; + struct proxy_entry *ptr; + + err = -1; + ptr = proxyList; + while (ptr != NULL) + { + struct proxy_entry *ptr_next; + + ptr_next = ptr->next; + if (ptr->rule_index == rule_index) + { + err = 0; + RuleDelete(ptr); + } + + ptr = ptr_next; + } + + return err; +} + +static void +ProxyEncodeTcpStream(struct alias_link *link, + struct ip *pip, + int maxpacketsize) +{ + int slen; + char buffer[40]; + struct tcphdr *tc; + +/* Compute pointer to tcp header */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + +/* Don't modify if once already modified */ + + if (GetAckModified (link)) + return; + +/* Translate destination address and port to string form */ + snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]", + inet_ntoa(GetProxyAddress (link)), (u_int) ntohs(GetProxyPort (link))); + +/* Pad string out to a multiple of two in length */ + slen = strlen(buffer); + switch (slen % 2) + { + case 0: + strcat(buffer, " \n"); + slen += 2; + break; + case 1: + strcat(buffer, "\n"); + slen += 1; + } + +/* Check for packet overflow */ + if ((ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize) + return; + +/* Shift existing TCP data and insert destination string */ + { + int dlen; + int hlen; + u_char *p; + + hlen = (pip->ip_hl + tc->th_off) << 2; + dlen = ntohs (pip->ip_len) - hlen; + +/* Modify first packet that has data in it */ + + if (dlen == 0) + return; + + p = (char *) pip; + p += hlen; + + memmove(p + slen, p, dlen); + memcpy(p, buffer, slen); + } + +/* Save information about modfied sequence number */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+slen); + } + +/* Update IP header packet length and checksum */ + { + int accumulate; + + accumulate = pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + slen); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + +/* Update TCP checksum, Use TcpChecksum since so many things have + already changed. */ + + tc->th_sum = 0; + tc->th_sum = TcpChecksum (pip); +} + +static void +ProxyEncodeIpHeader(struct ip *pip, + int maxpacketsize) +{ +#define OPTION_LEN_BYTES 8 +#define OPTION_LEN_INT16 4 +#define OPTION_LEN_INT32 2 + u_char option[OPTION_LEN_BYTES]; + +#ifdef DEBUG + fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip)); +#endif + +/* Check to see that there is room to add an IP option */ + if (pip->ip_hl > (0x0f - OPTION_LEN_INT32)) + return; + +/* Build option and copy into packet */ + { + u_char *ptr; + struct tcphdr *tc; + + ptr = (u_char *) pip; + ptr += 20; + memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20); + + option[0] = 0x64; /* class: 3 (reserved), option 4 */ + option[1] = OPTION_LEN_BYTES; + + memcpy(&option[2], (u_char *) &pip->ip_dst, 4); + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + memcpy(&option[6], (u_char *) &tc->th_sport, 2); + + memcpy(ptr, option, 8); + } + +/* Update checksum, header length and packet length */ + { + int i; + int accumulate; + u_short *sptr; + + sptr = (u_short *) option; + accumulate = 0; + for (i=0; i<OPTION_LEN_INT16; i++) + accumulate -= *(sptr++); + + sptr = (u_short *) pip; + accumulate += *sptr; + pip->ip_hl += OPTION_LEN_INT32; + accumulate -= *sptr; + + accumulate += pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } +#undef OPTION_LEN_BYTES +#undef OPTION_LEN_INT16 +#undef OPTION_LEN_INT32 +#ifdef DEBUG + fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip)); +#endif +} + + +/* Functions by other packet alias source files + + ProxyCheck() -- Checks whether an outgoing packet should + be proxied. + ProxyModify() -- Encodes the original destination address/port + for a packet which is to be redirected to + a proxy server. +*/ + +int +ProxyCheck(struct ip *pip, + struct in_addr *proxy_server_addr, + u_short *proxy_server_port) +{ + u_short dst_port; + struct in_addr src_addr; + struct in_addr dst_addr; + struct proxy_entry *ptr; + + src_addr = pip->ip_src; + dst_addr = pip->ip_dst; + dst_port = ((struct tcphdr *) ((char *) pip + (pip->ip_hl << 2))) + ->th_dport; + + ptr = proxyList; + while (ptr != NULL) + { + u_short proxy_port; + + proxy_port = ptr->proxy_port; + if ((dst_port == proxy_port || proxy_port == 0) + && pip->ip_p == ptr->proto + && src_addr.s_addr != ptr->server_addr.s_addr) + { + struct in_addr src_addr_masked; + struct in_addr dst_addr_masked; + + src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr; + dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr; + + if ((src_addr_masked.s_addr == ptr->src_addr.s_addr) + && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr)) + { + if ((*proxy_server_port = ptr->server_port) == 0) + *proxy_server_port = dst_port; + *proxy_server_addr = ptr->server_addr; + return ptr->proxy_type; + } + } + ptr = ptr->next; + } + + return 0; +} + +void +ProxyModify(struct alias_link *link, + struct ip *pip, + int maxpacketsize, + int proxy_type) +{ + switch (proxy_type) + { + case PROXY_TYPE_ENCODE_IPHDR: + ProxyEncodeIpHeader(pip, maxpacketsize); + break; + + case PROXY_TYPE_ENCODE_TCPSTREAM: + ProxyEncodeTcpStream(link, pip, maxpacketsize); + break; + } +} + + +/* + Public API functions +*/ + +int +PacketAliasProxyRule(const char *cmd) +{ +/* + * This function takes command strings of the form: + * + * server <addr>[:<port>] + * [port <port>] + * [rule n] + * [proto tcp|udp] + * [src <addr>[/n]] + * [dst <addr>[/n]] + * [type encode_tcp_stream|encode_ip_hdr|no_encode] + * + * delete <rule number> + * + * Subfields can be in arbitrary order. Port numbers and addresses + * must be in either numeric or symbolic form. An optional rule number + * is used to control the order in which rules are searched. If two + * rules have the same number, then search order cannot be guaranteed, + * and the rules should be disjoint. If no rule number is specified, + * then 0 is used, and group 0 rules are always checked before any + * others. + */ + int i, n, len; + int cmd_len; + int token_count; + int state; + char *token; + char buffer[256]; + char str_port[sizeof(buffer)]; + char str_server_port[sizeof(buffer)]; + char *res = buffer; + + int rule_index; + int proto; + int proxy_type; + int proxy_port; + int server_port; + struct in_addr server_addr; + struct in_addr src_addr, src_mask; + struct in_addr dst_addr, dst_mask; + struct proxy_entry *proxy_entry; + +/* Copy command line into a buffer */ + cmd += strspn(cmd, " \t"); + cmd_len = strlen(cmd); + if (cmd_len > (sizeof(buffer) - 1)) + return -1; + strcpy(buffer, cmd); + +/* Convert to lower case */ + len = strlen(buffer); + for (i=0; i<len; i++) + buffer[i] = tolower((unsigned char)buffer[i]); + +/* Set default proxy type */ + +/* Set up default values */ + rule_index = 0; + proxy_type = PROXY_TYPE_ENCODE_NONE; + proto = IPPROTO_TCP; + proxy_port = 0; + server_addr.s_addr = 0; + server_port = 0; + src_addr.s_addr = 0; + IpMask(0, &src_mask); + dst_addr.s_addr = 0; + IpMask(0, &dst_mask); + + str_port[0] = 0; + str_server_port[0] = 0; + +/* Parse command string with state machine */ +#define STATE_READ_KEYWORD 0 +#define STATE_READ_TYPE 1 +#define STATE_READ_PORT 2 +#define STATE_READ_SERVER 3 +#define STATE_READ_RULE 4 +#define STATE_READ_DELETE 5 +#define STATE_READ_PROTO 6 +#define STATE_READ_SRC 7 +#define STATE_READ_DST 8 + state = STATE_READ_KEYWORD; + token = strsep(&res, " \t"); + token_count = 0; + while (token != NULL) + { + token_count++; + switch (state) + { + case STATE_READ_KEYWORD: + if (strcmp(token, "type") == 0) + state = STATE_READ_TYPE; + else if (strcmp(token, "port") == 0) + state = STATE_READ_PORT; + else if (strcmp(token, "server") == 0) + state = STATE_READ_SERVER; + else if (strcmp(token, "rule") == 0) + state = STATE_READ_RULE; + else if (strcmp(token, "delete") == 0) + state = STATE_READ_DELETE; + else if (strcmp(token, "proto") == 0) + state = STATE_READ_PROTO; + else if (strcmp(token, "src") == 0) + state = STATE_READ_SRC; + else if (strcmp(token, "dst") == 0) + state = STATE_READ_DST; + else + return -1; + break; + + case STATE_READ_TYPE: + if (strcmp(token, "encode_ip_hdr") == 0) + proxy_type = PROXY_TYPE_ENCODE_IPHDR; + else if (strcmp(token, "encode_tcp_stream") == 0) + proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM; + else if (strcmp(token, "no_encode") == 0) + proxy_type = PROXY_TYPE_ENCODE_NONE; + else + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_PORT: + strcpy(str_port, token); + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SERVER: + { + int err; + char *p; + char s[sizeof(buffer)]; + + p = token; + while (*p != ':' && *p != 0) + p++; + + if (*p != ':') + { + err = IpAddr(token, &server_addr); + if (err) + return -1; + } + else + { + *p = ' '; + + n = sscanf(token, "%s %s", s, str_server_port); + if (n != 2) + return -1; + + err = IpAddr(s, &server_addr); + if (err) + return -1; + } + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_RULE: + n = sscanf(token, "%d", &rule_index); + if (n != 1 || rule_index < 0) + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_DELETE: + { + int err; + int rule_to_delete; + + if (token_count != 2) + return -1; + + n = sscanf(token, "%d", &rule_to_delete); + if (n != 1) + return -1; + err = RuleNumberDelete(rule_to_delete); + if (err) + return -1; + return 0; + } + + case STATE_READ_PROTO: + if (strcmp(token, "tcp") == 0) + proto = IPPROTO_TCP; + else if (strcmp(token, "udp") == 0) + proto = IPPROTO_UDP; + else + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SRC: + case STATE_READ_DST: + { + int err; + char *p; + struct in_addr mask; + struct in_addr addr; + + p = token; + while (*p != '/' && *p != 0) + p++; + + if (*p != '/') + { + IpMask(32, &mask); + err = IpAddr(token, &addr); + if (err) + return -1; + } + else + { + int nbits; + char s[sizeof(buffer)]; + + *p = ' '; + n = sscanf(token, "%s %d", s, &nbits); + if (n != 2) + return -1; + + err = IpAddr(s, &addr); + if (err) + return -1; + + err = IpMask(nbits, &mask); + if (err) + return -1; + } + + if (state == STATE_READ_SRC) + { + src_addr = addr; + src_mask = mask; + } + else + { + dst_addr = addr; + dst_mask = mask; + } + } + state = STATE_READ_KEYWORD; + break; + + default: + return -1; + break; + } + + do { + token = strsep(&res, " \t"); + } while (token != NULL && !*token); + } +#undef STATE_READ_KEYWORD +#undef STATE_READ_TYPE +#undef STATE_READ_PORT +#undef STATE_READ_SERVER +#undef STATE_READ_RULE +#undef STATE_READ_DELETE +#undef STATE_READ_PROTO +#undef STATE_READ_SRC +#undef STATE_READ_DST + +/* Convert port strings to numbers. This needs to be done after + the string is parsed, because the prototype might not be designated + before the ports (which might be symbolic entries in /etc/services) */ + + if (strlen(str_port) != 0) + { + int err; + + err = IpPort(str_port, proto, &proxy_port); + if (err) + return -1; + } + else + { + proxy_port = 0; + } + + if (strlen(str_server_port) != 0) + { + int err; + + err = IpPort(str_server_port, proto, &server_port); + if (err) + return -1; + } + else + { + server_port = 0; + } + +/* Check that at least the server address has been defined */ + if (server_addr.s_addr == 0) + return -1; + +/* Add to linked list */ + proxy_entry = malloc(sizeof(struct proxy_entry)); + if (proxy_entry == NULL) + return -1; + + proxy_entry->proxy_type = proxy_type; + proxy_entry->rule_index = rule_index; + proxy_entry->proto = proto; + proxy_entry->proxy_port = htons(proxy_port); + proxy_entry->server_port = htons(server_port); + proxy_entry->server_addr = server_addr; + proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr; + proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr; + proxy_entry->src_mask = src_mask; + proxy_entry->dst_mask = dst_mask; + + RuleAdd(proxy_entry); + + return 0; +} diff --git a/sys/netinet/libalias/alias_smedia.c b/sys/netinet/libalias/alias_smedia.c new file mode 100644 index 0000000..027a724 --- /dev/null +++ b/sys/netinet/libalias/alias_smedia.c @@ -0,0 +1,433 @@ +/* + * alias_smedia.c + * + * Copyright (c) 2000 Whistle Communications, Inc. + * All rights reserved. + * + * Subject to the following obligations and disclaimer of warranty, use and + * redistribution of this software, in source or object code forms, with or + * without modifications are expressly permitted by Whistle Communications; + * provided, however, that: + * 1. Any and all reproductions of the source or object code must include the + * copyright notice above and the following disclaimer of warranties; and + * 2. No rights are granted, in any manner or form, to use Whistle + * Communications, Inc. trademarks, including the mark "WHISTLE + * COMMUNICATIONS" on advertising, endorsements, or otherwise except as + * such appears in the above copyright notice or in the software. + * + * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND + * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO + * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, + * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY + * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS + * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. + * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES + * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING + * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * Copyright (c) 2000 Junichi SATOH <junichi@astec.co.jp> + * <junichi@junichi.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Erik Salander <erik@whistle.com> + * Junichi SATOH <junichi@astec.co.jp> + * <junichi@junichi.org> + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + Alias_smedia.c is meant to contain the aliasing code for streaming media + protocols. It performs special processing for RSTP sessions under TCP. + Specifically, when a SETUP request is sent by a client, or a 200 reply + is sent by a server, it is intercepted and modified. The address is + changed to the gateway machine and an aliasing port is used. + + More specifically, the "client_port" configuration parameter is + parsed for SETUP requests. The "server_port" configuration parameter is + parsed for 200 replies eminating from a server. This is intended to handle + the unicast case. + + RTSP also allows a redirection of a stream to another client by using the + "destination" configuration parameter. The destination config parm would + indicate a different IP address. This function is NOT supported by the + RTSP translation code below. + + The RTSP multicast functions without any address translation intervention. + + For this routine to work, the SETUP/200 must fit entirely + into a single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new SETUP/200 will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + Initial version: May, 2000 (eds) +*/ + +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include "alias_local.h" + +#define RTSP_CONTROL_PORT_NUMBER_1 554 +#define RTSP_CONTROL_PORT_NUMBER_2 7070 +#define RTSP_PORT_GROUP 2 + +#define ISDIGIT(a) (((a) >= '0') && ((a) <= '9')) + +static int +search_string(char *data, int dlen, const char *search_str) +{ + int i, j, k; + int search_str_len; + + search_str_len = strlen(search_str); + for (i = 0; i < dlen - search_str_len; i++) { + for (j = i, k = 0; j < dlen - search_str_len; j++, k++) { + if (data[j] != search_str[k] && + data[j] != search_str[k] - ('a' - 'A')) { + break; + } + if (k == search_str_len - 1) { + return j + 1; + } + } + } + return -1; +} + +static int +alias_rtsp_out(struct ip *pip, + struct alias_link *link, + char *data, + const char *port_str) +{ + int hlen, tlen, dlen; + struct tcphdr *tc; + int i, j, pos, state, port_dlen, new_dlen, delta; + u_short p[2], new_len; + u_short sport, eport, base_port; + u_short salias = 0, ealias = 0, base_alias = 0; + const char *transport_str = "transport:"; + char newdata[2048], *port_data, *port_newdata, stemp[80]; + int links_created = 0, pkt_updated = 0; + struct alias_link *rtsp_link = NULL; + struct in_addr null_addr; + + /* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Find keyword, "Transport: " */ + pos = search_string(data, dlen, transport_str); + if (pos < 0) { + return -1; + } + port_data = data + pos; + port_dlen = dlen - pos; + + memcpy(newdata, data, pos); + port_newdata = newdata + pos; + + while (port_dlen > strlen(port_str)) { + /* Find keyword, appropriate port string */ + pos = search_string(port_data, port_dlen, port_str); + if (pos < 0) { + break; + } + + memcpy (port_newdata, port_data, pos + 1); + port_newdata += (pos + 1); + + p[0] = p[1] = 0; + sport = eport = 0; + state = 0; + for (i = pos; i < port_dlen; i++) { + switch(state) { + case 0: + if (port_data[i] == '=') { + state++; + } + break; + case 1: + if (ISDIGIT(port_data[i])) { + p[0] = p[0] * 10 + port_data[i] - '0'; + } else { + if (port_data[i] == ';') { + state = 3; + } + if (port_data[i] == '-') { + state++; + } + } + break; + case 2: + if (ISDIGIT(port_data[i])) { + p[1] = p[1] * 10 + port_data[i] - '0'; + } else { + state++; + } + break; + case 3: + base_port = p[0]; + sport = htons(p[0]); + eport = htons(p[1]); + + if (!links_created) { + + links_created = 1; + /* Find an even numbered port number base that + satisfies the contiguous number of ports we need */ + null_addr.s_addr = 0; + if (0 == (salias = FindNewPortGroup(null_addr, + FindAliasAddress(pip->ip_src), + sport, 0, + RTSP_PORT_GROUP, + IPPROTO_UDP, 1))) { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/RTSP: Cannot find contiguous RTSP data ports\n"); +#endif + } else { + + base_alias = ntohs(salias); + for (j = 0; j < RTSP_PORT_GROUP; j++) { + /* Establish link to port found in RTSP packet */ + rtsp_link = FindRtspOut(GetOriginalAddress(link), null_addr, + htons(base_port + j), htons(base_alias + j), + IPPROTO_UDP); + if (rtsp_link != NULL) { +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(rtsp_link); +#endif + } else { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/RTSP: Cannot allocate RTSP data ports\n"); +#endif + break; + } + } + } + ealias = htons(base_alias + (RTSP_PORT_GROUP - 1)); + } + + if (salias && rtsp_link) { + + pkt_updated = 1; + + /* Copy into IP packet */ + sprintf(stemp, "%d", ntohs(salias)); + memcpy(port_newdata, stemp, strlen(stemp)); + port_newdata += strlen(stemp); + + if (eport != 0) { + *port_newdata = '-'; + port_newdata++; + + /* Copy into IP packet */ + sprintf(stemp, "%d", ntohs(ealias)); + memcpy(port_newdata, stemp, strlen(stemp)); + port_newdata += strlen(stemp); + } + + *port_newdata = ';'; + port_newdata++; + } + state++; + break; + } + if (state > 3) { + break; + } + } + port_data += i; + port_dlen -= i; + } + + if (!pkt_updated) + return -1; + + memcpy (port_newdata, port_data, port_dlen); + port_newdata += port_dlen; + *port_newdata = '\0'; + + /* Create new packet */ + new_dlen = port_newdata - newdata; + memcpy (data, newdata, new_dlen); + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta + new_dlen - dlen); + + new_len = htons(hlen + new_dlen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + + return 0; +} + +/* Support the protocol used by early versions of RealPlayer */ + +static int +alias_pna_out(struct ip *pip, + struct alias_link *link, + char *data, + int dlen) +{ + struct alias_link *pna_links; + u_short msg_id, msg_len; + char *work; + u_short alias_port, port; + struct tcphdr *tc; + + work = data; + work += 5; + while (work + 4 < data + dlen) { + memcpy(&msg_id, work, 2); + work += 2; + memcpy(&msg_len, work, 2); + work += 2; + if (ntohs(msg_id) == 0) { + /* end of options */ + return 0; + } + if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) { + memcpy(&port, work, 2); + pna_links = FindUdpTcpOut(pip->ip_src, GetDestAddress(link), + port, 0, IPPROTO_UDP, 1); + if (pna_links != NULL) { +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(pna_links); +#endif + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + alias_port = GetAliasPort(pna_links); + memcpy(work, &alias_port, 2); + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + } + } + work += ntohs(msg_len); + } + + return 0; +} + +void +AliasHandleRtspOut(struct ip *pip, struct alias_link *link, int maxpacketsize) +{ + int hlen, tlen, dlen; + struct tcphdr *tc; + char *data; + const char *setup = "SETUP", *pna = "PNA", *str200 = "200"; + const char *okstr = "OK", *client_port_str = "client_port"; + const char *server_port_str = "server_port"; + int i, parseOk; + + tc = (struct tcphdr *)((char *)pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + data = (char*)pip; + data += hlen; + + /* When aliasing a client, check for the SETUP request */ + if ((ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1) || + (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2)) { + + if (dlen >= strlen(setup)) { + if (memcmp(data, setup, strlen(setup)) == 0) { + alias_rtsp_out(pip, link, data, client_port_str); + return; + } + } + if (dlen >= strlen(pna)) { + if (memcmp(data, pna, strlen(pna)) == 0) { + alias_pna_out(pip, link, data, dlen); + } + } + + } else { + + /* When aliasing a server, check for the 200 reply + Accomodate varying number of blanks between 200 & OK */ + + if (dlen >= strlen(str200)) { + + for (parseOk = 0, i = 0; + i <= dlen - strlen(str200); + i++) { + if (memcmp(&data[i], str200, strlen(str200)) == 0) { + parseOk = 1; + break; + } + } + if (parseOk) { + + i += strlen(str200); /* skip string found */ + while(data[i] == ' ') /* skip blank(s) */ + i++; + + if ((dlen - i) >= strlen(okstr)) { + + if (memcmp(&data[i], okstr, strlen(okstr)) == 0) + alias_rtsp_out(pip, link, data, server_port_str); + + } + } + } + } +} diff --git a/sys/netinet/libalias/alias_util.c b/sys/netinet/libalias/alias_util.c new file mode 100644 index 0000000..787c859 --- /dev/null +++ b/sys/netinet/libalias/alias_util.c @@ -0,0 +1,169 @@ +/*- + * Copyright (c) 2001 Charles Mott <cm@linktel.net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + + +/* + Alias_util.c contains general utilities used by other functions + in the packet aliasing module. At the moment, there are functions + for computing IP header and TCP packet checksums. + + The checksum routines are based upon example code in a Unix networking + text written by Stevens (sorry, I can't remember the title -- but + at least this is a good author). + + Initial Version: August, 1996 (cjm) + + Version 1.7: January 9, 1997 + Added differential checksum update function. +*/ + +/* +Note: the checksum routines assume that the actual checksum word has +been zeroed out. If the checksum word is filled with the proper value, +then these routines will give a result of zero (useful for testing +purposes); +*/ + +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include "alias.h" +#include "alias_local.h" + +u_short +PacketAliasInternetChecksum(u_short *ptr, int nbytes) +{ + int sum, oddbyte; + + sum = 0; + while (nbytes > 1) + { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) + { + oddbyte = 0; + ((u_char *) &oddbyte)[0] = *(u_char *) ptr; + ((u_char *) &oddbyte)[1] = 0; + sum += oddbyte; + } + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + return(~sum); +} + +u_short +IpChecksum(struct ip *pip) +{ + return( PacketAliasInternetChecksum((u_short *) pip, + (pip->ip_hl << 2)) ); + +} + +u_short +TcpChecksum(struct ip *pip) +{ + u_short *ptr; + struct tcphdr *tc; + int nhdr, ntcp, nbytes; + int sum, oddbyte; + + nhdr = pip->ip_hl << 2; + ntcp = ntohs(pip->ip_len) - nhdr; + + tc = (struct tcphdr *) ((char *) pip + nhdr); + ptr = (u_short *) tc; + +/* Add up TCP header and data */ + nbytes = ntcp; + sum = 0; + while (nbytes > 1) + { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) + { + oddbyte = 0; + ((u_char *) &oddbyte)[0] = *(u_char *) ptr; + ((u_char *) &oddbyte)[1] = 0; + sum += oddbyte; + } + +/* "Pseudo-header" data */ + ptr = (u_short *) &(pip->ip_dst); + sum += *ptr++; + sum += *ptr; + ptr = (u_short *) &(pip->ip_src); + sum += *ptr++; + sum += *ptr; + sum += htons((u_short) ntcp); + sum += htons((u_short) pip->ip_p); + +/* Roll over carry bits */ + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + +/* Return checksum */ + return((u_short) ~sum); +} + + +void +DifferentialChecksum(u_short *cksum, u_short *new, u_short *old, int n) +{ + int i; + int accumulate; + + accumulate = *cksum; + for (i=0; i<n; i++) + { + accumulate -= *new++; + accumulate += *old++; + } + + if (accumulate < 0) + { + accumulate = -accumulate; + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) ~accumulate; + } + else + { + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) accumulate; + } +} + diff --git a/sys/netinet/libalias/libalias.3 b/sys/netinet/libalias/libalias.3 new file mode 100644 index 0000000..cd8b97c --- /dev/null +++ b/sys/netinet/libalias/libalias.3 @@ -0,0 +1,981 @@ +.\"- +.\" Copyright (c) 2001 Charles Mott <cm@linktel.net> +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd April 13, 2000 +.Dt LIBALIAS 3 +.Os +.Sh NAME +.Nm libalias +.Nd packet aliasing library for masquerading and network address translation +.Sh SYNOPSIS +.In sys/types.h +.In netinet/in.h +.In alias.h +.Pp +Function prototypes are given in the main body of the text. +.Sh DESCRIPTION +The +.Nm +library is a collection of functions for aliasing and de-aliasing of IP +packets, intended for masquerading and network address translation (NAT). +.Sh INTRODUCTION +This library is a moderately portable set of functions designed to assist +in the process of IP masquerading and network address translation. +Outgoing packets from a local network with unregistered IP addresses can +be aliased to appear as if they came from an accessible IP address. +Incoming packets are then de-aliased so that they are sent to the correct +machine on the local network. +.Pp +A certain amount of flexibility is built into the packet aliasing engine. +In the simplest mode of operation, a many-to-one address mapping takes +place between local network and the packet aliasing host. +This is known as IP masquerading. +In addition, one-to-one mappings between local and public addresses can +also be implemented, which is known as static NAT. +In between these extremes, different groups of private addresses can be +linked to different public addresses, comprising several distinct +many-to-one mappings. +Also, a given public address and port can be statically redirected to a +private address/port. +.Pp +The packet aliasing engine was designed to operate in user space outside +of the kernel, without any access to private kernel data structure, but +the source code can also be ported to a kernel environment. +.Sh INITIALIZATION AND CONTROL +Two special functions, +.Fn PacketAliasInit +and +.Fn PacketAliasSetAddress , +must always be called before any packet handling may be performed. +In addition, the operating mode of the packet aliasing engine can be +customized by calling +.Fn PacketAliasSetMode . +.Pp +.Ft void +.Fn PacketAliasInit void +.Bd -ragged -offset indent +This function has no arguments or return value and is used to initialize +internal data structures. +The following mode bits are always set after calling +.Fn PacketAliasInit . +See the description of +.Fn PacketAliasSetMode +below for the meaning of these mode bits. +.Pp +.Bl -item -offset indent -compact +.It +.Dv PKT_ALIAS_SAME_PORTS +.It +.Dv PKT_ALIAS_USE_SOCKETS +.It +.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +.El +.Pp +This function will always return the packet aliasing engine to the same +initial state. +.Fn PacketAliasSetAddress +must be called afterwards, and any desired changes from the default mode +bits listed above require a call to +.Fn PacketAliasSetMode . +.Pp +It is mandatory that this function be called at the beginning of a program +prior to any packet handling. +.Ed +.Pp +.Ft void +.Fn PacketAliasUninit void +.Bd -ragged -offset indent +This function has no arguments or return value and is used to clear any +resources attached to internal data structures. +.Pp +This functions should be called when a program stops using the aliasing +engine; it does, amongst other things, clear out any firewall holes. +To provide backwards compatibility and extra security, it is added to +the +.Xr atexit 3 +chain by +.Fn PacketAliasInit . +Calling it multiple times is harmless. +.Ed +.Pp +.Ft void +.Fn PacketAliasSetAddress "struct in_addr addr" +.Bd -ragged -offset indent +This function sets the source address to which outgoing packets from the +local area network are aliased. +All outgoing packets are re-mapped to this address unless overridden by a +static address mapping established by +.Fn PacketAliasRedirectAddr . +.Pp +If the +.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +mode bit is set (the default mode of operation), then the internal aliasing +link tables will be reset any time the aliasing address changes. +This is useful for interfaces such as +.Xr ppp 8 , +where the IP +address may or may not change on successive dial-up attempts. +.Pp +If the +.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +mode bit is set to zero, this function can also be used to dynamically change +the aliasing address on a packet to packet basis (it is a low overhead call). +.Pp +It is mandatory that this function be called prior to any packet handling. +.Ed +.Pp +.Ft unsigned int +.Fn PacketAliasSetMode "unsigned int flags" "unsigned int mask" +.Bd -ragged -offset indent +This function sets or clears mode bits +according to the value of +.Fa flags . +Only bits marked in +.Fa mask +are affected. +The following mode bits are defined in +.Aq Pa alias.h : +.Bl -tag -width indent +.It Dv PKT_ALIAS_LOG +Enables logging into +.Pa /var/log/alias.log . +Each time an aliasing link is created or deleted, the log file is appended +with the current number of ICMP, TCP and UDP links. +Mainly useful for debugging when the log file is viewed continuously with +.Xr tail 1 . +.It Dv PKT_ALIAS_DENY_INCOMING +If this mode bit is set, all incoming packets associated with new TCP +connections or new UDP transactions will be marked for being ignored +.Fn ( PacketAliasIn +returns +.Dv PKT_ALIAS_IGNORED +code) +by the calling program. +Response packets to connections or transactions initiated from the packet +aliasing host or local network will be unaffected. +This mode bit is useful for implementing a one-way firewall. +.It Dv PKT_ALIAS_SAME_PORTS +If this mode bit is set, the packet aliasing engine will attempt to leave +the alias port numbers unchanged from the actual local port numbers. +This can be done as long as the quintuple (proto, alias addr, alias port, +remote addr, remote port) is unique. +If a conflict exists, a new aliasing port number is chosen even if this +mode bit is set. +.It Dv PKT_ALIAS_USE_SOCKETS +This bit should be set when the packet aliasing host originates network +traffic as well as forwards it. +When the packet aliasing host is waiting for a connection from an unknown +host address or unknown port number (e.g. an FTP data connection), this +mode bit specifies that a socket be allocated as a place holder to prevent +port conflicts. +Once a connection is established, usually within a minute or so, the socket +is closed. +.It Dv PKT_ALIAS_UNREGISTERED_ONLY +If this mode bit is set, traffic on the local network which does not +originate from unregistered address spaces will be ignored. +Standard Class A, B and C unregistered addresses are: +.Bd -literal -offset indent +10.0.0.0 -> 10.255.255.255 (Class A subnet) +172.16.0.0 -> 172.31.255.255 (Class B subnets) +192.168.0.0 -> 192.168.255.255 (Class C subnets) +.Ed +.Pp +This option is useful in the case that packet aliasing host has both +registered and unregistered subnets on different interfaces. +The registered subnet is fully accessible to the outside world, so traffic +from it does not need to be passed through the packet aliasing engine. +.It Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +When this mode bit is set and +.Fn PacketAliasSetAddress +is called to change the aliasing address, the internal link table of the +packet aliasing engine will be cleared. +This operating mode is useful for +.Xr ppp 8 +links where the interface address can sometimes change or remain the same +between dial-up attempts. +If this mode bit is not set, the link table will never be reset in the event +of an address change. +.It Dv PKT_ALIAS_PUNCH_FW +This option makes +.Nm +`punch holes' in an +.Xr ipfirewall 4 +based firewall for FTP/IRC DCC connections. +The holes punched are bound by from/to IP address and port; it will not be +possible to use a hole for another connection. +A hole is removed when the connection that uses it dies. +To cater to unexpected death of a program using +.Nm +(e.g. kill -9), +changing the state of the flag will clear the entire firewall range +allocated for holes. +This will also happen on the initial call to +.Fn PacketAliasSetFWBase . +This call must happen prior to setting this flag. +.It Dv PKT_ALIAS_REVERSE +This option makes +.Nm +reverse the way it handles incoming and outgoing packets, allowing it +to be fed with data that passes through the internal interface rather +than the external one. +.It Dv PKT_ALIAS_PROXY_ONLY +This option tells +.Nm +to obey transparent proxy rules only. +Normal packet aliasing is not performed. +See +.Fn PacketAliasProxyRule +below for details. +.El +.Ed +.Pp +.Ft void +.Fn PacketAliasSetFWBase "unsigned int base" "unsigned int num" +.Bd -ragged -offset indent +Set firewall range allocated for punching firewall holes (with the +.Dv PKT_ALIAS_PUNCH_FW +flag). +The range will be cleared for all rules on initialization. +.Ed +.Sh PACKET HANDLING +The packet handling functions are used to modify incoming (remote to local) +and outgoing (local to remote) packets. +The calling program is responsible for receiving and sending packets via +network interfaces. +.Pp +Along with +.Fn PacketAliasInit +and +.Fn PacketAliasSetAddress , +the two packet handling functions, +.Fn PacketAliasIn +and +.Fn PacketAliasOut , +comprise minimal set of functions needed for a basic IP masquerading +implementation. +.Pp +.Ft int +.Fn PacketAliasIn "char *buffer" "int maxpacketsize" +.Bd -ragged -offset indent +An incoming packet coming from a remote machine to the local network is +de-aliased by this function. +The IP packet is pointed to by +.Fa buffer , +and +.Fa maxpacketsize +indicates the size of the data structure containing the packet and should +be at least as large as the actual packet size. +.Pp +Return codes: +.Bl -tag -width indent +.It Dv PKT_ALIAS_OK +The packet aliasing process was successful. +.It Dv PKT_ALIAS_IGNORED +The packet was ignored and not de-aliased. +This can happen if the protocol is unrecognized, possibly an ICMP message +type is not handled or if incoming packets for new connections are being +ignored (if +.Dv PKT_ALIAS_DENY_INCOMING +mode bit was set by +.Fn PacketAliasSetMode ) . +.It Dv PKT_ALIAS_UNRESOLVED_FRAGMENT +This is returned when a fragment cannot be resolved because the header +fragment has not been sent yet. +In this situation, fragments must be saved with +.Fn PacketAliasSaveFragment +until a header fragment is found. +.It Dv PKT_ALIAS_FOUND_HEADER_FRAGMENT +The packet aliasing process was successful, and a header fragment was found. +This is a signal to retrieve any unresolved fragments with +.Fn PacketAliasGetFragment +and de-alias them with +.Fn PacketAliasFragmentIn . +.It Dv PKT_ALIAS_ERROR +An internal error within the packet aliasing engine occurred. +.El +.Ed +.Pp +.Ft int +.Fn PacketAliasOut "char *buffer" "int maxpacketsize" +.Bd -ragged -offset indent +An outgoing packet coming from the local network to a remote machine is +aliased by this function. +The IP packet is pointed to by +.Fa buffer , +and +.Fa maxpacketsize +indicates the maximum packet size permissible should the packet length be +changed. +IP encoding protocols place address and port information in the encapsulated +data stream which has to be modified and can account for changes in packet +length. +Well known examples of such protocols are FTP and IRC DCC. +.Pp +Return codes: +.Bl -tag -width indent +.It Dv PKT_ALIAS_OK +The packet aliasing process was successful. +.It Dv PKT_ALIAS_IGNORED +The packet was ignored and not aliased. +This can happen if the protocol is unrecognized, or possibly an ICMP message +type is not handled. +.It Dv PKT_ALIAS_ERROR +An internal error within the packet aliasing engine occurred. +.El +.Ed +.Sh PORT AND ADDRESS REDIRECTION +The functions described in this section allow machines on the local network +to be accessible in some degree to new incoming connections from the external +network. +Individual ports can be re-mapped or static network address translations can +be designated. +.Pp +.Ft struct alias_link * +.Fo PacketAliasRedirectPort +.Fa "struct in_addr local_addr" +.Fa "u_short local_port" +.Fa "struct in_addr remote_addr" +.Fa "u_short remote_port" +.Fa "struct in_addr alias_addr" +.Fa "u_short alias_port" +.Fa "u_char proto" +.Fc +.Bd -ragged -offset indent +This function specifies that traffic from a given remote address/port to +an alias address/port be redirected to a specified local address/port. +The parameter +.Fa proto +can be either +.Dv IPPROTO_TCP +or +.Dv IPPROTO_UDP , +as defined in +.Aq Pa netinet/in.h . +.Pp +If +.Fa local_addr +or +.Fa alias_addr +is zero, this indicates that the packet aliasing address as established +by +.Fn PacketAliasSetAddress +is to be used. +Even if +.Fn PacketAliasSetAddress +is called to change the address after +.Fn PacketAliasRedirectPort +is called, a zero reference will track this change. +.Pp +If the link is further set up to operate for a load sharing, then +.Fa local_addr +and +.Fa local_port +are ignored, and are selected dynamically from the server pool, as described in +.Fn PacketAliasAddServer +below. +.Pp +If +.Fa remote_addr +is zero, this indicates to redirect packets from any remote address. +Likewise, if +.Fa remote_port +is zero, this indicates to redirect packets originating from any remote +port number. +Almost always, the remote port specification will be zero, but non-zero +remote addresses can sometimes be useful for firewalling. +If two calls to +.Fn PacketAliasRedirectPort +overlap in their address/port specifications, then the most recent call +will have precedence. +.Pp +This function returns a pointer which can subsequently be used by +.Fn PacketAliasRedirectDelete . +If +.Dv NULL +is returned, then the function call did not complete successfully. +.Pp +All port numbers should be in network address byte order, so it is necessary +to use +.Xr htons 3 +to convert these parameters from internally readable numbers to network byte +order. +Addresses are also in network byte order, which is implicit in the use of the +.Fa struct in_addr +data type. +.Ed +.Pp +.Ft struct alias_link * +.Fo PacketAliasRedirectAddr +.Fa "struct in_addr local_addr" +.Fa "struct in_addr alias_addr" +.Fc +.Bd -ragged -offset indent +This function designates that all incoming traffic to +.Fa alias_addr +be redirected to +.Fa local_addr . +Similarly, all outgoing traffic from +.Fa local_addr +is aliased to +.Fa alias_addr . +.Pp +If +.Fa local_addr +or +.Fa alias_addr +is zero, this indicates that the packet aliasing address as established by +.Fn PacketAliasSetAddress +is to be used. +Even if +.Fn PacketAliasSetAddress +is called to change the address after +.Fn PacketAliasRedirectAddr +is called, a zero reference will track this change. +.Pp +If the link is further set up to operate for a load sharing, then +.Fa local_addr +is ignored, and is selected dynamically from the server pool, as described in +.Fn PacketAliasAddServer +below. +.Pp +If subsequent calls to +.Fn PacketAliasRedirectAddr +use the same aliasing address, all new incoming traffic to this aliasing +address will be redirected to the local address made in the last function +call. +New traffic generated by any of the local machines, designated in the +several function calls, will be aliased to the same address. +Consider the following example: +.Bd -literal -offset indent +PacketAliasRedirectAddr(inet_aton("192.168.0.2"), + inet_aton("141.221.254.101")); +PacketAliasRedirectAddr(inet_aton("192.168.0.3"), + inet_aton("141.221.254.101")); +PacketAliasRedirectAddr(inet_aton("192.168.0.4"), + inet_aton("141.221.254.101")); +.Ed +.Pp +Any outgoing connections such as +.Xr telnet 1 +or +.Xr ftp 1 +from 192.168.0.2, 192.168.0.3 and 192.168.0.4 will appear to come from +141.221.254.101. +Any incoming connections to 141.221.254.101 will be directed to 192.168.0.4. +.Pp +Any calls to +.Fn PacketAliasRedirectPort +will have precedence over address mappings designated by +.Fn PacketAliasRedirectAddr . +.Pp +This function returns a pointer which can subsequently be used by +.Fn PacketAliasRedirectDelete . +If +.Dv NULL +is returned, then the function call did not complete successfully. +.Ed +.Pp +.Ft int +.Fo PacketAliasAddServer +.Fa "struct alias_link *link" +.Fa "struct in_addr addr" +.Fa "u_short port" +.Fc +.Bd -ragged -offset indent +This function sets the +.Fa link +up for Load Sharing using IP Network Address Translation (RFC 2391, LSNAT). +LSNAT operates as follows. +A client attempts to access a server by using the server virtual address. +The LSNAT router transparently redirects the request to one of the hosts +in server pool, selected using a real-time load sharing algorithm. +Multiple sessions may be initiated from the same client, and each session +could be directed to a different host based on load balance across server +pool hosts at the time. +If load share is desired for just a few specific services, the configuration +on LSNAT could be defined to restrict load share for just the services +desired. +.Pp +Currently, only the simplest selection algorithm is implemented, where a +host is selected on a round-robin basis only, without regard to load on +the host. +.Pp +First, the +.Fa link +is created by either +.Fn PacketAliasRedirectPort +or +.Fn PacketAliasRedirectAddr . +Then, +.Fn PacketAliasAddServer +is called multiple times to add entries to the +.Fa link Ns 's +server pool. +.Pp +For links created with +.Fn PacketAliasRedirectAddr , +the +.Fa port +argument is ignored and could have any value, e.g. htons(~0). +.Pp +This function returns 0 on success, -1 otherwise. +.Ed +.Pp +.Ft void +.Fn PacketAliasRedirectDelete "struct alias_link *link" +.Bd -ragged -offset indent +This function will delete a specific static redirect rule entered by +.Fn PacketAliasRedirectPort +or +.Fn PacketAliasRedirectAddr . +The parameter +.Fa link +is the pointer returned by either of the redirection functions. +If an invalid pointer is passed to +.Fn PacketAliasRedirectDelete , +then a program crash or unpredictable operation could result, so it is +necessary to be careful using this function. +.Ed +.Pp +.Ft int +.Fn PacketAliasProxyRule "const char *cmd" +.Bd -ragged -offset indent +The passed +.Fa cmd +string consists of one or more pairs of words. +The first word in each pair is a token and the second is the value that +should be applied for that token. +Tokens and their argument types are as follows: +.Bl -tag -width indent +.It Cm type encode_ip_hdr | encode_tcp_stream | no_encode +In order to support transparent proxying, it is necessary to somehow +pass the original address and port information into the new destination +server. +If +.Cm encode_ip_hdr +is specified, the original address and port is passed as an extra IP +option. +If +.Cm encode_tcp_stream +is specified, the original address and port is passed as the first +piece of data in the TCP stream in the format +.Dq DEST Ar IP port . +.It Cm port Ar portnum +Only packets with the destination port +.Ar portnum +are proxied. +.It Cm server Ar host Ns Xo +.Op : Ns Ar portnum +.Xc +This specifies the +.Ar host +and +.Ar portnum +that the data is to be redirected to. +.Ar host +must be an IP address rather than a DNS host name. +If +.Ar portnum +is not specified, the destination port number is not changed. +.Pp +The +.Ar server +specification is mandatory unless the +.Cm delete +command is being used. +.It Cm rule Ar index +Normally, each call to +.Fn PacketAliasProxyRule +inserts the next rule at the start of a linear list of rules. +If an +.Ar index +is specified, the new rule will be checked after all rules with lower +indices. +Calls to +.Fn PacketAliasProxyRule +that do not specify a rule are assigned rule 0. +.It Cm delete Ar index +This token and its argument MUST NOT be used with any other tokens. +When used, all existing rules with the given +.Ar index +are deleted. +.It Cm proto tcp | udp +If specified, only packets of the given protocol type are matched. +.It Cm src Ar IP Ns Xo +.Op / Ns Ar bits +.Xc +If specified, only packets with a source address matching the given +.Ar IP +are matched. +If +.Ar bits +is also specified, then the first +.Ar bits +bits of +.Ar IP +are taken as a network specification, and all IP addresses from that +network will be matched. +.It Cm dst Ar IP Ns Xo +.Op / Ns Ar bits +.Xc +If specified, only packets with a destination address matching the given +.Ar IP +are matched. +If +.Ar bits +is also specified, then the first +.Ar bits +bits of +.Ar IP +are taken as a network specification, and all IP addresses from that +network will be matched. +.El +.Pp +This function is usually used to redirect outgoing connections for +internal machines that are not permitted certain types of internet +access, or to restrict access to certain external machines. +.Ed +.Pp +.Ft struct alias_link * +.Fo PacketAliasRedirectProto +.Fa "struct in_addr local_addr" +.Fa "struct in_addr remote_addr" +.Fa "struct in_addr alias_addr" +.Fa "u_char proto" +.Fc +.Bd -ragged -offset indent +This function specifies that any IP packet with protocol number of +.Fa proto +from a given remote address to an alias address be +redirected to a specified local address. +.Pp +If +.Fa local_addr +or +.Fa alias_addr +is zero, this indicates that the packet aliasing address as established +by +.Fn PacketAliasSetAddress +is to be used. +Even if +.Fn PacketAliasSetAddress +is called to change the address after +.Fn PacketAliasRedirectProto +is called, a zero reference will track this change. +.Pp +If +.Fa remote_addr +is zero, this indicates to redirect packets from any remote address. +Non-zero remote addresses can sometimes be useful for firewalling. +.Pp +If two calls to +.Fn PacketAliasRedirectProto +overlap in their address specifications, then the most recent call +will have precedence. +.Pp +This function returns a pointer which can subsequently be used by +.Fn PacketAliasRedirectDelete . +If +.Dv NULL +is returned, then the function call did not complete successfully. +.Ed +.Sh FRAGMENT HANDLING +The functions in this section are used to deal with incoming fragments. +.Pp +Outgoing fragments are handled within +.Fn PacketAliasOut +by changing the address according to any applicable mapping set by +.Fn PacketAliasRedirectAddr , +or the default aliasing address set by +.Fn PacketAliasSetAddress . +.Pp +Incoming fragments are handled in one of two ways. +If the header of a fragmented IP packet has already been seen, then all +subsequent fragments will be re-mapped in the same manner the header +fragment was. +Fragments which arrive before the header are saved and then retrieved +once the header fragment has been resolved. +.Pp +.Ft int +.Fn PacketAliasSaveFragment "char *ptr" +.Bd -ragged -offset indent +When +.Fn PacketAliasIn +returns +.Dv PKT_ALIAS_UNRESOLVED_FRAGMENT , +this function can be used to save the pointer to the unresolved fragment. +.Pp +It is implicitly assumed that +.Fa ptr +points to a block of memory allocated by +.Xr malloc 3 . +If the fragment is never resolved, the packet aliasing engine will +automatically free the memory after a timeout period. +[Eventually this function should be modified so that a callback function +for freeing memory is passed as an argument.] +.Pp +This function returns +.Dv PKT_ALIAS_OK +if it was successful and +.Dv PKT_ALIAS_ERROR +if there was an error. +.Ed +.Pp +.Ft char * +.Fn PacketAliasGetFragment "char *buffer" +.Bd -ragged -offset indent +This function can be used to retrieve fragment pointers saved by +.Fn PacketAliasSaveFragment . +The IP header fragment pointed to by +.Fa buffer +is the header fragment indicated when +.Fn PacketAliasIn +returns +.Dv PKT_ALIAS_FOUND_HEADER_FRAGMENT . +Once a fragment pointer is retrieved, it becomes the calling program's +responsibility to free the dynamically allocated memory for the fragment. +.Pp +.Fn PacketAliasGetFragment +can be called sequentially until there are no more fragments available, +at which time it returns +.Dv NULL . +.Ed +.Pp +.Ft void +.Fn PacketAliasFragmentIn "char *header" "char *fragment" +.Bd -ragged -offset indent +When a fragment is retrieved with +.Fn PacketAliasGetFragment , +it can then be de-aliased with a call to +.Fn PacketAliasFragmentIn . +The +.Fa header +argument is the pointer to a header fragment used as a template, and +.Fa fragment +is the pointer to the packet to be de-aliased. +.Ed +.Sh MISCELLANEOUS FUNCTIONS +.Ft void +.Fn PacketAliasSetTarget "struct in_addr addr" +.Bd -ragged -offset indent +When an incoming packet not associated with any pre-existing aliasing link +arrives at the host machine, it will be sent to the address indicated by a +call to +.Fn PacketAliasSetTarget . +.Pp +If this function is called with an +.Dv INADDR_NONE +address argument, then all new incoming packets go to the address set by +.Fn PacketAliasSetAddress . +.Pp +If this function is not called, or is called with an +.Dv INADDR_ANY +address argument, then all new incoming packets go to the address specified +in the packet. +This allows external machines to talk directly to internal machines if they +can route packets to the machine in question. +.Ed +.Pp +.Ft int +.Fn PacketAliasCheckNewLink void +.Bd -ragged -offset indent +This function returns a non-zero value when a new aliasing link is created. +In circumstances where incoming traffic is being sequentially sent to +different local servers, this function can be used to trigger when +.Fn PacketAliasSetTarget +is called to change the default target address. +.Ed +.Pp +.Ft u_short +.Fn PacketAliasInternetChecksum "u_short *buffer" "int nbytes" +.Bd -ragged -offset indent +This is a utility function that does not seem to be available elsewhere and +is included as a convenience. +It computes the internet checksum, which is used in both IP and +protocol-specific headers (TCP, UDP, ICMP). +.Pp +The +.Fa buffer +argument points to the data block to be checksummed, and +.Fa nbytes +is the number of bytes. +The 16-bit checksum field should be zeroed before computing the checksum. +.Pp +Checksums can also be verified by operating on a block of data including +its checksum. +If the checksum is valid, +.Fn PacketAliasInternetChecksum +will return zero. +.Ed +.Pp +.Ft int +.Fn PacketUnaliasOut "char *buffer" "int maxpacketsize" +.Bd -ragged -offset indent +An outgoing packet, which has already been aliased, +has its private address/port information restored by this function. +The IP packet is pointed to by +.Fa buffer , +and +.Fa maxpacketsize +is provided for error checking purposes. +This function can be used if an already-aliased packet needs to have its +original IP header restored for further processing (eg. logging). +.Ed +.Sh BUGS +PPTP aliasing does not work when more than one internal client +connects to the same external server at the same time, because +PPTP requires a single TCP control connection to be established +between any two IP addresses. +.Sh AUTHORS +.An Charles Mott Aq cm@linktel.net , +versions 1.0 - 1.8, 2.0 - 2.4. +.An Eivind Eklund Aq eivind@FreeBSD.org , +versions 1.8b, 1.9 and 2.5. +Added IRC DCC support as well as contributing a number of architectural +improvements; added the firewall bypass for FTP/IRC DCC. +.An Erik Salander Aq erik@whistle.com +added support for PPTP and RTSP. +.An Junichi Satoh Aq junichi@junichi.org +added support for RTSP/PNA. +.Sh ACKNOWLEDGMENTS +Listed below, in approximate chronological order, are individuals who +have provided valuable comments and/or debugging assistance. +.Pp +.Bd -ragged -offset indent +.An -split +.An Gary Roberts +.An Tom Torrance +.An Reto Burkhalter +.An Martin Renters +.An Brian Somers +.An Paul Traina +.An Ari Suutari +.An Dave Remien +.An J. Fortes +.An Andrzej Bialecki +.An Gordon Burditt +.Ed +.Sh CONCEPTUAL BACKGROUND +This section is intended for those who are planning to modify the source +code or want to create somewhat esoteric applications using the packet +aliasing functions. +.Pp +The conceptual framework under which the packet aliasing engine operates +is described here. +Central to the discussion is the idea of an +.Em aliasing link +which describes the relationship for a given packet transaction between +the local machine, aliased identity and remote machine. +It is discussed how such links come into existence and are destroyed. +.Ss ALIASING LINKS +There is a notion of an +.Em aliasing link , +which is a 7-tuple describing a specific translation: +.Bd -literal -offset indent +(local addr, local port, alias addr, alias port, + remote addr, remote port, protocol) +.Ed +.Pp +Outgoing packets have the local address and port number replaced with the +alias address and port number. +Incoming packets undergo the reverse process. +The packet aliasing engine attempts to match packets against an internal +table of aliasing links to determine how to modify a given IP packet. +Both the IP header and protocol dependent headers are modified as necessary. +Aliasing links are created and deleted as necessary according to network +traffic. +.Pp +Protocols can be TCP, UDP or even ICMP in certain circumstances. +(Some types of ICMP packets can be aliased according to sequence or ID +number which acts as an equivalent port number for identifying how +individual packets should be handled.) +.Pp +Each aliasing link must have a unique combination of the following five +quantities: alias address/port, remote address/port and protocol. +This ensures that several machines on a local network can share the +same aliasing IP address. +In cases where conflicts might arise, the aliasing port is chosen so that +uniqueness is maintained. +.Ss STATIC AND DYNAMIC LINKS +Aliasing links can either be static or dynamic. +Static links persist indefinitely and represent fixed rules for translating +IP packets. +Dynamic links come into existence for a specific TCP connection or UDP +transaction or ICMP ECHO sequence. +For the case of TCP, the connection can be monitored to see when the +associated aliasing link should be deleted. +Aliasing links for UDP transactions (and ICMP ECHO and TIMESTAMP requests) +work on a simple timeout rule. +When no activity is observed on a dynamic link for a certain amount of time +it is automatically deleted. +Timeout rules also apply to TCP connections which do not open or close +properly. +.Ss PARTIALLY SPECIFIED ALIASING LINKS +Aliasing links can be partially specified, meaning that the remote address +and/or remote port are unknown. +In this case, when a packet matching the incomplete specification is found, +a fully specified dynamic link is created. +If the original partially specified link is dynamic, it will be deleted +after the fully specified link is created, otherwise it will persist. +.Pp +For instance, a partially specified link might be +.Bd -literal -offset indent +(192.168.0.4, 23, 204.228.203.215, 8066, 0, 0, tcp) +.Ed +.Pp +The zeros denote unspecified components for the remote address and port. +If this link were static it would have the effect of redirecting all +incoming traffic from port 8066 of 204.228.203.215 to port 23 (telnet) +of machine 192.168.0.4 on the local network. +Each individual telnet connection would initiate the creation of a distinct +dynamic link. +.Ss DYNAMIC LINK CREATION +In addition to aliasing links, there are also address mappings that can be +stored within the internal data table of the packet aliasing mechanism. +.Bd -literal -offset indent +(local addr, alias addr) +.Ed +.Pp +Address mappings are searched when creating new dynamic links. +.Pp +All outgoing packets from the local network automatically create a dynamic +link if they do not match an already existing fully specified link. +If an address mapping exists for the outgoing packet, this determines +the alias address to be used. +If no mapping exists, then a default address, usually the address of the +packet aliasing host, is used. +If necessary, this default address can be changed as often as each individual +packet arrives. +.Pp +The aliasing port number is determined such that the new dynamic link does +not conflict with any existing links. +In the default operating mode, the packet aliasing engine attempts to set +the aliasing port equal to the local port number. +If this results in a conflict, then port numbers are randomly chosen until +a unique aliasing link can be established. +In an alternate operating mode, the first choice of an aliasing port is also +random and unrelated to the local port number. diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c new file mode 100644 index 0000000..f104cfc --- /dev/null +++ b/sys/netinet/raw_ip.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 + * $FreeBSD$ + */ + +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_random_ip_id.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <vm/uma.h> + +#include <net/if.h> +#include <net/route.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_mroute.h> + +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +struct inpcbhead ripcb; +struct inpcbinfo ripcbinfo; + +/* control hooks for ipfw and dummynet */ +ip_fw_ctl_t *ip_fw_ctl_ptr; +ip_dn_ctl_t *ip_dn_ctl_ptr; + +/* + * Nominal space allocated to a raw ip socket. + */ +#define RIPSNDQ 8192 +#define RIPRCVQ 8192 + +/* + * Raw interface to IP protocol. + */ + +/* + * Initialize raw connection block q. + */ +void +rip_init() +{ + INP_INFO_LOCK_INIT(&ripcbinfo, "rip"); + LIST_INIT(&ripcb); + ripcbinfo.listhead = &ripcb; + /* + * XXX We don't use the hash list for raw IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); + ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); + ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); +} + +static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; +/* + * Setup generic address and protocol structures + * for raw_input routine, then pass them along with + * mbuf chain. + */ +void +rip_input(m, off) + struct mbuf *m; + int off; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct inpcb *inp; + struct inpcb *last = 0; + struct mbuf *opts = 0; + int proto = ip->ip_p; + + ripsrc.sin_addr = ip->ip_src; + LIST_FOREACH(inp, &ripcb, inp_list) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_ip_p && inp->inp_ip_p != proto) + continue; + if (inp->inp_laddr.s_addr && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + continue; + if (last) { + struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (n && ipsec4_in_reject_so(n, last->inp_socket)) { + m_freem(n); + ipsecstat.in_polvio++; + /* do not inject data to pcb */ + } else +#endif /*IPSEC*/ + if (n) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, n); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, n, + opts) == 0) { + /* should notify about lost packet */ + m_freem(n); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + opts = 0; + } + } + last = inp; + } +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (last && ipsec4_in_reject_so(m, last->inp_socket)) { + m_freem(m); + ipsecstat.in_polvio++; + ipstat.ips_delivered--; + /* do not inject data to pcb */ + } else +#endif /*IPSEC*/ + if (last) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, m); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, m, opts) == 0) { + m_freem(m); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Generate IP header and pass packet to ip_output. + * Tack on options user may have setup with control call. + */ +int +rip_output(m, so, dst) + struct mbuf *m; + struct socket *so; + u_long dst; +{ + register struct ip *ip; + register struct inpcb *inp = sotoinpcb(so); + int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + + /* + * If the user handed us a complete IP packet, use it. + * Otherwise, allocate an mbuf for a header and fill it in. + */ + if ((inp->inp_flags & INP_HDRINCL) == 0) { + if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + M_PREPEND(m, sizeof(struct ip), M_TRYWAIT); + ip = mtod(m, struct ip *); + ip->ip_tos = inp->inp_ip_tos; + ip->ip_off = 0; + ip->ip_p = inp->inp_ip_p; + ip->ip_len = m->m_pkthdr.len; + ip->ip_src = inp->inp_laddr; + ip->ip_dst.s_addr = dst; + ip->ip_ttl = inp->inp_ip_ttl; + } else { + if (m->m_pkthdr.len > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + ip = mtod(m, struct ip *); + /* don't allow both user specified and setsockopt options, + and don't allow packet length sizes that will crash */ + if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2)) + && inp->inp_options) + || (ip->ip_len > m->m_pkthdr.len) + || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) { + m_freem(m); + return EINVAL; + } + if (ip->ip_id == 0) +#ifdef RANDOM_IP_ID + ip->ip_id = ip_randomid(); +#else + ip->ip_id = htons(ip_id++); +#endif + /* XXX prevent ip_output from overwriting header fields */ + flags |= IP_RAWOUTPUT; + ipstat.ips_rawout++; + } + +#ifdef IPSEC + if (ipsec_setsocket(m, so) != 0) { + m_freem(m); + return ENOBUFS; + } +#endif /*IPSEC*/ + + return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + inp->inp_moptions)); +} + +/* + * Raw IP socket option processing. + */ +int +rip_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + if (sopt->sopt_level != IPPROTO_IP) + return (EINVAL); + + error = 0; + + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + optval = inp->inp_flags & INP_HDRINCL; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_FW_ADD: /* ADD actually returns the body... */ + case IP_FW_GET: + if (IPFW_LOADED) + error = ip_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break; + + case IP_DUMMYNET_GET: + if (DUMMYNET_LOADED) + error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break ; + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + error = ip_mrouter_get(so, sopt); + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval) + inp->inp_flags |= INP_HDRINCL; + else + inp->inp_flags &= ~INP_HDRINCL; + break; + + case IP_FW_ADD: + case IP_FW_DEL: + case IP_FW_FLUSH: + case IP_FW_ZERO: + case IP_FW_RESETLOG: + if (IPFW_LOADED) + error = ip_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break; + + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: + case IP_DUMMYNET_FLUSH: + if (DUMMYNET_LOADED) + error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT ; + break ; + + case IP_RSVP_ON: + error = ip_rsvp_init(so); + break; + + case IP_RSVP_OFF: + error = ip_rsvp_done(); + break; + + /* XXX - should be combined */ + case IP_RSVP_VIF_ON: + error = ip_rsvp_vif_init(so, sopt); + break; + + case IP_RSVP_VIF_OFF: + error = ip_rsvp_vif_done(so, sopt); + break; + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + error = ip_mrouter_set(so, sopt); + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + } + + return (error); +} + +/* + * This function exists solely to receive the PRC_IFDOWN messages which + * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, + * and calls in_ifadown() to remove all routes corresponding to that address. + * It also receives the PRC_IFUP messages from if_up() and reinstalls the + * interface routes. + */ +void +rip_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct in_ifaddr *ia; + struct ifnet *ifp; + int err; + int flags; + + switch (cmd) { + case PRC_IFDOWN: + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + if (ia->ia_ifa.ifa_addr == sa + && (ia->ia_flags & IFA_ROUTE)) { + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ia->ia_ifp, ia); + /* + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. + */ + in_ifadown(&ia->ia_ifa, 0); + break; + } + } + break; + + case PRC_IFUP: + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + if (ia->ia_ifa.ifa_addr == sa) + break; + } + if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) + return; + flags = RTF_UP; + ifp = ia->ia_ifa.ifa_ifp; + + if ((ifp->if_flags & IFF_LOOPBACK) + || (ifp->if_flags & IFF_POINTOPOINT)) + flags |= RTF_HOST; + + err = rtinit(&ia->ia_ifa, RTM_ADD, flags); + if (err == 0) + ia->ia_flags |= IFA_ROUTE; + break; + } +} + +u_long rip_sendspace = RIPSNDQ; +u_long rip_recvspace = RIPRCVQ; + +SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, + &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); +SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, + &rip_recvspace, 0, "Maximum incoming raw IP datagram size"); + +static int +rip_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + int error, s; + + inp = sotoinpcb(so); + if (inp) + panic("rip_attach"); + if (td && (error = suser(td)) != 0) + return error; + + error = soreserve(so, rip_sendspace, rip_recvspace); + if (error) + return error; + s = splnet(); + error = in_pcballoc(so, &ripcbinfo, td); + splx(s); + if (error) + return error; + inp = (struct inpcb *)so->so_pcb; + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_p = proto; + inp->inp_ip_ttl = ip_defttl; + return 0; +} + +static int +rip_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + panic("rip_detach"); + if (so == ip_mrouter) + ip_mrouter_done(); + ip_rsvp_force_done(so); + if (so == ip_rsvpd) + ip_rsvp_done(); + in_pcbdetach(inp); + return 0; +} + +static int +rip_abort(struct socket *so) +{ + soisdisconnected(so); + return rip_detach(so); +} + +static int +rip_disconnect(struct socket *so) +{ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + return rip_abort(so); +} + +static int +rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp = sotoinpcb(so); + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof(*addr)) + return EINVAL; + + if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) || + (addr->sin_addr.s_addr && + ifa_ifwithaddr((struct sockaddr *)addr) == 0)) + return EADDRNOTAVAIL; + inp->inp_laddr = addr->sin_addr; + return 0; +} + +static int +rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp = sotoinpcb(so); + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof(*addr)) + return EINVAL; + if (TAILQ_EMPTY(&ifnet)) + return EADDRNOTAVAIL; + if ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) + return EAFNOSUPPORT; + inp->inp_faddr = addr->sin_addr; + soisconnected(so); + return 0; +} + +static int +rip_shutdown(struct socket *so) +{ + socantsendmore(so); + return 0; +} + +static int +rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + struct inpcb *inp = sotoinpcb(so); + register u_long dst; + + if (so->so_state & SS_ISCONNECTED) { + if (nam) { + m_freem(m); + return EISCONN; + } + dst = inp->inp_faddr.s_addr; + } else { + if (nam == NULL) { + m_freem(m); + return ENOTCONN; + } + dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; + } + return rip_output(m, so, dst); +} + +static int +rip_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = ripcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = ripcbinfo.ipi_gencnt; + n = ripcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt) { + if (cr_canseesocket(req->td->td_ucred, + inp->inp_socket)) + continue; + inp_list[i++] = inp; + } + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = ripcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = ripcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +/* + * This is the wrapper function for in_setsockaddr. We just pass down + * the pcbinfo for in_setpeeraddr to lock. + */ +static int +rip_sockaddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setsockaddr(so, nam, &ripcbinfo)); +} + +/* + * This is the wrapper function for in_setpeeraddr. We just pass down + * the pcbinfo for in_setpeeraddr to lock. + */ +static int +rip_peeraddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setpeeraddr(so, nam, &ripcbinfo)); +} + + +SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, + rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); + +struct pr_usrreqs rip_usrreqs = { + rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, + pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, + pru_listen_notsupp, rip_peeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown, + rip_sockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h new file mode 100644 index 0000000..fee449f --- /dev/null +++ b/sys/netinet/tcp.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_H_ +#define _NETINET_TCP_H_ + +typedef u_int32_t tcp_seq; +typedef u_int32_t tcp_cc; /* connection count per rfc1644 */ + +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ + +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR) + + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOPT_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 /* Experimental */ +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 /* Experimental */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ +#define TCPOPT_TSTAMP_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) + +#define TCPOPT_CC 11 /* CC options: RFC-1644 */ +#define TCPOPT_CCNEW 12 +#define TCPOPT_CCECHO 13 +#define TCPOLEN_CC 6 +#define TCPOLEN_CC_APPA (TCPOLEN_CC+2) +#define TCPOPT_CC_HDR(ccopt) \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|(ccopt)<<8|TCPOLEN_CC) + +/* + * Default maximum segment size for TCP. + * With an IP MSS of 576, this is 536, + * but 512 is probably more convenient. + * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + */ +#define TCP_MSS 512 + +/* + * Default maximum segment size for TCP6. + * With an IP6 MSS of 1280, this is 1220, + * but 1024 is probably more convenient. (xxx kazu in doubt) + * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) + */ +#define TCP6_MSS 1024 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +#define TCP_MAXBURST 4 /* maximum segments in a burst */ + +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + /* max space left for options */ + +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#define TCP_MAXSEG 0x02 /* set maximum segment size */ +#define TCP_NOPUSH 0x04 /* don't push last block of write */ +#define TCP_NOOPT 0x08 /* don't use TCP options */ + +#endif diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c new file mode 100644 index 0000000..89e9d7c --- /dev/null +++ b/sys/netinet/tcp_debug.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#ifndef INET +#error The option TCPDEBUG requires option INET. +#endif + +#ifdef TCPDEBUG +/* load symbolic names */ +#define PRUREQUESTS +#define TCPSTATES +#define TCPTIMERS +#define TANAMES +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/protosw.h> +#include <sys/socket.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/tcp_debug.h> + +#ifdef TCPDEBUG +static int tcpconsdebug = 0; +#endif + +static struct tcp_debug tcp_debug[TCP_NDEBUG]; +static int tcp_debx; + +/* + * Tcp debug routines + */ +void +tcp_trace(act, ostate, tp, ipgen, th, req) + short act, ostate; + struct tcpcb *tp; + void *ipgen; + struct tcphdr *th; + int req; +{ +#ifdef INET6 + int isipv6; +#endif /* INET6 */ + tcp_seq seq, ack; + int len, flags; + struct tcp_debug *td = &tcp_debug[tcp_debx++]; + +#ifdef INET6 + isipv6 = (ipgen != NULL && ((struct ip *)ipgen)->ip_v == 6) ? 1 : 0; +#endif /* INET6 */ + td->td_family = +#ifdef INET6 + (isipv6 != 0) ? AF_INET6 : +#endif + AF_INET; + if (tcp_debx == TCP_NDEBUG) + tcp_debx = 0; + td->td_time = iptime(); + td->td_act = act; + td->td_ostate = ostate; + td->td_tcb = (caddr_t)tp; + if (tp) + td->td_cb = *tp; + else + bzero((caddr_t)&td->td_cb, sizeof (*tp)); + if (ipgen) { + switch (td->td_family) { + case AF_INET: + bcopy((caddr_t)ipgen, (caddr_t)&td->td_ti.ti_i, + sizeof(td->td_ti.ti_i)); + bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf)); + break; +#ifdef INET6 + case AF_INET6: + bcopy((caddr_t)ipgen, (caddr_t)td->td_ip6buf, + sizeof(td->td_ip6buf)); + bzero((caddr_t)&td->td_ti.ti_i, + sizeof(td->td_ti.ti_i)); + break; +#endif + default: + bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf)); + bzero((caddr_t)&td->td_ti.ti_i, + sizeof(td->td_ti.ti_i)); + break; + } + } else { + bzero((caddr_t)&td->td_ti.ti_i, sizeof(td->td_ti.ti_i)); + bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf)); + } + if (th) { + switch (td->td_family) { + case AF_INET: + td->td_ti.ti_t = *th; + bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th)); + break; +#ifdef INET6 + case AF_INET6: + td->td_ti6.th = *th; + bzero((caddr_t)&td->td_ti.ti_t, + sizeof(td->td_ti.ti_t)); + break; +#endif + default: + bzero((caddr_t)&td->td_ti.ti_t, + sizeof(td->td_ti.ti_t)); + bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th)); + break; + } + } else { + bzero((caddr_t)&td->td_ti.ti_t, sizeof(td->td_ti.ti_t)); + bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th)); + } + td->td_req = req; +#ifdef TCPDEBUG + if (tcpconsdebug == 0) + return; + if (tp) + printf("%p %s:", tp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", tanames[act]); + switch (act) { + + case TA_INPUT: + case TA_OUTPUT: + case TA_DROP: + if (ipgen == NULL || th == NULL) + break; + seq = th->th_seq; + ack = th->th_ack; + len = +#ifdef INET6 + isipv6 ? ((struct ip6_hdr *)ipgen)->ip6_plen : +#endif + ((struct ip *)ipgen)->ip_len; + if (act == TA_OUTPUT) { + seq = ntohl(seq); + ack = ntohl(ack); + len = ntohs((u_short)len); + } + if (act == TA_OUTPUT) + len -= sizeof (struct tcphdr); + if (len) + printf("[%x..%x)", seq, seq+len); + else + printf("%x", seq); + printf("@%x, urp=%x", ack, th->th_urp); + flags = th->th_flags; + if (flags) { + char *cp = "<"; +#define pf(f) { \ + if (th->th_flags & TH_##f) { \ + printf("%s%s", cp, #f); \ + cp = ","; \ + } \ +} + pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG); + printf(">"); + } + break; + + case TA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", tcptimers[req>>8]); + break; + } + if (tp) + printf(" -> %s", tcpstates[tp->t_state]); + /* print out internal state of tp !?! */ + printf("\n"); + if (tp == 0) + return; + printf( + "\trcv_(nxt,wnd,up) (%lx,%lx,%lx) snd_(una,nxt,max) (%lx,%lx,%lx)\n", + (u_long)tp->rcv_nxt, tp->rcv_wnd, (u_long)tp->rcv_up, + (u_long)tp->snd_una, (u_long)tp->snd_nxt, (u_long)tp->snd_max); + printf("\tsnd_(wl1,wl2,wnd) (%lx,%lx,%lx)\n", + (u_long)tp->snd_wl1, (u_long)tp->snd_wl2, tp->snd_wnd); +#endif /* TCPDEBUG */ +} diff --git a/sys/netinet/tcp_debug.h b/sys/netinet/tcp_debug.h new file mode 100644 index 0000000..773d3e4 --- /dev/null +++ b/sys/netinet/tcp_debug.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_DEBUG_H_ +#define _NETINET_TCP_DEBUG_H_ + +struct tcp_debug { + n_time td_time; + short td_act; + short td_ostate; + caddr_t td_tcb; + int td_family; + /* + * Co-existense of td_ti and td_ti6 below is ugly, but it is necessary + * to achieve backword compatibility to some extent. + */ + struct tcpiphdr td_ti; + struct { +#if !defined(_KERNEL) && defined(INET6) + struct ip6_hdr ip6; +#else + u_char ip6buf[40]; /* sizeof(struct ip6_hdr) */ +#endif + struct tcphdr th; + } td_ti6; +#define td_ip6buf td_ti6.ip6buf + short td_req; + struct tcpcb td_cb; +}; + +#define TA_INPUT 0 +#define TA_OUTPUT 1 +#define TA_USER 2 +#define TA_RESPOND 3 +#define TA_DROP 4 + +#ifdef TANAMES +static char *tanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define TCP_NDEBUG 100 + +#ifndef _KERNEL +/* XXX common variables for broken applications. */ +struct tcp_debug tcp_debug[TCP_NDEBUG]; +int tcp_debx; +#endif + +#endif /* !_NETINET_TCP_DEBUG_H_ */ diff --git a/sys/netinet/tcp_fsm.h b/sys/netinet/tcp_fsm.h new file mode 100644 index 0000000..c8f87f6 --- /dev/null +++ b/sys/netinet/tcp_fsm.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_FSM_H_ +#define _NETINET_TCP_FSM_H_ + +/* + * TCP FSM state definitions. + * Per RFC793, September, 1981. + */ + +#define TCP_NSTATES 11 + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +/* for KAME src sync over BSD*'s */ +#define TCP6_NSTATES TCP_NSTATES +#define TCP6S_CLOSED TCPS_CLOSED +#define TCP6S_LISTEN TCPS_LISTEN +#define TCP6S_SYN_SENT TCPS_SYN_SENT +#define TCP6S_SYN_RECEIVED TCPS_SYN_RECEIVED +#define TCP6S_ESTABLISHED TCPS_ESTABLISHED +#define TCP6S_CLOSE_WAIT TCPS_CLOSE_WAIT +#define TCP6S_FIN_WAIT_1 TCPS_FIN_WAIT_1 +#define TCP6S_CLOSING TCPS_CLOSING +#define TCP6S_LAST_ACK TCPS_LAST_ACK +#define TCP6S_FIN_WAIT_2 TCPS_FIN_WAIT_2 +#define TCP6S_TIME_WAIT TCPS_TIME_WAIT + +#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED) +#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) +#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) + +#ifdef TCPOUTFLAGS +/* + * Flags used when sending segments in tcp_output. + * Basic flags (TH_RST,TH_ACK,TH_SYN,TH_FIN) are totally + * determined by state, with the proviso that TH_FIN is sent only + * if all data queued for output is included in the segment. + */ +static u_char tcp_outflags[TCP_NSTATES] = { + TH_RST|TH_ACK, /* 0, CLOSED */ + 0, /* 1, LISTEN */ + TH_SYN, /* 2, SYN_SENT */ + TH_SYN|TH_ACK, /* 3, SYN_RECEIVED */ + TH_ACK, /* 4, ESTABLISHED */ + TH_ACK, /* 5, CLOSE_WAIT */ + TH_FIN|TH_ACK, /* 6, FIN_WAIT_1 */ + TH_FIN|TH_ACK, /* 7, CLOSING */ + TH_FIN|TH_ACK, /* 8, LAST_ACK */ + TH_ACK, /* 9, FIN_WAIT_2 */ + TH_ACK, /* 10, TIME_WAIT */ +}; +#endif + +#ifdef KPROF +int tcp_acounts[TCP_NSTATES][PRU_NREQ]; +#endif + +#ifdef TCPSTATES +const char *tcpstates[] = { + "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", + "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", +}; +#endif + +#endif diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c new file mode 100644 index 0000000..0fb62e0 --- /dev/null +++ b/sys/netinet/tcp_input.c @@ -0,0 +1,2785 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/in6_pcb.h> +#include <netinet6/ip6_var.h> +#include <netinet6/nd6.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> + +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#include <netkey/key.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); + +static int tcprexmtthresh = 3; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, + &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming TCP connections"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send RST when dropping refused connections"); + +int tcp_delack_enabled = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &tcp_delack_enabled, 0, + "Delay ACK to try and piggyback it onto a data packet"); + +#ifdef TCP_DROP_SYNFIN +static int drop_synfin = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +#endif + +struct inpcbhead tcb; +#define tcb6 tcb /* for KAME src sync over BSD*'s */ +struct inpcbinfo tcbinfo; +struct mtx *tcbinfo_mtx; + +static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, + struct mbuf *); +static void tcp_xmit_timer(struct tcpcb *, int); +static int tcp_newreno(struct tcpcb *, struct tcphdr *); + +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ + (tp)->t_inpcb->in6p_route.ro_rt) \ + nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif + +/* + * Indicate whether this ack should be delayed. We can delay the ack if + * - delayed acks are enabled and + * - there is no delayed ack timer in progress and + * - our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + */ +#define DELAY_ACK(tp) \ + (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \ + (tp->t_flags & TF_RXWIN0SENT) == 0) + +static int +tcp_reass(tp, th, tlenp, m) + register struct tcpcb *tp; + register struct tcphdr *th; + int *tlenp; + struct mbuf *m; +{ + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + /* + * Call with th==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (th == 0) + goto present; + + /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ + MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, + M_NOWAIT); + if (te == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + return (0); + } + + /* + * Find a segment which begins after this one does. + */ + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) + break; + p = q; + } + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + register int i; + /* conversion to int (in i) handles seq wraparound */ + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; + if (i > 0) { + if (i >= *tlenp) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += *tlenp; + m_freem(m); + FREE(te, M_TSEGQ); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + *tlenp -= i; + th->th_seq += i; + } + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; + if (i <= 0) + break; + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); + break; + } + + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } + + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + + if (p == NULL) { + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + } else { + LIST_INSERT_AFTER(p, te, tqe_q); + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + if (so->so_state & SS_CANTRCVMORE) + m_freem(q->tqe_m); + else + sbappend(&so->so_rcv, q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +#ifdef INET6 +int +tcp6_input(mp, offp, proto) + struct mbuf **mp; + int *offp, proto; +{ + register struct mbuf *m = *mp; + struct in6_ifaddr *ia6; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + ia6 = ip6_getdstifaddr(m); + if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp); + return IPPROTO_DONE; +} +#endif + +void +tcp_input(m, off0) + register struct mbuf *m; + int off0; +{ + register struct tcphdr *th; + register struct ip *ip = NULL; + register struct ipovly *ipov; + register struct inpcb *inp = NULL; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + int drop_hdrlen; + register struct tcpcb *tp = 0; + register int thflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ + int headlocked = 0; + +#ifdef TCPDEBUG + u_char tcp_saveipgen[40]; + /* the size of the above must be of max ip header, now IPv6 */ + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#endif /* INET6 */ + struct sockaddr_in *next_hop = NULL; + int rstreason; /* For badport_bandlim accounting purposes */ + + /* Grab info from MT_TAG mbufs prepended to the chain. */ + for (;m && m->m_type == MT_TAG; m = m->m_next) { + if (m->m_tag_id == PACKET_TAG_IPFORWARD) + next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; + } + +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + +#ifdef INET6 + if (isipv6) { + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + goto drop; + } + } else +#endif /* INET6 */ + { + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + if (off0 > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + tlen = ip->ip_len; + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + + ip->ip_len + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + /* + * Checksum extended TCP header and data. + */ + len = sizeof (struct ip) + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + ipov->ih_len = htons(ipov->ih_len); + th->th_sum = in_cksum(m, len); + } + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; +#endif + } + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; /* tlen is used instead of ti->ti_len */ + if (off > sizeof (struct tcphdr)) { +#ifdef INET6 + if (isipv6) { + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + } + } + optlen = off - sizeof (struct tcphdr); + optp = (u_char *)(th + 1); + } + thflags = th->th_flags; + +#ifdef TCP_DROP_SYNFIN + /* + * If the drop_synfin option is enabled, drop all packets with + * both the SYN and FIN bits set. This prevents e.g. nmap from + * identifying the TCP/IP stack. + * + * This is a violation of the TCP specification. + */ + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + goto drop; +#endif + + /* + * Convert TCP protocol specific fields to host format. + */ + th->th_seq = ntohl(th->th_seq); + th->th_ack = ntohl(th->th_ack); + th->th_win = ntohs(th->th_win); + th->th_urp = ntohs(th->th_urp); + + /* + * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, + * until after ip6_savecontrol() is called and before other functions + * which don't want those proto headers. + * Because ip6_savecontrol() is going to parse the mbuf to + * search for data to be passed up to user-land, it wants mbuf + * parameters to be unchanged. + * XXX: the call of ip6_savecontrol() has been obsoleted based on + * latest version of the advanced API (20020110). + */ + drop_hdrlen = off0 + off; + + /* + * Locate pcb for segment. + */ + INP_INFO_WLOCK(&tcbinfo); + headlocked = 1; +findpcb: + /* IPFIREWALL_FORWARD section */ + if (next_hop != NULL +#ifdef INET6 + && isipv6 == NULL /* IPv6 support is not yet */ +#endif /* INET6 */ + ) { + /* + * Transparently forwarded. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); + if (!inp) { + /* + * No, then it's new. Try find the ambushing socket + */ + if (next_hop->sin_port == 0) { + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, + th->th_sport, next_hop->sin_addr, + th->th_dport, 1, m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_hash(&tcbinfo, + ip->ip_src, th->th_sport, + next_hop->sin_addr, + ntohs(next_hop->sin_port), 1, + m->m_pkthdr.rcvif); + } + } + } else + { +#ifdef INET6 + if (isipv6) + inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, 1, + m->m_pkthdr.rcvif); + else +#endif /* INET6 */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); + } + +#ifdef IPSEC +#ifdef INET6 + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + goto drop; + } + } else +#endif /* INET6 */ + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto drop; + } +#endif /*IPSEC*/ + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain) { +#ifdef INET6 + char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; +#else /* INET6 */ + char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; +#endif /* INET6 */ + +#ifdef INET6 + if (isipv6) { + strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); + strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + } else +#endif + { + strcpy(dbuf, inet_ntoa(ip->ip_dst)); + strcpy(sbuf, inet_ntoa(ip->ip_src)); + } + switch (log_in_vain) { + case 1: + if(thflags & TH_SYN) + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d\n", + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); + break; + case 2: + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); + break; + default: + break; + } + } + if (blackhole) { + switch (blackhole) { + case 1: + if (thflags & TH_SYN) + goto drop; + break; + case 2: + goto drop; + default: + goto drop; + } + } + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + INP_LOCK(inp); + tp = intotcpcb(inp); + if (tp == 0) { + INP_UNLOCK(inp); + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((thflags & TH_SYN) == 0) + tiwin = th->th_win << tp->snd_scale; + else + tiwin = th->th_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { + struct in_conninfo inc; +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; +#ifdef INET6 + if (isipv6) + bcopy((char *)ip6, (char *)tcp_saveipgen, + sizeof(*ip6)); + else +#endif /* INET6 */ + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; + } +#endif + /* skip if this isn't a listen socket */ + if ((so->so_options & SO_ACCEPTCONN) == 0) + goto after_listen; +#ifdef INET6 + inc.inc_isipv6 = isipv6; + if (isipv6) { + inc.inc6_faddr = ip6->ip6_src; + inc.inc6_laddr = ip6->ip6_dst; + inc.inc6_route.ro_rt = NULL; /* XXX */ + + } else +#endif /* INET6 */ + { + inc.inc_faddr = ip->ip_src; + inc.inc_laddr = ip->ip_dst; + inc.inc_route.ro_rt = NULL; /* XXX */ + } + inc.inc_fport = th->th_sport; + inc.inc_lport = th->th_dport; + + /* + * If the state is LISTEN then ignore segment if it contains + * a RST. If the segment contains an ACK then it is bad and + * send a RST. If it does not contain a SYN then it is not + * interesting; drop it. + * + * If the state is SYN_RECEIVED (syncache) and seg contains + * an ACK, but not for our SYN/ACK, send a RST. If the seg + * contains a RST, check the sequence number to see if it + * is a valid reset segment. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + if (!syncache_expand(&inc, th, &so, m)) { + /* + * No syncache entry, or ACK was not + * for our SYN/ACK. Send a RST. + */ + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + if (so == NULL) { + /* + * Could not complete 3-way handshake, + * connection is being closed down, and + * syncache will free mbuf. + */ + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return; + } + /* + * Socket is created in state SYN_RECEIVED. + * Continue processing segment. + */ + INP_UNLOCK(inp); + inp = sotoinpcb(so); + INP_LOCK(inp); + tp = intotcpcb(inp); + /* + * This is what would have happened in + * tcp_output() when the SYN,ACK was sent. + */ + tp->snd_up = tp->snd_una; + tp->snd_max = tp->snd_nxt = tp->iss + 1; + tp->last_ack_sent = tp->rcv_nxt; +/* + * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled + * until the _second_ ACK is received: + * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window. + * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale, + * move to ESTAB, set snd_wnd to tiwin. + */ + tp->snd_wnd = tiwin; /* unscaled */ + goto after_listen; + } + if (thflags & TH_RST) { + syncache_chkrst(&inc, th); + goto drop; + } + if (thflags & TH_ACK) { + syncache_badack(&inc); + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + goto drop; + } + + /* + * Segment's flags are (SYN) or (SYN|FIN). + */ +#ifdef INET6 + /* + * If deprecated address is forbidden, + * we do not accept SYN to deprecated interface + * address to prevent any new inbound connection from + * getting established. + * When we do not accept SYN, we send a TCP RST, + * with deprecated source address (instead of dropping + * it). We compromise it as it is much better for peer + * to send a RST, and RST will be the final packet + * for the exchange. + * + * If we do not forbid deprecated addresses, we accept + * the SYN packet. RFC2462 does not suggest dropping + * SYN in this case. + * If we decipher RFC2462 5.5.4, it says like this: + * 1. use of deprecated addr with existing + * communication is okay - "SHOULD continue to be + * used" + * 2. use of it with new communication: + * (2a) "SHOULD NOT be used if alternate address + * with sufficient scope is available" + * (2b) nothing mentioned otherwise. + * Here we fall into (2b) case as we have no choice in + * our source address selection - we must obey the peer. + * + * The wording in RFC2462 is confusing, and there are + * multiple description text for deprecated address + * handling - worse, they are not exactly the same. + * I believe 5.5.4 is the best one, so we follow 5.5.4. + */ + if (isipv6 && !ip6_use_deprecated) { + struct in6_ifaddr *ia6; + + if ((ia6 = ip6_getdstifaddr(m)) && + (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { + INP_UNLOCK(inp); + tp = NULL; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + } +#endif + /* + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + */ + if (th->th_dport == th->th_sport) { +#ifdef INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * + * Note that it is quite possible to receive unicast + * link-layer packets with a broadcast IP address. Use + * in_broadcast() to find them. + */ + if (m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + goto drop; + /* + * SYN appears to be valid; create compressed TCP state + * for syncache, or perform t/tcp connection. + */ + if (so->so_qlen <= so->so_qlimit) { + tcp_dooptions(&to, optp, optlen, 1); + if (!syncache_add(&inc, &to, th, &so, m)) + goto drop; + if (so == NULL) { + /* + * Entry added to syncache, mbuf used to + * send SYN,ACK packet. + */ + KASSERT(headlocked, ("headlocked")); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return; + } + /* + * Segment passed TAO tests. + */ + INP_UNLOCK(inp); + inp = sotoinpcb(so); + INP_LOCK(inp); + tp = intotcpcb(inp); + tp->snd_wnd = tiwin; + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if (DELAY_ACK(tp) && ((thflags & TH_FIN) || + (tlen != 0 && +#ifdef INET6 + ((isipv6 && in6_localaddr(&inp->in6p_faddr)) + || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + )) +#endif + ))) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + tcpstat.tcps_connects++; + soisconnected(so); + goto trimthenstep6; + } + goto drop; + } +after_listen: + +/* XXX temp debugging */ + /* should not happen - syncache should pick up these connections */ + if (tp->t_state == TCPS_LISTEN) + panic("tcp_input: TCPS_LISTEN"); + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + + /* + * Process options. + * XXX this is tradtitional behavior, may need to be cleaned up. + */ + tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); + if (thflags & TH_SYN) { + if (to.to_flags & TOF_SCALE) { + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = to.to_requested_s_scale; + } + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = ticks; + } + if (to.to_flags & (TOF_CC|TOF_CCNEW)) + tp->t_flags |= TF_RCVD_CC; + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + } + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + th->th_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + KASSERT(headlocked, ("headlocked")); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + /* + * "bad retransmit" recovery + */ + if (tp->t_rxtshift == 1 && + ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + if ((to.to_flags & TOF_TS) != 0) + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = th->th_ack; + m_freem(m); + ND6_HINT(tp); /* some progress has been done */ + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + callout_stop(tp->tt_rexmt); + else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); + + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + INP_UNLOCK(inp); + return; + } + } else if (th->th_ack == tp->snd_una && + LIST_EMPTY(&tp->t_segq) && + tlen <= sbspace(&so->so_rcv)) { + KASSERT(headlocked, ("headlocked")); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += tlen; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); /* some progress has been done */ + /* + * Add data to socket buffer. + */ + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappend(&so->so_rcv, m); + sorwakeup(so); + if (DELAY_ACK(tp)) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + INP_UNLOCK(inp); + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + if (thflags & TH_RST) { + if (thflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((thflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = th->th_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flags & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (DELAY_ACK(tp) && tlen != 0) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + /* + * Received <SYN,ACK> in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + callout_stop(tp->tt_rexmt); + if (to.to_flags & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance th->th_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (thflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((thflags & TH_SYN) && + (to.to_flags & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + (ticks - tp->t_starttime) > tcp_msl) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * If we have multiple segments in flight, the intial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; + + case TCPS_TIME_WAIT: + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += tlen; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if its CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) + goto dropafterack; + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && tlen) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= tlen) { + tcpstat.tcps_rcvbyteafterwin += tlen; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (thflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(th->th_seq, tp->rcv_nxt)) { + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (thflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (!callout_active(tp->tt_rexmt) || + th->th_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + if (tcp_do_newreno && SEQ_LT(th->th_ack, + tp->snd_recover)) { + /* False retransmit, should not + * cut window + */ + tp->snd_cwnd += tp->t_maxseg; + tp->t_dupacks = 0; + (void) tcp_output(tp); + goto drop; + } + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->snd_recover = tp->snd_max; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tcp_do_newreno == 0) { + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } else if (tp->t_dupacks >= tcprexmtthresh && + !tcp_newreno(tp, th)) { + /* + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = + tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } + if (tp->t_dupacks < tcprexmtthresh) + tp->t_dupacks = 0; + if (SEQ_GT(th->th_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flags & TOF_TS) + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + callout_stop(tp->tt_rexmt); + needoutput = 1; + } else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + /* + * If t_dupacks != 0 here, it indicates that we are still + * in NewReno fast recovery mode, so we leave the congestion + * window alone. + */ + if (tcp_do_newreno == 0 || tp->t_dupacks == 0) + tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + sowwakeup(so); + tp->snd_una = th->th_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (u_long)tlen +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, th, m, + drop_hdrlen); /* hdr drop is delayed */ + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + KASSERT(headlocked, ("headlocked")); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((tlen || (thflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * Insert segment which inludes th into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. This handle the common case inline (segment + * is the next to be received on an established connection, and the + * queue is empty), avoiding linkage into and removal from the queue + * and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ + if (th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + if (DELAY_ACK(tp)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt += tlen; + thflags = th->th_flags & TH_FIN; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); + sbappend(&so->so_rcv, m); + sorwakeup(so); + } else { + thflags = tcp_reass(tp, th, &tlen, m); + tp->t_flags |= TF_ACKNOW; + } + + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /*FALLTHROUGH*/ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + INP_UNLOCK(inp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + INP_UNLOCK(inp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ + + /* + * Perform bandwidth limiting. + */ + if (badport_bandlim(rstreason) < 0) + goto drop; + +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + if (tp) + INP_UNLOCK(inp); + + if (thflags & TH_ACK) + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, + TH_RST); + else { + if (thflags & TH_SYN) + tlen++; + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); + } + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp) + INP_UNLOCK(inp); + m_freem(m); + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + return; +} + +/* + * Parse TCP options and place in tcpopt. + */ +static void +tcp_dooptions(to, cp, cnt, is_syn) + struct tcpopt *to; + u_char *cp; + int cnt; +{ + int opt, optlen; + + to->to_flags = 0; + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = cp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!is_syn) + continue; + to->to_flags |= TOF_MSS; + bcopy((char *)cp + 2, + (char *)&to->to_mss, sizeof(to->to_mss)); + to->to_mss = ntohs(to->to_mss); + break; + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (! is_syn) + continue; + to->to_flags |= TOF_SCALE; + to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flags |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + to->to_tsval = ntohl(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + to->to_tsecr = ntohl(to->to_tsecr); + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flags |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + to->to_cc = ntohl(to->to_cc); + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!is_syn) + continue; + to->to_flags |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + to->to_cc = ntohl(to->to_cc); + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!is_syn) + continue; + to->to_flags |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + to->to_ccecho = ntohl(to->to_ccecho); + break; + default: + continue; + } + } +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(so, th, m, off) + struct socket *so; + struct tcphdr *th; + register struct mbuf *m; + int off; /* delayed to be droped hdrlen */ +{ + int cnt = off + th->th_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + int rtt; +{ + register int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(tp, offer) + struct tcpcb *tp; + int offer; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + + inp = tp->t_inpcb; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(&inp->inp_inc); + else +#endif + rt = tcp_rtlookup(&inp->inp_inc); + if (rt == NULL) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); + tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + * else, use the link mtu. + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - min_protoh; + else + { + mss = +#ifdef INET6 + (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : +#endif + ifp->if_mtu +#ifdef INET6 + ) +#endif + - min_protoh; +#ifdef INET6 + if (isipv6) { + if (!in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, tcp_v6mssdflt); + } else +#endif + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize, so, NULL); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize, so, NULL); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + ) +#endif + ) + tp->snd_cwnd = mss * ss_fltsz_local; + else + tp->snd_cwnd = mss * ss_fltsz; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(tp) + struct tcpcb *tp; +{ + struct rtentry *rt; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + +#ifdef INET6 + isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); + else +#endif /* INET6 */ + rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); + if (rt == NULL) + return +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + return rt->rt_ifp->if_mtu - min_protoh; +} + + +/* + * Checks for partial ack. If partial ack arrives, force the retransmission + * of the next unacknowledged segment, do not clear tp->t_dupacks, and return + * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to + * be started again. If the ack advances at least to tp->snd_recover, return 0. + */ +static int +tcp_newreno(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + tcp_seq onxt = tp->snd_nxt; + u_long ocwnd = tp->snd_cwnd; + + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset + * (tp->snd_una has not yet been updated when this function + * is called) + */ + tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); + return (1); + } + return (0); +} diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c new file mode 100644 index 0000000..47a1873 --- /dev/null +++ b/sys/netinet/tcp_output.c @@ -0,0 +1,970 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#define TCPOUTFLAGS +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +#ifdef notyet +extern struct mbuf *m_copypack(); +#endif + +int path_mtu_discovery = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, + &path_mtu_discovery, 1, "Enable Path MTU Discovery"); + +int ss_fltsz = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz, 1, "Slow start flight size"); + +int ss_fltsz_local = 4; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz_local, 1, "Slow start flight size for local networks"); + +int tcp_do_newreno = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, + 0, "Enable NewReno Algorithms"); +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + long len, win; + int off, flags, error; + struct mbuf *m; + struct ip *ip = NULL; + struct ipovly *ipov = NULL; + struct tcphdr *th; + u_char opt[TCP_MAXOLEN]; + unsigned ipoptlen, optlen, hdrlen; + int idle, sendalot; +#if 0 + int maxburst = TCP_MAXBURST; +#endif + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; + + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif + + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); + if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + int ss = ss_fltsz; +#ifdef INET6 + if (isipv6) { + if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) + ss = ss_fltsz_local; + } else +#endif /* INET6 */ + if (in_localaddr(tp->t_inpcb->inp_faddr)) + ss = ss_fltsz_local; + tp->snd_cwnd = tp->t_maxseg * ss; + } + tp->t_flags &= ~TF_LASTIDLE; + if (idle) { + if (tp->t_flags & TF_MORETOCOME) { + tp->t_flags |= TF_LASTIDLE; + idle = 0; + } + } +again: + sendalot = 0; + off = tp->snd_nxt - tp->snd_una; + win = min(tp->snd_wnd, tp->snd_cwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_force) { + if (win == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unsent data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (off < so->so_snd.sb_cc) + flags &= ~TH_FIN; + win = 1; + } else { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + } + + len = (long)ulmin(so->so_snd.sb_cc, win) - off; + + if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + /* + * Lop off SYN bit if it has already been sent. However, if this + * is SYN-SENT state and if segment contains data and if we don't + * know that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + flags &= ~TH_SYN; + off--, len++; + if (len > 0 && tp->t_state == TCPS_SYN_SENT && + taop->tao_ccsent == 0) + return 0; + } + + /* + * Be careful not to send data and/or FIN on SYN segments + * in cases when no CC option will be sent. + * This measure is needed to prevent interoperability problems + * with not fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && + ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || + ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { + len = 0; + flags &= ~TH_FIN; + } + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be -1. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + len = 0; + if (win == 0) { + callout_stop(tp->tt_rexmt); + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_una; + if (!callout_active(tp->tt_persist)) + tcp_setpersist(tp); + } + } + if (len > tp->t_maxseg) { + len = tp->t_maxseg; + sendalot = 1; + } + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + + win = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. We transmit under the following + * conditions when len is non-zero: + * + * - We have a full segment + * - This is the last buffer in a write()/send() and we are + * either idle or running NODELAY + * - we've timed out (e.g. persist timer) + * - we have more then 1/2 the maximum send window's worth of + * data (receiver may be limited the window size) + * - we need to retransmit + */ + if (len) { + if (len == tp->t_maxseg) + goto send; + /* + * NOTE! on localhost connections an 'ack' from the remote + * end may occur synchronously with the output and cause + * us to flush a buffer queued with moretocome. XXX + * + * note: the len + off check is almost certainly unnecessary. + */ + if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ + (idle || (tp->t_flags & TF_NODELAY)) && + len + off >= so->so_snd.sb_cc && + (tp->t_flags & TF_NOPUSH) == 0) { + goto send; + } + if (tp->t_force) /* typ. timeout case */ + goto send; + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ + goto send; + } + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + */ + if (win > 0) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe peer an ACK. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if ((flags & TH_RST) || + ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, or we're retransmitting the FIN, + * then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + + /* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * callout_active(tp->tt_persist) + * is true when we are in persist state. + * tp->t_force + * is set when we are called to send a persist packet. + * callout_active(tp->tt_rexmt) + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ + if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + + /* + * No reason to send a segment, just return. + */ + return (0); + +send: + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES + */ + optlen = 0; +#ifdef INET6 + if (isipv6) + hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + else +#endif + hdrlen = sizeof (struct tcpiphdr); + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + if ((tp->t_flags & TF_NOOPT) == 0) { + u_short mss; + + opt[0] = TCPOPT_MAXSEG; + opt[1] = TCPOLEN_MAXSEG; + mss = htons((u_short) tcp_mssopt(tp)); + (void)memcpy(opt + 2, &mss, sizeof(mss)); + optlen = TCPOLEN_MAXSEG; + + if ((tp->t_flags & TF_REQ_SCALE) && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_SCALE))) { + *((u_int32_t *)(opt + optlen)) = htonl( + TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | + TCPOLEN_WINDOW << 8 | + tp->request_r_scale); + optlen += 4; + } + } + } + + /* + * Send a timestamp and echo-reply if this is a SYN and our side + * wants to use timestamps (TF_REQ_TSTMP is set) or both our side + * and our peer have sent timestamps in our SYN's. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (flags & TH_RST) == 0 && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_TSTMP))) { + u_int32_t *lp = (u_int32_t *)(opt + optlen); + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(ticks); + *lp = htonl(tp->ts_recent); + optlen += TCPOLEN_TSTAMP_APPA; + } + + /* + * Send `CC-family' options if our side wants to use them (TF_REQ_CC), + * options are allowed (!TF_NOOPT) and it's not a RST. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (flags & TH_RST) == 0) { + switch (flags & (TH_SYN|TH_ACK)) { + /* + * This is a normal ACK, send CC if we received CC before + * from our peer. + */ + case TH_ACK: + if (!(tp->t_flags & TF_RCVD_CC)) + break; + /*FALLTHROUGH*/ + + /* + * We can only get here in T/TCP's SYN_SENT* state, when + * we're a sending a non-SYN segment without waiting for + * the ACK of our SYN. A check above assures that we only + * do this if our peer understands T/TCP. + */ + case 0: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + + optlen += 4; + break; + + /* + * This is our initial SYN, check whether we have to use + * CC or CC.new. + */ + case TH_SYN: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? + TCPOPT_CCNEW : TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + optlen += 4; + break; + + /* + * This is a SYN,ACK; send CC and CC.echo if we received + * CC from our peer. + */ + case (TH_SYN|TH_ACK): + if (tp->t_flags & TF_RCVD_CC) { + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_send); + optlen += 4; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CCECHO; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_recv); + optlen += 4; + } + break; + } + } + + hdrlen += optlen; + +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; +#ifdef IPSEC + ipoptlen += ipsec_hdrsiz_tcp(tp); +#endif + + /* + * Adjust data length if insertion of options will + * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. + */ + if (len + optlen + ipoptlen > tp->t_maxopd) { + /* + * If there is still more to send, don't close the connection. + */ + flags &= ~TH_FIN; + len = tp->t_maxopd - optlen - ipoptlen; + sendalot = 1; + } + +/*#ifdef DIAGNOSTIC*/ +#ifdef INET6 + if (max_linkhdr + hdrlen > MCLBYTES) +#else + if (max_linkhdr + hdrlen > MHLEN) +#endif + panic("tcphdr too big"); +/*#endif*/ + + /* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ + if (len) { + if (tp->t_force && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, off, + (int)len, max_linkhdr + hdrlen)) == 0) { + error = ENOBUFS; + goto out; + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(m); + error = ENOBUFS; + goto out; + } + } +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(so->so_snd.sb_mb, off, (int) len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; + } else { + m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; + goto out; + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (off + len == so->so_snd.sb_cc) + flags |= TH_PUSH; + } else { + if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + MH_ALIGN(m, hdrlen); + } else +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + tcp_fillheaders(tp, ip6, th); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)(ip + 1); + /* this picks up the pseudo header (w/o the length) */ + tcp_fillheaders(tp, ip, th); + } + + /* + * Fill in fields, remembering maximum advertised + * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are doing retransmissions, then snd_nxt will + * not reflect the first unsent octet. For ACK only + * packets, we do not want the sequence number of the + * retransmitted packet, we want the sequence number + * of the next unsent octet. So, if there is no data + * (and no SYN or FIN), use snd_max instead of snd_nxt + * when filling in ti_seq. But if we are in persist + * state, snd_max might reflect one byte beyond the + * right edge of the window, so use snd_nxt in that + * case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (len || (flags & (TH_SYN|TH_FIN)) + || callout_active(tp->tt_persist)) + th->th_seq = htonl(tp->snd_nxt); + else + th->th_seq = htonl(tp->snd_max); + th->th_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; + } + th->th_flags = flags; + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) + win = 0; + if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) + win = (long)(tp->rcv_adv - tp->rcv_nxt); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + th->th_win = htons((u_short) (win>>tp->rcv_scale)); + + + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised + * a 0 window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is attempting + * to read more data then can be buffered prior to transmitting on + * the connection. + */ + if (win == 0) + tp->t_flags |= TF_RXWIN0SENT; + else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + th->th_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull + * the urgent pointer to the left edge of the send window + * so that it doesn't drift into the send window on sequence + * number wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + + /* + * Put TCP length in extended header, and then + * checksum extended header and data. + */ + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (isipv6) + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), + sizeof(struct tcphdr) + optlen + len); + else +#endif /* INET6 */ + { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + if (len + optlen) + th->th_sum = in_addword(th->th_sum, + htons((u_short)(optlen + len))); + + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + } + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing an ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (!callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_una) { + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + } + } else + if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); +#endif + + /* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ +#ifdef INET6 + if (isipv6) { + /* + * we separately set hoplimit for every segment, since the + * user might want to change the value via setsockopt. + * Also, desired default hop limit might be changed via + * Neighbor Discovery. + */ + ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, + tp->t_inpcb->in6p_route.ro_rt ? + tp->t_inpcb->in6p_route.ro_rt->rt_ifp + : NULL); + + /* TODO: IPv6 IP6TOS_ECT bit on */ +#ifdef IPSEC + if (ipsec_setsocket(m, so) != 0) { + m_freem(m); + error = ENOBUFS; + goto out; + } +#endif /*IPSEC*/ + error = ip6_output(m, + tp->t_inpcb->in6p_outputopts, + &tp->t_inpcb->in6p_route, + (so->so_options & SO_DONTROUTE), NULL, NULL); + } else +#endif /* INET6 */ + { + struct rtentry *rt; + ip->ip_len = m->m_pkthdr.len; +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + ip->ip_ttl = in6_selecthlim(tp->t_inpcb, + tp->t_inpcb->in6p_route.ro_rt ? + tp->t_inpcb->in6p_route.ro_rt->rt_ifp + : NULL); + else +#endif /* INET6 */ + ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ + ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ + /* + * See if we should do MTU discovery. We do it only if the following + * are true: + * 1) we have a valid route to the destination + * 2) the MTU is not locked (if it is, then discovery has been + * disabled) + */ + if (path_mtu_discovery + && (rt = tp->t_inpcb->inp_route.ro_rt) + && rt->rt_flags & RTF_UP + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + ip->ip_off |= IP_DF; + } +#ifdef IPSEC + ipsec_setsocket(m, so); +#endif /*IPSEC*/ + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + } + if (error) { + + /* + * We know that the packet was lost, so back out the + * sequence number advance, if any. + */ + if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { + /* + * No need to check for TH_FIN here because + * the TF_SENTFIN flag handles that case. + */ + if ((flags & TH_SYN) == 0) + tp->snd_nxt -= len; + } + +out: + if (error == ENOBUFS) { + if (!callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + tcp_quench(tp->t_inpcb, 0); + return (0); + } + if (error == EMSGSIZE) { + /* + * ip_output() will have already fixed the route + * for us. tcp_mtudisc() will, as its last action, + * initiate retransmission, so it is important to + * not do so here. + */ + tcp_mtudisc(tp->t_inpcb, 0); + return 0; + } + if ((error == EHOSTUNREACH || error == ENETDOWN) + && TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); + } + return (error); + } + tcpstat.tcps_sndtotal++; + + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertised window. + * Any pending ACK has now been sent. + */ + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~TF_ACKNOW; + if (tcp_delack_enabled) + callout_stop(tp->tt_delack); +#if 0 + /* + * This completely breaks TCP if newreno is turned on. What happens + * is that if delayed-acks are turned on on the receiver, this code + * on the transmitter effectively destroys the TCP window, forcing + * it to four packets (1.5Kx4 = 6K window). + */ + if (sendalot && (!tcp_do_newreno || --maxburst)) + goto again; +#endif + if (sendalot) + goto again; + return (0); +} + +void +tcp_setpersist(tp) + register struct tcpcb *tp; +{ + int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int tt; + + if (callout_active(tp->tt_rexmt)) + panic("tcp_setpersist: retransmit pending"); + /* + * Start/restart persistance timer. + */ + TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; +} diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c new file mode 100644 index 0000000..0fb62e0 --- /dev/null +++ b/sys/netinet/tcp_reass.c @@ -0,0 +1,2785 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/in6_pcb.h> +#include <netinet6/ip6_var.h> +#include <netinet6/nd6.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> + +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#include <netkey/key.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); + +static int tcprexmtthresh = 3; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, + &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming TCP connections"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send RST when dropping refused connections"); + +int tcp_delack_enabled = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &tcp_delack_enabled, 0, + "Delay ACK to try and piggyback it onto a data packet"); + +#ifdef TCP_DROP_SYNFIN +static int drop_synfin = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +#endif + +struct inpcbhead tcb; +#define tcb6 tcb /* for KAME src sync over BSD*'s */ +struct inpcbinfo tcbinfo; +struct mtx *tcbinfo_mtx; + +static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, + struct mbuf *); +static void tcp_xmit_timer(struct tcpcb *, int); +static int tcp_newreno(struct tcpcb *, struct tcphdr *); + +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ + (tp)->t_inpcb->in6p_route.ro_rt) \ + nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif + +/* + * Indicate whether this ack should be delayed. We can delay the ack if + * - delayed acks are enabled and + * - there is no delayed ack timer in progress and + * - our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + */ +#define DELAY_ACK(tp) \ + (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \ + (tp->t_flags & TF_RXWIN0SENT) == 0) + +static int +tcp_reass(tp, th, tlenp, m) + register struct tcpcb *tp; + register struct tcphdr *th; + int *tlenp; + struct mbuf *m; +{ + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + /* + * Call with th==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (th == 0) + goto present; + + /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ + MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, + M_NOWAIT); + if (te == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + return (0); + } + + /* + * Find a segment which begins after this one does. + */ + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) + break; + p = q; + } + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + register int i; + /* conversion to int (in i) handles seq wraparound */ + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; + if (i > 0) { + if (i >= *tlenp) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += *tlenp; + m_freem(m); + FREE(te, M_TSEGQ); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + *tlenp -= i; + th->th_seq += i; + } + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; + if (i <= 0) + break; + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); + break; + } + + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } + + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + + if (p == NULL) { + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + } else { + LIST_INSERT_AFTER(p, te, tqe_q); + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + if (so->so_state & SS_CANTRCVMORE) + m_freem(q->tqe_m); + else + sbappend(&so->so_rcv, q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +#ifdef INET6 +int +tcp6_input(mp, offp, proto) + struct mbuf **mp; + int *offp, proto; +{ + register struct mbuf *m = *mp; + struct in6_ifaddr *ia6; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + ia6 = ip6_getdstifaddr(m); + if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp); + return IPPROTO_DONE; +} +#endif + +void +tcp_input(m, off0) + register struct mbuf *m; + int off0; +{ + register struct tcphdr *th; + register struct ip *ip = NULL; + register struct ipovly *ipov; + register struct inpcb *inp = NULL; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + int drop_hdrlen; + register struct tcpcb *tp = 0; + register int thflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ + int headlocked = 0; + +#ifdef TCPDEBUG + u_char tcp_saveipgen[40]; + /* the size of the above must be of max ip header, now IPv6 */ + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#endif /* INET6 */ + struct sockaddr_in *next_hop = NULL; + int rstreason; /* For badport_bandlim accounting purposes */ + + /* Grab info from MT_TAG mbufs prepended to the chain. */ + for (;m && m->m_type == MT_TAG; m = m->m_next) { + if (m->m_tag_id == PACKET_TAG_IPFORWARD) + next_hop = (struct sockaddr_in *)m->m_hdr.mh_data; + } + +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + +#ifdef INET6 + if (isipv6) { + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + goto drop; + } + } else +#endif /* INET6 */ + { + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + if (off0 > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + tlen = ip->ip_len; + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + + ip->ip_len + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + /* + * Checksum extended TCP header and data. + */ + len = sizeof (struct ip) + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + ipov->ih_len = htons(ipov->ih_len); + th->th_sum = in_cksum(m, len); + } + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; +#endif + } + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; /* tlen is used instead of ti->ti_len */ + if (off > sizeof (struct tcphdr)) { +#ifdef INET6 + if (isipv6) { + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + } + } + optlen = off - sizeof (struct tcphdr); + optp = (u_char *)(th + 1); + } + thflags = th->th_flags; + +#ifdef TCP_DROP_SYNFIN + /* + * If the drop_synfin option is enabled, drop all packets with + * both the SYN and FIN bits set. This prevents e.g. nmap from + * identifying the TCP/IP stack. + * + * This is a violation of the TCP specification. + */ + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + goto drop; +#endif + + /* + * Convert TCP protocol specific fields to host format. + */ + th->th_seq = ntohl(th->th_seq); + th->th_ack = ntohl(th->th_ack); + th->th_win = ntohs(th->th_win); + th->th_urp = ntohs(th->th_urp); + + /* + * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, + * until after ip6_savecontrol() is called and before other functions + * which don't want those proto headers. + * Because ip6_savecontrol() is going to parse the mbuf to + * search for data to be passed up to user-land, it wants mbuf + * parameters to be unchanged. + * XXX: the call of ip6_savecontrol() has been obsoleted based on + * latest version of the advanced API (20020110). + */ + drop_hdrlen = off0 + off; + + /* + * Locate pcb for segment. + */ + INP_INFO_WLOCK(&tcbinfo); + headlocked = 1; +findpcb: + /* IPFIREWALL_FORWARD section */ + if (next_hop != NULL +#ifdef INET6 + && isipv6 == NULL /* IPv6 support is not yet */ +#endif /* INET6 */ + ) { + /* + * Transparently forwarded. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); + if (!inp) { + /* + * No, then it's new. Try find the ambushing socket + */ + if (next_hop->sin_port == 0) { + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, + th->th_sport, next_hop->sin_addr, + th->th_dport, 1, m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_hash(&tcbinfo, + ip->ip_src, th->th_sport, + next_hop->sin_addr, + ntohs(next_hop->sin_port), 1, + m->m_pkthdr.rcvif); + } + } + } else + { +#ifdef INET6 + if (isipv6) + inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, 1, + m->m_pkthdr.rcvif); + else +#endif /* INET6 */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); + } + +#ifdef IPSEC +#ifdef INET6 + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + goto drop; + } + } else +#endif /* INET6 */ + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto drop; + } +#endif /*IPSEC*/ + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain) { +#ifdef INET6 + char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; +#else /* INET6 */ + char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; +#endif /* INET6 */ + +#ifdef INET6 + if (isipv6) { + strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); + strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + } else +#endif + { + strcpy(dbuf, inet_ntoa(ip->ip_dst)); + strcpy(sbuf, inet_ntoa(ip->ip_src)); + } + switch (log_in_vain) { + case 1: + if(thflags & TH_SYN) + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d\n", + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); + break; + case 2: + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); + break; + default: + break; + } + } + if (blackhole) { + switch (blackhole) { + case 1: + if (thflags & TH_SYN) + goto drop; + break; + case 2: + goto drop; + default: + goto drop; + } + } + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + INP_LOCK(inp); + tp = intotcpcb(inp); + if (tp == 0) { + INP_UNLOCK(inp); + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((thflags & TH_SYN) == 0) + tiwin = th->th_win << tp->snd_scale; + else + tiwin = th->th_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { + struct in_conninfo inc; +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; +#ifdef INET6 + if (isipv6) + bcopy((char *)ip6, (char *)tcp_saveipgen, + sizeof(*ip6)); + else +#endif /* INET6 */ + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; + } +#endif + /* skip if this isn't a listen socket */ + if ((so->so_options & SO_ACCEPTCONN) == 0) + goto after_listen; +#ifdef INET6 + inc.inc_isipv6 = isipv6; + if (isipv6) { + inc.inc6_faddr = ip6->ip6_src; + inc.inc6_laddr = ip6->ip6_dst; + inc.inc6_route.ro_rt = NULL; /* XXX */ + + } else +#endif /* INET6 */ + { + inc.inc_faddr = ip->ip_src; + inc.inc_laddr = ip->ip_dst; + inc.inc_route.ro_rt = NULL; /* XXX */ + } + inc.inc_fport = th->th_sport; + inc.inc_lport = th->th_dport; + + /* + * If the state is LISTEN then ignore segment if it contains + * a RST. If the segment contains an ACK then it is bad and + * send a RST. If it does not contain a SYN then it is not + * interesting; drop it. + * + * If the state is SYN_RECEIVED (syncache) and seg contains + * an ACK, but not for our SYN/ACK, send a RST. If the seg + * contains a RST, check the sequence number to see if it + * is a valid reset segment. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + if (!syncache_expand(&inc, th, &so, m)) { + /* + * No syncache entry, or ACK was not + * for our SYN/ACK. Send a RST. + */ + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + if (so == NULL) { + /* + * Could not complete 3-way handshake, + * connection is being closed down, and + * syncache will free mbuf. + */ + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return; + } + /* + * Socket is created in state SYN_RECEIVED. + * Continue processing segment. + */ + INP_UNLOCK(inp); + inp = sotoinpcb(so); + INP_LOCK(inp); + tp = intotcpcb(inp); + /* + * This is what would have happened in + * tcp_output() when the SYN,ACK was sent. + */ + tp->snd_up = tp->snd_una; + tp->snd_max = tp->snd_nxt = tp->iss + 1; + tp->last_ack_sent = tp->rcv_nxt; +/* + * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled + * until the _second_ ACK is received: + * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window. + * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale, + * move to ESTAB, set snd_wnd to tiwin. + */ + tp->snd_wnd = tiwin; /* unscaled */ + goto after_listen; + } + if (thflags & TH_RST) { + syncache_chkrst(&inc, th); + goto drop; + } + if (thflags & TH_ACK) { + syncache_badack(&inc); + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + goto drop; + } + + /* + * Segment's flags are (SYN) or (SYN|FIN). + */ +#ifdef INET6 + /* + * If deprecated address is forbidden, + * we do not accept SYN to deprecated interface + * address to prevent any new inbound connection from + * getting established. + * When we do not accept SYN, we send a TCP RST, + * with deprecated source address (instead of dropping + * it). We compromise it as it is much better for peer + * to send a RST, and RST will be the final packet + * for the exchange. + * + * If we do not forbid deprecated addresses, we accept + * the SYN packet. RFC2462 does not suggest dropping + * SYN in this case. + * If we decipher RFC2462 5.5.4, it says like this: + * 1. use of deprecated addr with existing + * communication is okay - "SHOULD continue to be + * used" + * 2. use of it with new communication: + * (2a) "SHOULD NOT be used if alternate address + * with sufficient scope is available" + * (2b) nothing mentioned otherwise. + * Here we fall into (2b) case as we have no choice in + * our source address selection - we must obey the peer. + * + * The wording in RFC2462 is confusing, and there are + * multiple description text for deprecated address + * handling - worse, they are not exactly the same. + * I believe 5.5.4 is the best one, so we follow 5.5.4. + */ + if (isipv6 && !ip6_use_deprecated) { + struct in6_ifaddr *ia6; + + if ((ia6 = ip6_getdstifaddr(m)) && + (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { + INP_UNLOCK(inp); + tp = NULL; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + } +#endif + /* + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + */ + if (th->th_dport == th->th_sport) { +#ifdef INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * + * Note that it is quite possible to receive unicast + * link-layer packets with a broadcast IP address. Use + * in_broadcast() to find them. + */ + if (m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + goto drop; + /* + * SYN appears to be valid; create compressed TCP state + * for syncache, or perform t/tcp connection. + */ + if (so->so_qlen <= so->so_qlimit) { + tcp_dooptions(&to, optp, optlen, 1); + if (!syncache_add(&inc, &to, th, &so, m)) + goto drop; + if (so == NULL) { + /* + * Entry added to syncache, mbuf used to + * send SYN,ACK packet. + */ + KASSERT(headlocked, ("headlocked")); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return; + } + /* + * Segment passed TAO tests. + */ + INP_UNLOCK(inp); + inp = sotoinpcb(so); + INP_LOCK(inp); + tp = intotcpcb(inp); + tp->snd_wnd = tiwin; + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if (DELAY_ACK(tp) && ((thflags & TH_FIN) || + (tlen != 0 && +#ifdef INET6 + ((isipv6 && in6_localaddr(&inp->in6p_faddr)) + || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + )) +#endif + ))) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + tcpstat.tcps_connects++; + soisconnected(so); + goto trimthenstep6; + } + goto drop; + } +after_listen: + +/* XXX temp debugging */ + /* should not happen - syncache should pick up these connections */ + if (tp->t_state == TCPS_LISTEN) + panic("tcp_input: TCPS_LISTEN"); + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + + /* + * Process options. + * XXX this is tradtitional behavior, may need to be cleaned up. + */ + tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); + if (thflags & TH_SYN) { + if (to.to_flags & TOF_SCALE) { + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = to.to_requested_s_scale; + } + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = ticks; + } + if (to.to_flags & (TOF_CC|TOF_CCNEW)) + tp->t_flags |= TF_RCVD_CC; + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + } + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + th->th_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + KASSERT(headlocked, ("headlocked")); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + /* + * "bad retransmit" recovery + */ + if (tp->t_rxtshift == 1 && + ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + if ((to.to_flags & TOF_TS) != 0) + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = th->th_ack; + m_freem(m); + ND6_HINT(tp); /* some progress has been done */ + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + callout_stop(tp->tt_rexmt); + else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); + + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + INP_UNLOCK(inp); + return; + } + } else if (th->th_ack == tp->snd_una && + LIST_EMPTY(&tp->t_segq) && + tlen <= sbspace(&so->so_rcv)) { + KASSERT(headlocked, ("headlocked")); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += tlen; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); /* some progress has been done */ + /* + * Add data to socket buffer. + */ + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappend(&so->so_rcv, m); + sorwakeup(so); + if (DELAY_ACK(tp)) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + INP_UNLOCK(inp); + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + if (thflags & TH_RST) { + if (thflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((thflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = th->th_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flags & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (DELAY_ACK(tp) && tlen != 0) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + /* + * Received <SYN,ACK> in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + callout_stop(tp->tt_rexmt); + if (to.to_flags & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance th->th_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (thflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((thflags & TH_SYN) && + (to.to_flags & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + (ticks - tp->t_starttime) > tcp_msl) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * If we have multiple segments in flight, the intial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; + + case TCPS_TIME_WAIT: + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += tlen; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if its CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) + goto dropafterack; + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && tlen) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= tlen) { + tcpstat.tcps_rcvbyteafterwin += tlen; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (thflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(th->th_seq, tp->rcv_nxt)) { + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (thflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (!callout_active(tp->tt_rexmt) || + th->th_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + if (tcp_do_newreno && SEQ_LT(th->th_ack, + tp->snd_recover)) { + /* False retransmit, should not + * cut window + */ + tp->snd_cwnd += tp->t_maxseg; + tp->t_dupacks = 0; + (void) tcp_output(tp); + goto drop; + } + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->snd_recover = tp->snd_max; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tcp_do_newreno == 0) { + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } else if (tp->t_dupacks >= tcprexmtthresh && + !tcp_newreno(tp, th)) { + /* + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = + tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } + if (tp->t_dupacks < tcprexmtthresh) + tp->t_dupacks = 0; + if (SEQ_GT(th->th_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flags & TOF_TS) + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + callout_stop(tp->tt_rexmt); + needoutput = 1; + } else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + /* + * If t_dupacks != 0 here, it indicates that we are still + * in NewReno fast recovery mode, so we leave the congestion + * window alone. + */ + if (tcp_do_newreno == 0 || tp->t_dupacks == 0) + tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + sowwakeup(so); + tp->snd_una = th->th_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (u_long)tlen +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, th, m, + drop_hdrlen); /* hdr drop is delayed */ + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + KASSERT(headlocked, ("headlocked")); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((tlen || (thflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * Insert segment which inludes th into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. This handle the common case inline (segment + * is the next to be received on an established connection, and the + * queue is empty), avoiding linkage into and removal from the queue + * and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ + if (th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + if (DELAY_ACK(tp)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt += tlen; + thflags = th->th_flags & TH_FIN; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); + sbappend(&so->so_rcv, m); + sorwakeup(so); + } else { + thflags = tcp_reass(tp, th, &tlen, m); + tp->t_flags |= TF_ACKNOW; + } + + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /*FALLTHROUGH*/ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + INP_UNLOCK(inp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + INP_UNLOCK(inp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ + + /* + * Perform bandwidth limiting. + */ + if (badport_bandlim(rstreason) < 0) + goto drop; + +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + if (tp) + INP_UNLOCK(inp); + + if (thflags & TH_ACK) + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, + TH_RST); + else { + if (thflags & TH_SYN) + tlen++; + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); + } + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp) + INP_UNLOCK(inp); + m_freem(m); + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + return; +} + +/* + * Parse TCP options and place in tcpopt. + */ +static void +tcp_dooptions(to, cp, cnt, is_syn) + struct tcpopt *to; + u_char *cp; + int cnt; +{ + int opt, optlen; + + to->to_flags = 0; + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = cp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!is_syn) + continue; + to->to_flags |= TOF_MSS; + bcopy((char *)cp + 2, + (char *)&to->to_mss, sizeof(to->to_mss)); + to->to_mss = ntohs(to->to_mss); + break; + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (! is_syn) + continue; + to->to_flags |= TOF_SCALE; + to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flags |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + to->to_tsval = ntohl(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + to->to_tsecr = ntohl(to->to_tsecr); + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flags |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + to->to_cc = ntohl(to->to_cc); + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!is_syn) + continue; + to->to_flags |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + to->to_cc = ntohl(to->to_cc); + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!is_syn) + continue; + to->to_flags |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + to->to_ccecho = ntohl(to->to_ccecho); + break; + default: + continue; + } + } +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(so, th, m, off) + struct socket *so; + struct tcphdr *th; + register struct mbuf *m; + int off; /* delayed to be droped hdrlen */ +{ + int cnt = off + th->th_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + int rtt; +{ + register int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(tp, offer) + struct tcpcb *tp; + int offer; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + + inp = tp->t_inpcb; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(&inp->inp_inc); + else +#endif + rt = tcp_rtlookup(&inp->inp_inc); + if (rt == NULL) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); + tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + * else, use the link mtu. + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - min_protoh; + else + { + mss = +#ifdef INET6 + (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : +#endif + ifp->if_mtu +#ifdef INET6 + ) +#endif + - min_protoh; +#ifdef INET6 + if (isipv6) { + if (!in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, tcp_v6mssdflt); + } else +#endif + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize, so, NULL); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize, so, NULL); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + ) +#endif + ) + tp->snd_cwnd = mss * ss_fltsz_local; + else + tp->snd_cwnd = mss * ss_fltsz; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(tp) + struct tcpcb *tp; +{ + struct rtentry *rt; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + +#ifdef INET6 + isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); + else +#endif /* INET6 */ + rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); + if (rt == NULL) + return +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + return rt->rt_ifp->if_mtu - min_protoh; +} + + +/* + * Checks for partial ack. If partial ack arrives, force the retransmission + * of the next unacknowledged segment, do not clear tp->t_dupacks, and return + * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to + * be started again. If the ack advances at least to tp->snd_recover, return 0. + */ +static int +tcp_newreno(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + tcp_seq onxt = tp->snd_nxt; + u_long ocwnd = tp->snd_cwnd; + + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset + * (tp->snd_una has not yet been updated when this function + * is called) + */ + tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); + return (1); + } + return (0); +} diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h new file mode 100644 index 0000000..5850ccc --- /dev/null +++ b/sys/netinet/tcp_seq.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_SEQ_H_ +#define _NETINET_TCP_SEQ_H_ +/* + * TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* + * TCP connection counts are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define CC_LT(a,b) ((int)((a)-(b)) < 0) +#define CC_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define CC_GT(a,b) ((int)((a)-(b)) > 0) +#define CC_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* Macro to increment a CC: skip 0 which has a special meaning */ +#define CC_INC(c) (++(c) == 0 ? ++(c) : (c)) + +/* + * Macros to initialize tcp sequence numbers for + * send and receive from initial send and receive + * sequence numbers. + */ +#define tcp_rcvseqinit(tp) \ + (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1 + +#define tcp_sendseqinit(tp) \ + (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ + (tp)->snd_recover = (tp)->iss + +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * hz) + /* timestamp wrap-around time */ + +#ifdef _KERNEL +extern tcp_cc tcp_ccgen; /* global connection count */ +#endif /* _KERNEL */ +#endif /* _NETINET_TCP_SEQ_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c new file mode 100644 index 0000000..f7800d2 --- /dev/null +++ b/sys/netinet/tcp_subr.c @@ -0,0 +1,1510 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#ifdef INET6 +#include <sys/domain.h> +#endif +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/random.h> + +#include <vm/uma.h> + +#include <net/route.h> +#include <net/if.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif +#include <netinet6/ip6protosw.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> +#include <sys/md5.h> + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +#ifdef INET6 +int tcp_v6mssdflt = TCP6_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, &tcp_v6mssdflt , 0, + "Default TCP Maximum Segment Size for IPv6"); +#endif + +#if 0 +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, + &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); +#endif + +int tcp_do_rfc1323 = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); + +int tcp_do_rfc1644 = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, + &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +static int do_tcpdrain = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static int icmp_may_rst = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); + +static int tcp_isn_reseed_interval = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, + &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); + +static void tcp_cleartaocache(void); +static struct inpcb *tcp_notify(struct inpcb *, int); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * This is the actual shape of what we allocate using the zone + * allocator. Doing it this way allows us to protect both structures + * using the same generation count, and also eliminates the overhead + * of allocating tcpcbs separately. By hiding the structure here, + * we avoid changing most of the rest of the code (although it needs + * to be changed, eventually, for greater efficiency). + */ +#define ALIGNMENT 32 +#define ALIGNM1 (ALIGNMENT - 1) +struct inp_tp { + union { + struct inpcb inp; + char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; + } inp_tp_u; + struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; + struct callout inp_tp_delack; +}; +#undef ALIGNMENT +#undef ALIGNM1 + +/* + * Tcp initialization + */ +void +tcp_init() +{ + int hashsize = TCBHASHSIZE; + + tcp_ccgen = 1; + tcp_cleartaocache(); + + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + + INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + tcp_tcbhashsize = hashsize; + tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); + tcbinfo.porthashbase = hashinit(hashsize, M_PCB, + &tcbinfo.porthashmask); + tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) + panic("tcp_init"); +#undef TCP_MINPROTOHDR + + syncache_init(); +} + +/* + * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. + * tcp_template used to store this data in mbufs, but we now recopy it out + * of the tcpcb each time to conserve mbufs. + */ +void +tcp_fillheaders(tp, ip_ptr, tcp_ptr) + struct tcpcb *tp; + void *ip_ptr; + void *tcp_ptr; +{ + struct inpcb *inp = tp->t_inpcb; + struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)ip_ptr; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + tcp_hdr->th_sum = 0; + } else +#endif + { + struct ip *ip = (struct ip *) ip_ptr; + + ip->ip_vhl = IP_VHL_BORING; + ip->ip_tos = 0; + ip->ip_len = 0; + ip->ip_id = 0; + ip->ip_off = 0; + ip->ip_ttl = 0; + ip->ip_sum = 0; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + IPPROTO_TCP)); + } + + tcp_hdr->th_sport = inp->inp_lport; + tcp_hdr->th_dport = inp->inp_fport; + tcp_hdr->th_seq = 0; + tcp_hdr->th_ack = 0; + tcp_hdr->th_x2 = 0; + tcp_hdr->th_off = 5; + tcp_hdr->th_flags = 0; + tcp_hdr->th_win = 0; + tcp_hdr->th_urp = 0; +} + +/* + * Create template to be used to send tcp packets on a connection. + * Allocates an mbuf and fills in a skeletal tcp/ip header. The only + * use for this function is in keepalives, which use tcp_respond. + */ +struct tcptemp * +tcp_maketemplate(tp) + struct tcpcb *tp; +{ + struct mbuf *m; + struct tcptemp *n; + + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof(struct tcptemp); + n = mtod(m, struct tcptemp *); + + tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection. If flags are given then we send + * a message back to the TCP which originated the * segment ti, + * and discard the mbuf containing it and any other attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(tp, ipgen, th, m, ack, seq, flags) + struct tcpcb *tp; + void *ipgen; + register struct tcphdr *th; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + struct ip *ip; + struct tcphdr *nth; +#ifdef INET6 + struct route_in6 *ro6 = 0; + struct route_in6 sro6; + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + +#ifdef INET6 + isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + + if (tp) { + if (!(flags & TH_RST)) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + } +#ifdef INET6 + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else +#endif /* INET6 */ + ro = &tp->t_inpcb->inp_route; + } else { +#ifdef INET6 + if (isipv6) { + ro6 = &sro6; + bzero(ro6, sizeof *ro6); + } else +#endif /* INET6 */ + { + ro = &sro; + bzero(ro, sizeof *ro); + } + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + tlen = 0; + m->m_data += max_linkhdr; +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, n_short); +#undef xchg + } +#ifdef INET6 + if (isipv6) { + ip6->ip6_flow = 0; + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } else +#endif + { + tlen += sizeof (struct tcpiphdr); + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; + if (tp) + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); + else + nth->th_win = htons((u_short)win); + nth->th_urp = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = 0; + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, + ro6 && ro6->ro_rt ? + ro6->ro_rt->rt_ifp : + NULL); + } else +#endif /* INET6 */ + { + nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + } +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); +#endif +#ifdef IPSEC + if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { + m_freem(m); + return; + } +#endif +#ifdef INET6 + if (isipv6) { + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + if (ro6 == &sro6 && ro6->ro_rt) { + RTFREE(ro6->ro_rt); + ro6->ro_rt = NULL; + } + } else +#endif /* INET6 */ + { + (void) ip_output(m, NULL, ro, ipflags, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + struct inp_tp *it; + register struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + it = (struct inp_tp *)inp; + tp = &it->tcb; + bzero((char *) tp, sizeof(struct tcpcb)); + LIST_INIT(&tp->t_segq); + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); + callout_init(tp->tt_persist = &it->inp_tp_persist, 0); + callout_init(tp->tt_keep = &it->inp_tp_keep, 0); + callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); + callout_init(tp->tt_delack = &it->inp_tp_delack, 0); + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (tcp_do_rfc1644) + tp->t_flags |= TF_REQ_CC; + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + /* + * IPv4 TTL initialization is necessary for an IPv6 socket as well, + * because the socket may be bound to an IPv6 wildcard address, + * which may match an IPv4-mapped IPv6 address. + */ + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct tseg_qent *q; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + register struct rtentry *rt; + int dosavessthresh; + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16) { + register u_long i = 0; +#ifdef INET6 + if (isipv6) { + struct sockaddr_in6 *sin6; + + if ((rt = inp->in6p_route.ro_rt) == NULL) + goto no_valid_rt; + sin6 = (struct sockaddr_in6 *)rt_key(rt); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + goto no_valid_rt; + } + else +#endif /* INET6 */ + if ((rt = inp->inp_route.ro_rt) == NULL || + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr + == INADDR_ANY) + goto no_valid_rt; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * The old comment here said: + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + * + * But we want to save the ssthresh even if no pipesize is + * specified explicitly in the route, because such + * connections still have an implicit pipesize specified + * by the global tcp_sendspace. In the absence of a reliable + * way to calculate the pipesize, it will have to do. + */ + i = tp->snd_ssthresh; + if (rt->rt_rmx.rmx_sendpipe != 0) + dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); + else + dosavessthresh = (i < so->so_snd.sb_hiwat / 2); + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) + || dosavessthresh) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + no_valid_rt: + /* free the reassembly queue, if any */ + while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + } + inp->inp_ppcb = NULL; + soisdisconnected(so); +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + in6_pcbdetach(inp); + else +#endif /* INET6 */ + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + if (do_tcpdrain) + { + struct inpcb *inpb; + struct tcpcb *tcpb; + struct tseg_qent *te; + + /* + * Walk the tcpbs, if existing, and flush the reassembly queue, + * if there is one... + * XXX: The "Net/3" implementation doesn't imply that the TCP + * reassembly queue should be flushed, but in a situation + * where we're really low on mbufs, this is potentially + * usefull. + */ + INP_INFO_RLOCK(&tcbinfo); + LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { + INP_LOCK(inpb); + if ((tcpb = intotcpcb(inpb))) { + while ((te = LIST_FIRST(&tcpb->t_segq)) + != NULL) { + LIST_REMOVE(te, tqe_q); + m_freem(te->tqe_m); + FREE(te, M_TSEGQ); + } + } + INP_UNLOCK(inpb); + } + INP_INFO_RUNLOCK(&tcbinfo); + } +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +static struct inpcb * +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return inp; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) { + tcp_drop(tp, error); + return (struct inpcb *)0; + } else { + tp->t_softerror = error; + return inp; + } +#if 0 + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +#endif +} + +static int +tcp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt && + cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) + inp_list[i++] = inp; + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + INP_UNLOCK(inp); + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser_cred(req->td->td_ucred, PRISON_ROOT); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp == NULL) { + error = ENOENT; + goto outunlocked; + } else { + INP_LOCK(inp); + if (inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + } + + error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); + if (error) + goto out; + cru2x(inp->inp_socket->so_cred, &xuc); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + INP_UNLOCK(inp); +outunlocked: + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); + +#ifdef INET6 +static int +tcp6_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, s, mapped = 0; + + error = suser_cred(req->td->td_ucred, PRISON_ROOT); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + if (mapped == 1) + inp = in_pcblookup_hash(&tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, + 0, NULL); + if (inp == NULL) { + error = ENOENT; + goto outunlocked; + } else { + INP_LOCK(inp); + if (inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + } + error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); + if (error) + goto out; + cru2x(inp->inp_socket->so_cred, &xuc); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + INP_UNLOCK(inp); +outunlocked: + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); +#endif + + +void +tcp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct tcphdr *th; + struct in_addr faddr; + struct inpcb *inp; + struct tcpcb *tp; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + tcp_seq icmp_seq; + int s; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT) && ip) + notify = tcp_drop_syn_sent; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + notify = in_rtchange; + } else if (cmd == PRC_HOSTDEAD) + ip = 0; + else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip) { + s = splnet(); + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); + INP_INFO_WLOCK(&tcbinfo); + inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, 0, NULL); + if (inp != NULL) { + INP_LOCK(inp); + if (inp->inp_socket != NULL) { + icmp_seq = htonl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_seq, tp->snd_una) && + SEQ_LT(icmp_seq, tp->snd_max)) + inp = (*notify)(inp, inetctlerrmap[cmd]); + } + if (inp) + INP_UNLOCK(inp); + } else { + struct in_conninfo inc; + + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc_faddr = faddr; + inc.inc_laddr = ip->ip_src; +#ifdef INET6 + inc.inc_isipv6 = 0; +#endif + syncache_unreach(&inc, th); + } + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + } else + in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); +} + +#ifdef INET6 +void +tcp6_ctlinput(cmd, sa, d) + int cmd; + struct sockaddr *sa; + void *d; +{ + struct tcphdr th; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + struct ip6_hdr *ip6; + struct mbuf *m; + struct ip6ctlparam *ip6cp = NULL; + const struct sockaddr_in6 *sa6_src = NULL; + int off; + struct tcp_portonly { + u_int16_t th_sport; + u_int16_t th_dport; + } *thp; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + sa6_src = ip6cp->ip6c_src; + } else { + m = NULL; + ip6 = NULL; + off = 0; /* fool gcc */ + sa6_src = &sa6_any; + } + + if (ip6) { + struct in_conninfo inc; + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + + /* check if we can safely examine src and dst ports */ + if (m->m_pkthdr.len < off + sizeof(*thp)) + return; + + bzero(&th, sizeof(th)); + m_copydata(m, off, sizeof(*thp), (caddr_t)&th); + + in6_pcbnotify(&tcb, sa, th.th_dport, + (struct sockaddr *)ip6cp->ip6c_src, + th.th_sport, cmd, notify); + + inc.inc_fport = th.th_dport; + inc.inc_lport = th.th_sport; + inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; + inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; + inc.inc_isipv6 = 1; + syncache_unreach(&inc, &th); + } else + in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, + 0, cmd, notify); +} +#endif /* INET6 */ + + +/* + * Following is where TCP initial sequence number generation occurs. + * + * There are two places where we must use initial sequence numbers: + * 1. In SYN-ACK packets. + * 2. In SYN packets. + * + * All ISNs for SYN-ACK packets are generated by the syncache. See + * tcp_syncache.c for details. + * + * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling + * depends on this property. In addition, these ISNs should be + * unguessable so as to prevent connection hijacking. To satisfy + * the requirements of this situation, the algorithm outlined in + * RFC 1948 is used to generate sequence numbers. + * + * Implementation details: + * + * Time is based off the system timer, and is corrected so that it + * increases by one megabyte per second. This allows for proper + * recycling on high speed LANs while still leaving over an hour + * before rollover. + * + * net.inet.tcp.isn_reseed_interval controls the number of seconds + * between seeding of isn_secret. This is normally set to zero, + * as reseeding should not be necessary. + * + */ + +#define ISN_BYTES_PER_SECOND 1048576 + +u_char isn_secret[32]; +int isn_last_reseed; +MD5_CTX isn_ctx; + +tcp_seq +tcp_new_isn(tp) + struct tcpcb *tp; +{ + u_int32_t md5_buffer[4]; + tcp_seq new_isn; + + /* Seed if this is the first use, reseed if requested. */ + if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && + (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) + < (u_int)ticks))) { + read_random(&isn_secret, sizeof(isn_secret)); + isn_last_reseed = ticks; + } + + /* Compute the md5 hash and return the ISN. */ + MD5Init(&isn_ctx); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, + sizeof(struct in6_addr)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, + sizeof(struct in6_addr)); + } else +#endif + { + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, + sizeof(struct in_addr)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, + sizeof(struct in_addr)); + } + MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); + MD5Final((u_char *) &md5_buffer, &isn_ctx); + new_isn = (tcp_seq) md5_buffer[0]; + new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); + return new_isn; +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +struct inpcb * +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; + return (inp); +} + +/* + * When a specific ICMP unreachable message is received and the + * connection state is SYN-SENT, drop the connection. This behavior + * is controlled by the icmp_may_rst sysctl. + */ +struct inpcb * +tcp_drop_syn_sent(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp && tp->t_state == TCPS_SYN_SENT) { + tcp_drop(tp, errno); + return (struct inpcb *)0; + } + return inp; +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +struct inpcb * +tcp_mtudisc(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; +#ifdef INET6 + int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + if (tp) { +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(&inp->inp_inc); + else +#endif /* INET6 */ + rt = tcp_rtlookup(&inp->inp_inc); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return inp; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - +#ifdef INET6 + (isipv6 ? + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : +#endif /* INET6 */ + sizeof(struct tcpiphdr) +#ifdef INET6 + ) +#endif /* INET6 */ + ; + + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return inp; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } + return inp; +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated the return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(inc) + struct in_conninfo *inc; +{ + struct route *ro; + struct rtentry *rt; + + ro = &inc->inc_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inc->inc_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inc->inc_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +#ifdef INET6 +struct rtentry * +tcp_rtlookup6(inc) + struct in_conninfo *inc; +{ + struct route_in6 *ro6; + struct rtentry *rt; + + ro6 = &inc->inc6_route; + rt = ro6->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { + ro6->ro_dst.sin6_family = AF_INET6; + ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); + ro6->ro_dst.sin6_addr = inc->inc6_faddr; + rtalloc((struct route *)ro6); + rt = ro6->ro_rt; + } + } + return rt; +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(tp) + struct tcpcb *tp; +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif /* INET6 */ + struct tcphdr *th; + + if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) + return 0; + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return 0; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + tcp_fillheaders(tp, ip6, th); + hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + tcp_fillheaders(tp, ip, th); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return hdrsiz; +} +#endif /*IPSEC*/ + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(inc) + struct in_conninfo *inc; +{ + struct rtentry *rt; + +#ifdef INET6 + if (inc->inc_isipv6) + rt = tcp_rtlookup6(inc); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inc); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} + +/* + * Clear all the TAO cache entries, called from tcp_init. + * + * XXX + * This routine is just an empty one, because we assume that the routing + * routing tables are initialized at the same time when TCP, so there is + * nothing in the cache left over. + */ +static void +tcp_cleartaocache() +{ +} diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c new file mode 100644 index 0000000..0ef2f3d --- /dev/null +++ b/sys/netinet/tcp_syncache.c @@ -0,0 +1,1371 @@ +/*- + * Copyright (c) 2001 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jonathan Lemon + * and NAI Labs, the Security Research Division of Network Associates, Inc. + * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/md5.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/random.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/nd6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#include <netkey/key.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> +#include <vm/uma.h> + +static int tcp_syncookies = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, + &tcp_syncookies, 0, + "Use TCP SYN cookies if the syncache overflows"); + +static void syncache_drop(struct syncache *, struct syncache_head *); +static void syncache_free(struct syncache *); +static void syncache_insert(struct syncache *, struct syncache_head *); +struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); +static int syncache_respond(struct syncache *, struct mbuf *); +static struct socket *syncache_socket(struct syncache *, struct socket *, + struct mbuf *m); +static void syncache_timer(void *); +static u_int32_t syncookie_generate(struct syncache *); +static struct syncache *syncookie_lookup(struct in_conninfo *, + struct tcphdr *, struct socket *); + +/* + * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. + * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds, + * the odds are that the user has given up attempting to connect by then. + */ +#define SYNCACHE_MAXREXMTS 3 + +/* Arbitrary values */ +#define TCP_SYNCACHE_HASHSIZE 512 +#define TCP_SYNCACHE_BUCKETLIMIT 30 + +struct tcp_syncache { + struct syncache_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; + u_int cache_limit; + u_int rexmt_limit; + u_int hash_secret; + u_int next_reseed; + TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1]; + struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1]; +}; +static struct tcp_syncache tcp_syncache; + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); + +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RD, + &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache"); + +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RD, + &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache"); + +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, + &tcp_syncache.cache_count, 0, "Current number of entries in syncache"); + +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RD, + &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable"); + +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, + &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions"); + +static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); + +#define SYNCACHE_HASH(inc, mask) \ + ((tcp_syncache.hash_secret ^ \ + (inc)->inc_faddr.s_addr ^ \ + ((inc)->inc_faddr.s_addr >> 16) ^ \ + (inc)->inc_fport ^ (inc)->inc_lport) & mask) + +#define SYNCACHE_HASH6(inc, mask) \ + ((tcp_syncache.hash_secret ^ \ + (inc)->inc6_faddr.s6_addr32[0] ^ \ + (inc)->inc6_faddr.s6_addr32[3] ^ \ + (inc)->inc_fport ^ (inc)->inc_lport) & mask) + +#define ENDPTS_EQ(a, b) ( \ + (a)->ie_fport == (b)->ie_fport && \ + (a)->ie_lport == (b)->ie_lport && \ + (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ + (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ +) + +#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) + +#define SYNCACHE_TIMEOUT(sc, slot) do { \ + sc->sc_rxtslot = slot; \ + sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[slot]; \ + TAILQ_INSERT_TAIL(&tcp_syncache.timerq[slot], sc, sc_timerq); \ + if (!callout_active(&tcp_syncache.tt_timerq[slot])) \ + callout_reset(&tcp_syncache.tt_timerq[slot], \ + TCPTV_RTOBASE * tcp_backoff[slot], \ + syncache_timer, (void *)((intptr_t)slot)); \ +} while (0) + +static void +syncache_free(struct syncache *sc) +{ + struct rtentry *rt; + + if (sc->sc_ipopts) + (void) m_free(sc->sc_ipopts); +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) + rt = sc->sc_route6.ro_rt; + else +#endif + rt = sc->sc_route.ro_rt; + if (rt != NULL) { + /* + * If this is the only reference to a protocol cloned + * route, remove it immediately. + */ + if (rt->rt_flags & RTF_WASCLONED && + (sc->sc_flags & SCF_KEEPROUTE) == 0 && + rt->rt_refcnt == 1) + rtrequest(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, NULL); + RTFREE(rt); + } + uma_zfree(tcp_syncache.zone, sc); +} + +void +syncache_init(void) +{ + int i; + + tcp_syncache.cache_count = 0; + tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; + tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; + tcp_syncache.cache_limit = + tcp_syncache.hashsize * tcp_syncache.bucket_limit; + tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; + tcp_syncache.next_reseed = 0; + tcp_syncache.hash_secret = arc4random(); + + TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", + &tcp_syncache.hashsize); + TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", + &tcp_syncache.cache_limit); + TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", + &tcp_syncache.bucket_limit); + if (!powerof2(tcp_syncache.hashsize)) { + printf("WARNING: syncache hash size is not a power of 2.\n"); + tcp_syncache.hashsize = 512; /* safe default */ + } + tcp_syncache.hashmask = tcp_syncache.hashsize - 1; + + /* Allocate the hash table. */ + MALLOC(tcp_syncache.hashbase, struct syncache_head *, + tcp_syncache.hashsize * sizeof(struct syncache_head), + M_SYNCACHE, M_WAITOK); + + /* Initialize the hash buckets. */ + for (i = 0; i < tcp_syncache.hashsize; i++) { + TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket); + tcp_syncache.hashbase[i].sch_length = 0; + } + + /* Initialize the timer queues. */ + for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) { + TAILQ_INIT(&tcp_syncache.timerq[i]); + callout_init(&tcp_syncache.tt_timerq[i], 0); + } + + /* + * Allocate the syncache entries. Allow the zone to allocate one + * more entry than cache limit, so a new entry can bump out an + * older one. + */ + tcp_syncache.cache_limit -= 1; + tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit); +} + +static void +syncache_insert(sc, sch) + struct syncache *sc; + struct syncache_head *sch; +{ + struct syncache *sc2; + int s, i; + + /* + * Make sure that we don't overflow the per-bucket + * limit or the total cache size limit. + */ + s = splnet(); + if (sch->sch_length >= tcp_syncache.bucket_limit) { + /* + * The bucket is full, toss the oldest element. + */ + sc2 = TAILQ_FIRST(&sch->sch_bucket); + sc2->sc_tp->ts_recent = ticks; + syncache_drop(sc2, sch); + tcpstat.tcps_sc_bucketoverflow++; + } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) { + /* + * The cache is full. Toss the oldest entry in the + * entire cache. This is the front entry in the + * first non-empty timer queue with the largest + * timeout value. + */ + for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { + sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]); + if (sc2 != NULL) + break; + } + sc2->sc_tp->ts_recent = ticks; + syncache_drop(sc2, NULL); + tcpstat.tcps_sc_cacheoverflow++; + } + + /* Initialize the entry's timer. */ + SYNCACHE_TIMEOUT(sc, 0); + + /* Put it into the bucket. */ + TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash); + sch->sch_length++; + tcp_syncache.cache_count++; + tcpstat.tcps_sc_added++; + splx(s); +} + +static void +syncache_drop(sc, sch) + struct syncache *sc; + struct syncache_head *sch; +{ + int s; + + if (sch == NULL) { +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + sch = &tcp_syncache.hashbase[ + SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)]; + } else +#endif + { + sch = &tcp_syncache.hashbase[ + SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)]; + } + } + + s = splnet(); + + TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); + sch->sch_length--; + tcp_syncache.cache_count--; + + TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq); + if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot])) + callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]); + splx(s); + + syncache_free(sc); +} + +/* + * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. + * If we have retransmitted an entry the maximum number of times, expire it. + */ +static void +syncache_timer(xslot) + void *xslot; +{ + intptr_t slot = (intptr_t)xslot; + struct syncache *sc, *nsc; + struct inpcb *inp; + int s; + + s = splnet(); + if (callout_pending(&tcp_syncache.tt_timerq[slot]) || + !callout_active(&tcp_syncache.tt_timerq[slot])) { + splx(s); + return; + } + callout_deactivate(&tcp_syncache.tt_timerq[slot]); + + nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]); + INP_INFO_RLOCK(&tcbinfo); + while (nsc != NULL) { + if (ticks < nsc->sc_rxttime) + break; + sc = nsc; + nsc = TAILQ_NEXT(sc, sc_timerq); + inp = sc->sc_tp->t_inpcb; + INP_LOCK(inp); + if (slot == SYNCACHE_MAXREXMTS || + slot >= tcp_syncache.rexmt_limit || + inp->inp_gencnt != sc->sc_inp_gencnt) { + syncache_drop(sc, NULL); + tcpstat.tcps_sc_stale++; + INP_UNLOCK(inp); + continue; + } + (void) syncache_respond(sc, NULL); + INP_UNLOCK(inp); + tcpstat.tcps_sc_retransmitted++; + TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq); + SYNCACHE_TIMEOUT(sc, slot + 1); + } + INP_INFO_RUNLOCK(&tcbinfo); + if (nsc != NULL) + callout_reset(&tcp_syncache.tt_timerq[slot], + nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot)); + splx(s); +} + +/* + * Find an entry in the syncache. + */ +struct syncache * +syncache_lookup(inc, schp) + struct in_conninfo *inc; + struct syncache_head **schp; +{ + struct syncache *sc; + struct syncache_head *sch; + int s; + +#ifdef INET6 + if (inc->inc_isipv6) { + sch = &tcp_syncache.hashbase[ + SYNCACHE_HASH6(inc, tcp_syncache.hashmask)]; + *schp = sch; + s = splnet(); + TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { + if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) { + splx(s); + return (sc); + } + } + splx(s); + } else +#endif + { + sch = &tcp_syncache.hashbase[ + SYNCACHE_HASH(inc, tcp_syncache.hashmask)]; + *schp = sch; + s = splnet(); + TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) + continue; +#endif + if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) { + splx(s); + return (sc); + } + } + splx(s); + } + return (NULL); +} + +/* + * This function is called when we get a RST for a + * non-existent connection, so that we can see if the + * connection is in the syn cache. If it is, zap it. + */ +void +syncache_chkrst(inc, th) + struct in_conninfo *inc; + struct tcphdr *th; +{ + struct syncache *sc; + struct syncache_head *sch; + + sc = syncache_lookup(inc, &sch); + if (sc == NULL) + return; + /* + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + */ + if (SEQ_GEQ(th->th_seq, sc->sc_irs) && + SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { + syncache_drop(sc, sch); + tcpstat.tcps_sc_reset++; + } +} + +void +syncache_badack(inc) + struct in_conninfo *inc; +{ + struct syncache *sc; + struct syncache_head *sch; + + sc = syncache_lookup(inc, &sch); + if (sc != NULL) { + syncache_drop(sc, sch); + tcpstat.tcps_sc_badack++; + } +} + +void +syncache_unreach(inc, th) + struct in_conninfo *inc; + struct tcphdr *th; +{ + struct syncache *sc; + struct syncache_head *sch; + + /* we are called at splnet() here */ + sc = syncache_lookup(inc, &sch); + if (sc == NULL) + return; + + /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ + if (ntohl(th->th_seq) != sc->sc_iss) + return; + + /* + * If we've rertransmitted 3 times and this is our second error, + * we remove the entry. Otherwise, we allow it to continue on. + * This prevents us from incorrectly nuking an entry during a + * spurious network outage. + * + * See tcp_notify(). + */ + if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) { + sc->sc_flags |= SCF_UNREACH; + return; + } + syncache_drop(sc, sch); + tcpstat.tcps_sc_unreach++; +} + +/* + * Build a new TCP socket structure from a syncache entry. + */ +static struct socket * +syncache_socket(sc, lso, m) + struct syncache *sc; + struct socket *lso; + struct mbuf *m; +{ + struct inpcb *inp = NULL; + struct socket *so; + struct tcpcb *tp; + + /* + * Ok, create the full blown connection, and set things up + * as they would have been set up if we had created the + * connection when the SYN arrived. If we can't create + * the connection, abort it. + */ + so = sonewconn(lso, SS_ISCONNECTED); + if (so == NULL) { + /* + * Drop the connection; we will send a RST if the peer + * retransmits the ACK, + */ + tcpstat.tcps_listendrop++; + goto abort; + } + + inp = sotoinpcb(so); + + /* + * Insert new socket into hash list. + */ + inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6; +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + inp->in6p_laddr = sc->sc_inc.inc6_laddr; + } else { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; +#endif + inp->inp_laddr = sc->sc_inc.inc_laddr; +#ifdef INET6 + } +#endif + inp->inp_lport = sc->sc_inc.inc_lport; + if (in_pcbinshash(inp) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) + inp->in6p_laddr = in6addr_any; + else +#endif + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + goto abort; + } +#ifdef IPSEC + /* copy old policy into new socket's */ + if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) + printf("syncache_expand: could not copy policy\n"); +#endif +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + struct inpcb *oinp = sotoinpcb(lso); + struct in6_addr laddr6; + struct sockaddr_in6 *sin6; + /* + * Inherit socket options from the listening socket. + * Note that in6p_inputopts are not (and should not be) + * copied, since it stores previously received options and is + * used to detect if each new option is different than the + * previous one and hence should be passed to a user. + * If we copied in6p_inputopts, a user would not be able to + * receive options just after calling the accept system call. + */ + inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; + if (oinp->in6p_outputopts) + inp->in6p_outputopts = + ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); + inp->in6p_route = sc->sc_route6; + sc->sc_route6.ro_rt = NULL; + + MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, + M_SONAME, M_NOWAIT | M_ZERO); + if (sin6 == NULL) + goto abort; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = sc->sc_inc.inc6_faddr; + sin6->sin6_port = sc->sc_inc.inc_fport; + laddr6 = inp->in6p_laddr; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = sc->sc_inc.inc6_laddr; + if (in6_pcbconnect(inp, (struct sockaddr *)sin6, &thread0)) { + inp->in6p_laddr = laddr6; + FREE(sin6, M_SONAME); + goto abort; + } + FREE(sin6, M_SONAME); + } else +#endif + { + struct in_addr laddr; + struct sockaddr_in *sin; + + inp->inp_options = ip_srcroute(); + if (inp->inp_options == NULL) { + inp->inp_options = sc->sc_ipopts; + sc->sc_ipopts = NULL; + } + inp->inp_route = sc->sc_route; + sc->sc_route.ro_rt = NULL; + + MALLOC(sin, struct sockaddr_in *, sizeof *sin, + M_SONAME, M_NOWAIT | M_ZERO); + if (sin == NULL) + goto abort; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = sc->sc_inc.inc_faddr; + sin->sin_port = sc->sc_inc.inc_fport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = sc->sc_inc.inc_laddr; + if (in_pcbconnect(inp, (struct sockaddr *)sin, &thread0)) { + inp->inp_laddr = laddr; + FREE(sin, M_SONAME); + goto abort; + } + FREE(sin, M_SONAME); + } + + tp = intotcpcb(inp); + tp->t_state = TCPS_SYN_RECEIVED; + tp->iss = sc->sc_iss; + tp->irs = sc->sc_irs; + tcp_rcvseqinit(tp); + tcp_sendseqinit(tp); + tp->snd_wl1 = sc->sc_irs; + tp->rcv_up = sc->sc_irs + 1; + tp->rcv_wnd = sc->sc_wnd; + tp->rcv_adv += tp->rcv_wnd; + + tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); + if (sc->sc_flags & SCF_NOOPT) + tp->t_flags |= TF_NOOPT; + if (sc->sc_flags & SCF_WINSCALE) { + tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; + tp->requested_s_scale = sc->sc_requested_s_scale; + tp->request_r_scale = sc->sc_request_r_scale; + } + if (sc->sc_flags & SCF_TIMESTAMP) { + tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; + tp->ts_recent = sc->sc_tsrecent; + tp->ts_recent_age = ticks; + } + if (sc->sc_flags & SCF_CC) { + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->t_flags |= TF_REQ_CC|TF_RCVD_CC; + tp->cc_send = sc->sc_cc_send; + tp->cc_recv = sc->sc_cc_recv; + } + + tcp_mss(tp, sc->sc_peer_mss); + + /* + * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. + */ + if (sc->sc_rxtslot != 0) + tp->snd_cwnd = tp->t_maxseg; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + + tcpstat.tcps_accepts++; + return (so); + +abort: + if (so != NULL) + (void) soabort(so); + return (NULL); +} + +/* + * This function gets called when we receive an ACK for a + * socket in the LISTEN state. We look up the connection + * in the syncache, and if its there, we pull it out of + * the cache and turn it into a full-blown connection in + * the SYN-RECEIVED state. + */ +int +syncache_expand(inc, th, sop, m) + struct in_conninfo *inc; + struct tcphdr *th; + struct socket **sop; + struct mbuf *m; +{ + struct syncache *sc; + struct syncache_head *sch; + struct socket *so; + + sc = syncache_lookup(inc, &sch); + if (sc == NULL) { + /* + * There is no syncache entry, so see if this ACK is + * a returning syncookie. To do this, first: + * A. See if this socket has had a syncache entry dropped in + * the past. We don't want to accept a bogus syncookie + * if we've never received a SYN. + * B. check that the syncookie is valid. If it is, then + * cobble up a fake syncache entry, and return. + */ + if (!tcp_syncookies) + return (0); + sc = syncookie_lookup(inc, th, *sop); + if (sc == NULL) + return (0); + sch = NULL; + tcpstat.tcps_sc_recvcookie++; + } + + /* + * If seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + if (th->th_ack != sc->sc_iss + 1) + return (0); + + so = syncache_socket(sc, *sop, m); + if (so == NULL) { +#if 0 +resetandabort: + /* XXXjlemon check this - is this correct? */ + (void) tcp_respond(NULL, m, m, th, + th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK); +#endif + m_freem(m); /* XXX only needed for above */ + tcpstat.tcps_sc_aborted++; + } else { + sc->sc_flags |= SCF_KEEPROUTE; + tcpstat.tcps_sc_completed++; + } + if (sch == NULL) + syncache_free(sc); + else + syncache_drop(sc, sch); + *sop = so; + return (1); +} + +/* + * Given a LISTEN socket and an inbound SYN request, add + * this to the syn cache, and send back a segment: + * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> + * to the source. + * + * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. + * Doing so would require that we hold onto the data and deliver it + * to the application. However, if we are the target of a SYN-flood + * DoS attack, an attacker could send data which would eventually + * consume all available buffer space if it were ACKed. By not ACKing + * the data, we avoid this DoS scenario. + */ +int +syncache_add(inc, to, th, sop, m) + struct in_conninfo *inc; + struct tcpopt *to; + struct tcphdr *th; + struct socket **sop; + struct mbuf *m; +{ + struct tcpcb *tp; + struct socket *so; + struct syncache *sc = NULL; + struct syncache_head *sch; + struct mbuf *ipopts = NULL; + struct rmxp_tao *taop; + int i, s, win; + + so = *sop; + tp = sototcpcb(so); + + /* + * Remember the IP options, if any. + */ +#ifdef INET6 + if (!inc->inc_isipv6) +#endif + ipopts = ip_srcroute(); + + /* + * See if we already have an entry for this connection. + * If we do, resend the SYN,ACK, and reset the retransmit timer. + * + * XXX + * should the syncache be re-initialized with the contents + * of the new SYN here (which may have different options?) + */ + sc = syncache_lookup(inc, &sch); + if (sc != NULL) { + tcpstat.tcps_sc_dupsyn++; + if (ipopts) { + /* + * If we were remembering a previous source route, + * forget it and use the new one we've been given. + */ + if (sc->sc_ipopts) + (void) m_free(sc->sc_ipopts); + sc->sc_ipopts = ipopts; + } + /* + * Update timestamp if present. + */ + if (sc->sc_flags & SCF_TIMESTAMP) + sc->sc_tsrecent = to->to_tsval; + /* + * PCB may have changed, pick up new values. + */ + sc->sc_tp = tp; + sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; + if (syncache_respond(sc, m) == 0) { + s = splnet(); + TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], + sc, sc_timerq); + SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot); + splx(s); + tcpstat.tcps_sndacks++; + tcpstat.tcps_sndtotal++; + } + *sop = NULL; + return (1); + } + + sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT); + if (sc == NULL) { + /* + * The zone allocator couldn't provide more entries. + * Treat this as if the cache was full; drop the oldest + * entry and insert the new one. + */ + s = splnet(); + for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) { + sc = TAILQ_FIRST(&tcp_syncache.timerq[i]); + if (sc != NULL) + break; + } + sc->sc_tp->ts_recent = ticks; + syncache_drop(sc, NULL); + splx(s); + tcpstat.tcps_sc_zonefail++; + sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT); + if (sc == NULL) { + if (ipopts) + (void) m_free(ipopts); + return (0); + } + } + + /* + * Fill in the syncache values. + */ + bzero(sc, sizeof(*sc)); + sc->sc_tp = tp; + sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt; + sc->sc_ipopts = ipopts; + sc->sc_inc.inc_fport = inc->inc_fport; + sc->sc_inc.inc_lport = inc->inc_lport; +#ifdef INET6 + sc->sc_inc.inc_isipv6 = inc->inc_isipv6; + if (inc->inc_isipv6) { + sc->sc_inc.inc6_faddr = inc->inc6_faddr; + sc->sc_inc.inc6_laddr = inc->inc6_laddr; + sc->sc_route6.ro_rt = NULL; + } else +#endif + { + sc->sc_inc.inc_faddr = inc->inc_faddr; + sc->sc_inc.inc_laddr = inc->inc_laddr; + sc->sc_route.ro_rt = NULL; + } + sc->sc_irs = th->th_seq; + if (tcp_syncookies) + sc->sc_iss = syncookie_generate(sc); + else + sc->sc_iss = arc4random(); + + /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */ + win = sbspace(&so->so_rcv); + win = imax(win, 0); + win = imin(win, TCP_MAXWIN); + sc->sc_wnd = win; + + sc->sc_flags = 0; + sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0; + if (tcp_do_rfc1323) { + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (to->to_flags & TOF_TS) { + sc->sc_tsrecent = to->to_tsval; + sc->sc_flags |= SCF_TIMESTAMP; + } + if (to->to_flags & TOF_SCALE) { + int wscale = 0; + + /* Compute proper scaling value from buffer space */ + while (wscale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat) + wscale++; + sc->sc_request_r_scale = wscale; + sc->sc_requested_s_scale = to->to_requested_s_scale; + sc->sc_flags |= SCF_WINSCALE; + } + } + if (tcp_do_rfc1644) { + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + if (to->to_flags & (TOF_CC|TOF_CCNEW)) { + sc->sc_cc_recv = to->to_cc; + sc->sc_cc_send = CC_INC(tcp_ccgen); + sc->sc_flags |= SCF_CC; + } + } + if (tp->t_flags & TF_NOOPT) + sc->sc_flags = SCF_NOOPT; + + /* + * XXX + * We have the option here of not doing TAO (even if the segment + * qualifies) and instead fall back to a normal 3WHS via the syncache. + * This allows us to apply synflood protection to TAO-qualifying SYNs + * also. However, there should be a hueristic to determine when to + * do this, and is not present at the moment. + */ + + /* + * Perform TAO test on incoming CC (SEG.CC) option, if any. + * - compare SEG.CC against cached CC from the same host, if any. + * - if SEG.CC > chached value, SYN must be new and is accepted + * immediately: save new CC in the cache, mark the socket + * connected, enter ESTABLISHED state, turn on flag to + * send a SYN in the next segment. + * A virtual advertised window is set in rcv_adv to + * initialize SWS prevention. Then enter normal segment + * processing: drop SYN, process data and FIN. + * - otherwise do a normal 3-way handshake. + */ + taop = tcp_gettaocache(&sc->sc_inc); + if ((to->to_flags & TOF_CC) != 0) { + if (((tp->t_flags & TF_NOPUSH) != 0) && + sc->sc_flags & SCF_CC && + taop != NULL && taop->tao_cc != 0 && + CC_GT(to->to_cc, taop->tao_cc)) { + sc->sc_rxtslot = 0; + so = syncache_socket(sc, *sop, m); + if (so != NULL) { + sc->sc_flags |= SCF_KEEPROUTE; + taop->tao_cc = to->to_cc; + *sop = so; + } + syncache_free(sc); + return (so != NULL); + } + } else { + /* + * No CC option, but maybe CC.NEW: invalidate cached value. + */ + if (taop != NULL) + taop->tao_cc = 0; + } + /* + * TAO test failed or there was no CC option, + * do a standard 3-way handshake. + */ + if (syncache_respond(sc, m) == 0) { + syncache_insert(sc, sch); + tcpstat.tcps_sndacks++; + tcpstat.tcps_sndtotal++; + } else { + syncache_free(sc); + tcpstat.tcps_sc_dropped++; + } + *sop = NULL; + return (1); +} + +static int +syncache_respond(sc, m) + struct syncache *sc; + struct mbuf *m; +{ + u_int8_t *optp; + int optlen, error; + u_int16_t tlen, hlen, mssopt; + struct ip *ip = NULL; + struct rtentry *rt; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif + +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + rt = tcp_rtlookup6(&sc->sc_inc); + if (rt != NULL) + mssopt = rt->rt_ifp->if_mtu - + (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); + else + mssopt = tcp_v6mssdflt; + hlen = sizeof(struct ip6_hdr); + } else +#endif + { + rt = tcp_rtlookup(&sc->sc_inc); + if (rt != NULL) + mssopt = rt->rt_ifp->if_mtu - + (sizeof(struct ip) + sizeof(struct tcphdr)); + else + mssopt = tcp_mssdflt; + hlen = sizeof(struct ip); + } + + /* Compute the size of the TCP options. */ + if (sc->sc_flags & SCF_NOOPT) { + optlen = 0; + } else { + optlen = TCPOLEN_MAXSEG + + ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) + + ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) + + ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0); + } + tlen = hlen + sizeof(struct tcphdr) + optlen; + + /* + * XXX + * assume that the entire packet will fit in a header mbuf + */ + KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small")); + + /* + * XXX shouldn't this reuse the mbuf if possible ? + * Create the IP+TCP header from scratch. + */ + if (m) + m_freem(m); + + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (ENOBUFS); + m->m_data += max_linkhdr; + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = NULL; + +#ifdef IPSEC + /* use IPsec policy on listening socket to send SYN,ACK */ + if (ipsec_setsocket(m, sc->sc_tp->t_inpcb->inp_socket) != 0) { + m_freem(m); + return (ENOBUFS); + } +#endif + +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_src = sc->sc_inc.inc6_laddr; + ip6->ip6_dst = sc->sc_inc.inc6_faddr; + ip6->ip6_plen = htons(tlen - hlen); + /* ip6_hlim is set after checksum */ + /* ip6_flow = ??? */ + + th = (struct tcphdr *)(ip6 + 1); + } else +#endif + { + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(struct ip) >> 2; + ip->ip_len = tlen; + ip->ip_id = 0; + ip->ip_off = 0; + ip->ip_sum = 0; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = sc->sc_inc.inc_laddr; + ip->ip_dst = sc->sc_inc.inc_faddr; + ip->ip_ttl = sc->sc_tp->t_inpcb->inp_ip_ttl; /* XXX */ + ip->ip_tos = sc->sc_tp->t_inpcb->inp_ip_tos; /* XXX */ + + /* + * See if we should do MTU discovery. We do it only if the following + * are true: + * 1) we have a valid route to the destination + * 2) the MTU is not locked (if it is, then discovery has been + * disabled) + */ + if (path_mtu_discovery + && (rt != NULL) + && rt->rt_flags & RTF_UP + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + ip->ip_off |= IP_DF; + } + + th = (struct tcphdr *)(ip + 1); + } + th->th_sport = sc->sc_inc.inc_lport; + th->th_dport = sc->sc_inc.inc_fport; + + th->th_seq = htonl(sc->sc_iss); + th->th_ack = htonl(sc->sc_irs + 1); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + th->th_x2 = 0; + th->th_flags = TH_SYN|TH_ACK; + th->th_win = htons(sc->sc_wnd); + th->th_urp = 0; + + /* Tack on the TCP options. */ + if (optlen == 0) + goto no_options; + optp = (u_int8_t *)(th + 1); + *optp++ = TCPOPT_MAXSEG; + *optp++ = TCPOLEN_MAXSEG; + *optp++ = (mssopt >> 8) & 0xff; + *optp++ = mssopt & 0xff; + + if (sc->sc_flags & SCF_WINSCALE) { + *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | + sc->sc_request_r_scale); + optp += 4; + } + + if (sc->sc_flags & SCF_TIMESTAMP) { + u_int32_t *lp = (u_int32_t *)(optp); + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(ticks); + *lp = htonl(sc->sc_tsrecent); + optp += TCPOLEN_TSTAMP_APPA; + } + + /* + * Send CC and CC.echo if we received CC from our peer. + */ + if (sc->sc_flags & SCF_CC) { + u_int32_t *lp = (u_int32_t *)(optp); + + *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); + *lp++ = htonl(sc->sc_cc_send); + *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CCECHO)); + *lp = htonl(sc->sc_cc_recv); + optp += TCPOLEN_CC_APPA * 2; + } +no_options: + +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + struct route_in6 *ro6 = &sc->sc_route6; + + th->th_sum = 0; + th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); + ip6->ip6_hlim = in6_selecthlim(NULL, + ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); + error = ip6_output(m, NULL, ro6, 0, NULL, NULL); + } else +#endif + { + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(tlen - hlen + IPPROTO_TCP)); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL); + } + return (error); +} + +/* + * cookie layers: + * + * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .| + * | peer iss | + * | MD5(laddr,faddr,lport,fport,secret) |. . . . . . .| + * | 0 |(A)| | + * (A): peer mss index + */ + +/* + * The values below are chosen to minimize the size of the tcp_secret + * table, as well as providing roughly a 4 second lifetime for the cookie. + */ + +#define SYNCOOKIE_HASHSHIFT 2 /* log2(# of 32bit words from hash) */ +#define SYNCOOKIE_WNDBITS 7 /* exposed bits for window indexing */ +#define SYNCOOKIE_TIMESHIFT 5 /* scale ticks to window time units */ + +#define SYNCOOKIE_HASHMASK ((1 << SYNCOOKIE_HASHSHIFT) - 1) +#define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1) +#define SYNCOOKIE_NSECRETS (1 << (SYNCOOKIE_WNDBITS - SYNCOOKIE_HASHSHIFT)) +#define SYNCOOKIE_TIMEOUT \ + (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT)) +#define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK) + +static struct { + u_int32_t ts_secbits; + u_int ts_expire; +} tcp_secret[SYNCOOKIE_NSECRETS]; + +static int tcp_msstab[] = { 0, 536, 1460, 8960 }; + +static MD5_CTX syn_ctx; + +#define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v)) + +/* + * Consider the problem of a recreated (and retransmitted) cookie. If the + * original SYN was accepted, the connection is established. The second + * SYN is inflight, and if it arrives with an ISN that falls within the + * receive window, the connection is killed. + * + * However, since cookies have other problems, this may not be worth + * worrying about. + */ + +static u_int32_t +syncookie_generate(struct syncache *sc) +{ + u_int32_t md5_buffer[4]; + u_int32_t data; + int wnd, idx; + + wnd = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK; + idx = wnd >> SYNCOOKIE_HASHSHIFT; + if (tcp_secret[idx].ts_expire < ticks) { + tcp_secret[idx].ts_secbits = arc4random(); + tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT; + } + for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--) + if (tcp_msstab[data] <= sc->sc_peer_mss) + break; + data = (data << SYNCOOKIE_WNDBITS) | wnd; + data ^= sc->sc_irs; /* peer's iss */ + MD5Init(&syn_ctx); +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) { + MD5Add(sc->sc_inc.inc6_laddr); + MD5Add(sc->sc_inc.inc6_faddr); + } else +#endif + { + MD5Add(sc->sc_inc.inc_laddr); + MD5Add(sc->sc_inc.inc_faddr); + } + MD5Add(sc->sc_inc.inc_lport); + MD5Add(sc->sc_inc.inc_fport); + MD5Add(tcp_secret[idx].ts_secbits); + MD5Final((u_char *)&md5_buffer, &syn_ctx); + data ^= (md5_buffer[wnd & SYNCOOKIE_HASHMASK] & ~SYNCOOKIE_WNDMASK); + return (data); +} + +static struct syncache * +syncookie_lookup(inc, th, so) + struct in_conninfo *inc; + struct tcphdr *th; + struct socket *so; +{ + u_int32_t md5_buffer[4]; + struct syncache *sc; + u_int32_t data; + int wnd, idx; + + data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */ + wnd = data & SYNCOOKIE_WNDMASK; + idx = wnd >> SYNCOOKIE_HASHSHIFT; + if (tcp_secret[idx].ts_expire < ticks || + sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks) + return (NULL); + MD5Init(&syn_ctx); +#ifdef INET6 + if (inc->inc_isipv6) { + MD5Add(inc->inc6_laddr); + MD5Add(inc->inc6_faddr); + } else +#endif + { + MD5Add(inc->inc_laddr); + MD5Add(inc->inc_faddr); + } + MD5Add(inc->inc_lport); + MD5Add(inc->inc_fport); + MD5Add(tcp_secret[idx].ts_secbits); + MD5Final((u_char *)&md5_buffer, &syn_ctx); + data ^= md5_buffer[wnd & SYNCOOKIE_HASHMASK]; + if ((data & ~SYNCOOKIE_DATAMASK) != 0) + return (NULL); + data = data >> SYNCOOKIE_WNDBITS; + + sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT); + if (sc == NULL) + return (NULL); + /* + * Fill in the syncache values. + * XXX duplicate code from syncache_add + */ + sc->sc_ipopts = NULL; + sc->sc_inc.inc_fport = inc->inc_fport; + sc->sc_inc.inc_lport = inc->inc_lport; +#ifdef INET6 + sc->sc_inc.inc_isipv6 = inc->inc_isipv6; + if (inc->inc_isipv6) { + sc->sc_inc.inc6_faddr = inc->inc6_faddr; + sc->sc_inc.inc6_laddr = inc->inc6_laddr; + sc->sc_route6.ro_rt = NULL; + } else +#endif + { + sc->sc_inc.inc_faddr = inc->inc_faddr; + sc->sc_inc.inc_laddr = inc->inc_laddr; + sc->sc_route.ro_rt = NULL; + } + sc->sc_irs = th->th_seq - 1; + sc->sc_iss = th->th_ack - 1; + wnd = sbspace(&so->so_rcv); + wnd = imax(wnd, 0); + wnd = imin(wnd, TCP_MAXWIN); + sc->sc_wnd = wnd; + sc->sc_flags = 0; + sc->sc_rxtslot = 0; + sc->sc_peer_mss = tcp_msstab[data]; + return (sc); +} diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c new file mode 100644 index 0000000..82cf3c5 --- /dev/null +++ b/sys/netinet/tcp_timer.c @@ -0,0 +1,529 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +static int +sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) +{ + int error, s, tt; + + tt = *(int *)oidp->oid_arg1; + s = tt * 1000 / hz; + + error = sysctl_handle_int(oidp, &s, 0, req); + if (error || !req->newptr) + return (error); + + tt = s * hz / 1000; + if (tt < 1) + return (EINVAL); + + *(int *)oidp->oid_arg1 = tt; + return (0); +} + +int tcp_keepinit; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_keepidle; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_keepintvl; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_delacktime; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, + CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", + "Time before a delayed ACK is sent"); + +int tcp_msl; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); + +static int always_keepalive = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, + &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + +static int tcp_keepcnt = TCPTV_KEEPCNT; + /* max idle probes */ +int tcp_maxpersistidle; + /* max idle time in persist */ +int tcp_maxidle; + +/* + * Tcp protocol timeout routine called every 500 ms. + * Updates timestamps used for TCP + * causes finite state machine actions if timers expire. + */ +void +tcp_slowtimo() +{ + int s; + + s = splnet(); + + tcp_maxidle = tcp_keepcnt * tcp_keepintvl; + + splx(s); +} + +/* + * Cancel all timers for TCP tp. + */ +void +tcp_canceltimers(tp) + struct tcpcb *tp; +{ + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_rexmt); +} + +int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; + +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ + +/* + * TCP timer processing. + */ + +void +tcp_timer_delack(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + struct inpcb *inp; + + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + inp = tp->t_inpcb; + INP_LOCK(inp); + INP_INFO_RUNLOCK(&tcbinfo); + if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { + INP_UNLOCK(inp); + splx(s); + return; + } + callout_deactivate(tp->tt_delack); + + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_delack++; + (void) tcp_output(tp); + INP_UNLOCK(inp); + splx(s); +} + +void +tcp_timer_2msl(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + struct inpcb *inp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + INP_INFO_WLOCK(&tcbinfo); + inp = tp->t_inpcb; + INP_LOCK(inp); + if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { + INP_UNLOCK(tp->t_inpcb); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return; + } + callout_deactivate(tp->tt_2msl); + /* + * 2 MSL timeout in shutdown went off. If we're closed but + * still waiting for peer to close and connection has been idle + * too long, or if 2MSL time is up from TIME_WAIT, delete connection + * control block. Otherwise, check again in a bit. + */ + if (tp->t_state != TCPS_TIME_WAIT && + (ticks - tp->t_rcvtime) <= tcp_maxidle) + callout_reset(tp->tt_2msl, tcp_keepintvl, + tcp_timer_2msl, tp); + else + tp = tcp_close(tp); + +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp) + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); +} + +void +tcp_timer_keep(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + struct tcptemp *t_template; + int s; + struct inpcb *inp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + INP_INFO_WLOCK(&tcbinfo); + inp = tp->t_inpcb; + INP_LOCK(inp); + if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return; + } + callout_deactivate(tp->tt_keep); + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + tcpstat.tcps_keeptimeo++; + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((always_keepalive || + tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) + goto dropit; + /* + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. + */ + tcpstat.tcps_keepprobe++; + t_template = tcp_maketemplate(tp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + (void) m_free(dtom(t_template)); + } + callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); + } else + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return; + +dropit: + tcpstat.tcps_keepdrops++; + tp = tcp_drop(tp, ETIMEDOUT); + +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp) + INP_UNLOCK(tp->t_inpcb); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); +} + +void +tcp_timer_persist(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + struct inpcb *inp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + INP_INFO_WLOCK(&tcbinfo); + inp = tp->t_inpcb; + INP_LOCK(inp); + if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return; + } + callout_deactivate(tp->tt_persist); + /* + * Persistance timer into zero window. + * Force a byte to be output, if possible. + */ + tcpstat.tcps_persisttimeo++; + /* + * Hack: if the peer is dead/unreachable, we do not + * time out if the window is closed. After a full + * backoff, drop the connection if the idle time + * (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || + (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + tcpstat.tcps_persistdrop++; + tp = tcp_drop(tp, ETIMEDOUT); + goto out; + } + tcp_setpersist(tp); + tp->t_force = 1; + (void) tcp_output(tp); + tp->t_force = 0; + +out: +#ifdef TCPDEBUG + if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp) + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); +} + +void +tcp_timer_rexmt(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + int rexmt; + int headlocked; + struct inpcb *inp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + INP_INFO_WLOCK(&tcbinfo); + headlocked = 1; + inp = tp->t_inpcb; + INP_LOCK(inp); + if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return; + } + callout_deactivate(tp->tt_rexmt); + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + tcpstat.tcps_timeoutdrop++; + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + goto out; + } + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + if (tp->t_rxtshift == 1) { + /* + * first retransmit; record ssthresh and cwnd so they can + * be recovered if this turns out to be a "bad" retransmit. + * A retransmit is considered "bad" if an ACK for this + * segment is received within RTT/2 interval; the assumption + * here is that the ACK was already in flight. See + * "On Estimating End-to-End Network Path Properties" by + * Allman and Paxson for more details. + */ + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + } + tcpstat.tcps_rexmttimeo++; + if (tp->t_state == TCPS_SYN_SENT) + rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; + else + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + /* + * Disable rfc1323 and rfc1644 if we havn't got any response to + * our third SYN to work-around some broken terminal servers + * (most of which have hopefully been retired) that have bad VJ + * header compression code which trashes TCP segments containing + * unknown-to-them TCP options. + */ + if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) + tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); + /* + * If losing, let the lower level know and try for + * a better route. Also, if we backed off this far, + * our srtt estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + in6_losing(tp->t_inpcb); + else +#endif + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + /* + * Note: We overload snd_recover to function also as the + * snd_last variable described in RFC 2582 + */ + tp->snd_recover = tp->snd_max; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtttime = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; + } + (void) tcp_output(tp); + +out: +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp) + INP_UNLOCK(inp); + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); +} diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h new file mode 100644 index 0000000..ff86d2a --- /dev/null +++ b/sys/netinet/tcp_timer.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_TIMER_H_ +#define _NETINET_TCP_TIMER_H_ + +/* + * The TCPT_REXMT timer is used to force retransmissions. + * The TCP has the TCPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The TCPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the TCPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as TCPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The TCPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The TCPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, + * but not yet established, then we drop the connection. Once the connection + * is established, if the connection is idle for TCPTV_KEEP_IDLE time + * (and keepalives have been enabled on the socket), we begin to probe + * the connection. We force the peer to send us a segment by sending: + * <SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK> + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the TCPT_KEEP + * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE + * amount of time probing, then we drop the connection. + */ + +/* + * Time constants. + */ +#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ +#define TCPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ +#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */ + +#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */ +#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ + +#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ +#define TCPTV_KEEPCNT 8 /* max probes before drop */ + +#define TCPTV_MIN ( 1*hz) /* minimum allowable value */ +#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ + +#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ + +#define TCP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */ + +#ifdef TCPTIMERS +static char *tcptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ + (tv) = (value); \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + else if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} while(0) + +#ifdef _KERNEL +extern int tcp_keepinit; /* time to establish connection */ +extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_keepintvl; /* time between keepalive probes */ +extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_delacktime; /* time before sending a delayed ACK */ +extern int tcp_maxpersistidle; +extern int tcp_msl; +extern int tcp_ttl; /* time to live for TCP segs */ +extern int tcp_backoff[]; + +void tcp_timer_2msl(void *xtp); +void tcp_timer_keep(void *xtp); +void tcp_timer_persist(void *xtp); +void tcp_timer_rexmt(void *xtp); +void tcp_timer_delack(void *xtp); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_TCP_TIMER_H_ */ diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c new file mode 100644 index 0000000..f7800d2 --- /dev/null +++ b/sys/netinet/tcp_timewait.c @@ -0,0 +1,1510 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#ifdef INET6 +#include <sys/domain.h> +#endif +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/random.h> + +#include <vm/uma.h> + +#include <net/route.h> +#include <net/if.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif +#include <netinet6/ip6protosw.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> +#include <sys/md5.h> + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +#ifdef INET6 +int tcp_v6mssdflt = TCP6_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, &tcp_v6mssdflt , 0, + "Default TCP Maximum Segment Size for IPv6"); +#endif + +#if 0 +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, + &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); +#endif + +int tcp_do_rfc1323 = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); + +int tcp_do_rfc1644 = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, + &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +static int do_tcpdrain = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static int icmp_may_rst = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); + +static int tcp_isn_reseed_interval = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, + &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); + +static void tcp_cleartaocache(void); +static struct inpcb *tcp_notify(struct inpcb *, int); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * This is the actual shape of what we allocate using the zone + * allocator. Doing it this way allows us to protect both structures + * using the same generation count, and also eliminates the overhead + * of allocating tcpcbs separately. By hiding the structure here, + * we avoid changing most of the rest of the code (although it needs + * to be changed, eventually, for greater efficiency). + */ +#define ALIGNMENT 32 +#define ALIGNM1 (ALIGNMENT - 1) +struct inp_tp { + union { + struct inpcb inp; + char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; + } inp_tp_u; + struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; + struct callout inp_tp_delack; +}; +#undef ALIGNMENT +#undef ALIGNM1 + +/* + * Tcp initialization + */ +void +tcp_init() +{ + int hashsize = TCBHASHSIZE; + + tcp_ccgen = 1; + tcp_cleartaocache(); + + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + + INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + tcp_tcbhashsize = hashsize; + tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); + tcbinfo.porthashbase = hashinit(hashsize, M_PCB, + &tcbinfo.porthashmask); + tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) + panic("tcp_init"); +#undef TCP_MINPROTOHDR + + syncache_init(); +} + +/* + * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. + * tcp_template used to store this data in mbufs, but we now recopy it out + * of the tcpcb each time to conserve mbufs. + */ +void +tcp_fillheaders(tp, ip_ptr, tcp_ptr) + struct tcpcb *tp; + void *ip_ptr; + void *tcp_ptr; +{ + struct inpcb *inp = tp->t_inpcb; + struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)ip_ptr; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + tcp_hdr->th_sum = 0; + } else +#endif + { + struct ip *ip = (struct ip *) ip_ptr; + + ip->ip_vhl = IP_VHL_BORING; + ip->ip_tos = 0; + ip->ip_len = 0; + ip->ip_id = 0; + ip->ip_off = 0; + ip->ip_ttl = 0; + ip->ip_sum = 0; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + IPPROTO_TCP)); + } + + tcp_hdr->th_sport = inp->inp_lport; + tcp_hdr->th_dport = inp->inp_fport; + tcp_hdr->th_seq = 0; + tcp_hdr->th_ack = 0; + tcp_hdr->th_x2 = 0; + tcp_hdr->th_off = 5; + tcp_hdr->th_flags = 0; + tcp_hdr->th_win = 0; + tcp_hdr->th_urp = 0; +} + +/* + * Create template to be used to send tcp packets on a connection. + * Allocates an mbuf and fills in a skeletal tcp/ip header. The only + * use for this function is in keepalives, which use tcp_respond. + */ +struct tcptemp * +tcp_maketemplate(tp) + struct tcpcb *tp; +{ + struct mbuf *m; + struct tcptemp *n; + + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof(struct tcptemp); + n = mtod(m, struct tcptemp *); + + tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection. If flags are given then we send + * a message back to the TCP which originated the * segment ti, + * and discard the mbuf containing it and any other attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(tp, ipgen, th, m, ack, seq, flags) + struct tcpcb *tp; + void *ipgen; + register struct tcphdr *th; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + struct ip *ip; + struct tcphdr *nth; +#ifdef INET6 + struct route_in6 *ro6 = 0; + struct route_in6 sro6; + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + +#ifdef INET6 + isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + + if (tp) { + if (!(flags & TH_RST)) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + } +#ifdef INET6 + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else +#endif /* INET6 */ + ro = &tp->t_inpcb->inp_route; + } else { +#ifdef INET6 + if (isipv6) { + ro6 = &sro6; + bzero(ro6, sizeof *ro6); + } else +#endif /* INET6 */ + { + ro = &sro; + bzero(ro, sizeof *ro); + } + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + tlen = 0; + m->m_data += max_linkhdr; +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, n_short); +#undef xchg + } +#ifdef INET6 + if (isipv6) { + ip6->ip6_flow = 0; + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } else +#endif + { + tlen += sizeof (struct tcpiphdr); + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; + if (tp) + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); + else + nth->th_win = htons((u_short)win); + nth->th_urp = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = 0; + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, + ro6 && ro6->ro_rt ? + ro6->ro_rt->rt_ifp : + NULL); + } else +#endif /* INET6 */ + { + nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + } +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); +#endif +#ifdef IPSEC + if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) { + m_freem(m); + return; + } +#endif +#ifdef INET6 + if (isipv6) { + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + if (ro6 == &sro6 && ro6->ro_rt) { + RTFREE(ro6->ro_rt); + ro6->ro_rt = NULL; + } + } else +#endif /* INET6 */ + { + (void) ip_output(m, NULL, ro, ipflags, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + struct inp_tp *it; + register struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + it = (struct inp_tp *)inp; + tp = &it->tcb; + bzero((char *) tp, sizeof(struct tcpcb)); + LIST_INIT(&tp->t_segq); + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); + callout_init(tp->tt_persist = &it->inp_tp_persist, 0); + callout_init(tp->tt_keep = &it->inp_tp_keep, 0); + callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); + callout_init(tp->tt_delack = &it->inp_tp_delack, 0); + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (tcp_do_rfc1644) + tp->t_flags |= TF_REQ_CC; + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + /* + * IPv4 TTL initialization is necessary for an IPv6 socket as well, + * because the socket may be bound to an IPv6 wildcard address, + * which may match an IPv4-mapped IPv6 address. + */ + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct tseg_qent *q; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + register struct rtentry *rt; + int dosavessthresh; + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16) { + register u_long i = 0; +#ifdef INET6 + if (isipv6) { + struct sockaddr_in6 *sin6; + + if ((rt = inp->in6p_route.ro_rt) == NULL) + goto no_valid_rt; + sin6 = (struct sockaddr_in6 *)rt_key(rt); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + goto no_valid_rt; + } + else +#endif /* INET6 */ + if ((rt = inp->inp_route.ro_rt) == NULL || + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr + == INADDR_ANY) + goto no_valid_rt; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * The old comment here said: + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + * + * But we want to save the ssthresh even if no pipesize is + * specified explicitly in the route, because such + * connections still have an implicit pipesize specified + * by the global tcp_sendspace. In the absence of a reliable + * way to calculate the pipesize, it will have to do. + */ + i = tp->snd_ssthresh; + if (rt->rt_rmx.rmx_sendpipe != 0) + dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); + else + dosavessthresh = (i < so->so_snd.sb_hiwat / 2); + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) + || dosavessthresh) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + no_valid_rt: + /* free the reassembly queue, if any */ + while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + } + inp->inp_ppcb = NULL; + soisdisconnected(so); +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + in6_pcbdetach(inp); + else +#endif /* INET6 */ + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + if (do_tcpdrain) + { + struct inpcb *inpb; + struct tcpcb *tcpb; + struct tseg_qent *te; + + /* + * Walk the tcpbs, if existing, and flush the reassembly queue, + * if there is one... + * XXX: The "Net/3" implementation doesn't imply that the TCP + * reassembly queue should be flushed, but in a situation + * where we're really low on mbufs, this is potentially + * usefull. + */ + INP_INFO_RLOCK(&tcbinfo); + LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { + INP_LOCK(inpb); + if ((tcpb = intotcpcb(inpb))) { + while ((te = LIST_FIRST(&tcpb->t_segq)) + != NULL) { + LIST_REMOVE(te, tqe_q); + m_freem(te->tqe_m); + FREE(te, M_TSEGQ); + } + } + INP_UNLOCK(inpb); + } + INP_INFO_RUNLOCK(&tcbinfo); + } +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +static struct inpcb * +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return inp; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) { + tcp_drop(tp, error); + return (struct inpcb *)0; + } else { + tp->t_softerror = error; + return inp; + } +#if 0 + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +#endif +} + +static int +tcp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt && + cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) + inp_list[i++] = inp; + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + INP_UNLOCK(inp); + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser_cred(req->td->td_ucred, PRISON_ROOT); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp == NULL) { + error = ENOENT; + goto outunlocked; + } else { + INP_LOCK(inp); + if (inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + } + + error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); + if (error) + goto out; + cru2x(inp->inp_socket->so_cred, &xuc); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + INP_UNLOCK(inp); +outunlocked: + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); + +#ifdef INET6 +static int +tcp6_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, s, mapped = 0; + + error = suser_cred(req->td->td_ucred, PRISON_ROOT); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + if (mapped == 1) + inp = in_pcblookup_hash(&tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, + 0, NULL); + if (inp == NULL) { + error = ENOENT; + goto outunlocked; + } else { + INP_LOCK(inp); + if (inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + } + error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); + if (error) + goto out; + cru2x(inp->inp_socket->so_cred, &xuc); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + INP_UNLOCK(inp); +outunlocked: + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); +#endif + + +void +tcp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct tcphdr *th; + struct in_addr faddr; + struct inpcb *inp; + struct tcpcb *tp; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + tcp_seq icmp_seq; + int s; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT) && ip) + notify = tcp_drop_syn_sent; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + notify = in_rtchange; + } else if (cmd == PRC_HOSTDEAD) + ip = 0; + else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip) { + s = splnet(); + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); + INP_INFO_WLOCK(&tcbinfo); + inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, 0, NULL); + if (inp != NULL) { + INP_LOCK(inp); + if (inp->inp_socket != NULL) { + icmp_seq = htonl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_seq, tp->snd_una) && + SEQ_LT(icmp_seq, tp->snd_max)) + inp = (*notify)(inp, inetctlerrmap[cmd]); + } + if (inp) + INP_UNLOCK(inp); + } else { + struct in_conninfo inc; + + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc_faddr = faddr; + inc.inc_laddr = ip->ip_src; +#ifdef INET6 + inc.inc_isipv6 = 0; +#endif + syncache_unreach(&inc, th); + } + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + } else + in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); +} + +#ifdef INET6 +void +tcp6_ctlinput(cmd, sa, d) + int cmd; + struct sockaddr *sa; + void *d; +{ + struct tcphdr th; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + struct ip6_hdr *ip6; + struct mbuf *m; + struct ip6ctlparam *ip6cp = NULL; + const struct sockaddr_in6 *sa6_src = NULL; + int off; + struct tcp_portonly { + u_int16_t th_sport; + u_int16_t th_dport; + } *thp; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + sa6_src = ip6cp->ip6c_src; + } else { + m = NULL; + ip6 = NULL; + off = 0; /* fool gcc */ + sa6_src = &sa6_any; + } + + if (ip6) { + struct in_conninfo inc; + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + + /* check if we can safely examine src and dst ports */ + if (m->m_pkthdr.len < off + sizeof(*thp)) + return; + + bzero(&th, sizeof(th)); + m_copydata(m, off, sizeof(*thp), (caddr_t)&th); + + in6_pcbnotify(&tcb, sa, th.th_dport, + (struct sockaddr *)ip6cp->ip6c_src, + th.th_sport, cmd, notify); + + inc.inc_fport = th.th_dport; + inc.inc_lport = th.th_sport; + inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; + inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; + inc.inc_isipv6 = 1; + syncache_unreach(&inc, &th); + } else + in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, + 0, cmd, notify); +} +#endif /* INET6 */ + + +/* + * Following is where TCP initial sequence number generation occurs. + * + * There are two places where we must use initial sequence numbers: + * 1. In SYN-ACK packets. + * 2. In SYN packets. + * + * All ISNs for SYN-ACK packets are generated by the syncache. See + * tcp_syncache.c for details. + * + * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling + * depends on this property. In addition, these ISNs should be + * unguessable so as to prevent connection hijacking. To satisfy + * the requirements of this situation, the algorithm outlined in + * RFC 1948 is used to generate sequence numbers. + * + * Implementation details: + * + * Time is based off the system timer, and is corrected so that it + * increases by one megabyte per second. This allows for proper + * recycling on high speed LANs while still leaving over an hour + * before rollover. + * + * net.inet.tcp.isn_reseed_interval controls the number of seconds + * between seeding of isn_secret. This is normally set to zero, + * as reseeding should not be necessary. + * + */ + +#define ISN_BYTES_PER_SECOND 1048576 + +u_char isn_secret[32]; +int isn_last_reseed; +MD5_CTX isn_ctx; + +tcp_seq +tcp_new_isn(tp) + struct tcpcb *tp; +{ + u_int32_t md5_buffer[4]; + tcp_seq new_isn; + + /* Seed if this is the first use, reseed if requested. */ + if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && + (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) + < (u_int)ticks))) { + read_random(&isn_secret, sizeof(isn_secret)); + isn_last_reseed = ticks; + } + + /* Compute the md5 hash and return the ISN. */ + MD5Init(&isn_ctx); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, + sizeof(struct in6_addr)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, + sizeof(struct in6_addr)); + } else +#endif + { + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, + sizeof(struct in_addr)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, + sizeof(struct in_addr)); + } + MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); + MD5Final((u_char *) &md5_buffer, &isn_ctx); + new_isn = (tcp_seq) md5_buffer[0]; + new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); + return new_isn; +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +struct inpcb * +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; + return (inp); +} + +/* + * When a specific ICMP unreachable message is received and the + * connection state is SYN-SENT, drop the connection. This behavior + * is controlled by the icmp_may_rst sysctl. + */ +struct inpcb * +tcp_drop_syn_sent(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp && tp->t_state == TCPS_SYN_SENT) { + tcp_drop(tp, errno); + return (struct inpcb *)0; + } + return inp; +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +struct inpcb * +tcp_mtudisc(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; +#ifdef INET6 + int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + if (tp) { +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(&inp->inp_inc); + else +#endif /* INET6 */ + rt = tcp_rtlookup(&inp->inp_inc); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return inp; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - +#ifdef INET6 + (isipv6 ? + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : +#endif /* INET6 */ + sizeof(struct tcpiphdr) +#ifdef INET6 + ) +#endif /* INET6 */ + ; + + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return inp; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } + return inp; +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated the return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(inc) + struct in_conninfo *inc; +{ + struct route *ro; + struct rtentry *rt; + + ro = &inc->inc_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inc->inc_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inc->inc_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +#ifdef INET6 +struct rtentry * +tcp_rtlookup6(inc) + struct in_conninfo *inc; +{ + struct route_in6 *ro6; + struct rtentry *rt; + + ro6 = &inc->inc6_route; + rt = ro6->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { + ro6->ro_dst.sin6_family = AF_INET6; + ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); + ro6->ro_dst.sin6_addr = inc->inc6_faddr; + rtalloc((struct route *)ro6); + rt = ro6->ro_rt; + } + } + return rt; +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(tp) + struct tcpcb *tp; +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif /* INET6 */ + struct tcphdr *th; + + if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) + return 0; + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return 0; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + tcp_fillheaders(tp, ip6, th); + hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + tcp_fillheaders(tp, ip, th); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return hdrsiz; +} +#endif /*IPSEC*/ + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(inc) + struct in_conninfo *inc; +{ + struct rtentry *rt; + +#ifdef INET6 + if (inc->inc_isipv6) + rt = tcp_rtlookup6(inc); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inc); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} + +/* + * Clear all the TAO cache entries, called from tcp_init. + * + * XXX + * This routine is just an empty one, because we assume that the routing + * routing tables are initialized at the same time when TCP, so there is + * nothing in the cache left over. + */ +static void +tcp_cleartaocache() +{ +} diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c new file mode 100644 index 0000000..9c4b547 --- /dev/null +++ b/sys/netinet/tcp_usrreq.c @@ -0,0 +1,1252 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/mbuf.h> +#ifdef INET6 +#include <sys/domain.h> +#endif /* INET6 */ +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/proc.h> +#include <sys/jail.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +/* + * TCP protocol interface to socket abstraction. + */ +extern char *tcpstates[]; /* XXX ??? */ + +static int tcp_attach(struct socket *, struct thread *td); +static int tcp_connect(struct tcpcb *, struct sockaddr *, + struct thread *td); +#ifdef INET6 +static int tcp6_connect(struct tcpcb *, struct sockaddr *, + struct thread *td); +#endif /* INET6 */ +static struct tcpcb * + tcp_disconnect(struct tcpcb *); +static struct tcpcb * + tcp_usrclosed(struct tcpcb *); + +#ifdef TCPDEBUG +#define TCPDEBUG0 int ostate = 0 +#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 +#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ + tcp_trace(TA_USER, ostate, tp, 0, 0, req) +#else +#define TCPDEBUG0 +#define TCPDEBUG1() +#define TCPDEBUG2(req) +#endif + +/* + * TCP attaches to socket via pru_attach(), reserving space, + * and an internet control block. + */ +static int +tcp_usr_attach(struct socket *so, int proto, struct thread *td) +{ + int s = splnet(); + int error; + struct inpcb *inp; + struct tcpcb *tp = 0; + TCPDEBUG0; + + INP_INFO_WLOCK(&tcbinfo); + TCPDEBUG1(); + inp = sotoinpcb(so); + if (inp) { + error = EISCONN; + goto out; + } + + error = tcp_attach(so, td); + if (error) + goto out; + + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; + + inp = sotoinpcb(so); + tp = intotcpcb(inp); +out: + TCPDEBUG2(PRU_ATTACH); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return error; +} + +/* + * pru_detach() detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a pru_disconnect(), + * which may finish later; embryonic TCB's can just + * be discarded here. + */ +static int +tcp_usr_detach(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + TCPDEBUG0; + + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return EINVAL; /* XXX */ + } + INP_LOCK(inp); + tp = intotcpcb(inp); + TCPDEBUG1(); + tp = tcp_disconnect(tp); + + TCPDEBUG2(PRU_DETACH); + if (tp) + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + splx(s); + return error; +} + +#define INI_NOLOCK 0 +#define INI_READ 1 +#define INI_WRITE 2 + +#define COMMON_START() \ + TCPDEBUG0; \ + do { \ + if (inirw == INI_READ) \ + INP_INFO_RLOCK(&tcbinfo); \ + else if (inirw == INI_WRITE) \ + INP_INFO_WLOCK(&tcbinfo); \ + inp = sotoinpcb(so); \ + if (inp == 0) { \ + if (inirw == INI_READ) \ + INP_INFO_RUNLOCK(&tcbinfo); \ + else if (inirw == INI_WRITE) \ + INP_INFO_WUNLOCK(&tcbinfo); \ + splx(s); \ + return EINVAL; \ + } \ + INP_LOCK(inp); \ + if (inirw == INI_READ) \ + INP_INFO_RUNLOCK(&tcbinfo); \ + tp = intotcpcb(inp); \ + TCPDEBUG1(); \ +} while(0) + +#define COMMON_END(req) \ +out: TCPDEBUG2(req); \ + do { \ + if (tp) \ + INP_UNLOCK(inp); \ + if (inirw == INI_WRITE) \ + INP_INFO_WUNLOCK(&tcbinfo); \ + splx(s); \ + return error; \ + goto out; \ +} while(0) + +/* + * Give the socket an address. + */ +static int +tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + struct sockaddr_in *sinp; + const int inirw = INI_READ; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_family == AF_INET && + IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + error = in_pcbbind(inp, nam, td); + if (error) + goto out; + COMMON_END(PRU_BIND); +} + +#ifdef INET6 +static int +tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + struct sockaddr_in6 *sin6p; + const int inirw = INI_READ; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + error = EAFNOSUPPORT; + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if (ip6_mapped_addr_on && (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + inp->inp_vflag |= INP_IPV4; + else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + error = in_pcbbind(inp, (struct sockaddr *)&sin, td); + goto out; + } + } + error = in6_pcbbind(inp, nam, td); + if (error) + goto out; + COMMON_END(PRU_BIND); +} +#endif /* INET6 */ + +/* + * Prepare to accept connections. + */ +static int +tcp_usr_listen(struct socket *so, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_READ; + + COMMON_START(); + if (inp->inp_lport == 0) + error = in_pcbbind(inp, (struct sockaddr *)0, td); + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} + +#ifdef INET6 +static int +tcp6_usr_listen(struct socket *so, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_READ; + + COMMON_START(); + if (inp->inp_lport == 0) { + inp->inp_vflag &= ~INP_IPV4; + if (ip6_mapped_addr_on && + (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) + inp->inp_vflag |= INP_IPV4; + error = in6_pcbbind(inp, (struct sockaddr *)0, td); + } + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} +#endif /* INET6 */ + +/* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ +static int +tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + struct sockaddr_in *sinp; + const int inirw = INI_WRITE; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_family == AF_INET + && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + + if (td && jailed(td->td_ucred)) + prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); + + if ((error = tcp_connect(tp, nam, td)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} + +#ifdef INET6 +static int +tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + struct sockaddr_in6 *sin6p; + const int inirw = INI_WRITE; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + error = EAFNOSUPPORT; + goto out; + } + + if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + if (!ip6_mapped_addr_on || + (inp->inp_flags & IN6P_IPV6_V6ONLY)) + return(EINVAL); + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) + goto out; + error = tcp_output(tp); + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + inp->inp_inc.inc_isipv6 = 1; + if ((error = tcp6_connect(tp, nam, td)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} +#endif /* INET6 */ + +/* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ +static int +tcp_usr_disconnect(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_WRITE; + + COMMON_START(); + tp = tcp_disconnect(tp); + COMMON_END(PRU_DISCONNECT); +} + +/* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ +static int +tcp_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int s; + int error = 0; + struct inpcb *inp = NULL; + struct tcpcb *tp = NULL; + struct sockaddr_in *sin; + const int inirw = INI_READ; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) { + error = ECONNABORTED; + goto out; + } + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + inp = sotoinpcb(so); + if (!inp) { + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + free(sin, M_SONAME); + return (EINVAL); + } + INP_LOCK(inp); + INP_INFO_RUNLOCK(&tcbinfo); + tp = intotcpcb(inp); + TCPDEBUG1(); + + /* + * We inline in_setpeeraddr here, because we have already done + * the locking and the malloc. + */ + sin->sin_port = inp->inp_fport; + sin->sin_addr = inp->inp_faddr; + *nam = (struct sockaddr *)sin; + + COMMON_END(PRU_ACCEPT); +} + +#ifdef INET6 +static int +tcp6_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int s; + struct inpcb *inp = NULL; + int error = 0; + struct tcpcb *tp = NULL; + const int inirw = INI_READ; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) { + error = ECONNABORTED; + goto out; + } + + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + return (EINVAL); + } + INP_LOCK(inp); + INP_INFO_RUNLOCK(&tcbinfo); + tp = intotcpcb(inp); + TCPDEBUG1(); + in6_mapped_peeraddr(so, nam); + COMMON_END(PRU_ACCEPT); +} +#endif /* INET6 */ + +/* + * This is the wrapper function for in_setsockaddr. We just pass down + * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking + * here because in_setsockaddr will call malloc and can block. + */ +static int +tcp_sockaddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setsockaddr(so, nam, &tcbinfo)); +} + +/* + * This is the wrapper function for in_setpeeraddr. We just pass down + * the pcbinfo for in_setpeeraddr to lock. + */ +static int +tcp_peeraddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setpeeraddr(so, nam, &tcbinfo)); +} + +/* + * Mark the connection as being incapable of further output. + */ +static int +tcp_usr_shutdown(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_WRITE; + + COMMON_START(); + socantsendmore(so); + tp = tcp_usrclosed(tp); + if (tp) + error = tcp_output(tp); + COMMON_END(PRU_SHUTDOWN); +} + +/* + * After a receive, possibly send window update to peer. + */ +static int +tcp_usr_rcvd(struct socket *so, int flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_READ; + + COMMON_START(); + tcp_output(tp); + COMMON_END(PRU_RCVD); +} + +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. Unlike the other + * pru_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pru_* routines + * generally are caller-frees. + */ +static int +tcp_usr_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct thread *td) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_WRITE; +#ifdef INET6 + int isipv6; +#endif + TCPDEBUG0; + + /* + * Need write lock here because this function might call + * tcp_connect or tcp_usrclosed. + * We really want to have to this function upgrade from read lock + * to write lock. XXX + */ + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + if (inp == NULL) { + /* + * OOPS! we lost a race, the TCP session got reset after + * we checked SS_CANTSENDMORE, eg: while doing uiomove or a + * network interrupt in the non-splnet() section of sosend(). + */ + if (m) + m_freem(m); + if (control) + m_freem(control); + error = ECONNRESET; /* XXX EPIPE? */ + tp = NULL; + TCPDEBUG1(); + goto out; + } + INP_LOCK(inp); +#ifdef INET6 + isipv6 = nam && nam->sa_family == AF_INET6; +#endif /* INET6 */ + tp = intotcpcb(inp); + TCPDEBUG1(); + if (control) { + /* TCP doesn't do control messages (rights, creds, etc) */ + if (control->m_len) { + m_freem(control); + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + m_freem(control); /* empty control, just free it */ + } + if (!(flags & PRUS_OOB)) { + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, td); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, td); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + socantsendmore(so); + tp = tcp_usrclosed(tp); + } + if (tp != NULL) { + if (flags & PRUS_MORETOCOME) + tp->t_flags |= TF_MORETOCOME; + error = tcp_output(tp); + if (flags & PRUS_MORETOCOME) + tp->t_flags &= ~TF_MORETOCOME; + } + } else { + if (sbspace(&so->so_snd) < -512) { + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, td); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, td); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_force = 1; + error = tcp_output(tp); + tp->t_force = 0; + } + COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); +} + +/* + * Abort the TCP. + */ +static int +tcp_usr_abort(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_WRITE; + + COMMON_START(); + tp = tcp_drop(tp, ECONNABORTED); + COMMON_END(PRU_ABORT); +} + +/* + * Receive out-of-band data. + */ +static int +tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp; + struct tcpcb *tp; + const int inirw = INI_READ; + + COMMON_START(); + if ((so->so_oobmark == 0 && + (so->so_state & SS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + goto out; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + goto out; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if ((flags & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + COMMON_END(PRU_RCVOOB); +} + +/* xxx - should be const */ +struct pr_usrreqs tcp_usrreqs = { + tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind, + tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach, + tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, + tcp_sockaddr, sosend, soreceive, sopoll +}; + +#ifdef INET6 +struct pr_usrreqs tcp6_usrreqs = { + tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind, + tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach, + tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, + in6_mapped_sockaddr, sosend, soreceive, sopoll +}; +#endif /* INET6 */ + +/* + * Common subroutine to open a TCP connection to remote host specified + * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local + * port number if needed. Call in_pcbladdr to do the routing and to choose + * a local host address (interface). If there is an existing incarnation + * of the same connection in TIME-WAIT state and if the remote host was + * sending CC options and if the connection duration was < MSL, then + * truncate the previous TIME-WAIT state and proceed. + * Initialize connection parameters and enter SYN-SENT state. + */ +static int +tcp_connect(tp, nam, td) + register struct tcpcb *tp; + struct sockaddr *nam; + struct thread *td; +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in *ifaddr; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + int error; + + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, td); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in_pcbladdr(inp, nam, &ifaddr); + if (error) + return error; + oinp = in_pcblookup_hash(inp->inp_pcbinfo, + sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr + : ifaddr->sin_addr, + inp->inp_lport, 0, NULL); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + (ticks - otp->t_starttime) < tcp_msl && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ifaddr->sin_addr; + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + tp->iss = tcp_new_isn(tp); + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} + +#ifdef INET6 +static int +tcp6_connect(tp, nam, td) + register struct tcpcb *tp; + struct sockaddr *nam; + struct thread *td; +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr *addr6; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + int error; + + if (inp->inp_lport == 0) { + error = in6_pcbbind(inp, (struct sockaddr *)0, td); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in6_pcbladdr(inp, nam, &addr6); + if (error) + return error; + oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + &sin6->sin6_addr, sin6->sin6_port, + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) + ? addr6 + : &inp->in6p_laddr, + inp->inp_lport, 0, NULL); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + (ticks - otp->t_starttime) < tcp_msl && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = *addr6; + inp->in6p_faddr = sin6->sin6_addr; + inp->inp_fport = sin6->sin6_port; + if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL) + inp->in6p_flowinfo = sin6->sin6_flowinfo; + in_pcbrehash(inp); + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + tp->iss = tcp_new_isn(tp); + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} +#endif /* INET6 */ + +/* + * The new sockopt interface makes it possible for us to block in the + * copyin/out step (if we take a page fault). Taking a page fault at + * splnet() is probably a Bad Thing. (Since sockets and pcbs both now + * use TSM, there probably isn't any need for this function to run at + * splnet() any more. This needs more examination.) + */ +int +tcp_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, opt, optval, s; + struct inpcb *inp; + struct tcpcb *tp; + + error = 0; + s = splnet(); /* XXX */ + INP_INFO_RLOCK(&tcbinfo); + inp = sotoinpcb(so); + if (inp == NULL) { + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + return (ECONNRESET); + } + INP_LOCK(inp); + INP_INFO_RUNLOCK(&tcbinfo); + if (sopt->sopt_level != IPPROTO_TCP) { +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + error = ip6_ctloutput(so, sopt); + else +#endif /* INET6 */ + error = ip_ctloutput(so, sopt); + INP_UNLOCK(inp); + splx(s); + return (error); + } + tp = intotcpcb(inp); + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + case TCP_NOOPT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case TCP_NODELAY: + opt = TF_NODELAY; + break; + case TCP_NOOPT: + opt = TF_NOOPT; + break; + default: + opt = 0; /* dead code to fool gcc */ + break; + } + + if (optval) + tp->t_flags |= opt; + else + tp->t_flags &= ~opt; + break; + + case TCP_NOPUSH: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + if (optval) + tp->t_flags |= TF_NOPUSH; + else { + tp->t_flags &= ~TF_NOPUSH; + error = tcp_output(tp); + } + break; + + case TCP_MAXSEG: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + if (optval > 0 && optval <= tp->t_maxseg) + tp->t_maxseg = optval; + else + error = EINVAL; + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + optval = tp->t_flags & TF_NODELAY; + break; + case TCP_MAXSEG: + optval = tp->t_maxseg; + break; + case TCP_NOOPT: + optval = tp->t_flags & TF_NOOPT; + break; + case TCP_NOPUSH: + optval = tp->t_flags & TF_NOPUSH; + break; + default: + error = ENOPROTOOPT; + break; + } + if (error == 0) + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + } + INP_UNLOCK(inp); + splx(s); + return (error); +} + +/* + * tcp_sendspace and tcp_recvspace are the default send and receive window + * sizes, respectively. These are obsolescent (this information should + * be set by the route). + */ +u_long tcp_sendspace = 1024*32; +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, + &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); +u_long tcp_recvspace = 1024*64; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +static int +tcp_attach(so, td) + struct socket *so; + struct thread *td; +{ + register struct tcpcb *tp; + struct inpcb *inp; + int error; +#ifdef INET6 + int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL; +#endif + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + error = in_pcballoc(so, &tcbinfo, td); + if (error) + return (error); + inp = sotoinpcb(so); +#ifdef INET6 + if (isipv6) { + inp->inp_vflag |= INP_IPV6; + inp->in6p_hops = -1; /* use kernel default */ + } + else +#endif + inp->inp_vflag |= INP_IPV4; + tp = tcp_newtcpcb(inp); + if (tp == 0) { + int nofd = so->so_state & SS_NOFDREF; /* XXX */ + + so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif + in_pcbdetach(inp); + so->so_state |= nofd; + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static struct tcpcb * +tcp_disconnect(tp) + register struct tcpcb *tp; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (tp->t_state < TCPS_ESTABLISHED) + tp = tcp_close(tp); + else if ((so->so_options & SO_LINGER) && so->so_linger == 0) + tp = tcp_drop(tp, 0); + else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tp = tcp_usrclosed(tp); + if (tp) + (void) tcp_output(tp); + } + return (tp); +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static struct tcpcb * +tcp_usrclosed(tp) + register struct tcpcb *tp; +{ + + switch (tp->t_state) { + + case TCPS_CLOSED: + case TCPS_LISTEN: + tp->t_state = TCPS_CLOSED; + tp = tcp_close(tp); + break; + + case TCPS_SYN_SENT: + case TCPS_SYN_RECEIVED: + tp->t_flags |= TF_NEEDFIN; + break; + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { + soisdisconnected(tp->t_inpcb->inp_socket); + /* To prevent the connection hanging in FIN_WAIT_2 forever. */ + if (tp->t_state == TCPS_FIN_WAIT_2) + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + return (tp); +} + diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h new file mode 100644 index 0000000..a58bdf5 --- /dev/null +++ b/sys/netinet/tcp_var.h @@ -0,0 +1,491 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_VAR_H_ +#define _NETINET_TCP_VAR_H_ + +#include <netinet/in_pcb.h> /* needed for in_conninfo, inp_gen_t */ +#include <netinet/tcp.h> + +/* + * Kernel variables for tcp. + */ +extern int tcp_do_rfc1323; +extern int tcp_do_rfc1644; + +/* TCP segment queue entry */ +struct tseg_qent { + LIST_ENTRY(tseg_qent) tqe_q; + int tqe_len; /* TCP segment data length */ + struct tcphdr *tqe_th; /* a pointer to tcp header */ + struct mbuf *tqe_m; /* mbuf contains packet */ +}; +LIST_HEAD(tsegqe_head, tseg_qent); +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_TSEGQ); +#endif + +struct tcptemp { + u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ + struct tcphdr tt_t; +}; + +#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ + +/* + * Tcp control block, one per tcp; fields: + * Organized for 16 byte cacheline efficiency. + */ +struct tcpcb { + struct tsegqe_head t_segq; + int t_dupacks; /* consecutive dup acks recd */ + struct tcptemp *unused; /* unused */ + + struct callout *tt_rexmt; /* retransmit timer */ + struct callout *tt_persist; /* retransmit persistence */ + struct callout *tt_keep; /* keepalive */ + struct callout *tt_2msl; /* 2*msl TIME_WAIT timer */ + struct callout *tt_delack; /* delayed ACK timer */ + + struct inpcb *t_inpcb; /* back pointer to internet pcb */ + int t_state; /* state of this connection */ + u_int t_flags; +#define TF_ACKNOW 0x00001 /* ack peer immediately */ +#define TF_DELACK 0x00002 /* ack, but try to delay it */ +#define TF_NODELAY 0x00004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x00008 /* don't use tcp options */ +#define TF_SENTFIN 0x00010 /* have sent FIN */ +#define TF_REQ_SCALE 0x00020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x00040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x00080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x00100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x00200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x00400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x00800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x01000 /* don't push */ +#define TF_REQ_CC 0x02000 /* have/will request CC */ +#define TF_RCVD_CC 0x04000 /* a CC was received in SYN */ +#define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */ +#define TF_MORETOCOME 0x10000 /* More data to be appended to sock */ +#define TF_LQ_OVERFLOW 0x20000 /* listen queue overflow */ +#define TF_LASTIDLE 0x40000 /* connection was previously idle */ +#define TF_RXWIN0SENT 0x80000 /* sent a receiver win 0 in response */ + int t_force; /* 1 if forcing out a byte */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_long snd_wnd; /* send window */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + tcp_seq snd_recover; /* for use in fast recovery */ + + u_int t_maxopd; /* mss plus options */ + + u_long t_rcvtime; /* inactivity time */ + u_long t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_long t_rttupdated; /* number of times rtt sampled */ + u_long max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_long ts_recent; /* timestamp echo data */ + + u_long ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; +/* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ +/* experimental */ + u_long snd_cwnd_prev; /* cwnd prior to retransmit */ + u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ + u_long t_badrxtwin; /* window for retransmit recovery */ +}; + +/* + * Structure to hold TCP options that are only used during segment + * processing (in tcp_input), but not held in the tcpcb. + * It's basically used to reduce the number of parameters + * to tcp_dooptions. + */ +struct tcpopt { + u_long to_flags; /* which options are present */ +#define TOF_TS 0x0001 /* timestamp */ +#define TOF_CC 0x0002 /* CC and CCnew are exclusive */ +#define TOF_CCNEW 0x0004 +#define TOF_CCECHO 0x0008 +#define TOF_MSS 0x0010 +#define TOF_SCALE 0x0020 + u_int32_t to_tsval; + u_int32_t to_tsecr; + tcp_cc to_cc; /* holds CC or CCnew */ + tcp_cc to_ccecho; + u_int16_t to_mss; + u_int8_t to_requested_s_scale; + u_int8_t to_pad; +}; + +struct syncache { + inp_gen_t sc_inp_gencnt; /* pointer check */ + struct tcpcb *sc_tp; /* tcb for listening socket */ + struct mbuf *sc_ipopts; /* source route */ + struct in_conninfo sc_inc; /* addresses */ +#define sc_route sc_inc.inc_route +#define sc_route6 sc_inc.inc6_route + u_int32_t sc_tsrecent; + tcp_cc sc_cc_send; /* holds CC or CCnew */ + tcp_cc sc_cc_recv; + tcp_seq sc_irs; /* seq from peer */ + tcp_seq sc_iss; /* our ISS */ + u_long sc_rxttime; /* retransmit time */ + u_int16_t sc_rxtslot; /* retransmit counter */ + u_int16_t sc_peer_mss; /* peer's MSS */ + u_int16_t sc_wnd; /* advertised window */ + u_int8_t sc_requested_s_scale:4, + sc_request_r_scale:4; + u_int8_t sc_flags; +#define SCF_NOOPT 0x01 /* no TCP options */ +#define SCF_WINSCALE 0x02 /* negotiated window scaling */ +#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ +#define SCF_CC 0x08 /* negotiated CC */ +#define SCF_UNREACH 0x10 /* icmp unreachable received */ +#define SCF_KEEPROUTE 0x20 /* keep cloned route */ + TAILQ_ENTRY(syncache) sc_hash; + TAILQ_ENTRY(syncache) sc_timerq; +}; + +struct syncache_head { + TAILQ_HEAD(, syncache) sch_bucket; + u_int sch_length; +}; + +/* + * The TAO cache entry which is stored in the protocol family specific + * portion of the route metrics. + */ +struct rmxp_tao { + tcp_cc tao_cc; /* latest CC in valid SYN */ + tcp_cc tao_ccsent; /* latest CC sent to peer */ + u_short tao_mssopt; /* peer's cached MSS */ +#ifdef notyet + u_short tao_flags; /* cache status flags */ +#define TAOF_DONT 0x0001 /* peer doesn't understand rfc1644 */ +#define TAOF_OK 0x0002 /* peer does understand rfc1644 */ +#define TAOF_UNDEF 0 /* we don't know yet */ +#endif /* notyet */ +}; +#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) + +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +#define TCP_REXMTVAL(tp) \ + max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) + +/* + * TCP statistics. + * Many of these should be kept per connection, + * but that's inconvenient at the moment. + */ +struct tcpstat { + u_long tcps_connattempt; /* connections initiated */ + u_long tcps_accepts; /* connections accepted */ + u_long tcps_connects; /* connections established */ + u_long tcps_drops; /* connections dropped */ + u_long tcps_conndrops; /* embryonic connections dropped */ + u_long tcps_closed; /* conn. closed (includes drops) */ + u_long tcps_segstimed; /* segs where we tried to get rtt */ + u_long tcps_rttupdated; /* times we succeeded */ + u_long tcps_delack; /* delayed acks sent */ + u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + u_long tcps_rexmttimeo; /* retransmit timeouts */ + u_long tcps_persisttimeo; /* persist timeouts */ + u_long tcps_keeptimeo; /* keepalive timeouts */ + u_long tcps_keepprobe; /* keepalive probes sent */ + u_long tcps_keepdrops; /* connections dropped in keepalive */ + + u_long tcps_sndtotal; /* total packets sent */ + u_long tcps_sndpack; /* data packets sent */ + u_long tcps_sndbyte; /* data bytes sent */ + u_long tcps_sndrexmitpack; /* data packets retransmitted */ + u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_long tcps_sndacks; /* ack-only packets sent */ + u_long tcps_sndprobe; /* window probes sent */ + u_long tcps_sndurg; /* packets sent with URG only */ + u_long tcps_sndwinup; /* window update-only packets sent */ + u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + u_long tcps_rcvtotal; /* total packets received */ + u_long tcps_rcvpack; /* packets received in sequence */ + u_long tcps_rcvbyte; /* bytes received in sequence */ + u_long tcps_rcvbadsum; /* packets received with ccksum errs */ + u_long tcps_rcvbadoff; /* packets received with bad offset */ + u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ + u_long tcps_rcvshort; /* packets received too short */ + u_long tcps_rcvduppack; /* duplicate-only packets received */ + u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ + u_long tcps_rcvpartduppack; /* packets with some duplicate data */ + u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + u_long tcps_rcvoopack; /* out-of-order packets received */ + u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvpackafterwin; /* packets with data after window */ + u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ + u_long tcps_rcvafterclose; /* packets rcvd after "close" */ + u_long tcps_rcvwinprobe; /* rcvd window probe packets */ + u_long tcps_rcvdupack; /* rcvd duplicate acks */ + u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + u_long tcps_rcvackpack; /* rcvd ack packets */ + u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ + u_long tcps_rcvwinupd; /* rcvd window update packets */ + u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_predack; /* times hdr predict ok for acks */ + u_long tcps_preddat; /* times hdr predict ok for data pkts */ + u_long tcps_pcbcachemiss; + u_long tcps_cachedrtt; /* times cached RTT in route updated */ + u_long tcps_cachedrttvar; /* times cached rttvar updated */ + u_long tcps_cachedssthresh; /* times cached ssthresh updated */ + u_long tcps_usedrtt; /* times RTT initialized from route */ + u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ + u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ + u_long tcps_persistdrop; /* timeout in persist state */ + u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ + u_long tcps_mturesent; /* resends due to MTU discovery */ + u_long tcps_listendrop; /* listen queue overflows */ + + u_long tcps_sc_added; /* entry added to syncache */ + u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */ + u_long tcps_sc_dupsyn; /* duplicate SYN packet */ + u_long tcps_sc_dropped; /* could not reply to packet */ + u_long tcps_sc_completed; /* successful extraction of entry */ + u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */ + u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */ + u_long tcps_sc_reset; /* RST removed entry from syncache */ + u_long tcps_sc_stale; /* timed out or listen socket gone */ + u_long tcps_sc_aborted; /* syncache entry aborted */ + u_long tcps_sc_badack; /* removed due to bad ACK */ + u_long tcps_sc_unreach; /* ICMP unreachable received */ + u_long tcps_sc_zonefail; /* zalloc() failed */ + u_long tcps_sc_sendcookie; /* SYN cookie sent */ + u_long tcps_sc_recvcookie; /* SYN cookie received */ +}; + +/* + * TCB structure exported to user-land via sysctl(3). + * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been + * included. Not all of our clients do. + */ +#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) +struct xtcpcb { + size_t xt_len; + struct inpcb xt_inp; + struct tcpcb xt_tp; + struct xsocket xt_socket; + u_quad_t xt_alignment_hack; +}; +#endif + +/* + * Names for TCP sysctl objects + */ +#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ +#define TCPCTL_DO_RFC1644 2 /* use RFC-1644 extensions */ +#define TCPCTL_MSSDFLT 3 /* MSS default */ +#define TCPCTL_STATS 4 /* statistics (read-only) */ +#define TCPCTL_RTTDFLT 5 /* default RTT estimate */ +#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ +#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ +#define TCPCTL_SENDSPACE 8 /* send buffer space */ +#define TCPCTL_RECVSPACE 9 /* receive buffer space */ +#define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ +#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ +#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ +#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ +#define TCPCTL_MAXID 14 + +#define TCPCTL_NAMES { \ + { 0, 0 }, \ + { "rfc1323", CTLTYPE_INT }, \ + { "rfc1644", CTLTYPE_INT }, \ + { "mssdflt", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "rttdflt", CTLTYPE_INT }, \ + { "keepidle", CTLTYPE_INT }, \ + { "keepintvl", CTLTYPE_INT }, \ + { "sendspace", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "keepinit", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ + { "delacktime", CTLTYPE_INT }, \ + { "v6mssdflt", CTLTYPE_INT }, \ +} + + +#ifdef _KERNEL +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_tcp); +#endif + +extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ +extern struct inpcbinfo tcbinfo; +extern struct tcpstat tcpstat; /* tcp statistics */ +extern int tcp_mssdflt; /* XXX */ +extern int tcp_delack_enabled; +extern int tcp_do_newreno; +extern int path_mtu_discovery; +extern int ss_fltsz; +extern int ss_fltsz_local; + +void tcp_canceltimers(struct tcpcb *); +struct tcpcb * + tcp_close(struct tcpcb *); +void tcp_ctlinput(int, struct sockaddr *, void *); +int tcp_ctloutput(struct socket *, struct sockopt *); +struct tcpcb * + tcp_drop(struct tcpcb *, int); +void tcp_drain(void); +void tcp_fasttimo(void); +struct rmxp_tao * + tcp_gettaocache(struct in_conninfo *); +void tcp_init(void); +void tcp_input(struct mbuf *, int); +void tcp_mss(struct tcpcb *, int); +int tcp_mssopt(struct tcpcb *); +struct inpcb * + tcp_drop_syn_sent(struct inpcb *, int); +struct inpcb * + tcp_mtudisc(struct inpcb *, int); +struct tcpcb * + tcp_newtcpcb(struct inpcb *); +int tcp_output(struct tcpcb *); +struct inpcb * + tcp_quench(struct inpcb *, int); +void tcp_respond(struct tcpcb *, void *, + struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); +struct rtentry * + tcp_rtlookup(struct in_conninfo *); +void tcp_setpersist(struct tcpcb *); +void tcp_slowtimo(void); +struct tcptemp * + tcp_maketemplate(struct tcpcb *); +void tcp_fillheaders(struct tcpcb *, void *, void *); +struct tcpcb * + tcp_timers(struct tcpcb *, int); +void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int); +void syncache_init(void); +void syncache_unreach(struct in_conninfo *, struct tcphdr *); +int syncache_expand(struct in_conninfo *, struct tcphdr *, + struct socket **, struct mbuf *); +int syncache_add(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct socket **, struct mbuf *); +void syncache_chkrst(struct in_conninfo *, struct tcphdr *); +void syncache_badack(struct in_conninfo *); + +extern struct pr_usrreqs tcp_usrreqs; +extern u_long tcp_sendspace; +extern u_long tcp_recvspace; +tcp_seq tcp_new_isn(struct tcpcb *); + +#endif /* _KERNEL */ + +#endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/netinet/tcpip.h b/sys/netinet/tcpip.h new file mode 100644 index 0000000..92189b9 --- /dev/null +++ b/sys/netinet/tcpip.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcpip.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCPIP_H_ +#define _NETINET_TCPIP_H_ + +/* + * Tcp+ip header, after ip options removed. + */ +struct tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ +}; +#define ti_x1 ti_i.ih_x1 +#define ti_pr ti_i.ih_pr +#define ti_len ti_i.ih_len +#define ti_src ti_i.ih_src +#define ti_dst ti_i.ih_dst +#define ti_sport ti_t.th_sport +#define ti_dport ti_t.th_dport +#define ti_seq ti_t.th_seq +#define ti_ack ti_t.th_ack +#define ti_x2 ti_t.th_x2 +#define ti_off ti_t.th_off +#define ti_flags ti_t.th_flags +#define ti_win ti_t.th_win +#define ti_sum ti_t.th_sum +#define ti_urp ti_t.th_urp + +#endif diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h new file mode 100644 index 0000000..635267f --- /dev/null +++ b/sys/netinet/udp.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * Udp protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +#endif diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c new file mode 100644 index 0000000..31fb251 --- /dev/null +++ b/sys/netinet/udp_usrreq.c @@ -0,0 +1,1048 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/domain.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/signalvar.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sx.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> + +#include <vm/uma.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/ip_icmp.h> +#include <netinet/icmp_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +/* + * UDP protocol implementation. + * Per RFC 768, August, 1980. + */ +#ifndef COMPAT_42 +static int udpcksum = 1; +#else +static int udpcksum = 0; /* XXX */ +#endif +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, + &udpcksum, 0, ""); + +int log_in_vain = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming UDP packets"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send port unreachables for refused connects"); + +struct inpcbhead udb; /* from udp_var.h */ +#define udb6 udb /* for KAME src sync over BSD*'s */ +struct inpcbinfo udbinfo; + +#ifndef UDBHASHSIZE +#define UDBHASHSIZE 16 +#endif + +struct udpstat udpstat; /* from udp_var.h */ +SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, + &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); + +static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; +#ifdef INET6 +struct udp_in6 { + struct sockaddr_in6 uin6_sin; + u_char uin6_init_done : 1; +} udp_in6 = { + { sizeof(udp_in6.uin6_sin), AF_INET6 }, + 0 +}; +struct udp_ip6 { + struct ip6_hdr uip6_ip6; + u_char uip6_init_done : 1; +} udp_ip6; +#endif /* INET6 */ + +static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, + int off); +#ifdef INET6 +static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip); +#endif + +static int udp_detach(struct socket *so); +static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, + struct mbuf *, struct thread *); + +void +udp_init() +{ + INP_INFO_LOCK_INIT(&udbinfo, "udp"); + LIST_INIT(&udb); + udbinfo.listhead = &udb; + udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); + udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, + &udbinfo.porthashmask); + udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(udbinfo.ipi_zone, maxsockets); +} + +void +udp_input(m, off) + register struct mbuf *m; + int off; +{ + int iphlen = off; + register struct ip *ip; + register struct udphdr *uh; + register struct inpcb *inp; + struct mbuf *opts = 0; + int len; + struct ip save_ip; + struct sockaddr *append_sa; + + udpstat.udps_ipackets++; + + /* + * Strip IP options, if any; should skip this, + * make available to user, and use on returned packets, + * but we don't yet have a way to check the checksum + * with options still present. + */ + if (iphlen > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + + /* + * Get IP and UDP header together in first mbuf. + */ + ip = mtod(m, struct ip *); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + udpstat.udps_hdrops++; + return; + } + ip = mtod(m, struct ip *); + } + uh = (struct udphdr *)((caddr_t)ip + iphlen); + + /* destination port of 0 is illegal, based on RFC768. */ + if (uh->uh_dport == 0) + goto badunlocked; + + /* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ + len = ntohs((u_short)uh->uh_ulen); + if (ip->ip_len != len) { + if (len > ip->ip_len || len < sizeof(struct udphdr)) { + udpstat.udps_badlen++; + goto badunlocked; + } + m_adj(m, len - ip->ip_len); + /* ip->ip_len = len; */ + } + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + if (!blackhole) + save_ip = *ip; + + /* + * Checksum extended UDP header and data. + */ + if (uh->uh_sum) { + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + uh->uh_sum = m->m_pkthdr.csum_data; + else + uh->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_UDP)); + uh->uh_sum ^= 0xffff; + } else { + char b[9]; + bcopy(((struct ipovly *)ip)->ih_x1, b, 9); + bzero(((struct ipovly *)ip)->ih_x1, 9); + ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); + bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); + } + if (uh->uh_sum) { + udpstat.udps_badsum++; + m_freem(m); + return; + } + } else + udpstat.udps_nosum++; + + INP_INFO_RLOCK(&udbinfo); + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + struct inpcb *last; + /* + * Deliver a multicast or broadcast datagram to *all* sockets + * for which the local and remote addresses and ports match + * those of the incoming datagram. This allows more than + * one process to receive multi/broadcasts on the same port. + * (This really ought to be done for unicast datagrams as + * well, but that would cause problems with existing + * applications that open both address-specific sockets and + * a wildcard socket listening to the same port -- they would + * end up receiving duplicates of every unicast datagram. + * Those applications open the multiple sockets to overcome an + * inadequacy of the UDP socket interface, but for backwards + * compatibility we avoid the problem here rather than + * fixing the interface. Maybe 4.5BSD will remedy this?) + */ + + /* + * Construct sockaddr format source address. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + /* + * Locate pcb(s) for datagram. + * (Algorithm copied from raw_intr().) + */ + last = NULL; +#ifdef INET6 + udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0; +#endif + LIST_FOREACH(inp, &udb, inp_list) { + INP_LOCK(inp); + if (inp->inp_lport != uh->uh_dport) { + docontinue: + INP_UNLOCK(inp); + continue; + } +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + goto docontinue; +#endif + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + goto docontinue; + } + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (inp->inp_faddr.s_addr != + ip->ip_src.s_addr || + inp->inp_fport != uh->uh_sport) + goto docontinue; + } + + if (last != NULL) { + struct mbuf *n; + +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject_so(m, last->inp_socket)) + ipsecstat.in_polvio++; + /* do not inject data to pcb */ + else +#endif /*IPSEC*/ + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) + udp_append(last, ip, n, + iphlen + + sizeof(struct udphdr)); + INP_UNLOCK(last); + } + last = inp; + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids searching + * through all pcbs in the common case of a non-shared + * port. It * assumes that an application will never + * clear these options after setting them. + */ + if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) + break; + } + + if (last == NULL) { + /* + * No matching pcb found; discard datagram. + * (No need to send an ICMP Port Unreachable + * for a broadcast or multicast datgram.) + */ + udpstat.udps_noportbcast++; + goto badheadlocked; + } +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject_so(m, last->inp_socket)) { + ipsecstat.in_polvio++; + goto badheadlocked; + } +#endif /*IPSEC*/ + INP_UNLOCK(last); + INP_INFO_RUNLOCK(&udbinfo); + udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); + return; + } + /* + * Locate pcb for datagram. + */ + inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); + if (inp == NULL) { + if (log_in_vain) { + char buf[4*sizeof "123"]; + + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), + ntohs(uh->uh_sport)); + } + udpstat.udps_noport++; + if (m->m_flags & (M_BCAST | M_MCAST)) { + udpstat.udps_noportbcast++; + goto badheadlocked; + } + if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) + goto badheadlocked; + if (blackhole) + goto badheadlocked; + *ip = save_ip; + ip->ip_len += iphlen; + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); + INP_INFO_RUNLOCK(&udbinfo); + return; + } + INP_LOCK(inp); + INP_INFO_RUNLOCK(&udbinfo); +#ifdef IPSEC + if (ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif /*IPSEC*/ + + /* + * Construct sockaddr format source address. + * Stuff source address and datagram in user buffer. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + if (inp->inp_flags & INP_CONTROLOPTS + || inp->inp_socket->so_options & SO_TIMESTAMP) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + int savedflags; + + ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); + savedflags = inp->inp_flags; + inp->inp_flags &= ~INP_UNMAPPABLEOPTS; + ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m); + inp->inp_flags = savedflags; + } else +#endif + ip_savecontrol(inp, &opts, ip, m); + } + m_adj(m, iphlen + sizeof(struct udphdr)); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); + append_sa = (struct sockaddr *)&udp_in6; + } else +#endif + append_sa = (struct sockaddr *)&udp_in; + if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(inp->inp_socket); + INP_UNLOCK(inp); + return; + +badheadlocked: + INP_INFO_RUNLOCK(&udbinfo); +bad: + if (inp) + INP_UNLOCK(inp); +badunlocked: + m_freem(m); + if (opts) + m_freem(opts); + return; +} + +#ifdef INET6 +static void +ip_2_ip6_hdr(ip6, ip) + struct ip6_hdr *ip6; + struct ip *ip; +{ + bzero(ip6, sizeof(*ip6)); + + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_plen = ip->ip_len; + ip6->ip6_nxt = ip->ip_p; + ip6->ip6_hlim = ip->ip_ttl; + ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] = + IPV6_ADDR_INT32_SMP; + ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; + ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; +} +#endif + +/* + * subroutine of udp_input(), mainly for source code readability. + * caller must properly init udp_ip6 and udp_in6 beforehand. + */ +static void +udp_append(last, ip, n, off) + struct inpcb *last; + struct ip *ip; + struct mbuf *n; + int off; +{ + struct sockaddr *append_sa; + struct mbuf *opts = 0; + + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) { +#ifdef INET6 + if (last->inp_vflag & INP_IPV6) { + int savedflags; + + if (udp_ip6.uip6_init_done == 0) { + ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); + udp_ip6.uip6_init_done = 1; + } + savedflags = last->inp_flags; + last->inp_flags &= ~INP_UNMAPPABLEOPTS; + ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n); + last->inp_flags = savedflags; + } else +#endif + ip_savecontrol(last, &opts, ip, n); + } +#ifdef INET6 + if (last->inp_vflag & INP_IPV6) { + if (udp_in6.uin6_init_done == 0) { + in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); + udp_in6.uin6_init_done = 1; + } + append_sa = (struct sockaddr *)&udp_in6.uin6_sin; + } else +#endif + append_sa = (struct sockaddr *)&udp_in; + m_adj(n, off); + if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) { + m_freem(n); + if (opts) + m_freem(opts); + udpstat.udps_fullsock++; + } else + sorwakeup(last->inp_socket); +} + +/* + * Notify a udp user of an asynchronous error; + * just wake up so that he can collect error status. + */ +struct inpcb * +udp_notify(inp, errno) + register struct inpcb *inp; + int errno; +{ + inp->inp_socket->so_error = errno; + sorwakeup(inp->inp_socket); + sowwakeup(inp->inp_socket); + return inp; +} + +void +udp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct udphdr *uh; + struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; + struct in_addr faddr; + struct inpcb *inp; + int s; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + notify = in_rtchange; + } else if (cmd == PRC_HOSTDEAD) + ip = 0; + else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip) { + s = splnet(); + uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + INP_INFO_RLOCK(&udbinfo); + inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, 0, NULL); + if (inp != NULL) { + INP_LOCK(inp); + if(inp->inp_socket != NULL) { + (*notify)(inp, inetctlerrmap[cmd]); + } + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(&udbinfo); + splx(s); + } else + in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify); +} + +static int +udp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = udbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = udbinfo.ipi_gencnt; + n = udbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + INP_INFO_RLOCK(&udbinfo); + for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt && + cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) + inp_list[i++] = inp; + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(&udbinfo); + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + INP_UNLOCK(inp); + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + INP_INFO_RLOCK(&udbinfo); + xig.xig_gen = udbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = udbinfo.ipi_count; + INP_INFO_RUNLOCK(&udbinfo); + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + udp_pcblist, "S,xinpcb", "List of active UDP sockets"); + +static int +udp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser_cred(req->td->td_ucred, PRISON_ROOT); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + INP_INFO_RLOCK(&udbinfo); + inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); + if (error) + goto out; + cru2x(inp->inp_socket->so_cred, &xuc); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + INP_INFO_RUNLOCK(&udbinfo); + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); + +static int +udp_output(inp, m, addr, control, td) + register struct inpcb *inp; + struct mbuf *m; + struct sockaddr *addr; + struct mbuf *control; + struct thread *td; +{ + register struct udpiphdr *ui; + register int len = m->m_pkthdr.len; + struct in_addr laddr; + struct sockaddr_in *sin; + int s = 0, error = 0; + + if (control) + m_freem(control); /* XXX */ + + if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { + error = EMSGSIZE; + goto release; + } + + if (addr) { + sin = (struct sockaddr_in *)addr; + if (td && jailed(td->td_ucred)) + prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); + laddr = inp->inp_laddr; + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + goto release; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = in_pcbconnect(inp, addr, td); + if (error) { + splx(s); + goto release; + } + } else { + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + goto release; + } + } + /* + * Calculate data length and get a mbuf + * for UDP and IP headers. + */ + M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); + if (m == 0) { + error = ENOBUFS; + if (addr) + splx(s); + goto release; + } + + /* + * Fill in mbuf with extended UDP header + * and addresses and length put into network format. + */ + ui = mtod(m, struct udpiphdr *); + bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ + ui->ui_pr = IPPROTO_UDP; + ui->ui_src = inp->inp_laddr; + ui->ui_dst = inp->inp_faddr; + ui->ui_sport = inp->inp_lport; + ui->ui_dport = inp->inp_fport; + ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); + + /* + * Set up checksum and output datagram. + */ + if (udpcksum) { + ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr, + htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + } else { + ui->ui_sum = 0; + } + ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ + ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ + udpstat.udps_opackets++; + +#ifdef IPSEC + if (ipsec_setsocket(m, inp->inp_socket) != 0) { + error = ENOBUFS; + goto release; + } +#endif /*IPSEC*/ + error = ip_output(m, inp->inp_options, &inp->inp_route, + (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)), + inp->inp_moptions); + + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; /* XXX rehash? */ + splx(s); + } + return (error); + +release: + m_freem(m); + return (error); +} + +u_long udp_sendspace = 9216; /* really max datagram size */ + /* 40 1K datagrams */ +SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, + &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); + +u_long udp_recvspace = 40 * (1024 + +#ifdef INET6 + sizeof(struct sockaddr_in6) +#else + sizeof(struct sockaddr_in) +#endif + ); +SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &udp_recvspace, 0, "Maximum incoming UDP datagram size"); + +static int +udp_abort(struct socket *so) +{ + struct inpcb *inp; + int s; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&udbinfo); + return EINVAL; /* ??? possible? panic instead? */ + } + INP_LOCK(inp); + soisdisconnected(so); + s = splnet(); + in_pcbdetach(inp); + INP_INFO_WUNLOCK(&udbinfo); + splx(s); + return 0; +} + +static int +udp_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + int s, error; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp != 0) { + INP_INFO_WUNLOCK(&udbinfo); + return EINVAL; + } + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) { + INP_INFO_WUNLOCK(&udbinfo); + return error; + } + s = splnet(); + error = in_pcballoc(so, &udbinfo, td); + splx(s); + if (error) + return error; + + inp = (struct inpcb *)so->so_pcb; + INP_LOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_ttl = ip_defttl; + INP_UNLOCK(inp); + return 0; +} + +static int +udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp; + int s, error; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&udbinfo); + return EINVAL; + } + INP_LOCK(inp); + s = splnet(); + error = in_pcbbind(inp, nam, td); + splx(s); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + return error; +} + +static int +udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp; + int s, error; + struct sockaddr_in *sin; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&udbinfo); + return EINVAL; + } + INP_LOCK(inp); + if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + return EISCONN; + } + s = splnet(); + sin = (struct sockaddr_in *)nam; + if (td && jailed(td->td_ucred)) + prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr); + error = in_pcbconnect(inp, nam, td); + splx(s); + if (error == 0) + soisconnected(so); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + return error; +} + +static int +udp_detach(struct socket *so) +{ + struct inpcb *inp; + int s; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&udbinfo); + return EINVAL; + } + INP_LOCK(inp); + s = splnet(); + in_pcbdetach(inp); + INP_INFO_WUNLOCK(&udbinfo); + splx(s); + return 0; +} + +static int +udp_disconnect(struct socket *so) +{ + struct inpcb *inp; + int s; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&udbinfo); + return EINVAL; + } + INP_LOCK(inp); + if (inp->inp_faddr.s_addr == INADDR_ANY) { + INP_INFO_WUNLOCK(&udbinfo); + INP_UNLOCK(inp); + return ENOTCONN; + } + + s = splnet(); + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + splx(s); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + return 0; +} + +static int +udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct thread *td) +{ + struct inpcb *inp; + int ret; + + INP_INFO_WLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_WUNLOCK(&udbinfo); + m_freem(m); + return EINVAL; + } + INP_LOCK(inp); + ret = udp_output(inp, m, addr, control, td); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&udbinfo); + return ret; +} + +int +udp_shutdown(struct socket *so) +{ + struct inpcb *inp; + + INP_INFO_RLOCK(&udbinfo); + inp = sotoinpcb(so); + if (inp == 0) { + INP_INFO_RUNLOCK(&udbinfo); + return EINVAL; + } + INP_LOCK(inp); + INP_INFO_RUNLOCK(&udbinfo); + socantsendmore(so); + INP_UNLOCK(inp); + return 0; +} + +/* + * This is the wrapper function for in_setsockaddr. We just pass down + * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking + * here because in_setsockaddr will call malloc and might block. + */ +static int +udp_sockaddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setsockaddr(so, nam, &udbinfo)); +} + +/* + * This is the wrapper function for in_setpeeraddr. We just pass down + * the pcbinfo for in_setpeeraddr to lock. + */ +static int +udp_peeraddr(struct socket *so, struct sockaddr **nam) +{ + return (in_setpeeraddr(so, nam, &udbinfo)); +} + +struct pr_usrreqs udp_usrreqs = { + udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, + pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, + pru_listen_notsupp, udp_peeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown, + udp_sockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h new file mode 100644 index 0000000..66db23f --- /dev/null +++ b/sys/netinet/udp_var.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_VAR_H_ +#define _NETINET_UDP_VAR_H_ + +/* + * UDP kernel structures and variables. + */ +struct udpiphdr { + struct ipovly ui_i; /* overlaid ip structure */ + struct udphdr ui_u; /* udp header */ +}; +#define ui_x1 ui_i.ih_x1 +#define ui_pr ui_i.ih_pr +#define ui_len ui_i.ih_len +#define ui_src ui_i.ih_src +#define ui_dst ui_i.ih_dst +#define ui_sport ui_u.uh_sport +#define ui_dport ui_u.uh_dport +#define ui_ulen ui_u.uh_ulen +#define ui_sum ui_u.uh_sum + +struct udpstat { + /* input statistics: */ + u_long udps_ipackets; /* total input packets */ + u_long udps_hdrops; /* packet shorter than header */ + u_long udps_badsum; /* checksum error */ + u_long udps_nosum; /* no checksum */ + u_long udps_badlen; /* data length larger than packet */ + u_long udps_noport; /* no socket on port */ + u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_fullsock; /* not delivered, input socket full */ + u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ + u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + /* output statistics: */ + u_long udps_opackets; /* total output packets */ + u_long udps_fastout; /* output packets on fast path */ + /* of no socket on port, arrived as multicast */ + u_long udps_noportmcast; +}; + +/* + * Names for UDP sysctl objects + */ +#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ +#define UDPCTL_STATS 2 /* statistics (read-only) */ +#define UDPCTL_MAXDGRAM 3 /* max datagram size */ +#define UDPCTL_RECVSPACE 4 /* default receive buffer space */ +#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ +#define UDPCTL_MAXID 6 + +#define UDPCTL_NAMES { \ + { 0, 0 }, \ + { "checksum", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "maxdgram", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_udp); + +extern struct pr_usrreqs udp_usrreqs; +extern struct inpcbhead udb; +extern struct inpcbinfo udbinfo; +extern u_long udp_sendspace; +extern u_long udp_recvspace; +extern struct udpstat udpstat; +extern int log_in_vain; + +void udp_ctlinput(int, struct sockaddr *, void *); +void udp_init(void); +void udp_input(struct mbuf *, int); + +struct inpcb * + udp_notify(struct inpcb *inp, int errno); +int udp_shutdown(struct socket *so); +#endif + +#endif |