summaryrefslogtreecommitdiffstats
path: root/sys/netinet
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/accf_data.c67
-rw-r--r--sys/netinet/accf_http.c360
-rw-r--r--sys/netinet/icmp6.h740
-rw-r--r--sys/netinet/icmp_var.h91
-rw-r--r--sys/netinet/if_atm.c278
-rw-r--r--sys/netinet/if_atm.h47
-rw-r--r--sys/netinet/if_ether.c951
-rw-r--r--sys/netinet/if_ether.h122
-rw-r--r--sys/netinet/igmp.c487
-rw-r--r--sys/netinet/igmp.h96
-rw-r--r--sys/netinet/igmp_var.h109
-rw-r--r--sys/netinet/in.c890
-rw-r--r--sys/netinet/in.h550
-rw-r--r--sys/netinet/in_cksum.c150
-rw-r--r--sys/netinet/in_gif.c354
-rw-r--r--sys/netinet/in_gif.h42
-rw-r--r--sys/netinet/in_pcb.c1072
-rw-r--r--sys/netinet/in_pcb.h352
-rw-r--r--sys/netinet/in_proto.c230
-rw-r--r--sys/netinet/in_rmx.c426
-rw-r--r--sys/netinet/in_systm.h62
-rw-r--r--sys/netinet/in_var.h243
-rw-r--r--sys/netinet/ip.h190
-rw-r--r--sys/netinet/ip6.h308
-rw-r--r--sys/netinet/ip_divert.c562
-rw-r--r--sys/netinet/ip_dummynet.c1952
-rw-r--r--sys/netinet/ip_dummynet.h359
-rw-r--r--sys/netinet/ip_ecn.c142
-rw-r--r--sys/netinet/ip_ecn.h49
-rw-r--r--sys/netinet/ip_encap.c522
-rw-r--r--sys/netinet/ip_encap.h64
-rw-r--r--sys/netinet/ip_flow.c333
-rw-r--r--sys/netinet/ip_flow.h57
-rw-r--r--sys/netinet/ip_fw.c2254
-rw-r--r--sys/netinet/ip_fw.h359
-rw-r--r--sys/netinet/ip_icmp.c871
-rw-r--r--sys/netinet/ip_icmp.h192
-rw-r--r--sys/netinet/ip_id.c210
-rw-r--r--sys/netinet/ip_input.c1948
-rw-r--r--sys/netinet/ip_mroute.c2257
-rw-r--r--sys/netinet/ip_mroute.h263
-rw-r--r--sys/netinet/ip_output.c2050
-rw-r--r--sys/netinet/ip_var.h203
-rw-r--r--sys/netinet/ipprotosw.h102
-rw-r--r--sys/netinet/libalias/HISTORY145
-rw-r--r--sys/netinet/libalias/Makefile13
-rw-r--r--sys/netinet/libalias/alias.c1574
-rw-r--r--sys/netinet/libalias/alias.h182
-rw-r--r--sys/netinet/libalias/alias_cuseeme.c121
-rw-r--r--sys/netinet/libalias/alias_db.c2812
-rw-r--r--sys/netinet/libalias/alias_ftp.c583
-rw-r--r--sys/netinet/libalias/alias_irc.c357
-rw-r--r--sys/netinet/libalias/alias_local.h229
-rw-r--r--sys/netinet/libalias/alias_nbt.c704
-rw-r--r--sys/netinet/libalias/alias_pptp.c369
-rw-r--r--sys/netinet/libalias/alias_proxy.c837
-rw-r--r--sys/netinet/libalias/alias_smedia.c433
-rw-r--r--sys/netinet/libalias/alias_util.c169
-rw-r--r--sys/netinet/libalias/libalias.3981
-rw-r--r--sys/netinet/raw_ip.c707
-rw-r--r--sys/netinet/tcp.h137
-rw-r--r--sys/netinet/tcp_debug.c231
-rw-r--r--sys/netinet/tcp_debug.h83
-rw-r--r--sys/netinet/tcp_fsm.h114
-rw-r--r--sys/netinet/tcp_input.c2785
-rw-r--r--sys/netinet/tcp_output.c970
-rw-r--r--sys/netinet/tcp_reass.c2785
-rw-r--r--sys/netinet/tcp_seq.h84
-rw-r--r--sys/netinet/tcp_subr.c1510
-rw-r--r--sys/netinet/tcp_syncache.c1371
-rw-r--r--sys/netinet/tcp_timer.c529
-rw-r--r--sys/netinet/tcp_timer.h139
-rw-r--r--sys/netinet/tcp_timewait.c1510
-rw-r--r--sys/netinet/tcp_usrreq.c1252
-rw-r--r--sys/netinet/tcp_var.h491
-rw-r--r--sys/netinet/tcpip.h63
-rw-r--r--sys/netinet/udp.h51
-rw-r--r--sys/netinet/udp_usrreq.c1048
-rw-r--r--sys/netinet/udp_var.h115
79 files changed, 48450 insertions, 0 deletions
diff --git a/sys/netinet/accf_data.c b/sys/netinet/accf_data.c
new file mode 100644
index 0000000..2058b06
--- /dev/null
+++ b/sys/netinet/accf_data.c
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2000 Alfred Perlstein <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define ACCEPT_FILTER_MOD
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+
+/* accept filter that holds a socket until data arrives */
+
+static void sohasdata(struct socket *so, void *arg, int waitflag);
+
+static struct accept_filter accf_data_filter = {
+ "dataready",
+ sohasdata,
+ NULL,
+ NULL
+};
+
+static moduledata_t accf_data_mod = {
+ "accf_data",
+ accept_filt_generic_mod_event,
+ &accf_data_filter
+};
+
+DECLARE_MODULE(accf_data, accf_data_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+
+static void
+sohasdata(struct socket *so, void *arg, int waitflag)
+{
+
+ if (!soreadable(so))
+ return;
+
+ so->so_upcall = NULL;
+ so->so_rcv.sb_flags &= ~SB_UPCALL;
+ soisconnected(so);
+ return;
+}
diff --git a/sys/netinet/accf_http.c b/sys/netinet/accf_http.c
new file mode 100644
index 0000000..686e563
--- /dev/null
+++ b/sys/netinet/accf_http.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2000 Paycounter, Inc.
+ * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define ACCEPT_FILTER_MOD
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/socketvar.h>
+
+/* check for GET/HEAD */
+static void sohashttpget(struct socket *so, void *arg, int waitflag);
+/* check for HTTP/1.0 or HTTP/1.1 */
+static void soparsehttpvers(struct socket *so, void *arg, int waitflag);
+/* check for end of HTTP/1.x request */
+static void soishttpconnected(struct socket *so, void *arg, int waitflag);
+/* strcmp on an mbuf chain */
+static int mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp);
+/* strncmp on an mbuf chain */
+static int mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset,
+ int max, char *cmp);
+/* socketbuffer is full */
+static int sbfull(struct sockbuf *sb);
+
+static struct accept_filter accf_http_filter = {
+ "httpready",
+ sohashttpget,
+ NULL,
+ NULL
+};
+
+static moduledata_t accf_http_mod = {
+ "accf_http",
+ accept_filt_generic_mod_event,
+ &accf_http_filter
+};
+
+DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+
+static int parse_http_version = 1;
+
+SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0,
+"HTTP accept filter");
+SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW,
+&parse_http_version, 1,
+"Parse http version so that non 1.x requests work");
+
+#ifdef ACCF_HTTP_DEBUG
+#define DPRINT(fmt, args...) \
+ do { \
+ printf("%s:%d: " fmt "\n", __func__, __LINE__, ##args); \
+ } while (0)
+#else
+#define DPRINT(fmt, args...)
+#endif
+
+static int
+sbfull(struct sockbuf *sb)
+{
+
+ DPRINT("sbfull, cc(%ld) >= hiwat(%ld): %d, "
+ "mbcnt(%ld) >= mbmax(%ld): %d",
+ sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat,
+ sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax);
+ return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax);
+}
+
+/*
+ * start at mbuf m, (must provide npkt if exists)
+ * starting at offset in m compare characters in mbuf chain for 'cmp'
+ */
+static int
+mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ n = npkt;
+ if (npkt)
+ npkt = npkt->m_nextpkt;
+ for (; m; m = m->m_next) {
+ for (; offset < m->m_len; offset++, cmp++) {
+ if (*cmp == '\0')
+ return (1);
+ else if (*cmp != *(mtod(m, char *) + offset))
+ return (0);
+ }
+ if (*cmp == '\0')
+ return (1);
+ offset = 0;
+ }
+ }
+ return (0);
+}
+
+/*
+ * start at mbuf m, (must provide npkt if exists)
+ * starting at offset in m compare characters in mbuf chain for 'cmp'
+ * stop at 'max' characters
+ */
+static int
+mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, int max, char *cmp)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ n = npkt;
+ if (npkt)
+ npkt = npkt->m_nextpkt;
+ for (; m; m = m->m_next) {
+ for (; offset < m->m_len; offset++, cmp++, max--) {
+ if (max == 0 || *cmp == '\0')
+ return (1);
+ else if (*cmp != *(mtod(m, char *) + offset))
+ return (0);
+ }
+ if (max == 0 || *cmp == '\0')
+ return (1);
+ offset = 0;
+ }
+ }
+ return (0);
+}
+
+#define STRSETUP(sptr, slen, str) \
+ do { \
+ sptr = str; \
+ slen = sizeof(str) - 1; \
+ } while(0)
+
+static void
+sohashttpget(struct socket *so, void *arg, int waitflag)
+{
+
+ if ((so->so_state & SS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) {
+ struct mbuf *m;
+ char *cmp;
+ int cmplen, cc;
+
+ m = so->so_rcv.sb_mb;
+ cc = so->so_rcv.sb_cc - 1;
+ if (cc < 1)
+ return;
+ switch (*mtod(m, char *)) {
+ case 'G':
+ STRSETUP(cmp, cmplen, "ET ");
+ break;
+ case 'H':
+ STRSETUP(cmp, cmplen, "EAD ");
+ break;
+ default:
+ goto fallout;
+ }
+ if (cc < cmplen) {
+ if (mbufstrncmp(m, m->m_nextpkt, 1, cc, cmp) == 1) {
+ DPRINT("short cc (%d) but mbufstrncmp ok", cc);
+ return;
+ } else {
+ DPRINT("short cc (%d) mbufstrncmp failed", cc);
+ goto fallout;
+ }
+ }
+ if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) {
+ DPRINT("mbufstrcmp ok");
+ if (parse_http_version == 0)
+ soishttpconnected(so, arg, waitflag);
+ else
+ soparsehttpvers(so, arg, waitflag);
+ return;
+ }
+ DPRINT("mbufstrcmp bad");
+ }
+
+fallout:
+ DPRINT("fallout");
+ so->so_upcall = NULL;
+ so->so_rcv.sb_flags &= ~SB_UPCALL;
+ soisconnected(so);
+ return;
+}
+
+static void
+soparsehttpvers(struct socket *so, void *arg, int waitflag)
+{
+ struct mbuf *m, *n;
+ int i, cc, spaces, inspaces;
+
+ if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv))
+ goto fallout;
+
+ m = so->so_rcv.sb_mb;
+ cc = so->so_rcv.sb_cc;
+ inspaces = spaces = 0;
+ for (m = so->so_rcv.sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ for (i = 0; i < m->m_len; i++, cc--) {
+ switch (*(mtod(m, char *) + i)) {
+ case ' ':
+ /* tabs? '\t' */
+ if (!inspaces) {
+ spaces++;
+ inspaces = 1;
+ }
+ break;
+ case '\r':
+ case '\n':
+ DPRINT("newline");
+ goto fallout;
+ default:
+ if (spaces != 2) {
+ inspaces = 0;
+ break;
+ }
+
+ /*
+ * if we don't have enough characters
+ * left (cc < sizeof("HTTP/1.0") - 1)
+ * then see if the remaining ones
+ * are a request we can parse.
+ */
+ if (cc < sizeof("HTTP/1.0") - 1) {
+ if (mbufstrncmp(m, n, i, cc,
+ "HTTP/1.") == 1) {
+ DPRINT("ok");
+ goto readmore;
+ } else {
+ DPRINT("bad");
+ goto fallout;
+ }
+ } else if (
+ mbufstrcmp(m, n, i, "HTTP/1.0") ||
+ mbufstrcmp(m, n, i, "HTTP/1.1")) {
+ DPRINT("ok");
+ soishttpconnected(so,
+ arg, waitflag);
+ return;
+ } else {
+ DPRINT("bad");
+ goto fallout;
+ }
+ }
+ }
+ }
+ }
+readmore:
+ DPRINT("readmore");
+ /*
+ * if we hit here we haven't hit something
+ * we don't understand or a newline, so try again
+ */
+ so->so_upcall = soparsehttpvers;
+ so->so_rcv.sb_flags |= SB_UPCALL;
+ return;
+
+fallout:
+ DPRINT("fallout");
+ so->so_upcall = NULL;
+ so->so_rcv.sb_flags &= ~SB_UPCALL;
+ soisconnected(so);
+ return;
+}
+
+
+#define NCHRS 3
+
+static void
+soishttpconnected(struct socket *so, void *arg, int waitflag)
+{
+ char a, b, c;
+ struct mbuf *m, *n;
+ int ccleft, copied;
+
+ DPRINT("start");
+ if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv))
+ goto gotit;
+
+ /*
+ * Walk the socketbuffer and copy the last NCHRS (3) into a, b, and c
+ * copied - how much we've copied so far
+ * ccleft - how many bytes remaining in the socketbuffer
+ * just loop over the mbufs subtracting from 'ccleft' until we only
+ * have NCHRS left
+ */
+ copied = 0;
+ ccleft = so->so_rcv.sb_cc;
+ if (ccleft < NCHRS)
+ goto readmore;
+ a = b = c = '\0';
+ for (m = so->so_rcv.sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ ccleft -= m->m_len;
+ if (ccleft <= NCHRS) {
+ char *src;
+ int tocopy;
+
+ tocopy = (NCHRS - ccleft) - copied;
+ src = mtod(m, char *) + (m->m_len - tocopy);
+
+ while (tocopy--) {
+ switch (copied++) {
+ case 0:
+ a = *src++;
+ break;
+ case 1:
+ b = *src++;
+ break;
+ case 2:
+ c = *src++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (c == '\n' && (b == '\n' || (b == '\r' && a == '\n'))) {
+ /* we have all request headers */
+ goto gotit;
+ }
+
+readmore:
+ so->so_upcall = soishttpconnected;
+ so->so_rcv.sb_flags |= SB_UPCALL;
+ return;
+
+gotit:
+ so->so_upcall = NULL;
+ so->so_rcv.sb_flags &= ~SB_UPCALL;
+ soisconnected(so);
+ return;
+}
diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h
new file mode 100644
index 0000000..2b1f529
--- /dev/null
+++ b/sys/netinet/icmp6.h
@@ -0,0 +1,740 @@
+/* $FreeBSD$ */
+/* $KAME: icmp6.h,v 1.46 2001/04/27 15:09:48 itojun Exp $ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93
+ */
+
+#ifndef _NETINET_ICMP6_H_
+#define _NETINET_ICMP6_H_
+
+#define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr)
+ - sizeof(struct icmp6_hdr) */
+
+struct icmp6_hdr {
+ u_int8_t icmp6_type; /* type field */
+ u_int8_t icmp6_code; /* code field */
+ u_int16_t icmp6_cksum; /* checksum field */
+ union {
+ u_int32_t icmp6_un_data32[1]; /* type-specific field */
+ u_int16_t icmp6_un_data16[2]; /* type-specific field */
+ u_int8_t icmp6_un_data8[4]; /* type-specific field */
+ } icmp6_dataun;
+} __attribute__((__packed__));
+
+#define icmp6_data32 icmp6_dataun.icmp6_un_data32
+#define icmp6_data16 icmp6_dataun.icmp6_un_data16
+#define icmp6_data8 icmp6_dataun.icmp6_un_data8
+#define icmp6_pptr icmp6_data32[0] /* parameter prob */
+#define icmp6_mtu icmp6_data32[0] /* packet too big */
+#define icmp6_id icmp6_data16[0] /* echo request/reply */
+#define icmp6_seq icmp6_data16[1] /* echo request/reply */
+#define icmp6_maxdelay icmp6_data16[0] /* mcast group membership */
+
+#define ICMP6_DST_UNREACH 1 /* dest unreachable, codes: */
+#define ICMP6_PACKET_TOO_BIG 2 /* packet too big */
+#define ICMP6_TIME_EXCEEDED 3 /* time exceeded, code: */
+#define ICMP6_PARAM_PROB 4 /* ip6 header bad */
+
+#define ICMP6_ECHO_REQUEST 128 /* echo service */
+#define ICMP6_ECHO_REPLY 129 /* echo reply */
+#define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */
+#define MLD_LISTENER_QUERY 130 /* multicast listener query */
+#define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */
+#define MLD_LISTENER_REPORT 131 /* multicast listener report */
+#define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */
+#define MLD_LISTENER_DONE 132 /* multicast listener done */
+
+#ifndef _KERNEL
+/* the followings are for backward compatibility to old KAME apps. */
+#define MLD6_LISTENER_QUERY MLD_LISTENER_QUERY
+#define MLD6_LISTENER_REPORT MLD_LISTENER_REPORT
+#define MLD6_LISTENER_DONE MLD_LISTENER_DONE
+#endif
+
+#define ND_ROUTER_SOLICIT 133 /* router solicitation */
+#define ND_ROUTER_ADVERT 134 /* router advertisment */
+#define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */
+#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisment */
+#define ND_REDIRECT 137 /* redirect */
+
+#define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */
+
+#define ICMP6_WRUREQUEST 139 /* who are you request */
+#define ICMP6_WRUREPLY 140 /* who are you reply */
+#define ICMP6_FQDN_QUERY 139 /* FQDN query */
+#define ICMP6_FQDN_REPLY 140 /* FQDN reply */
+#define ICMP6_NI_QUERY 139 /* node information request */
+#define ICMP6_NI_REPLY 140 /* node information reply */
+
+/* The definitions below are experimental. TBA */
+#define MLD_MTRACE_RESP 200 /* mtrace resp (to sender) */
+#define MLD_MTRACE 201 /* mtrace messages */
+
+#define ICMP6_HADISCOV_REQUEST 202 /* XXX To be defined */
+#define ICMP6_HADISCOV_REPLY 203 /* XXX To be defined */
+
+#ifndef _KERNEL
+#define MLD6_MTRACE_RESP MLD_MTRACE_RESP
+#define MLD6_MTRACE MLD_MTRACE
+#endif
+
+#define ICMP6_MAXTYPE 203
+
+#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */
+#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */
+#define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 /* not a neighbor(obsolete) */
+#define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */
+#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */
+#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */
+
+#define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */
+#define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */
+
+#define ICMP6_PARAMPROB_HEADER 0 /* erroneous header field */
+#define ICMP6_PARAMPROB_NEXTHEADER 1 /* unrecognized next header */
+#define ICMP6_PARAMPROB_OPTION 2 /* unrecognized option */
+
+#define ICMP6_INFOMSG_MASK 0x80 /* all informational messages */
+
+#define ICMP6_NI_SUBJ_IPV6 0 /* Query Subject is an IPv6 address */
+#define ICMP6_NI_SUBJ_FQDN 1 /* Query Subject is a Domain name */
+#define ICMP6_NI_SUBJ_IPV4 2 /* Query Subject is an IPv4 address */
+
+#define ICMP6_NI_SUCCESS 0 /* node information successful reply */
+#define ICMP6_NI_REFUSED 1 /* node information request is refused */
+#define ICMP6_NI_UNKNOWN 2 /* unknown Qtype */
+
+#define ICMP6_ROUTER_RENUMBERING_COMMAND 0 /* rr command */
+#define ICMP6_ROUTER_RENUMBERING_RESULT 1 /* rr result */
+#define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 /* rr seq num reset */
+
+/* Used in kernel only */
+#define ND_REDIRECT_ONLINK 0 /* redirect to an on-link node */
+#define ND_REDIRECT_ROUTER 1 /* redirect to a better router */
+
+/*
+ * Multicast Listener Discovery
+ */
+struct mld_hdr {
+ struct icmp6_hdr mld_icmp6_hdr;
+ struct in6_addr mld_addr; /* multicast address */
+} __attribute__((__packed__));
+
+/* definitions to provide backward compatibility to old KAME applications */
+#ifndef _KERNEL
+#define mld6_hdr mld_hdr
+#define mld6_type mld_type
+#define mld6_code mld_code
+#define mld6_cksum mld_cksum
+#define mld6_maxdelay mld_maxdelay
+#define mld6_reserved mld_reserved
+#define mld6_addr mld_addr
+#endif
+
+/* shortcut macro definitions */
+#define mld_type mld_icmp6_hdr.icmp6_type
+#define mld_code mld_icmp6_hdr.icmp6_code
+#define mld_cksum mld_icmp6_hdr.icmp6_cksum
+#define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0]
+#define mld_reserved mld_icmp6_hdr.icmp6_data16[1]
+
+/*
+ * Neighbor Discovery
+ */
+
+struct nd_router_solicit { /* router solicitation */
+ struct icmp6_hdr nd_rs_hdr;
+ /* could be followed by options */
+} __attribute__((__packed__));
+
+#define nd_rs_type nd_rs_hdr.icmp6_type
+#define nd_rs_code nd_rs_hdr.icmp6_code
+#define nd_rs_cksum nd_rs_hdr.icmp6_cksum
+#define nd_rs_reserved nd_rs_hdr.icmp6_data32[0]
+
+struct nd_router_advert { /* router advertisement */
+ struct icmp6_hdr nd_ra_hdr;
+ u_int32_t nd_ra_reachable; /* reachable time */
+ u_int32_t nd_ra_retransmit; /* retransmit timer */
+ /* could be followed by options */
+} __attribute__((__packed__));
+
+#define nd_ra_type nd_ra_hdr.icmp6_type
+#define nd_ra_code nd_ra_hdr.icmp6_code
+#define nd_ra_cksum nd_ra_hdr.icmp6_cksum
+#define nd_ra_curhoplimit nd_ra_hdr.icmp6_data8[0]
+#define nd_ra_flags_reserved nd_ra_hdr.icmp6_data8[1]
+#define ND_RA_FLAG_MANAGED 0x80
+#define ND_RA_FLAG_OTHER 0x40
+#define ND_RA_FLAG_HA 0x20
+
+/*
+ * Router preference values based on draft-draves-ipngwg-router-selection-01.
+ * These are non-standard definitions.
+ */
+#define ND_RA_FLAG_RTPREF_MASK 0x18 /* 00011000 */
+
+#define ND_RA_FLAG_RTPREF_HIGH 0x08 /* 00001000 */
+#define ND_RA_FLAG_RTPREF_MEDIUM 0x00 /* 00000000 */
+#define ND_RA_FLAG_RTPREF_LOW 0x18 /* 00011000 */
+#define ND_RA_FLAG_RTPREF_RSV 0x10 /* 00010000 */
+
+#define nd_ra_router_lifetime nd_ra_hdr.icmp6_data16[1]
+
+struct nd_neighbor_solicit { /* neighbor solicitation */
+ struct icmp6_hdr nd_ns_hdr;
+ struct in6_addr nd_ns_target; /*target address */
+ /* could be followed by options */
+} __attribute__((__packed__));
+
+#define nd_ns_type nd_ns_hdr.icmp6_type
+#define nd_ns_code nd_ns_hdr.icmp6_code
+#define nd_ns_cksum nd_ns_hdr.icmp6_cksum
+#define nd_ns_reserved nd_ns_hdr.icmp6_data32[0]
+
+struct nd_neighbor_advert { /* neighbor advertisement */
+ struct icmp6_hdr nd_na_hdr;
+ struct in6_addr nd_na_target; /* target address */
+ /* could be followed by options */
+} __attribute__((__packed__));
+
+#define nd_na_type nd_na_hdr.icmp6_type
+#define nd_na_code nd_na_hdr.icmp6_code
+#define nd_na_cksum nd_na_hdr.icmp6_cksum
+#define nd_na_flags_reserved nd_na_hdr.icmp6_data32[0]
+#if BYTE_ORDER == BIG_ENDIAN
+#define ND_NA_FLAG_ROUTER 0x80000000
+#define ND_NA_FLAG_SOLICITED 0x40000000
+#define ND_NA_FLAG_OVERRIDE 0x20000000
+#else
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define ND_NA_FLAG_ROUTER 0x80
+#define ND_NA_FLAG_SOLICITED 0x40
+#define ND_NA_FLAG_OVERRIDE 0x20
+#endif
+#endif
+
+struct nd_redirect { /* redirect */
+ struct icmp6_hdr nd_rd_hdr;
+ struct in6_addr nd_rd_target; /* target address */
+ struct in6_addr nd_rd_dst; /* destination address */
+ /* could be followed by options */
+} __attribute__((__packed__));
+
+#define nd_rd_type nd_rd_hdr.icmp6_type
+#define nd_rd_code nd_rd_hdr.icmp6_code
+#define nd_rd_cksum nd_rd_hdr.icmp6_cksum
+#define nd_rd_reserved nd_rd_hdr.icmp6_data32[0]
+
+struct nd_opt_hdr { /* Neighbor discovery option header */
+ u_int8_t nd_opt_type;
+ u_int8_t nd_opt_len;
+ /* followed by option specific data*/
+} __attribute__((__packed__));
+
+#define ND_OPT_SOURCE_LINKADDR 1
+#define ND_OPT_TARGET_LINKADDR 2
+#define ND_OPT_PREFIX_INFORMATION 3
+#define ND_OPT_REDIRECTED_HEADER 4
+#define ND_OPT_MTU 5
+
+#define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */
+
+struct nd_opt_prefix_info { /* prefix information */
+ u_int8_t nd_opt_pi_type;
+ u_int8_t nd_opt_pi_len;
+ u_int8_t nd_opt_pi_prefix_len;
+ u_int8_t nd_opt_pi_flags_reserved;
+ u_int32_t nd_opt_pi_valid_time;
+ u_int32_t nd_opt_pi_preferred_time;
+ u_int32_t nd_opt_pi_reserved2;
+ struct in6_addr nd_opt_pi_prefix;
+} __attribute__((__packed__));
+
+#define ND_OPT_PI_FLAG_ONLINK 0x80
+#define ND_OPT_PI_FLAG_AUTO 0x40
+
+struct nd_opt_rd_hdr { /* redirected header */
+ u_int8_t nd_opt_rh_type;
+ u_int8_t nd_opt_rh_len;
+ u_int16_t nd_opt_rh_reserved1;
+ u_int32_t nd_opt_rh_reserved2;
+ /* followed by IP header and data */
+} __attribute__((__packed__));
+
+struct nd_opt_mtu { /* MTU option */
+ u_int8_t nd_opt_mtu_type;
+ u_int8_t nd_opt_mtu_len;
+ u_int16_t nd_opt_mtu_reserved;
+ u_int32_t nd_opt_mtu_mtu;
+} __attribute__((__packed__));
+
+struct nd_opt_route_info { /* route info */
+ u_int8_t nd_opt_rti_type;
+ u_int8_t nd_opt_rti_len;
+ u_int8_t nd_opt_rti_prefixlen;
+ u_int8_t nd_opt_rti_flags;
+ u_int32_t nd_opt_rti_lifetime;
+ /* prefix follows */
+} __attribute__((__packed__));
+
+/*
+ * icmp6 namelookup
+ */
+
+struct icmp6_namelookup {
+ struct icmp6_hdr icmp6_nl_hdr;
+ u_int8_t icmp6_nl_nonce[8];
+ int32_t icmp6_nl_ttl;
+#if 0
+ u_int8_t icmp6_nl_len;
+ u_int8_t icmp6_nl_name[3];
+#endif
+ /* could be followed by options */
+} __attribute__((__packed__));
+
+/*
+ * icmp6 node information
+ */
+struct icmp6_nodeinfo {
+ struct icmp6_hdr icmp6_ni_hdr;
+ u_int8_t icmp6_ni_nonce[8];
+ /* could be followed by reply data */
+} __attribute__((__packed__));
+
+#define ni_type icmp6_ni_hdr.icmp6_type
+#define ni_code icmp6_ni_hdr.icmp6_code
+#define ni_cksum icmp6_ni_hdr.icmp6_cksum
+#define ni_qtype icmp6_ni_hdr.icmp6_data16[0]
+#define ni_flags icmp6_ni_hdr.icmp6_data16[1]
+
+#define NI_QTYPE_NOOP 0 /* NOOP */
+#define NI_QTYPE_SUPTYPES 1 /* Supported Qtypes */
+#define NI_QTYPE_FQDN 2 /* FQDN (draft 04) */
+#define NI_QTYPE_DNSNAME 2 /* DNS Name */
+#define NI_QTYPE_NODEADDR 3 /* Node Addresses */
+#define NI_QTYPE_IPV4ADDR 4 /* IPv4 Addresses */
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define NI_SUPTYPE_FLAG_COMPRESS 0x1
+#define NI_FQDN_FLAG_VALIDTTL 0x1
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define NI_SUPTYPE_FLAG_COMPRESS 0x0100
+#define NI_FQDN_FLAG_VALIDTTL 0x0100
+#endif
+
+#ifdef NAME_LOOKUPS_04
+#if BYTE_ORDER == BIG_ENDIAN
+#define NI_NODEADDR_FLAG_LINKLOCAL 0x1
+#define NI_NODEADDR_FLAG_SITELOCAL 0x2
+#define NI_NODEADDR_FLAG_GLOBAL 0x4
+#define NI_NODEADDR_FLAG_ALL 0x8
+#define NI_NODEADDR_FLAG_TRUNCATE 0x10
+#define NI_NODEADDR_FLAG_ANYCAST 0x20 /* just experimental. not in spec */
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define NI_NODEADDR_FLAG_LINKLOCAL 0x0100
+#define NI_NODEADDR_FLAG_SITELOCAL 0x0200
+#define NI_NODEADDR_FLAG_GLOBAL 0x0400
+#define NI_NODEADDR_FLAG_ALL 0x0800
+#define NI_NODEADDR_FLAG_TRUNCATE 0x1000
+#define NI_NODEADDR_FLAG_ANYCAST 0x2000 /* just experimental. not in spec */
+#endif
+#else /* draft-ietf-ipngwg-icmp-name-lookups-05 (and later?) */
+#if BYTE_ORDER == BIG_ENDIAN
+#define NI_NODEADDR_FLAG_TRUNCATE 0x1
+#define NI_NODEADDR_FLAG_ALL 0x2
+#define NI_NODEADDR_FLAG_COMPAT 0x4
+#define NI_NODEADDR_FLAG_LINKLOCAL 0x8
+#define NI_NODEADDR_FLAG_SITELOCAL 0x10
+#define NI_NODEADDR_FLAG_GLOBAL 0x20
+#define NI_NODEADDR_FLAG_ANYCAST 0x40 /* just experimental. not in spec */
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define NI_NODEADDR_FLAG_TRUNCATE 0x0100
+#define NI_NODEADDR_FLAG_ALL 0x0200
+#define NI_NODEADDR_FLAG_COMPAT 0x0400
+#define NI_NODEADDR_FLAG_LINKLOCAL 0x0800
+#define NI_NODEADDR_FLAG_SITELOCAL 0x1000
+#define NI_NODEADDR_FLAG_GLOBAL 0x2000
+#define NI_NODEADDR_FLAG_ANYCAST 0x4000 /* just experimental. not in spec */
+#endif
+#endif
+
+struct ni_reply_fqdn {
+ u_int32_t ni_fqdn_ttl; /* TTL */
+ u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */
+ u_int8_t ni_fqdn_name[3]; /* XXX: alignment */
+} __attribute__((__packed__));
+
+/*
+ * Router Renumbering. as router-renum-08.txt
+ */
+struct icmp6_router_renum { /* router renumbering header */
+ struct icmp6_hdr rr_hdr;
+ u_int8_t rr_segnum;
+ u_int8_t rr_flags;
+ u_int16_t rr_maxdelay;
+ u_int32_t rr_reserved;
+} __attribute__((__packed__));
+
+#define ICMP6_RR_FLAGS_TEST 0x80
+#define ICMP6_RR_FLAGS_REQRESULT 0x40
+#define ICMP6_RR_FLAGS_FORCEAPPLY 0x20
+#define ICMP6_RR_FLAGS_SPECSITE 0x10
+#define ICMP6_RR_FLAGS_PREVDONE 0x08
+
+#define rr_type rr_hdr.icmp6_type
+#define rr_code rr_hdr.icmp6_code
+#define rr_cksum rr_hdr.icmp6_cksum
+#define rr_seqnum rr_hdr.icmp6_data32[0]
+
+struct rr_pco_match { /* match prefix part */
+ u_int8_t rpm_code;
+ u_int8_t rpm_len;
+ u_int8_t rpm_ordinal;
+ u_int8_t rpm_matchlen;
+ u_int8_t rpm_minlen;
+ u_int8_t rpm_maxlen;
+ u_int16_t rpm_reserved;
+ struct in6_addr rpm_prefix;
+} __attribute__((__packed__));
+
+#define RPM_PCO_ADD 1
+#define RPM_PCO_CHANGE 2
+#define RPM_PCO_SETGLOBAL 3
+#define RPM_PCO_MAX 4
+
+struct rr_pco_use { /* use prefix part */
+ u_int8_t rpu_uselen;
+ u_int8_t rpu_keeplen;
+ u_int8_t rpu_ramask;
+ u_int8_t rpu_raflags;
+ u_int32_t rpu_vltime;
+ u_int32_t rpu_pltime;
+ u_int32_t rpu_flags;
+ struct in6_addr rpu_prefix;
+} __attribute__((__packed__));
+#define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80
+#define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80000000
+#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40000000
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80
+#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40
+#endif
+
+struct rr_result { /* router renumbering result message */
+ u_int16_t rrr_flags;
+ u_int8_t rrr_ordinal;
+ u_int8_t rrr_matchedlen;
+ u_int32_t rrr_ifid;
+ struct in6_addr rrr_prefix;
+} __attribute__((__packed__));
+#if BYTE_ORDER == BIG_ENDIAN
+#define ICMP6_RR_RESULT_FLAGS_OOB 0x0002
+#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define ICMP6_RR_RESULT_FLAGS_OOB 0x0200
+#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0100
+#endif
+
+/*
+ * icmp6 filter structures.
+ */
+
+struct icmp6_filter {
+ u_int32_t icmp6_filt[8];
+};
+
+#ifdef _KERNEL
+#define ICMP6_FILTER_SETPASSALL(filterp) \
+do { \
+ int i; u_char *p; \
+ p = (u_char *)filterp; \
+ for (i = 0; i < sizeof(struct icmp6_filter); i++) \
+ p[i] = 0xff; \
+} while (0)
+#define ICMP6_FILTER_SETBLOCKALL(filterp) \
+ bzero(filterp, sizeof(struct icmp6_filter))
+#else /* _KERNEL */
+#define ICMP6_FILTER_SETPASSALL(filterp) \
+ memset(filterp, 0xff, sizeof(struct icmp6_filter))
+#define ICMP6_FILTER_SETBLOCKALL(filterp) \
+ memset(filterp, 0x00, sizeof(struct icmp6_filter))
+#endif /* _KERNEL */
+
+#define ICMP6_FILTER_SETPASS(type, filterp) \
+ (((filterp)->icmp6_filt[(type) >> 5]) |= (1 << ((type) & 31)))
+#define ICMP6_FILTER_SETBLOCK(type, filterp) \
+ (((filterp)->icmp6_filt[(type) >> 5]) &= ~(1 << ((type) & 31)))
+#define ICMP6_FILTER_WILLPASS(type, filterp) \
+ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) != 0)
+#define ICMP6_FILTER_WILLBLOCK(type, filterp) \
+ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0)
+
+/*
+ * Variables related to this implementation
+ * of the internet control message protocol version 6.
+ */
+struct icmp6errstat {
+ u_quad_t icp6errs_dst_unreach_noroute;
+ u_quad_t icp6errs_dst_unreach_admin;
+ u_quad_t icp6errs_dst_unreach_beyondscope;
+ u_quad_t icp6errs_dst_unreach_addr;
+ u_quad_t icp6errs_dst_unreach_noport;
+ u_quad_t icp6errs_packet_too_big;
+ u_quad_t icp6errs_time_exceed_transit;
+ u_quad_t icp6errs_time_exceed_reassembly;
+ u_quad_t icp6errs_paramprob_header;
+ u_quad_t icp6errs_paramprob_nextheader;
+ u_quad_t icp6errs_paramprob_option;
+ u_quad_t icp6errs_redirect; /* we regard redirect as an error here */
+ u_quad_t icp6errs_unknown;
+};
+
+struct icmp6stat {
+/* statistics related to icmp6 packets generated */
+ u_quad_t icp6s_error; /* # of calls to icmp6_error */
+ u_quad_t icp6s_canterror; /* no error 'cuz old was icmp */
+ u_quad_t icp6s_toofreq; /* no error 'cuz rate limitation */
+ u_quad_t icp6s_outhist[256];
+/* statistics related to input message processed */
+ u_quad_t icp6s_badcode; /* icmp6_code out of range */
+ u_quad_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */
+ u_quad_t icp6s_checksum; /* bad checksum */
+ u_quad_t icp6s_badlen; /* calculated bound mismatch */
+ /*
+ * number of responses: this member is inherited from netinet code, but
+ * for netinet6 code, it is already available in icp6s_outhist[].
+ */
+ u_quad_t icp6s_reflect;
+ u_quad_t icp6s_inhist[256];
+ u_quad_t icp6s_nd_toomanyopt; /* too many ND options */
+ struct icmp6errstat icp6s_outerrhist;
+#define icp6s_odst_unreach_noroute \
+ icp6s_outerrhist.icp6errs_dst_unreach_noroute
+#define icp6s_odst_unreach_admin icp6s_outerrhist.icp6errs_dst_unreach_admin
+#define icp6s_odst_unreach_beyondscope \
+ icp6s_outerrhist.icp6errs_dst_unreach_beyondscope
+#define icp6s_odst_unreach_addr icp6s_outerrhist.icp6errs_dst_unreach_addr
+#define icp6s_odst_unreach_noport icp6s_outerrhist.icp6errs_dst_unreach_noport
+#define icp6s_opacket_too_big icp6s_outerrhist.icp6errs_packet_too_big
+#define icp6s_otime_exceed_transit \
+ icp6s_outerrhist.icp6errs_time_exceed_transit
+#define icp6s_otime_exceed_reassembly \
+ icp6s_outerrhist.icp6errs_time_exceed_reassembly
+#define icp6s_oparamprob_header icp6s_outerrhist.icp6errs_paramprob_header
+#define icp6s_oparamprob_nextheader \
+ icp6s_outerrhist.icp6errs_paramprob_nextheader
+#define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option
+#define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect
+#define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown
+ u_quad_t icp6s_pmtuchg; /* path MTU changes */
+ u_quad_t icp6s_nd_badopt; /* bad ND options */
+ u_quad_t icp6s_badns; /* bad neighbor solicitation */
+ u_quad_t icp6s_badna; /* bad neighbor advertisement */
+ u_quad_t icp6s_badrs; /* bad router advertisement */
+ u_quad_t icp6s_badra; /* bad router advertisement */
+ u_quad_t icp6s_badredirect; /* bad redirect message */
+};
+
+/*
+ * Names for ICMP sysctl objects
+ */
+#define ICMPV6CTL_STATS 1
+#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */
+#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */
+#if 0 /*obsoleted*/
+#define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */
+#endif
+#define ICMPV6CTL_ND6_PRUNE 6
+#define ICMPV6CTL_ND6_DELAY 8
+#define ICMPV6CTL_ND6_UMAXTRIES 9
+#define ICMPV6CTL_ND6_MMAXTRIES 10
+#define ICMPV6CTL_ND6_USELOOPBACK 11
+/*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */
+#define ICMPV6CTL_NODEINFO 13
+#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */
+#define ICMPV6CTL_ND6_MAXNUDHINT 15
+#define ICMPV6CTL_MTUDISC_HIWAT 16
+#define ICMPV6CTL_MTUDISC_LOWAT 17
+#define ICMPV6CTL_ND6_DEBUG 18
+#define ICMPV6CTL_ND6_DRLIST 19
+#define ICMPV6CTL_ND6_PRLIST 20
+#define ICMPV6CTL_MAXID 21
+
+#define ICMPV6CTL_NAMES { \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "rediraccept", CTLTYPE_INT }, \
+ { "redirtimeout", CTLTYPE_INT }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "nd6_prune", CTLTYPE_INT }, \
+ { 0, 0 }, \
+ { "nd6_delay", CTLTYPE_INT }, \
+ { "nd6_umaxtries", CTLTYPE_INT }, \
+ { "nd6_mmaxtries", CTLTYPE_INT }, \
+ { "nd6_useloopback", CTLTYPE_INT }, \
+ { 0, 0 }, \
+ { "nodeinfo", CTLTYPE_INT }, \
+ { "errppslimit", CTLTYPE_INT }, \
+ { "nd6_maxnudhint", CTLTYPE_INT }, \
+ { "mtudisc_hiwat", CTLTYPE_INT }, \
+ { "mtudisc_lowat", CTLTYPE_INT }, \
+ { "nd6_debug", CTLTYPE_INT }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+}
+
+#define RTF_PROBEMTU RTF_PROTO1
+
+#ifdef _KERNEL
+# ifdef __STDC__
+struct rtentry;
+struct rttimer;
+struct in6_multi;
+# endif
+void icmp6_init(void);
+void icmp6_paramerror(struct mbuf *, int);
+void icmp6_error(struct mbuf *, int, int, int);
+int icmp6_input(struct mbuf **, int *, int);
+void icmp6_fasttimo(void);
+void icmp6_reflect(struct mbuf *, size_t);
+void icmp6_prepare(struct mbuf *);
+void icmp6_redirect_input(struct mbuf *, int);
+void icmp6_redirect_output(struct mbuf *, struct rtentry *);
+
+struct ip6ctlparam;
+void icmp6_mtudisc_update(struct ip6ctlparam *, int);
+
+/* XXX: is this the right place for these macros? */
+#define icmp6_ifstat_inc(ifp, tag) \
+do { \
+ if ((ifp) && (ifp)->if_index <= if_index \
+ && (ifp)->if_index < icmp6_ifstatmax \
+ && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \
+ icmp6_ifstat[(ifp)->if_index]->tag++; \
+ } \
+} while (0)
+
+#define icmp6_ifoutstat_inc(ifp, type, code) \
+do { \
+ icmp6_ifstat_inc(ifp, ifs6_out_msg); \
+ if (type < ICMP6_INFOMSG_MASK) \
+ icmp6_ifstat_inc(ifp, ifs6_out_error); \
+ switch(type) { \
+ case ICMP6_DST_UNREACH: \
+ icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \
+ if (code == ICMP6_DST_UNREACH_ADMIN) \
+ icmp6_ifstat_inc(ifp, ifs6_out_adminprohib); \
+ break; \
+ case ICMP6_PACKET_TOO_BIG: \
+ icmp6_ifstat_inc(ifp, ifs6_out_pkttoobig); \
+ break; \
+ case ICMP6_TIME_EXCEEDED: \
+ icmp6_ifstat_inc(ifp, ifs6_out_timeexceed); \
+ break; \
+ case ICMP6_PARAM_PROB: \
+ icmp6_ifstat_inc(ifp, ifs6_out_paramprob); \
+ break; \
+ case ICMP6_ECHO_REQUEST: \
+ icmp6_ifstat_inc(ifp, ifs6_out_echo); \
+ break; \
+ case ICMP6_ECHO_REPLY: \
+ icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \
+ break; \
+ case MLD_LISTENER_QUERY: \
+ icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \
+ break; \
+ case MLD_LISTENER_REPORT: \
+ icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \
+ break; \
+ case MLD_LISTENER_DONE: \
+ icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \
+ break; \
+ case ND_ROUTER_SOLICIT: \
+ icmp6_ifstat_inc(ifp, ifs6_out_routersolicit); \
+ break; \
+ case ND_ROUTER_ADVERT: \
+ icmp6_ifstat_inc(ifp, ifs6_out_routeradvert); \
+ break; \
+ case ND_NEIGHBOR_SOLICIT: \
+ icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); \
+ break; \
+ case ND_NEIGHBOR_ADVERT: \
+ icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); \
+ break; \
+ case ND_REDIRECT: \
+ icmp6_ifstat_inc(ifp, ifs6_out_redirect); \
+ break; \
+ } \
+} while (0)
+
+extern int icmp6_rediraccept; /* accept/process redirects */
+extern int icmp6_redirtimeout; /* cache time for redirect routes */
+#endif /* _KERNEL */
+
+#endif /* not _NETINET_ICMP6_H_ */
diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h
new file mode 100644
index 0000000..92e23c4
--- /dev/null
+++ b/sys/netinet/icmp_var.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_ICMP_VAR_H_
+#define _NETINET_ICMP_VAR_H_
+
+
+/*
+ * Variables related to this implementation
+ * of the internet control message protocol.
+ */
+struct icmpstat {
+/* statistics related to icmp packets generated */
+ u_long icps_error; /* # of calls to icmp_error */
+ u_long icps_oldshort; /* no error 'cuz old ip too short */
+ u_long icps_oldicmp; /* no error 'cuz old was icmp */
+ u_long icps_outhist[ICMP_MAXTYPE + 1];
+/* statistics related to input messages processed */
+ u_long icps_badcode; /* icmp_code out of range */
+ u_long icps_tooshort; /* packet < ICMP_MINLEN */
+ u_long icps_checksum; /* bad checksum */
+ u_long icps_badlen; /* calculated bound mismatch */
+ u_long icps_reflect; /* number of responses */
+ u_long icps_inhist[ICMP_MAXTYPE + 1];
+ u_long icps_bmcastecho; /* b/mcast echo requests dropped */
+ u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */
+ u_long icps_badaddr; /* bad return address */
+ u_long icps_noroute; /* no route back */
+};
+
+/*
+ * Names for ICMP sysctl objects
+ */
+#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */
+#define ICMPCTL_STATS 2 /* statistics (read-only) */
+#define ICMPCTL_ICMPLIM 3
+#define ICMPCTL_MAXID 4
+
+#define ICMPCTL_NAMES { \
+ { 0, 0 }, \
+ { "maskrepl", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "icmplim", CTLTYPE_INT }, \
+}
+
+#ifdef _KERNEL
+SYSCTL_DECL(_net_inet_icmp);
+extern int badport_bandlim(int);
+#define BANDLIM_UNLIMITED -1
+#define BANDLIM_ICMP_UNREACH 0
+#define BANDLIM_ICMP_ECHO 1
+#define BANDLIM_ICMP_TSTAMP 2
+#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
+#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */
+#define BANDLIM_MAX 4
+#endif
+
+#endif
diff --git a/sys/netinet/if_atm.c b/sys/netinet/if_atm.c
new file mode 100644
index 0000000..934309b
--- /dev/null
+++ b/sys/netinet/if_atm.c
@@ -0,0 +1,278 @@
+/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */
+
+/*
+ *
+ * Copyright (c) 1996 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * IP <=> ATM address resolution.
+ */
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_natm.h"
+
+#if defined(INET) || defined(INET6)
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/if_atm.h>
+
+#include <netinet/in.h>
+#include <netinet/if_atm.h>
+
+#ifdef NATM
+#include <netnatm/natm.h>
+#endif
+
+
+#define SDL(s) ((struct sockaddr_dl *)s)
+
+/*
+ * atm_rtrequest: handle ATM rt request (in support of generic code)
+ * inputs: "req" = request code
+ * "rt" = route entry
+ * "info" = rt_addrinfo
+ */
+
+void
+atm_rtrequest(req, rt, info)
+ int req;
+ register struct rtentry *rt;
+ struct rt_addrinfo *info;
+{
+ register struct sockaddr *gate = rt->rt_gateway;
+ struct atm_pseudoioctl api;
+#ifdef NATM
+ struct sockaddr_in *sin;
+ struct natmpcb *npcb = NULL;
+ struct atm_pseudohdr *aph;
+#endif
+ static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
+
+ if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */
+ return;
+
+ switch (req) {
+
+ case RTM_RESOLVE: /* resolve: only happens when cloning */
+ printf("atm_rtrequest: RTM_RESOLVE request detected?\n");
+ break;
+
+ case RTM_ADD:
+
+ /*
+ * route added by a command (e.g. ifconfig, route, arp...).
+ *
+ * first check to see if this is not a host route, in which
+ * case we are being called via "ifconfig" to set the address.
+ */
+
+ if ((rt->rt_flags & RTF_HOST) == 0) {
+ rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl);
+ gate = rt->rt_gateway;
+ SDL(gate)->sdl_type = rt->rt_ifp->if_type;
+ SDL(gate)->sdl_index = rt->rt_ifp->if_index;
+ break;
+ }
+
+ if ((rt->rt_flags & RTF_CLONING) != 0) {
+ printf("atm_rtrequest: cloning route detected?\n");
+ break;
+ }
+ if (gate->sa_family != AF_LINK ||
+ gate->sa_len < sizeof(null_sdl)) {
+ log(LOG_DEBUG, "atm_rtrequest: bad gateway value");
+ break;
+ }
+
+ KASSERT(rt->rt_ifp->if_ioctl != NULL,
+ ("atm_rtrequest: null ioctl"));
+#ifdef NATM
+ /*
+ * let native ATM know we are using this VCI/VPI
+ * (i.e. reserve it)
+ */
+ sin = (struct sockaddr_in *) rt_key(rt);
+ if (sin->sin_family != AF_INET)
+ goto failed;
+ aph = (struct atm_pseudohdr *) LLADDR(SDL(gate));
+ npcb = npcb_add(NULL, rt->rt_ifp, ATM_PH_VCI(aph),
+ ATM_PH_VPI(aph));
+ if (npcb == NULL)
+ goto failed;
+ npcb->npcb_flags |= NPCB_IP;
+ npcb->ipaddr.s_addr = sin->sin_addr.s_addr;
+ /* XXX: move npcb to llinfo when ATM ARP is ready */
+ rt->rt_llinfo = (caddr_t) npcb;
+ rt->rt_flags |= RTF_LLINFO;
+#endif
+ /*
+ * let the lower level know this circuit is active
+ */
+ bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph));
+ api.rxhand = NULL;
+ if (rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMENA,
+ (caddr_t)&api) != 0) {
+ printf("atm: couldn't add VC\n");
+ goto failed;
+ }
+
+ SDL(gate)->sdl_type = rt->rt_ifp->if_type;
+ SDL(gate)->sdl_index = rt->rt_ifp->if_index;
+
+ break;
+
+failed:
+#ifdef NATM
+ if (npcb) {
+ npcb_free(npcb, NPCB_DESTROY);
+ rt->rt_llinfo = NULL;
+ rt->rt_flags &= ~RTF_LLINFO;
+ }
+#endif
+ rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0,
+ rt_mask(rt), 0, (struct rtentry **) 0);
+ break;
+
+ case RTM_DELETE:
+
+#ifdef NATM
+ /*
+ * tell native ATM we are done with this VC
+ */
+
+ if (rt->rt_flags & RTF_LLINFO) {
+ npcb_free((struct natmpcb *)rt->rt_llinfo,
+ NPCB_DESTROY);
+ rt->rt_llinfo = NULL;
+ rt->rt_flags &= ~RTF_LLINFO;
+ }
+#endif
+ /*
+ * tell the lower layer to disable this circuit
+ */
+
+ bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph));
+ api.rxhand = NULL;
+ (void)rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMDIS,
+ (caddr_t)&api);
+
+ break;
+ }
+}
+
+/*
+ * atmresolve:
+ * inputs:
+ * [1] "rt" = the link level route to use (or null if need to look one up)
+ * [2] "m" = mbuf containing the data to be sent
+ * [3] "dst" = sockaddr_in (IP) address of dest.
+ * output:
+ * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info
+ * return:
+ * 0 == resolve FAILED; note that "m" gets m_freem'd in this case
+ * 1 == resolve OK; desten contains result
+ *
+ * XXX: will need more work if we wish to support ATMARP in the kernel,
+ * but this is enough for PVCs entered via the "route" command.
+ */
+
+int
+atmresolve(rt, m, dst, desten)
+
+register struct rtentry *rt;
+struct mbuf *m;
+register struct sockaddr *dst;
+register struct atm_pseudohdr *desten; /* OUT */
+
+{
+ struct sockaddr_dl *sdl;
+
+ if (m->m_flags & (M_BCAST|M_MCAST)) {
+ log(LOG_INFO, "atmresolve: BCAST/MCAST packet detected/dumped");
+ goto bad;
+ }
+
+ if (rt == NULL) {
+ rt = RTALLOC1(dst, 0);
+ if (rt == NULL) goto bad; /* failed */
+ rt->rt_refcnt--; /* don't keep LL references */
+ if ((rt->rt_flags & RTF_GATEWAY) != 0 ||
+ (rt->rt_flags & RTF_LLINFO) == 0 ||
+ /* XXX: are we using LLINFO? */
+ rt->rt_gateway->sa_family != AF_LINK) {
+ goto bad;
+ }
+ }
+
+ /*
+ * note that rt_gateway is a sockaddr_dl which contains the
+ * atm_pseudohdr data structure for this route. we currently
+ * don't need any rt_llinfo info (but will if we want to support
+ * ATM ARP [c.f. if_ether.c]).
+ */
+
+ sdl = SDL(rt->rt_gateway);
+
+ /*
+ * Check the address family and length is valid, the address
+ * is resolved; otherwise, try to resolve.
+ */
+
+
+ if (sdl->sdl_family == AF_LINK && sdl->sdl_alen == sizeof(*desten)) {
+ bcopy(LLADDR(sdl), desten, sdl->sdl_alen);
+ return(1); /* ok, go for it! */
+ }
+
+ /*
+ * we got an entry, but it doesn't have valid link address
+ * info in it (it is prob. the interface route, which has
+ * sdl_alen == 0). dump packet. (fall through to "bad").
+ */
+
+bad:
+ m_freem(m);
+ return(0);
+}
+#endif /* INET */
diff --git a/sys/netinet/if_atm.h b/sys/netinet/if_atm.h
new file mode 100644
index 0000000..b8cddf6
--- /dev/null
+++ b/sys/netinet/if_atm.h
@@ -0,0 +1,47 @@
+/* $FreeBSD$ */
+/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */
+
+/*
+ *
+ * Copyright (c) 1996 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Charles D. Cranor and
+ * Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * if_atm.h
+ */
+
+struct atm_pseudohdr;
+struct mbuf;
+struct rtentry;
+struct sockaddr;
+
+void atm_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
+int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *,
+ struct atm_pseudohdr *);
diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c
new file mode 100644
index 0000000..77eee3c
--- /dev/null
+++ b/sys/netinet/if_ether.c
@@ -0,0 +1,951 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)if_ether.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+/*
+ * Ethernet address resolution protocol.
+ * TODO:
+ * add "inuse/lock" bit (or ref. count) along with valid bit
+ */
+
+#include "opt_inet.h"
+#include "opt_bdg.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/netisr.h>
+#include <net/if_llc.h>
+#ifdef BRIDGE
+#include <net/ethernet.h>
+#include <net/bridge.h>
+#endif
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/if_ether.h>
+
+#include <net/if_arc.h>
+#include <net/iso88025.h>
+
+#define SIN(s) ((struct sockaddr_in *)s)
+#define SDL(s) ((struct sockaddr_dl *)s)
+
+SYSCTL_DECL(_net_link_ether);
+SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
+
+/* timer values */
+static int arpt_prune = (5*60*1); /* walk list every 5 minutes */
+static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */
+static int arpt_down = 20; /* once declared down, don't send for 20 sec */
+
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW,
+ &arpt_prune, 0, "");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW,
+ &arpt_keep, 0, "");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW,
+ &arpt_down, 0, "");
+
+#define rt_expire rt_rmx.rmx_expire
+
+struct llinfo_arp {
+ LIST_ENTRY(llinfo_arp) la_le;
+ struct rtentry *la_rt;
+ struct mbuf *la_hold; /* last packet until resolved/timeout */
+ long la_asked; /* last time we QUERIED for this addr */
+#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */
+};
+
+static LIST_HEAD(, llinfo_arp) llinfo_arp;
+
+struct ifqueue arpintrq;
+static int arp_inuse, arp_allocated, arpinit_done;
+
+static int arp_maxtries = 5;
+static int useloopback = 1; /* use loopback interface for local traffic */
+static int arp_proxyall = 0;
+
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW,
+ &arp_maxtries, 0, "");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW,
+ &useloopback, 0, "");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW,
+ &arp_proxyall, 0, "");
+
+static void arp_init(void);
+static void arp_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
+static void arprequest(struct ifnet *,
+ struct in_addr *, struct in_addr *, u_char *);
+static void arpintr(void);
+static void arptfree(struct llinfo_arp *);
+static void arptimer(void *);
+static struct llinfo_arp
+ *arplookup(u_long, int, int);
+#ifdef INET
+static void in_arpinput(struct mbuf *);
+#endif
+
+/*
+ * Timeout routine. Age arp_tab entries periodically.
+ */
+/* ARGSUSED */
+static void
+arptimer(ignored_arg)
+ void *ignored_arg;
+{
+ int s = splnet();
+ register struct llinfo_arp *la = LIST_FIRST(&llinfo_arp);
+ struct llinfo_arp *ola;
+
+ timeout(arptimer, (caddr_t)0, arpt_prune * hz);
+ while ((ola = la) != 0) {
+ register struct rtentry *rt = la->la_rt;
+ la = LIST_NEXT(la, la_le);
+ if (rt->rt_expire && rt->rt_expire <= time_second)
+ arptfree(ola); /* timer has expired, clear */
+ }
+ splx(s);
+}
+
+/*
+ * Parallel to llc_rtrequest.
+ */
+static void
+arp_rtrequest(req, rt, info)
+ int req;
+ register struct rtentry *rt;
+ struct rt_addrinfo *info;
+{
+ register struct sockaddr *gate = rt->rt_gateway;
+ register struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo;
+ static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
+
+ if (!arpinit_done) {
+ arpinit_done = 1;
+ timeout(arptimer, (caddr_t)0, hz);
+ }
+ if (rt->rt_flags & RTF_GATEWAY)
+ return;
+ switch (req) {
+
+ case RTM_ADD:
+ /*
+ * XXX: If this is a manually added route to interface
+ * such as older version of routed or gated might provide,
+ * restore cloning bit.
+ */
+ if ((rt->rt_flags & RTF_HOST) == 0 &&
+ SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff)
+ rt->rt_flags |= RTF_CLONING;
+ if (rt->rt_flags & RTF_CLONING) {
+ /*
+ * Case 1: This route should come from a route to iface.
+ */
+ rt_setgate(rt, rt_key(rt),
+ (struct sockaddr *)&null_sdl);
+ gate = rt->rt_gateway;
+ SDL(gate)->sdl_type = rt->rt_ifp->if_type;
+ SDL(gate)->sdl_index = rt->rt_ifp->if_index;
+ rt->rt_expire = time_second;
+ break;
+ }
+ /* Announce a new entry if requested. */
+ if (rt->rt_flags & RTF_ANNOUNCE)
+ arprequest(rt->rt_ifp,
+ &SIN(rt_key(rt))->sin_addr,
+ &SIN(rt_key(rt))->sin_addr,
+ (u_char *)LLADDR(SDL(gate)));
+ /*FALLTHROUGH*/
+ case RTM_RESOLVE:
+ if (gate->sa_family != AF_LINK ||
+ gate->sa_len < sizeof(null_sdl)) {
+ log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n");
+ break;
+ }
+ SDL(gate)->sdl_type = rt->rt_ifp->if_type;
+ SDL(gate)->sdl_index = rt->rt_ifp->if_index;
+ if (la != 0)
+ break; /* This happens on a route change */
+ /*
+ * Case 2: This route may come from cloning, or a manual route
+ * add with a LL address.
+ */
+ R_Malloc(la, struct llinfo_arp *, sizeof(*la));
+ rt->rt_llinfo = (caddr_t)la;
+ if (la == 0) {
+ log(LOG_DEBUG, "arp_rtrequest: malloc failed\n");
+ break;
+ }
+ arp_inuse++, arp_allocated++;
+ Bzero(la, sizeof(*la));
+ la->la_rt = rt;
+ rt->rt_flags |= RTF_LLINFO;
+ LIST_INSERT_HEAD(&llinfo_arp, la, la_le);
+
+#ifdef INET
+ /*
+ * This keeps the multicast addresses from showing up
+ * in `arp -a' listings as unresolved. It's not actually
+ * functional. Then the same for broadcast.
+ */
+ if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr)) &&
+ rt->rt_ifp->if_type != IFT_ARCNET) {
+ ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr,
+ LLADDR(SDL(gate)));
+ SDL(gate)->sdl_alen = 6;
+ rt->rt_expire = 0;
+ }
+ if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) {
+ memcpy(LLADDR(SDL(gate)), rt->rt_ifp->if_broadcastaddr,
+ rt->rt_ifp->if_addrlen);
+ SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen;
+ rt->rt_expire = 0;
+ }
+#endif
+
+ if (SIN(rt_key(rt))->sin_addr.s_addr ==
+ (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) {
+ /*
+ * This test used to be
+ * if (loif.if_flags & IFF_UP)
+ * It allowed local traffic to be forced
+ * through the hardware by configuring the loopback down.
+ * However, it causes problems during network configuration
+ * for boards that can't receive packets they send.
+ * It is now necessary to clear "useloopback" and remove
+ * the route to force traffic out to the hardware.
+ */
+ rt->rt_expire = 0;
+ Bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)),
+ SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen);
+ if (useloopback)
+ rt->rt_ifp = loif;
+
+ }
+ break;
+
+ case RTM_DELETE:
+ if (la == 0)
+ break;
+ arp_inuse--;
+ LIST_REMOVE(la, la_le);
+ rt->rt_llinfo = 0;
+ rt->rt_flags &= ~RTF_LLINFO;
+ if (la->la_hold)
+ m_freem(la->la_hold);
+ Free((caddr_t)la);
+ }
+}
+
+/*
+ * Broadcast an ARP request. Caller specifies:
+ * - arp header source ip address
+ * - arp header target ip address
+ * - arp header source ethernet address
+ */
+static void
+arprequest(ifp, sip, tip, enaddr)
+ register struct ifnet *ifp;
+ register struct in_addr *sip, *tip;
+ register u_char *enaddr;
+{
+ register struct mbuf *m;
+ register struct ether_header *eh;
+ register struct arc_header *arh;
+ register struct arphdr *ah;
+ struct sockaddr sa;
+ static u_char llcx[] = { 0x82, 0x40, LLC_SNAP_LSAP, LLC_SNAP_LSAP,
+ LLC_UI, 0x00, 0x00, 0x00, 0x08, 0x06 };
+ u_short ar_hrd;
+
+ if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
+ return;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ switch (ifp->if_type) {
+ case IFT_ARCNET:
+ ar_hrd = htons(ARPHRD_ARCNET);
+
+ m->m_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
+ m->m_pkthdr.len = m->m_len;
+ MH_ALIGN(m, m->m_len);
+
+ arh = (struct arc_header *)sa.sa_data;
+ arh->arc_dhost = *ifp->if_broadcastaddr;
+ arh->arc_type = ARCTYPE_ARP;
+
+ ah = mtod(m, struct arphdr *);
+ break;
+
+ case IFT_ISO88025:
+ ar_hrd = htons(ARPHRD_IEEE802);
+
+ m->m_len = sizeof(llcx) +
+ arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
+ m->m_pkthdr.len = m->m_len;
+ MH_ALIGN(m, m->m_len);
+
+ (void)memcpy(mtod(m, caddr_t), llcx, sizeof(llcx));
+ (void)memcpy(sa.sa_data, ifp->if_broadcastaddr, 6);
+ (void)memcpy(sa.sa_data + 6, enaddr, 6);
+ sa.sa_data[6] |= TR_RII;
+ sa.sa_data[12] = TR_AC;
+ sa.sa_data[13] = TR_LLC_FRAME;
+
+ ah = (struct arphdr *)(mtod(m, char *) + sizeof(llcx));
+ break;
+ case IFT_FDDI:
+ case IFT_ETHER:
+ /*
+ * This may not be correct for types not explicitly
+ * listed, but this is our best guess
+ */
+ default:
+ ar_hrd = htons(ARPHRD_ETHER);
+
+ m->m_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
+ m->m_pkthdr.len = m->m_len;
+ MH_ALIGN(m, m->m_len);
+
+ eh = (struct ether_header *)sa.sa_data;
+ /* if_output will not swap */
+ eh->ether_type = htons(ETHERTYPE_ARP);
+ (void)memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
+ sizeof(eh->ether_dhost));
+
+ ah = mtod(m, struct arphdr *);
+ break;
+ }
+
+ ah->ar_hrd = ar_hrd;
+ ah->ar_pro = htons(ETHERTYPE_IP);
+ ah->ar_hln = ifp->if_addrlen; /* hardware address length */
+ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */
+ ah->ar_op = htons(ARPOP_REQUEST);
+ (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
+ (void)memcpy(ar_spa(ah), sip, ah->ar_pln);
+ (void)memcpy(ar_tpa(ah), tip, ah->ar_pln);
+
+ sa.sa_family = AF_UNSPEC;
+ sa.sa_len = sizeof(sa);
+ (*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0);
+}
+
+/*
+ * Resolve an IP address into an ethernet address. If success,
+ * desten is filled in. If there is no entry in arptab,
+ * set one up and broadcast a request for the IP address.
+ * Hold onto this mbuf and resend it once the address
+ * is finally resolved. A return value of 1 indicates
+ * that desten has been filled in and the packet should be sent
+ * normally; a 0 return indicates that the packet has been
+ * taken over here, either now or for later transmission.
+ */
+int
+arpresolve(ifp, rt, m, dst, desten, rt0)
+ register struct ifnet *ifp;
+ register struct rtentry *rt;
+ struct mbuf *m;
+ register struct sockaddr *dst;
+ register u_char *desten;
+ struct rtentry *rt0;
+{
+ struct llinfo_arp *la = 0;
+ struct sockaddr_dl *sdl;
+
+ if (m->m_flags & M_BCAST) { /* broadcast */
+ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen);
+ return (1);
+ }
+ if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {/* multicast */
+ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
+ return(1);
+ }
+ if (rt)
+ la = (struct llinfo_arp *)rt->rt_llinfo;
+ if (la == 0) {
+ la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0);
+ if (la)
+ rt = la->la_rt;
+ }
+ if (la == 0 || rt == 0) {
+ log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s%s%s\n",
+ inet_ntoa(SIN(dst)->sin_addr), la ? "la" : "",
+ rt ? "rt" : "");
+ m_freem(m);
+ return (0);
+ }
+ sdl = SDL(rt->rt_gateway);
+ /*
+ * Check the address family and length is valid, the address
+ * is resolved; otherwise, try to resolve.
+ */
+ if ((rt->rt_expire == 0 || rt->rt_expire > time_second) &&
+ sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) {
+ /*
+ * If entry has an expiry time and it is approaching,
+ * see if we need to send an ARP request within this
+ * arpt_down interval.
+ */
+ if ((rt->rt_expire != 0) &&
+ (time_second + (arp_maxtries - la->la_asked) * arpt_down >
+ rt->rt_expire)) {
+ arprequest(ifp,
+ &SIN(rt->rt_ifa->ifa_addr)->sin_addr,
+ &SIN(dst)->sin_addr,
+ IF_LLADDR(ifp));
+ la->la_asked++;
+ }
+
+ bcopy(LLADDR(sdl), desten, sdl->sdl_alen);
+ return 1;
+ }
+ /*
+ * If ARP is disabled on this interface, stop.
+ * XXX
+ * Probably should not allocate empty llinfo struct if we are
+ * not going to be sending out an arp request.
+ */
+ if (ifp->if_flags & IFF_NOARP) {
+ m_freem(m);
+ return (0);
+ }
+ /*
+ * There is an arptab entry, but no ethernet address
+ * response yet. Replace the held mbuf with this
+ * latest one.
+ */
+ if (la->la_hold)
+ m_freem(la->la_hold);
+ la->la_hold = m;
+ if (rt->rt_expire) {
+ rt->rt_flags &= ~RTF_REJECT;
+ if (la->la_asked == 0 || rt->rt_expire != time_second) {
+ rt->rt_expire = time_second;
+ if (la->la_asked++ < arp_maxtries)
+ arprequest(ifp,
+ &SIN(rt->rt_ifa->ifa_addr)->sin_addr,
+ &SIN(dst)->sin_addr,
+ IF_LLADDR(ifp));
+ else {
+ rt->rt_flags |= RTF_REJECT;
+ rt->rt_expire += arpt_down;
+ la->la_asked = 0;
+ }
+
+ }
+ }
+ return (0);
+}
+
+/*
+ * Common length and type checks are done here,
+ * then the protocol-specific routine is called.
+ */
+static void
+arpintr()
+{
+ register struct mbuf *m;
+ register struct arphdr *ar;
+ int s;
+
+ if (!arpinit_done) {
+ arpinit_done = 1;
+ timeout(arptimer, (caddr_t)0, hz);
+ }
+ while (arpintrq.ifq_head) {
+ s = splimp();
+ IF_DEQUEUE(&arpintrq, m);
+ splx(s);
+ if (m == 0 || (m->m_flags & M_PKTHDR) == 0)
+ panic("arpintr");
+
+ if (m->m_len < sizeof(struct arphdr) &&
+ ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
+ log(LOG_ERR, "arp: runt packet -- m_pullup failed\n");
+ continue;
+ }
+ ar = mtod(m, struct arphdr *);
+
+ if (ntohs(ar->ar_hrd) != ARPHRD_ETHER
+ && ntohs(ar->ar_hrd) != ARPHRD_IEEE802
+ && ntohs(ar->ar_hrd) != ARPHRD_ARCNET) {
+ log(LOG_ERR,
+ "arp: unknown hardware address format (0x%2D)\n",
+ (unsigned char *)&ar->ar_hrd, "");
+ m_freem(m);
+ continue;
+ }
+
+ if (m->m_pkthdr.len < arphdr_len(ar) &&
+ (m = m_pullup(m, arphdr_len(ar))) == NULL) {
+ log(LOG_ERR, "arp: runt packet\n");
+ m_freem(m);
+ continue;
+ }
+
+ switch (ntohs(ar->ar_pro)) {
+#ifdef INET
+ case ETHERTYPE_IP:
+ in_arpinput(m);
+ continue;
+#endif
+ }
+ m_freem(m);
+ }
+}
+
+#ifdef INET
+/*
+ * ARP for Internet protocols on 10 Mb/s Ethernet.
+ * Algorithm is that given in RFC 826.
+ * In addition, a sanity check is performed on the sender
+ * protocol address, to catch impersonators.
+ * We no longer handle negotiations for use of trailer protocol:
+ * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
+ * along with IP replies if we wanted trailers sent to us,
+ * and also sent them in response to IP replies.
+ * This allowed either end to announce the desire to receive
+ * trailer packets.
+ * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
+ * but formerly didn't normally send requests.
+ */
+static int log_arp_wrong_iface = 1;
+static int log_arp_movements = 1;
+
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
+ &log_arp_wrong_iface, 0,
+ "log arp packets arriving on the wrong interface");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
+ &log_arp_movements, 0,
+ "log arp replies from MACs different than the one in the cache");
+
+
+static void
+in_arpinput(m)
+ struct mbuf *m;
+{
+ register struct arphdr *ah;
+ register struct ifnet *ifp = m->m_pkthdr.rcvif;
+ struct ether_header *eh;
+ struct arc_header *arh;
+ struct iso88025_header *th = (struct iso88025_header *)0;
+ struct iso88025_sockaddr_dl_data *trld;
+ register struct llinfo_arp *la = 0;
+ register struct rtentry *rt;
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+ struct sockaddr_dl *sdl;
+ struct sockaddr sa;
+ struct in_addr isaddr, itaddr, myaddr;
+ int op, rif_len;
+ int req_len;
+
+ req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
+ if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
+ log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n");
+ return;
+ }
+
+ ah = mtod(m, struct arphdr *);
+ op = ntohs(ah->ar_op);
+ (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
+ (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
+#ifdef BRIDGE
+#define BRIDGE_TEST (do_bridge)
+#else
+#define BRIDGE_TEST (0) /* cc will optimise the test away */
+#endif
+ /*
+ * For a bridge, we want to check the address irrespective
+ * of the receive interface. (This will change slightly
+ * when we have clusters of interfaces).
+ */
+ LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash)
+ if ((BRIDGE_TEST || (ia->ia_ifp == ifp)) &&
+ itaddr.s_addr == ia->ia_addr.sin_addr.s_addr)
+ goto match;
+ LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
+ if ((BRIDGE_TEST || (ia->ia_ifp == ifp)) &&
+ isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)
+ goto match;
+ /*
+ * No match, use the first inet address on the receive interface
+ * as a dummy address for the rest of the function.
+ */
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
+ ia = ifatoia(ifa);
+ goto match;
+ }
+ /*
+ * If bridging, fall back to using any inet address.
+ */
+ if (!BRIDGE_TEST ||
+ (ia = TAILQ_FIRST(&in_ifaddrhead)) == NULL) {
+ m_freem(m);
+ return;
+ }
+match:
+ myaddr = ia->ia_addr.sin_addr;
+ if (!bcmp(ar_sha(ah), IF_LLADDR(ifp), ifp->if_addrlen)) {
+ m_freem(m); /* it's from me, ignore it. */
+ return;
+ }
+ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
+ log(LOG_ERR,
+ "arp: link address is broadcast for IP address %s!\n",
+ inet_ntoa(isaddr));
+ m_freem(m);
+ return;
+ }
+ if (isaddr.s_addr == myaddr.s_addr) {
+ log(LOG_ERR,
+ "arp: %*D is using my IP address %s!\n",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ inet_ntoa(isaddr));
+ itaddr = myaddr;
+ goto reply;
+ }
+ la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0);
+ if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) {
+ /* the following is not an error when doing bridging */
+ if (!BRIDGE_TEST && rt->rt_ifp != ifp) {
+ if (log_arp_wrong_iface)
+ log(LOG_ERR, "arp: %s is on %s%d but got reply from %*D on %s%d\n",
+ inet_ntoa(isaddr),
+ rt->rt_ifp->if_name, rt->rt_ifp->if_unit,
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_name, ifp->if_unit);
+ goto reply;
+ }
+ if (sdl->sdl_alen &&
+ bcmp(ar_sha(ah), LLADDR(sdl), sdl->sdl_alen)) {
+ if (rt->rt_expire) {
+ if (log_arp_movements)
+ log(LOG_INFO, "arp: %s moved from %*D to %*D on %s%d\n",
+ inet_ntoa(isaddr),
+ ifp->if_addrlen, (u_char *)LLADDR(sdl), ":",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ ifp->if_name, ifp->if_unit);
+ } else {
+ log(LOG_ERR,
+ "arp: %*D attempts to modify permanent entry for %s on %s%d\n",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
+ inet_ntoa(isaddr), ifp->if_name, ifp->if_unit);
+ goto reply;
+ }
+ }
+ /*
+ * sanity check for the address length.
+ * XXX this does not work for protocols with variable address
+ * length. -is
+ */
+ if (sdl->sdl_alen &&
+ sdl->sdl_alen != ah->ar_hln) {
+ log(LOG_WARNING,
+ "arp from %*D: new addr len %d, was %d",
+ ifp->if_addrlen, (u_char *) ar_sha(ah), ":",
+ ah->ar_hln, sdl->sdl_alen);
+ }
+ if (ifp->if_addrlen != ah->ar_hln) {
+ log(LOG_WARNING,
+ "arp from %*D: addr len: new %d, i/f %d (ignored)",
+ ifp->if_addrlen, (u_char *) ar_sha(ah), ":",
+ ah->ar_hln, ifp->if_addrlen);
+ goto reply;
+ }
+ (void)memcpy(LLADDR(sdl), ar_sha(ah),
+ sdl->sdl_alen = ah->ar_hln);
+ /*
+ * If we receive an arp from a token-ring station over
+ * a token-ring nic then try to save the source
+ * routing info.
+ */
+ if (ifp->if_type == IFT_ISO88025) {
+ th = (struct iso88025_header *)m->m_pkthdr.header;
+ trld = SDL_ISO88025(sdl);
+ rif_len = TR_RCF_RIFLEN(th->rcf);
+ if ((th->iso88025_shost[0] & TR_RII) &&
+ (rif_len > 2)) {
+ trld->trld_rcf = th->rcf;
+ trld->trld_rcf ^= htons(TR_RCF_DIR);
+ memcpy(trld->trld_route, th->rd, rif_len - 2);
+ trld->trld_rcf &= ~htons(TR_RCF_BCST_MASK);
+ /*
+ * Set up source routing information for
+ * reply packet (XXX)
+ */
+ m->m_data -= rif_len;
+ m->m_len += rif_len;
+ m->m_pkthdr.len += rif_len;
+ } else {
+ th->iso88025_shost[0] &= ~TR_RII;
+ trld->trld_rcf = 0;
+ }
+ m->m_data -= 8;
+ m->m_len += 8;
+ m->m_pkthdr.len += 8;
+ th->rcf = trld->trld_rcf;
+ }
+ if (rt->rt_expire)
+ rt->rt_expire = time_second + arpt_keep;
+ rt->rt_flags &= ~RTF_REJECT;
+ la->la_asked = 0;
+ if (la->la_hold) {
+ (*ifp->if_output)(ifp, la->la_hold,
+ rt_key(rt), rt);
+ la->la_hold = 0;
+ }
+ }
+reply:
+ if (op != ARPOP_REQUEST) {
+ m_freem(m);
+ return;
+ }
+ if (itaddr.s_addr == myaddr.s_addr) {
+ /* I am the target */
+ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
+ (void)memcpy(ar_sha(ah), IF_LLADDR(ifp), ah->ar_hln);
+ } else {
+ la = arplookup(itaddr.s_addr, 0, SIN_PROXY);
+ if (la == NULL) {
+ struct sockaddr_in sin;
+
+ if (!arp_proxyall) {
+ m_freem(m);
+ return;
+ }
+
+ bzero(&sin, sizeof sin);
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof sin;
+ sin.sin_addr = itaddr;
+
+ rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL);
+ if (!rt) {
+ m_freem(m);
+ return;
+ }
+ /*
+ * Don't send proxies for nodes on the same interface
+ * as this one came out of, or we'll get into a fight
+ * over who claims what Ether address.
+ */
+ if (rt->rt_ifp == ifp) {
+ rtfree(rt);
+ m_freem(m);
+ return;
+ }
+ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
+ (void)memcpy(ar_sha(ah), IF_LLADDR(ifp), ah->ar_hln);
+ rtfree(rt);
+
+ /*
+ * Also check that the node which sent the ARP packet
+ * is on the the interface we expect it to be on. This
+ * avoids ARP chaos if an interface is connected to the
+ * wrong network.
+ */
+ sin.sin_addr = isaddr;
+
+ rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL);
+ if (!rt) {
+ m_freem(m);
+ return;
+ }
+ if (rt->rt_ifp != ifp) {
+ log(LOG_INFO, "arp_proxy: ignoring request"
+ " from %s via %s%d, expecting %s%d\n",
+ inet_ntoa(isaddr), ifp->if_name,
+ ifp->if_unit, rt->rt_ifp->if_name,
+ rt->rt_ifp->if_unit);
+ rtfree(rt);
+ m_freem(m);
+ return;
+ }
+ rtfree(rt);
+
+#ifdef DEBUG_PROXY
+ printf("arp: proxying for %s\n",
+ inet_ntoa(itaddr));
+#endif
+ } else {
+ rt = la->la_rt;
+ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
+ sdl = SDL(rt->rt_gateway);
+ (void)memcpy(ar_sha(ah), LLADDR(sdl), ah->ar_hln);
+ }
+ }
+
+ (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
+ (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
+ ah->ar_op = htons(ARPOP_REPLY);
+ ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
+ switch (ifp->if_type) {
+ case IFT_ARCNET:
+ arh = (struct arc_header *)sa.sa_data;
+ arh->arc_dhost = *ar_tha(ah);
+ arh->arc_type = ARCTYPE_ARP;
+ break;
+
+ case IFT_ISO88025:
+ /* Re-arrange the source/dest address */
+ memcpy(th->iso88025_dhost, th->iso88025_shost,
+ sizeof(th->iso88025_dhost));
+ memcpy(th->iso88025_shost, IF_LLADDR(ifp),
+ sizeof(th->iso88025_shost));
+ /* Set the source routing bit if neccesary */
+ if (th->iso88025_dhost[0] & TR_RII) {
+ th->iso88025_dhost[0] &= ~TR_RII;
+ if (TR_RCF_RIFLEN(th->rcf) > 2)
+ th->iso88025_shost[0] |= TR_RII;
+ }
+ /* Copy the addresses, ac and fc into sa_data */
+ memcpy(sa.sa_data, th->iso88025_dhost,
+ sizeof(th->iso88025_dhost) * 2);
+ sa.sa_data[(sizeof(th->iso88025_dhost) * 2)] = TR_AC;
+ sa.sa_data[(sizeof(th->iso88025_dhost) * 2) + 1] = TR_LLC_FRAME;
+ break;
+ case IFT_ETHER:
+ case IFT_FDDI:
+ /*
+ * May not be correct for types not explictly
+ * listed, but it is our best guess.
+ */
+ default:
+ eh = (struct ether_header *)sa.sa_data;
+ (void)memcpy(eh->ether_dhost, ar_tha(ah),
+ sizeof(eh->ether_dhost));
+ eh->ether_type = htons(ETHERTYPE_ARP);
+ break;
+ }
+ sa.sa_family = AF_UNSPEC;
+ sa.sa_len = sizeof(sa);
+ (*ifp->if_output)(ifp, m, &sa, (struct rtentry *)0);
+ return;
+}
+#endif
+
+/*
+ * Free an arp entry.
+ */
+static void
+arptfree(la)
+ register struct llinfo_arp *la;
+{
+ register struct rtentry *rt = la->la_rt;
+ register struct sockaddr_dl *sdl;
+ if (rt == 0)
+ panic("arptfree");
+ if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) &&
+ sdl->sdl_family == AF_LINK) {
+ sdl->sdl_alen = 0;
+ la->la_asked = 0;
+ rt->rt_flags &= ~RTF_REJECT;
+ return;
+ }
+ rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt),
+ 0, (struct rtentry **)0);
+}
+/*
+ * Lookup or enter a new address in arptab.
+ */
+static struct llinfo_arp *
+arplookup(addr, create, proxy)
+ u_long addr;
+ int create, proxy;
+{
+ register struct rtentry *rt;
+ static struct sockaddr_inarp sin = {sizeof(sin), AF_INET };
+ const char *why = 0;
+
+ sin.sin_addr.s_addr = addr;
+ sin.sin_other = proxy ? SIN_PROXY : 0;
+ rt = rtalloc1((struct sockaddr *)&sin, create, 0UL);
+ if (rt == 0)
+ return (0);
+ rt->rt_refcnt--;
+
+ if (rt->rt_flags & RTF_GATEWAY)
+ why = "host is not on local network";
+ else if ((rt->rt_flags & RTF_LLINFO) == 0)
+ why = "could not allocate llinfo";
+ else if (rt->rt_gateway->sa_family != AF_LINK)
+ why = "gateway route is not ours";
+
+ if (why && create) {
+ log(LOG_DEBUG, "arplookup %s failed: %s\n",
+ inet_ntoa(sin.sin_addr), why);
+ return 0;
+ } else if (why) {
+ return 0;
+ }
+ return ((struct llinfo_arp *)rt->rt_llinfo);
+}
+
+void
+arp_ifinit(ifp, ifa)
+ struct ifnet *ifp;
+ struct ifaddr *ifa;
+{
+ if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
+ arprequest(ifp, &IA_SIN(ifa)->sin_addr,
+ &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
+ ifa->ifa_rtrequest = arp_rtrequest;
+ ifa->ifa_flags |= RTF_CLONING;
+}
+
+static void
+arp_init(void)
+{
+
+ arpintrq.ifq_maxlen = 50;
+ mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF);
+ LIST_INIT(&llinfo_arp);
+ register_netisr(NETISR_ARP, arpintr);
+}
+
+SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h
new file mode 100644
index 0000000..6b31758
--- /dev/null
+++ b/sys/netinet/if_ether.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)if_ether.h 8.3 (Berkeley) 5/2/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IF_ETHER_H_
+#define _NETINET_IF_ETHER_H_
+
+#include <net/ethernet.h>
+#include <net/if_arp.h>
+
+/*
+ * Macro to map an IP multicast address to an Ethernet multicast address.
+ * The high-order 25 bits of the Ethernet address are statically assigned,
+ * and the low-order 23 bits are taken from the low end of the IP address.
+ */
+#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \
+ /* struct in_addr *ipaddr; */ \
+ /* u_char enaddr[ETHER_ADDR_LEN]; */ \
+{ \
+ (enaddr)[0] = 0x01; \
+ (enaddr)[1] = 0x00; \
+ (enaddr)[2] = 0x5e; \
+ (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \
+ (enaddr)[4] = ((u_char *)ipaddr)[2]; \
+ (enaddr)[5] = ((u_char *)ipaddr)[3]; \
+}
+/*
+ * Macro to map an IP6 multicast address to an Ethernet multicast address.
+ * The high-order 16 bits of the Ethernet address are statically assigned,
+ * and the low-order 32 bits are taken from the low end of the IP6 address.
+ */
+#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \
+/* struct in6_addr *ip6addr; */ \
+/* u_char enaddr[ETHER_ADDR_LEN]; */ \
+{ \
+ (enaddr)[0] = 0x33; \
+ (enaddr)[1] = 0x33; \
+ (enaddr)[2] = ((u_char *)ip6addr)[12]; \
+ (enaddr)[3] = ((u_char *)ip6addr)[13]; \
+ (enaddr)[4] = ((u_char *)ip6addr)[14]; \
+ (enaddr)[5] = ((u_char *)ip6addr)[15]; \
+}
+
+/*
+ * Ethernet Address Resolution Protocol.
+ *
+ * See RFC 826 for protocol description. Structure below is adapted
+ * to resolving internet addresses. Field names used correspond to
+ * RFC 826.
+ */
+struct ether_arp {
+ struct arphdr ea_hdr; /* fixed-size header */
+ u_char arp_sha[ETHER_ADDR_LEN]; /* sender hardware address */
+ u_char arp_spa[4]; /* sender protocol address */
+ u_char arp_tha[ETHER_ADDR_LEN]; /* target hardware address */
+ u_char arp_tpa[4]; /* target protocol address */
+};
+#define arp_hrd ea_hdr.ar_hrd
+#define arp_pro ea_hdr.ar_pro
+#define arp_hln ea_hdr.ar_hln
+#define arp_pln ea_hdr.ar_pln
+#define arp_op ea_hdr.ar_op
+
+struct sockaddr_inarp {
+ u_char sin_len;
+ u_char sin_family;
+ u_short sin_port;
+ struct in_addr sin_addr;
+ struct in_addr sin_srcaddr;
+ u_short sin_tos;
+ u_short sin_other;
+#define SIN_PROXY 1
+};
+/*
+ * IP and ethernet specific routing flags
+ */
+#define RTF_USETRAILERS RTF_PROTO1 /* use trailers */
+#define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */
+
+#ifdef _KERNEL
+extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN];
+extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN];
+extern struct ifqueue arpintrq;
+
+int arpresolve(struct ifnet *, struct rtentry *, struct mbuf *,
+ struct sockaddr *, u_char *, struct rtentry *);
+void arp_ifinit(struct ifnet *, struct ifaddr *);
+#endif
+
+#endif
diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c
new file mode 100644
index 0000000..12a1552
--- /dev/null
+++ b/sys/netinet/igmp.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 1988 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)igmp.c 8.1 (Berkeley) 7/19/93
+ * $FreeBSD$
+ */
+
+/*
+ * Internet Group Management Protocol (IGMP) routines.
+ *
+ * Written by Steve Deering, Stanford, May 1988.
+ * Modified by Rosen Sharma, Stanford, Aug 1994.
+ * Modified by Bill Fenner, Xerox PARC, Feb 1995.
+ * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
+ *
+ * MULTICAST Revision: 3.5.1.4
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/igmp.h>
+#include <netinet/igmp_var.h>
+
+#include <machine/in_cksum.h>
+
+static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
+
+static struct router_info *
+ find_rti(struct ifnet *ifp);
+
+static struct igmpstat igmpstat;
+
+SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
+ &igmpstat, igmpstat, "");
+
+static int igmp_timers_are_running;
+static u_long igmp_all_hosts_group;
+static u_long igmp_all_rtrs_group;
+static struct mbuf *router_alert;
+static struct router_info *Head;
+
+static void igmp_sendpkt(struct in_multi *, int, unsigned long);
+
+void
+igmp_init()
+{
+ struct ipoption *ra;
+
+ /*
+ * To avoid byte-swapping the same value over and over again.
+ */
+ igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP);
+ igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP);
+
+ igmp_timers_are_running = 0;
+
+ /*
+ * Construct a Router Alert option to use in outgoing packets
+ */
+ MGET(router_alert, M_DONTWAIT, MT_DATA);
+ ra = mtod(router_alert, struct ipoption *);
+ ra->ipopt_dst.s_addr = 0;
+ ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */
+ ra->ipopt_list[1] = 0x04; /* 4 bytes long */
+ ra->ipopt_list[2] = 0x00;
+ ra->ipopt_list[3] = 0x00;
+ router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1];
+
+ Head = (struct router_info *) 0;
+}
+
+static struct router_info *
+find_rti(ifp)
+ struct ifnet *ifp;
+{
+ register struct router_info *rti = Head;
+
+#ifdef IGMP_DEBUG
+ printf("[igmp.c, _find_rti] --> entering \n");
+#endif
+ while (rti) {
+ if (rti->rti_ifp == ifp) {
+#ifdef IGMP_DEBUG
+ printf("[igmp.c, _find_rti] --> found old entry \n");
+#endif
+ return rti;
+ }
+ rti = rti->rti_next;
+ }
+ MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT);
+ rti->rti_ifp = ifp;
+ rti->rti_type = IGMP_V2_ROUTER;
+ rti->rti_time = 0;
+ rti->rti_next = Head;
+ Head = rti;
+#ifdef IGMP_DEBUG
+ printf("[igmp.c, _find_rti] --> created an entry \n");
+#endif
+ return rti;
+}
+
+void
+igmp_input(m, off)
+ register struct mbuf *m;
+ int off;
+{
+ register int iphlen = off;
+ register struct igmp *igmp;
+ register struct ip *ip;
+ register int igmplen;
+ register struct ifnet *ifp = m->m_pkthdr.rcvif;
+ register int minlen;
+ register struct in_multi *inm;
+ register struct in_ifaddr *ia;
+ struct in_multistep step;
+ struct router_info *rti;
+
+ int timer; /** timer value in the igmp query header **/
+
+ ++igmpstat.igps_rcv_total;
+
+ ip = mtod(m, struct ip *);
+ igmplen = ip->ip_len;
+
+ /*
+ * Validate lengths
+ */
+ if (igmplen < IGMP_MINLEN) {
+ ++igmpstat.igps_rcv_tooshort;
+ m_freem(m);
+ return;
+ }
+ minlen = iphlen + IGMP_MINLEN;
+ if ((m->m_flags & M_EXT || m->m_len < minlen) &&
+ (m = m_pullup(m, minlen)) == 0) {
+ ++igmpstat.igps_rcv_tooshort;
+ return;
+ }
+
+ /*
+ * Validate checksum
+ */
+ m->m_data += iphlen;
+ m->m_len -= iphlen;
+ igmp = mtod(m, struct igmp *);
+ if (in_cksum(m, igmplen)) {
+ ++igmpstat.igps_rcv_badsum;
+ m_freem(m);
+ return;
+ }
+ m->m_data -= iphlen;
+ m->m_len += iphlen;
+
+ ip = mtod(m, struct ip *);
+ timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
+ if (timer == 0)
+ timer = 1;
+ rti = find_rti(ifp);
+
+ /*
+ * In the IGMPv2 specification, there are 3 states and a flag.
+ *
+ * In Non-Member state, we simply don't have a membership record.
+ * In Delaying Member state, our timer is running (inm->inm_timer)
+ * In Idle Member state, our timer is not running (inm->inm_timer==0)
+ *
+ * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if
+ * we have heard a report from another member, or IGMP_IREPORTEDLAST
+ * if I sent the last report.
+ */
+ switch (igmp->igmp_type) {
+
+ case IGMP_MEMBERSHIP_QUERY:
+ ++igmpstat.igps_rcv_queries;
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ break;
+
+ if (igmp->igmp_code == 0) {
+ /*
+ * Old router. Remember that the querier on this
+ * interface is old, and set the timer to the
+ * value in RFC 1112.
+ */
+
+ rti->rti_type = IGMP_V1_ROUTER;
+ rti->rti_time = 0;
+
+ timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ;
+
+ if (ip->ip_dst.s_addr != igmp_all_hosts_group ||
+ igmp->igmp_group.s_addr != 0) {
+ ++igmpstat.igps_rcv_badqueries;
+ m_freem(m);
+ return;
+ }
+ } else {
+ /*
+ * New router. Simply do the new validity check.
+ */
+
+ if (igmp->igmp_group.s_addr != 0 &&
+ !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) {
+ ++igmpstat.igps_rcv_badqueries;
+ m_freem(m);
+ return;
+ }
+ }
+
+ /*
+ * - Start the timers in all of our membership records
+ * that the query applies to for the interface on
+ * which the query arrived excl. those that belong
+ * to the "all-hosts" group (224.0.0.1).
+ * - Restart any timer that is already running but has
+ * a value longer than the requested timeout.
+ * - Use the value specified in the query message as
+ * the maximum timeout.
+ */
+ IN_FIRST_MULTI(step, inm);
+ while (inm != NULL) {
+ if (inm->inm_ifp == ifp &&
+ inm->inm_addr.s_addr != igmp_all_hosts_group &&
+ (igmp->igmp_group.s_addr == 0 ||
+ igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) {
+ if (inm->inm_timer == 0 ||
+ inm->inm_timer > timer) {
+ inm->inm_timer =
+ IGMP_RANDOM_DELAY(timer);
+ igmp_timers_are_running = 1;
+ }
+ }
+ IN_NEXT_MULTI(step, inm);
+ }
+
+ break;
+
+ case IGMP_V1_MEMBERSHIP_REPORT:
+ case IGMP_V2_MEMBERSHIP_REPORT:
+ /*
+ * For fast leave to work, we have to know that we are the
+ * last person to send a report for this group. Reports
+ * can potentially get looped back if we are a multicast
+ * router, so discard reports sourced by me.
+ */
+ IFP_TO_IA(ifp, ia);
+ if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr)
+ break;
+
+ ++igmpstat.igps_rcv_reports;
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ break;
+
+ if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) {
+ ++igmpstat.igps_rcv_badreports;
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * KLUDGE: if the IP source address of the report has an
+ * unspecified (i.e., zero) subnet number, as is allowed for
+ * a booting host, replace it with the correct subnet number
+ * so that a process-level multicast routing daemon can
+ * determine which subnet it arrived from. This is necessary
+ * to compensate for the lack of any way for a process to
+ * determine the arrival interface of an incoming packet.
+ */
+ if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0)
+ if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet);
+
+ /*
+ * If we belong to the group being reported, stop
+ * our timer for that group.
+ */
+ IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm);
+
+ if (inm != NULL) {
+ inm->inm_timer = 0;
+ ++igmpstat.igps_rcv_ourreports;
+
+ inm->inm_state = IGMP_OTHERMEMBER;
+ }
+
+ break;
+ }
+
+ /*
+ * Pass all valid IGMP packets up to any process(es) listening
+ * on a raw IGMP socket.
+ */
+ rip_input(m, off);
+}
+
+void
+igmp_joingroup(inm)
+ struct in_multi *inm;
+{
+ int s = splnet();
+
+ if (inm->inm_addr.s_addr == igmp_all_hosts_group
+ || inm->inm_ifp->if_flags & IFF_LOOPBACK) {
+ inm->inm_timer = 0;
+ inm->inm_state = IGMP_OTHERMEMBER;
+ } else {
+ inm->inm_rti = find_rti(inm->inm_ifp);
+ igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
+ inm->inm_timer = IGMP_RANDOM_DELAY(
+ IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ);
+ inm->inm_state = IGMP_IREPORTEDLAST;
+ igmp_timers_are_running = 1;
+ }
+ splx(s);
+}
+
+void
+igmp_leavegroup(inm)
+ struct in_multi *inm;
+{
+ if (inm->inm_state == IGMP_IREPORTEDLAST &&
+ inm->inm_addr.s_addr != igmp_all_hosts_group &&
+ !(inm->inm_ifp->if_flags & IFF_LOOPBACK) &&
+ inm->inm_rti->rti_type != IGMP_V1_ROUTER)
+ igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group);
+}
+
+void
+igmp_fasttimo()
+{
+ register struct in_multi *inm;
+ struct in_multistep step;
+ int s;
+
+ /*
+ * Quick check to see if any work needs to be done, in order
+ * to minimize the overhead of fasttimo processing.
+ */
+
+ if (!igmp_timers_are_running)
+ return;
+
+ s = splnet();
+ igmp_timers_are_running = 0;
+ IN_FIRST_MULTI(step, inm);
+ while (inm != NULL) {
+ if (inm->inm_timer == 0) {
+ /* do nothing */
+ } else if (--inm->inm_timer == 0) {
+ igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
+ inm->inm_state = IGMP_IREPORTEDLAST;
+ } else {
+ igmp_timers_are_running = 1;
+ }
+ IN_NEXT_MULTI(step, inm);
+ }
+ splx(s);
+}
+
+void
+igmp_slowtimo()
+{
+ int s = splnet();
+ register struct router_info *rti = Head;
+
+#ifdef IGMP_DEBUG
+ printf("[igmp.c,_slowtimo] -- > entering \n");
+#endif
+ while (rti) {
+ if (rti->rti_type == IGMP_V1_ROUTER) {
+ rti->rti_time++;
+ if (rti->rti_time >= IGMP_AGE_THRESHOLD) {
+ rti->rti_type = IGMP_V2_ROUTER;
+ }
+ }
+ rti = rti->rti_next;
+ }
+#ifdef IGMP_DEBUG
+ printf("[igmp.c,_slowtimo] -- > exiting \n");
+#endif
+ splx(s);
+}
+
+static struct route igmprt;
+
+static void
+igmp_sendpkt(inm, type, addr)
+ struct in_multi *inm;
+ int type;
+ unsigned long addr;
+{
+ struct mbuf *m;
+ struct igmp *igmp;
+ struct ip *ip;
+ struct ip_moptions imo;
+
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return;
+
+ m->m_pkthdr.rcvif = loif;
+ m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN;
+ MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip));
+ m->m_data += sizeof(struct ip);
+ m->m_len = IGMP_MINLEN;
+ igmp = mtod(m, struct igmp *);
+ igmp->igmp_type = type;
+ igmp->igmp_code = 0;
+ igmp->igmp_group = inm->inm_addr;
+ igmp->igmp_cksum = 0;
+ igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN);
+
+ m->m_data -= sizeof(struct ip);
+ m->m_len += sizeof(struct ip);
+ ip = mtod(m, struct ip *);
+ ip->ip_tos = 0;
+ ip->ip_len = sizeof(struct ip) + IGMP_MINLEN;
+ ip->ip_off = 0;
+ ip->ip_p = IPPROTO_IGMP;
+ ip->ip_src.s_addr = INADDR_ANY;
+ ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr;
+
+ imo.imo_multicast_ifp = inm->inm_ifp;
+ imo.imo_multicast_ttl = 1;
+ imo.imo_multicast_vif = -1;
+ /*
+ * Request loopback of the report if we are acting as a multicast
+ * router, so that the process-level routing daemon can hear it.
+ */
+ imo.imo_multicast_loop = (ip_mrouter != NULL);
+
+ /*
+ * XXX
+ * Do we have to worry about reentrancy here? Don't think so.
+ */
+ ip_output(m, router_alert, &igmprt, 0, &imo);
+
+ ++igmpstat.igps_snd_reports;
+}
diff --git a/sys/netinet/igmp.h b/sys/netinet/igmp.h
new file mode 100644
index 0000000..7d943d6
--- /dev/null
+++ b/sys/netinet/igmp.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 1988 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)igmp.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IGMP_H_
+#define _NETINET_IGMP_H_
+
+/*
+ * Internet Group Management Protocol (IGMP) definitions.
+ *
+ * Written by Steve Deering, Stanford, May 1988.
+ *
+ * MULTICAST Revision: 3.5.1.2
+ */
+
+/*
+ * IGMP packet format.
+ */
+struct igmp {
+ u_char igmp_type; /* version & type of IGMP message */
+ u_char igmp_code; /* subtype for routing msgs */
+ u_short igmp_cksum; /* IP-style checksum */
+ struct in_addr igmp_group; /* group address being reported */
+}; /* (zero for queries) */
+
+#define IGMP_MINLEN 8
+
+/*
+ * Message types, including version number.
+ */
+#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */
+#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */
+#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */
+#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */
+
+#define IGMP_DVMRP 0x13 /* DVMRP routing message */
+#define IGMP_PIM 0x14 /* PIM routing message */
+
+#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/
+#define IGMP_MTRACE 0x1f /* mcast traceroute messages */
+
+#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */
+ /* query (in seconds) according */
+ /* to RFC1112 */
+
+
+#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */
+ /* specifies time in 10th of seconds*/
+
+/*
+ * The following four defininitions are for backwards compatibility.
+ * They should be removed as soon as all applications are updated to
+ * use the new constant names.
+ */
+#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY
+#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT
+#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT
+#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP
+
+#endif /* _NETINET_IGMP_H_ */
diff --git a/sys/netinet/igmp_var.h b/sys/netinet/igmp_var.h
new file mode 100644
index 0000000..d6451a7
--- /dev/null
+++ b/sys/netinet/igmp_var.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 1988 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IGMP_VAR_H_
+#define _NETINET_IGMP_VAR_H_
+
+/*
+ * Internet Group Management Protocol (IGMP),
+ * implementation-specific definitions.
+ *
+ * Written by Steve Deering, Stanford, May 1988.
+ *
+ * MULTICAST Revision: 3.5.1.3
+ */
+
+struct igmpstat {
+ u_int igps_rcv_total; /* total IGMP messages received */
+ u_int igps_rcv_tooshort; /* received with too few bytes */
+ u_int igps_rcv_badsum; /* received with bad checksum */
+ u_int igps_rcv_queries; /* received membership queries */
+ u_int igps_rcv_badqueries; /* received invalid queries */
+ u_int igps_rcv_reports; /* received membership reports */
+ u_int igps_rcv_badreports; /* received invalid reports */
+ u_int igps_rcv_ourreports; /* received reports for our groups */
+ u_int igps_snd_reports; /* sent membership reports */
+};
+
+#ifdef _KERNEL
+#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1)
+
+/*
+ * States for IGMPv2's leave processing
+ */
+#define IGMP_OTHERMEMBER 0
+#define IGMP_IREPORTEDLAST 1
+
+/*
+ * We must remember what version the subnet's querier is.
+ * We conveniently use the IGMP message type for the proper
+ * membership report to keep this state.
+ */
+#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT
+#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT
+
+/*
+ * Revert to new router if we haven't heard from an old router in
+ * this amount of time.
+ */
+#define IGMP_AGE_THRESHOLD 540
+
+void igmp_init(void);
+void igmp_input(struct mbuf *, int);
+void igmp_joingroup(struct in_multi *);
+void igmp_leavegroup(struct in_multi *);
+void igmp_fasttimo(void);
+void igmp_slowtimo(void);
+
+SYSCTL_DECL(_net_inet_igmp);
+
+#endif
+
+/*
+ * Names for IGMP sysctl objects
+ */
+#define IGMPCTL_STATS 1 /* statistics (read-only) */
+#define IGMPCTL_MAXID 2
+
+#define IGMPCTL_NAMES { \
+ { 0, 0 }, \
+ { "stats", CTLTYPE_STRUCT }, \
+}
+#endif
diff --git a/sys/netinet/in.c b/sys/netinet/in.c
new file mode 100644
index 0000000..5f4179d
--- /dev/null
+++ b/sys/netinet/in.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in.c 8.4 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sockio.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+
+#include <netinet/igmp_var.h>
+
+static MALLOC_DEFINE(M_IPMADDR, "in_multi", "internet multicast address");
+
+static int in_mask2len(struct in_addr *);
+static void in_len2mask(struct in_addr *, int);
+static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t,
+ struct ifnet *, struct thread *);
+
+static void in_socktrim(struct sockaddr_in *);
+static int in_ifinit(struct ifnet *,
+ struct in_ifaddr *, struct sockaddr_in *, int);
+
+static int subnetsarelocal = 0;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
+ &subnetsarelocal, 0, "");
+
+struct in_multihead in_multihead; /* XXX BSS initialization */
+
+extern struct inpcbinfo ripcbinfo;
+extern struct inpcbinfo udbinfo;
+
+/*
+ * Return 1 if an internet address is for a ``local'' host
+ * (one to which we have a connection). If subnetsarelocal
+ * is true, this includes other subnets of the local net.
+ * Otherwise, it includes only the directly-connected (sub)nets.
+ */
+int
+in_localaddr(in)
+ struct in_addr in;
+{
+ register u_long i = ntohl(in.s_addr);
+ register struct in_ifaddr *ia;
+
+ if (subnetsarelocal) {
+ TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
+ if ((i & ia->ia_netmask) == ia->ia_net)
+ return (1);
+ } else {
+ TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
+ if ((i & ia->ia_subnetmask) == ia->ia_subnet)
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Determine whether an IP address is in a reserved set of addresses
+ * that may not be forwarded, or whether datagrams to that destination
+ * may be forwarded.
+ */
+int
+in_canforward(in)
+ struct in_addr in;
+{
+ register u_long i = ntohl(in.s_addr);
+ register u_long net;
+
+ if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i))
+ return (0);
+ if (IN_CLASSA(i)) {
+ net = i & IN_CLASSA_NET;
+ if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * Trim a mask in a sockaddr
+ */
+static void
+in_socktrim(ap)
+struct sockaddr_in *ap;
+{
+ register char *cplim = (char *) &ap->sin_addr;
+ register char *cp = (char *) (&ap->sin_addr + 1);
+
+ ap->sin_len = 0;
+ while (--cp >= cplim)
+ if (*cp) {
+ (ap)->sin_len = cp - (char *) (ap) + 1;
+ break;
+ }
+}
+
+static int
+in_mask2len(mask)
+ struct in_addr *mask;
+{
+ int x, y;
+ u_char *p;
+
+ p = (u_char *)mask;
+ for (x = 0; x < sizeof(*mask); x++) {
+ if (p[x] != 0xff)
+ break;
+ }
+ y = 0;
+ if (x < sizeof(*mask)) {
+ for (y = 0; y < 8; y++) {
+ if ((p[x] & (0x80 >> y)) == 0)
+ break;
+ }
+ }
+ return x * 8 + y;
+}
+
+static void
+in_len2mask(mask, len)
+ struct in_addr *mask;
+ int len;
+{
+ int i;
+ u_char *p;
+
+ p = (u_char *)mask;
+ bzero(mask, sizeof(*mask));
+ for (i = 0; i < len / 8; i++)
+ p[i] = 0xff;
+ if (len % 8)
+ p[i] = (0xff00 >> (len % 8)) & 0xff;
+}
+
+static int in_interfaces; /* number of external internet interfaces */
+
+/*
+ * Generic internet control operations (ioctl's).
+ * Ifp is 0 if not an interface-specific ioctl.
+ */
+/* ARGSUSED */
+int
+in_control(so, cmd, data, ifp, td)
+ struct socket *so;
+ u_long cmd;
+ caddr_t data;
+ register struct ifnet *ifp;
+ struct thread *td;
+{
+ register struct ifreq *ifr = (struct ifreq *)data;
+ register struct in_ifaddr *ia = 0, *iap;
+ register struct ifaddr *ifa;
+ struct in_addr dst;
+ struct in_ifaddr *oia;
+ struct in_aliasreq *ifra = (struct in_aliasreq *)data;
+ struct sockaddr_in oldaddr;
+ int error, hostIsNew, iaIsNew, maskIsNew, s;
+
+ iaIsNew = 0;
+
+ switch (cmd) {
+ case SIOCALIFADDR:
+ case SIOCDLIFADDR:
+ if (td && (error = suser(td)) != 0)
+ return error;
+ /*fall through*/
+ case SIOCGLIFADDR:
+ if (!ifp)
+ return EINVAL;
+ return in_lifaddr_ioctl(so, cmd, data, ifp, td);
+ }
+
+ /*
+ * Find address for this interface, if it exists.
+ *
+ * If an alias address was specified, find that one instead of
+ * the first one on the interface, if possible.
+ */
+ if (ifp) {
+ dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr;
+ LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash)
+ if (iap->ia_ifp == ifp &&
+ iap->ia_addr.sin_addr.s_addr == dst.s_addr) {
+ ia = iap;
+ break;
+ }
+ if (ia == NULL)
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ iap = ifatoia(ifa);
+ if (iap->ia_addr.sin_family == AF_INET) {
+ ia = iap;
+ break;
+ }
+ }
+ }
+
+ switch (cmd) {
+
+ case SIOCAIFADDR:
+ case SIOCDIFADDR:
+ if (ifp == 0)
+ return (EADDRNOTAVAIL);
+ if (ifra->ifra_addr.sin_family == AF_INET) {
+ for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) {
+ if (ia->ia_ifp == ifp &&
+ ia->ia_addr.sin_addr.s_addr ==
+ ifra->ifra_addr.sin_addr.s_addr)
+ break;
+ }
+ if ((ifp->if_flags & IFF_POINTOPOINT)
+ && (cmd == SIOCAIFADDR)
+ && (ifra->ifra_dstaddr.sin_addr.s_addr
+ == INADDR_ANY)) {
+ return EDESTADDRREQ;
+ }
+ }
+ if (cmd == SIOCDIFADDR && ia == 0)
+ return (EADDRNOTAVAIL);
+ /* FALLTHROUGH */
+ case SIOCSIFADDR:
+ case SIOCSIFNETMASK:
+ case SIOCSIFDSTADDR:
+ if (td && (error = suser(td)) != 0)
+ return error;
+
+ if (ifp == 0)
+ return (EADDRNOTAVAIL);
+ if (ia == (struct in_ifaddr *)0) {
+ ia = (struct in_ifaddr *)
+ malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO);
+ if (ia == (struct in_ifaddr *)NULL)
+ return (ENOBUFS);
+ /*
+ * Protect from ipintr() traversing address list
+ * while we're modifying it.
+ */
+ s = splnet();
+
+ TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link);
+ ifa = &ia->ia_ifa;
+ TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
+
+ ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
+ ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
+ ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
+ ia->ia_sockmask.sin_len = 8;
+ ia->ia_sockmask.sin_family = AF_INET;
+ if (ifp->if_flags & IFF_BROADCAST) {
+ ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
+ ia->ia_broadaddr.sin_family = AF_INET;
+ }
+ ia->ia_ifp = ifp;
+ if (!(ifp->if_flags & IFF_LOOPBACK))
+ in_interfaces++;
+ splx(s);
+ iaIsNew = 1;
+ }
+ break;
+
+ case SIOCSIFBRDADDR:
+ if (td && (error = suser(td)) != 0)
+ return error;
+ /* FALLTHROUGH */
+
+ case SIOCGIFADDR:
+ case SIOCGIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ if (ia == (struct in_ifaddr *)0)
+ return (EADDRNOTAVAIL);
+ break;
+ }
+ switch (cmd) {
+
+ case SIOCGIFADDR:
+ *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr;
+ return (0);
+
+ case SIOCGIFBRDADDR:
+ if ((ifp->if_flags & IFF_BROADCAST) == 0)
+ return (EINVAL);
+ *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr;
+ return (0);
+
+ case SIOCGIFDSTADDR:
+ if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
+ return (EINVAL);
+ *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr;
+ return (0);
+
+ case SIOCGIFNETMASK:
+ *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask;
+ return (0);
+
+ case SIOCSIFDSTADDR:
+ if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
+ return (EINVAL);
+ oldaddr = ia->ia_dstaddr;
+ ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr;
+ if (ifp->if_ioctl && (error = (*ifp->if_ioctl)
+ (ifp, SIOCSIFDSTADDR, (caddr_t)ia))) {
+ ia->ia_dstaddr = oldaddr;
+ return (error);
+ }
+ if (ia->ia_flags & IFA_ROUTE) {
+ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr;
+ rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
+ ia->ia_ifa.ifa_dstaddr =
+ (struct sockaddr *)&ia->ia_dstaddr;
+ rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP);
+ }
+ return (0);
+
+ case SIOCSIFBRDADDR:
+ if ((ifp->if_flags & IFF_BROADCAST) == 0)
+ return (EINVAL);
+ ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr;
+ return (0);
+
+ case SIOCSIFADDR:
+ error = in_ifinit(ifp, ia,
+ (struct sockaddr_in *) &ifr->ifr_addr, 1);
+ if (error != 0 && iaIsNew)
+ break;
+ return (0);
+
+ case SIOCSIFNETMASK:
+ ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr;
+ ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
+ return (0);
+
+ case SIOCAIFADDR:
+ maskIsNew = 0;
+ hostIsNew = 1;
+ error = 0;
+ if (ia->ia_addr.sin_family == AF_INET) {
+ if (ifra->ifra_addr.sin_len == 0) {
+ ifra->ifra_addr = ia->ia_addr;
+ hostIsNew = 0;
+ } else if (ifra->ifra_addr.sin_addr.s_addr ==
+ ia->ia_addr.sin_addr.s_addr)
+ hostIsNew = 0;
+ }
+ if (ifra->ifra_mask.sin_len) {
+ in_ifscrub(ifp, ia);
+ ia->ia_sockmask = ifra->ifra_mask;
+ ia->ia_sockmask.sin_family = AF_INET;
+ ia->ia_subnetmask =
+ ntohl(ia->ia_sockmask.sin_addr.s_addr);
+ maskIsNew = 1;
+ }
+ if ((ifp->if_flags & IFF_POINTOPOINT) &&
+ (ifra->ifra_dstaddr.sin_family == AF_INET)) {
+ in_ifscrub(ifp, ia);
+ ia->ia_dstaddr = ifra->ifra_dstaddr;
+ maskIsNew = 1; /* We lie; but the effect's the same */
+ }
+ if (ifra->ifra_addr.sin_family == AF_INET &&
+ (hostIsNew || maskIsNew))
+ error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
+ if (error != 0 && iaIsNew)
+ break;
+
+ if ((ifp->if_flags & IFF_BROADCAST) &&
+ (ifra->ifra_broadaddr.sin_family == AF_INET))
+ ia->ia_broadaddr = ifra->ifra_broadaddr;
+ return (error);
+
+ case SIOCDIFADDR:
+ /*
+ * in_ifscrub kills the interface route.
+ */
+ in_ifscrub(ifp, ia);
+ /*
+ * in_ifadown gets rid of all the rest of
+ * the routes. This is not quite the right
+ * thing to do, but at least if we are running
+ * a routing process they will come back.
+ */
+ in_ifadown(&ia->ia_ifa, 1);
+ /*
+ * XXX horrible hack to detect that we are being called
+ * from if_detach()
+ */
+ if (ifaddr_byindex(ifp->if_index) != NULL) {
+ in_pcbpurgeif0(&ripcbinfo, ifp);
+ in_pcbpurgeif0(&udbinfo, ifp);
+ }
+ error = 0;
+ break;
+
+ default:
+ if (ifp == 0 || ifp->if_ioctl == 0)
+ return (EOPNOTSUPP);
+ return ((*ifp->if_ioctl)(ifp, cmd, data));
+ }
+
+ /*
+ * Protect from ipintr() traversing address list while we're modifying
+ * it.
+ */
+ s = splnet();
+ TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
+ TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link);
+ LIST_REMOVE(ia, ia_hash);
+ IFAFREE(&ia->ia_ifa);
+ splx(s);
+
+ return (error);
+}
+
+/*
+ * SIOC[GAD]LIFADDR.
+ * SIOCGLIFADDR: get first address. (?!?)
+ * SIOCGLIFADDR with IFLR_PREFIX:
+ * get first address that matches the specified prefix.
+ * SIOCALIFADDR: add the specified address.
+ * SIOCALIFADDR with IFLR_PREFIX:
+ * EINVAL since we can't deduce hostid part of the address.
+ * SIOCDLIFADDR: delete the specified address.
+ * SIOCDLIFADDR with IFLR_PREFIX:
+ * delete the first address that matches the specified prefix.
+ * return values:
+ * EINVAL on invalid parameters
+ * EADDRNOTAVAIL on prefix match failed/specified address not found
+ * other values may be returned from in_ioctl()
+ */
+static int
+in_lifaddr_ioctl(so, cmd, data, ifp, td)
+ struct socket *so;
+ u_long cmd;
+ caddr_t data;
+ struct ifnet *ifp;
+ struct thread *td;
+{
+ struct if_laddrreq *iflr = (struct if_laddrreq *)data;
+ struct ifaddr *ifa;
+
+ /* sanity checks */
+ if (!data || !ifp) {
+ panic("invalid argument to in_lifaddr_ioctl");
+ /*NOTRECHED*/
+ }
+
+ switch (cmd) {
+ case SIOCGLIFADDR:
+ /* address must be specified on GET with IFLR_PREFIX */
+ if ((iflr->flags & IFLR_PREFIX) == 0)
+ break;
+ /*FALLTHROUGH*/
+ case SIOCALIFADDR:
+ case SIOCDLIFADDR:
+ /* address must be specified on ADD and DELETE */
+ if (iflr->addr.ss_family != AF_INET)
+ return EINVAL;
+ if (iflr->addr.ss_len != sizeof(struct sockaddr_in))
+ return EINVAL;
+ /* XXX need improvement */
+ if (iflr->dstaddr.ss_family
+ && iflr->dstaddr.ss_family != AF_INET)
+ return EINVAL;
+ if (iflr->dstaddr.ss_family
+ && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in))
+ return EINVAL;
+ break;
+ default: /*shouldn't happen*/
+ return EOPNOTSUPP;
+ }
+ if (sizeof(struct in_addr) * 8 < iflr->prefixlen)
+ return EINVAL;
+
+ switch (cmd) {
+ case SIOCALIFADDR:
+ {
+ struct in_aliasreq ifra;
+
+ if (iflr->flags & IFLR_PREFIX)
+ return EINVAL;
+
+ /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
+ bzero(&ifra, sizeof(ifra));
+ bcopy(iflr->iflr_name, ifra.ifra_name,
+ sizeof(ifra.ifra_name));
+
+ bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len);
+
+ if (iflr->dstaddr.ss_family) { /*XXX*/
+ bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
+ iflr->dstaddr.ss_len);
+ }
+
+ ifra.ifra_mask.sin_family = AF_INET;
+ ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in);
+ in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen);
+
+ return in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td);
+ }
+ case SIOCGLIFADDR:
+ case SIOCDLIFADDR:
+ {
+ struct in_ifaddr *ia;
+ struct in_addr mask, candidate, match;
+ struct sockaddr_in *sin;
+ int cmp;
+
+ bzero(&mask, sizeof(mask));
+ if (iflr->flags & IFLR_PREFIX) {
+ /* lookup a prefix rather than address. */
+ in_len2mask(&mask, iflr->prefixlen);
+
+ sin = (struct sockaddr_in *)&iflr->addr;
+ match.s_addr = sin->sin_addr.s_addr;
+ match.s_addr &= mask.s_addr;
+
+ /* if you set extra bits, that's wrong */
+ if (match.s_addr != sin->sin_addr.s_addr)
+ return EINVAL;
+
+ cmp = 1;
+ } else {
+ if (cmd == SIOCGLIFADDR) {
+ /* on getting an address, take the 1st match */
+ cmp = 0; /*XXX*/
+ } else {
+ /* on deleting an address, do exact match */
+ in_len2mask(&mask, 32);
+ sin = (struct sockaddr_in *)&iflr->addr;
+ match.s_addr = sin->sin_addr.s_addr;
+
+ cmp = 1;
+ }
+ }
+
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET6)
+ continue;
+ if (!cmp)
+ break;
+ candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr;
+ candidate.s_addr &= mask.s_addr;
+ if (candidate.s_addr == match.s_addr)
+ break;
+ }
+ if (!ifa)
+ return EADDRNOTAVAIL;
+ ia = (struct in_ifaddr *)ifa;
+
+ if (cmd == SIOCGLIFADDR) {
+ /* fill in the if_laddrreq structure */
+ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len);
+
+ if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
+ bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
+ ia->ia_dstaddr.sin_len);
+ } else
+ bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));
+
+ iflr->prefixlen =
+ in_mask2len(&ia->ia_sockmask.sin_addr);
+
+ iflr->flags = 0; /*XXX*/
+
+ return 0;
+ } else {
+ struct in_aliasreq ifra;
+
+ /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
+ bzero(&ifra, sizeof(ifra));
+ bcopy(iflr->iflr_name, ifra.ifra_name,
+ sizeof(ifra.ifra_name));
+
+ bcopy(&ia->ia_addr, &ifra.ifra_addr,
+ ia->ia_addr.sin_len);
+ if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
+ bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
+ ia->ia_dstaddr.sin_len);
+ }
+ bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr,
+ ia->ia_sockmask.sin_len);
+
+ return in_control(so, SIOCDIFADDR, (caddr_t)&ifra,
+ ifp, td);
+ }
+ }
+ }
+
+ return EOPNOTSUPP; /*just for safety*/
+}
+
+/*
+ * Delete any existing route for an interface.
+ */
+void
+in_ifscrub(ifp, ia)
+ register struct ifnet *ifp;
+ register struct in_ifaddr *ia;
+{
+
+ if ((ia->ia_flags & IFA_ROUTE) == 0)
+ return;
+ if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
+ rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
+ else
+ rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0);
+ ia->ia_flags &= ~IFA_ROUTE;
+}
+
+/*
+ * Initialize an interface's internet address
+ * and routing table entry.
+ */
+static int
+in_ifinit(ifp, ia, sin, scrub)
+ register struct ifnet *ifp;
+ register struct in_ifaddr *ia;
+ struct sockaddr_in *sin;
+ int scrub;
+{
+ register u_long i = ntohl(sin->sin_addr.s_addr);
+ struct sockaddr_in oldaddr;
+ int s = splimp(), flags = RTF_UP, error = 0;
+
+ oldaddr = ia->ia_addr;
+ ia->ia_addr = *sin;
+ /*
+ * Give the interface a chance to initialize
+ * if this is its first address,
+ * and to validate the address if necessary.
+ */
+ if (ifp->if_ioctl &&
+ (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) {
+ splx(s);
+ ia->ia_addr = oldaddr;
+ return (error);
+ }
+ if (oldaddr.sin_family == AF_INET)
+ LIST_REMOVE(ia, ia_hash);
+ if (ia->ia_addr.sin_family == AF_INET)
+ LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr),
+ ia, ia_hash);
+ splx(s);
+ if (scrub) {
+ ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr;
+ in_ifscrub(ifp, ia);
+ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
+ }
+ if (IN_CLASSA(i))
+ ia->ia_netmask = IN_CLASSA_NET;
+ else if (IN_CLASSB(i))
+ ia->ia_netmask = IN_CLASSB_NET;
+ else
+ ia->ia_netmask = IN_CLASSC_NET;
+ /*
+ * The subnet mask usually includes at least the standard network part,
+ * but may may be smaller in the case of supernetting.
+ * If it is set, we believe it.
+ */
+ if (ia->ia_subnetmask == 0) {
+ ia->ia_subnetmask = ia->ia_netmask;
+ ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
+ } else
+ ia->ia_netmask &= ia->ia_subnetmask;
+ ia->ia_net = i & ia->ia_netmask;
+ ia->ia_subnet = i & ia->ia_subnetmask;
+ in_socktrim(&ia->ia_sockmask);
+ /*
+ * Add route for the network.
+ */
+ ia->ia_ifa.ifa_metric = ifp->if_metric;
+ if (ifp->if_flags & IFF_BROADCAST) {
+ ia->ia_broadaddr.sin_addr.s_addr =
+ htonl(ia->ia_subnet | ~ia->ia_subnetmask);
+ ia->ia_netbroadcast.s_addr =
+ htonl(ia->ia_net | ~ ia->ia_netmask);
+ } else if (ifp->if_flags & IFF_LOOPBACK) {
+ ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr;
+ flags |= RTF_HOST;
+ } else if (ifp->if_flags & IFF_POINTOPOINT) {
+ if (ia->ia_dstaddr.sin_family != AF_INET)
+ return (0);
+ flags |= RTF_HOST;
+ }
+
+ /*-
+ * Don't add host routes for interface addresses of
+ * 0.0.0.0 --> 0.255.255.255 netmask 255.0.0.0. This makes it
+ * possible to assign several such address pairs with consistent
+ * results (no host route) and is required by BOOTP.
+ *
+ * XXX: This is ugly ! There should be a way for the caller to
+ * say that they don't want a host route.
+ */
+ if (ia->ia_addr.sin_addr.s_addr != INADDR_ANY ||
+ ia->ia_netmask != IN_CLASSA_NET ||
+ ia->ia_dstaddr.sin_addr.s_addr != htonl(IN_CLASSA_HOST)) {
+ if ((error = rtinit(&ia->ia_ifa, (int)RTM_ADD, flags)) != 0) {
+ ia->ia_addr = oldaddr;
+ return (error);
+ }
+ ia->ia_flags |= IFA_ROUTE;
+ }
+
+ /*
+ * If the interface supports multicast, join the "all hosts"
+ * multicast group on that interface.
+ */
+ if (ifp->if_flags & IFF_MULTICAST) {
+ struct in_addr addr;
+
+ addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
+ in_addmulti(&addr, ifp);
+ }
+ return (error);
+}
+
+
+/*
+ * Return 1 if the address might be a local broadcast address.
+ */
+int
+in_broadcast(in, ifp)
+ struct in_addr in;
+ struct ifnet *ifp;
+{
+ register struct ifaddr *ifa;
+ u_long t;
+
+ if (in.s_addr == INADDR_BROADCAST ||
+ in.s_addr == INADDR_ANY)
+ return 1;
+ if ((ifp->if_flags & IFF_BROADCAST) == 0)
+ return 0;
+ t = ntohl(in.s_addr);
+ /*
+ * Look through the list of addresses for a match
+ * with a broadcast address.
+ */
+#define ia ((struct in_ifaddr *)ifa)
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ if (ifa->ifa_addr->sa_family == AF_INET &&
+ (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
+ in.s_addr == ia->ia_netbroadcast.s_addr ||
+ /*
+ * Check for old-style (host 0) broadcast.
+ */
+ t == ia->ia_subnet || t == ia->ia_net) &&
+ /*
+ * Check for an all one subnetmask. These
+ * only exist when an interface gets a secondary
+ * address.
+ */
+ ia->ia_subnetmask != (u_long)0xffffffff)
+ return 1;
+ return (0);
+#undef ia
+}
+/*
+ * Add an address to the list of IP multicast addresses for a given interface.
+ */
+struct in_multi *
+in_addmulti(ap, ifp)
+ register struct in_addr *ap;
+ register struct ifnet *ifp;
+{
+ register struct in_multi *inm;
+ int error;
+ struct sockaddr_in sin;
+ struct ifmultiaddr *ifma;
+ int s = splnet();
+
+ /*
+ * Call generic routine to add membership or increment
+ * refcount. It wants addresses in the form of a sockaddr,
+ * so we build one here (being careful to zero the unused bytes).
+ */
+ bzero(&sin, sizeof sin);
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof sin;
+ sin.sin_addr = *ap;
+ error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma);
+ if (error) {
+ splx(s);
+ return 0;
+ }
+
+ /*
+ * If ifma->ifma_protospec is null, then if_addmulti() created
+ * a new record. Otherwise, we are done.
+ */
+ if (ifma->ifma_protospec != 0) {
+ splx(s);
+ return ifma->ifma_protospec;
+ }
+
+ /* XXX - if_addmulti uses M_WAITOK. Can this really be called
+ at interrupt time? If so, need to fix if_addmulti. XXX */
+ inm = (struct in_multi *)malloc(sizeof(*inm), M_IPMADDR,
+ M_NOWAIT | M_ZERO);
+ if (inm == NULL) {
+ splx(s);
+ return (NULL);
+ }
+
+ inm->inm_addr = *ap;
+ inm->inm_ifp = ifp;
+ inm->inm_ifma = ifma;
+ ifma->ifma_protospec = inm;
+ LIST_INSERT_HEAD(&in_multihead, inm, inm_link);
+
+ /*
+ * Let IGMP know that we have joined a new IP multicast group.
+ */
+ igmp_joingroup(inm);
+ splx(s);
+ return (inm);
+}
+
+/*
+ * Delete a multicast address record.
+ */
+void
+in_delmulti(inm)
+ register struct in_multi *inm;
+{
+ struct ifmultiaddr *ifma = inm->inm_ifma;
+ struct in_multi my_inm;
+ int s = splnet();
+
+ my_inm.inm_ifp = NULL ; /* don't send the leave msg */
+ if (ifma->ifma_refcount == 1) {
+ /*
+ * No remaining claims to this record; let IGMP know that
+ * we are leaving the multicast group.
+ * But do it after the if_delmulti() which might reset
+ * the interface and nuke the packet.
+ */
+ my_inm = *inm ;
+ ifma->ifma_protospec = 0;
+ LIST_REMOVE(inm, inm_link);
+ free(inm, M_IPMADDR);
+ }
+ /* XXX - should be separate API for when we have an ifma? */
+ if_delmulti(ifma->ifma_ifp, ifma->ifma_addr);
+ if (my_inm.inm_ifp != NULL)
+ igmp_leavegroup(&my_inm);
+ splx(s);
+}
diff --git a/sys/netinet/in.h b/sys/netinet/in.h
new file mode 100644
index 0000000..282415d
--- /dev/null
+++ b/sys/netinet/in.h
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in.h 8.3 (Berkeley) 1/3/94
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_H_
+#define _NETINET_IN_H_
+
+#include <sys/cdefs.h>
+#include <sys/_types.h>
+#include <machine/endian.h>
+
+/* Protocols common to RFC 1700, POSIX, and X/Open. */
+#define IPPROTO_IP 0 /* dummy for IP */
+#define IPPROTO_ICMP 1 /* control message protocol */
+#define IPPROTO_TCP 6 /* tcp */
+#define IPPROTO_UDP 17 /* user datagram protocol */
+
+#define INADDR_ANY (u_int32_t)0x00000000
+#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */
+
+#ifndef _UINT8_T_DECLARED
+typedef __uint8_t uint8_t;
+#define _UINT8_T_DECLARED
+#endif
+
+#ifndef _UINT16_T_DECLARED
+typedef __uint16_t uint16_t;
+#define _UINT16_T_DECLARED
+#endif
+
+#ifndef _UINT32_T_DECLARED
+typedef __uint32_t uint32_t;
+#define _UINT32_T_DECLARED
+#endif
+
+#ifndef _IN_ADDR_T_DECLARED
+typedef uint32_t in_addr_t;
+#define _IN_ADDR_T_DECLARED
+#endif
+
+#ifndef _IN_PORT_T_DECLARED
+typedef uint16_t in_port_t;
+#define _IN_PORT_T_DECLARED
+#endif
+
+#ifdef _BSD_SA_FAMILY_T_
+typedef _BSD_SA_FAMILY_T_ sa_family_t;
+#undef _BSD_SA_FAMILY_T_
+#endif
+
+/* Internet address (a structure for historical reasons). */
+#ifndef _STRUCT_IN_ADDR_DECLARED
+struct in_addr {
+ in_addr_t s_addr;
+};
+#define _STRUCT_IN_ADDR_DECLARED
+#endif
+
+/* Socket address, internet style. */
+struct sockaddr_in {
+ uint8_t sin_len;
+ sa_family_t sin_family;
+ in_port_t sin_port;
+ struct in_addr sin_addr;
+ char sin_zero[8];
+};
+
+#ifndef _KERNEL
+
+#ifndef _BYTEORDER_PROTOTYPED
+#define _BYTEORDER_PROTOTYPED
+__BEGIN_DECLS
+uint32_t htonl(uint32_t);
+uint16_t htons(uint16_t);
+uint32_t ntohl(uint32_t);
+uint16_t ntohs(uint16_t);
+__END_DECLS
+#endif
+
+#ifndef _BYTEORDER_FUNC_DEFINED
+#define _BYTEORDER_FUNC_DEFINED
+#define htonl(x) __htonl(x)
+#define htons(x) __htons(x)
+#define ntohl(x) __ntohl(x)
+#define ntohs(x) __ntohs(x)
+#endif
+
+#endif /* !_KERNEL */
+
+#if __POSIX_VISIBLE >= 200112
+#define IPPROTO_RAW 255 /* raw IP packet */
+#define INET_ADDRSTRLEN 16
+#endif
+
+#if __BSD_VISIBLE
+/*
+ * Constants and structures defined by the internet system,
+ * Per RFC 790, September 1981, and numerous additions.
+ */
+
+/*
+ * Protocols (RFC 1700)
+ */
+#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */
+#define IPPROTO_IGMP 2 /* group mgmt protocol */
+#define IPPROTO_GGP 3 /* gateway^2 (deprecated) */
+#define IPPROTO_IPV4 4 /* IPv4 encapsulation */
+#define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */
+#define IPPROTO_ST 7 /* Stream protocol II */
+#define IPPROTO_EGP 8 /* exterior gateway protocol */
+#define IPPROTO_PIGP 9 /* private interior gateway */
+#define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */
+#define IPPROTO_NVPII 11 /* network voice protocol*/
+#define IPPROTO_PUP 12 /* pup */
+#define IPPROTO_ARGUS 13 /* Argus */
+#define IPPROTO_EMCON 14 /* EMCON */
+#define IPPROTO_XNET 15 /* Cross Net Debugger */
+#define IPPROTO_CHAOS 16 /* Chaos*/
+#define IPPROTO_MUX 18 /* Multiplexing */
+#define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */
+#define IPPROTO_HMP 20 /* Host Monitoring */
+#define IPPROTO_PRM 21 /* Packet Radio Measurement */
+#define IPPROTO_IDP 22 /* xns idp */
+#define IPPROTO_TRUNK1 23 /* Trunk-1 */
+#define IPPROTO_TRUNK2 24 /* Trunk-2 */
+#define IPPROTO_LEAF1 25 /* Leaf-1 */
+#define IPPROTO_LEAF2 26 /* Leaf-2 */
+#define IPPROTO_RDP 27 /* Reliable Data */
+#define IPPROTO_IRTP 28 /* Reliable Transaction */
+#define IPPROTO_TP 29 /* tp-4 w/ class negotiation */
+#define IPPROTO_BLT 30 /* Bulk Data Transfer */
+#define IPPROTO_NSP 31 /* Network Services */
+#define IPPROTO_INP 32 /* Merit Internodal */
+#define IPPROTO_SEP 33 /* Sequential Exchange */
+#define IPPROTO_3PC 34 /* Third Party Connect */
+#define IPPROTO_IDPR 35 /* InterDomain Policy Routing */
+#define IPPROTO_XTP 36 /* XTP */
+#define IPPROTO_DDP 37 /* Datagram Delivery */
+#define IPPROTO_CMTP 38 /* Control Message Transport */
+#define IPPROTO_TPXX 39 /* TP++ Transport */
+#define IPPROTO_IL 40 /* IL transport protocol */
+#define IPPROTO_IPV6 41 /* IP6 header */
+#define IPPROTO_SDRP 42 /* Source Demand Routing */
+#define IPPROTO_ROUTING 43 /* IP6 routing header */
+#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */
+#define IPPROTO_IDRP 45 /* InterDomain Routing*/
+#define IPPROTO_RSVP 46 /* resource reservation */
+#define IPPROTO_GRE 47 /* General Routing Encap. */
+#define IPPROTO_MHRP 48 /* Mobile Host Routing */
+#define IPPROTO_BHA 49 /* BHA */
+#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */
+#define IPPROTO_AH 51 /* IP6 Auth Header */
+#define IPPROTO_INLSP 52 /* Integ. Net Layer Security */
+#define IPPROTO_SWIPE 53 /* IP with encryption */
+#define IPPROTO_NHRP 54 /* Next Hop Resolution */
+#define IPPROTO_MOBILE 55 /* IP Mobility */
+#define IPPROTO_TLSP 56 /* Transport Layer Security */
+#define IPPROTO_SKIP 57 /* SKIP */
+#define IPPROTO_ICMPV6 58 /* ICMP6 */
+#define IPPROTO_NONE 59 /* IP6 no next header */
+#define IPPROTO_DSTOPTS 60 /* IP6 destination option */
+#define IPPROTO_AHIP 61 /* any host internal protocol */
+#define IPPROTO_CFTP 62 /* CFTP */
+#define IPPROTO_HELLO 63 /* "hello" routing protocol */
+#define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */
+#define IPPROTO_KRYPTOLAN 65 /* Kryptolan */
+#define IPPROTO_RVD 66 /* Remote Virtual Disk */
+#define IPPROTO_IPPC 67 /* Pluribus Packet Core */
+#define IPPROTO_ADFS 68 /* Any distributed FS */
+#define IPPROTO_SATMON 69 /* Satnet Monitoring */
+#define IPPROTO_VISA 70 /* VISA Protocol */
+#define IPPROTO_IPCV 71 /* Packet Core Utility */
+#define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */
+#define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */
+#define IPPROTO_WSN 74 /* Wang Span Network */
+#define IPPROTO_PVP 75 /* Packet Video Protocol */
+#define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */
+#define IPPROTO_ND 77 /* Sun net disk proto (temp.) */
+#define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */
+#define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */
+#define IPPROTO_EON 80 /* ISO cnlp */
+#define IPPROTO_VMTP 81 /* VMTP */
+#define IPPROTO_SVMTP 82 /* Secure VMTP */
+#define IPPROTO_VINES 83 /* Banyon VINES */
+#define IPPROTO_TTP 84 /* TTP */
+#define IPPROTO_IGP 85 /* NSFNET-IGP */
+#define IPPROTO_DGP 86 /* dissimilar gateway prot. */
+#define IPPROTO_TCF 87 /* TCF */
+#define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */
+#define IPPROTO_OSPFIGP 89 /* OSPFIGP */
+#define IPPROTO_SRPC 90 /* Strite RPC protocol */
+#define IPPROTO_LARP 91 /* Locus Address Resoloution */
+#define IPPROTO_MTP 92 /* Multicast Transport */
+#define IPPROTO_AX25 93 /* AX.25 Frames */
+#define IPPROTO_IPEIP 94 /* IP encapsulated in IP */
+#define IPPROTO_MICP 95 /* Mobile Int.ing control */
+#define IPPROTO_SCCSP 96 /* Semaphore Comm. security */
+#define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */
+#define IPPROTO_ENCAP 98 /* encapsulation header */
+#define IPPROTO_APES 99 /* any private encr. scheme */
+#define IPPROTO_GMTP 100 /* GMTP*/
+#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */
+/* 101-254: Partly Unassigned */
+#define IPPROTO_PIM 103 /* Protocol Independent Mcast */
+#define IPPROTO_PGM 113 /* PGM */
+/* 255: Reserved */
+/* BSD Private, local use, namespace incursion */
+#define IPPROTO_DIVERT 254 /* divert pseudo-protocol */
+#define IPPROTO_MAX 256
+
+/* last return value of *_input(), meaning "all job for this pkt is done". */
+#define IPPROTO_DONE 257
+
+/*
+ * Local port number conventions:
+ *
+ * When a user does a bind(2) or connect(2) with a port number of zero,
+ * a non-conflicting local port address is chosen.
+ * The default range is IPPORT_HIFIRSTAUTO through
+ * IPPORT_HILASTAUTO, although that is settable by sysctl.
+ *
+ * A user may set the IPPROTO_IP option IP_PORTRANGE to change this
+ * default assignment range.
+ *
+ * The value IP_PORTRANGE_DEFAULT causes the default behavior.
+ *
+ * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers
+ * into the "high" range. These are reserved for client outbound connections
+ * which do not want to be filtered by any firewalls. Note that by default
+ * this is the same as IP_PORTRANGE_DEFAULT.
+ *
+ * The value IP_PORTRANGE_LOW changes the range to the "low" are
+ * that is (by convention) restricted to privileged processes. This
+ * convention is based on "vouchsafe" principles only. It is only secure
+ * if you trust the remote host to restrict these ports.
+ *
+ * The default range of ports and the high range can be changed by
+ * sysctl(3). (net.inet.ip.port{hi,low}{first,last}_auto)
+ *
+ * Changing those values has bad security implications if you are
+ * using a a stateless firewall that is allowing packets outside of that
+ * range in order to allow transparent outgoing connections.
+ *
+ * Such a firewall configuration will generally depend on the use of these
+ * default values. If you change them, you may find your Security
+ * Administrator looking for you with a heavy object.
+ *
+ * For a slightly more orthodox text view on this:
+ *
+ * ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers
+ *
+ * port numbers are divided into three ranges:
+ *
+ * 0 - 1023 Well Known Ports
+ * 1024 - 49151 Registered Ports
+ * 49152 - 65535 Dynamic and/or Private Ports
+ *
+ */
+
+/*
+ * Ports < IPPORT_RESERVED are reserved for
+ * privileged processes (e.g. root). (IP_PORTRANGE_LOW)
+ */
+#define IPPORT_RESERVED 1024
+
+/*
+ * Default local port range, used by both IP_PORTRANGE_DEFAULT
+ * and IP_PORTRANGE_HIGH.
+ */
+#define IPPORT_HIFIRSTAUTO 49152
+#define IPPORT_HILASTAUTO 65535
+
+/*
+ * Scanning for a free reserved port return a value below IPPORT_RESERVED,
+ * but higher than IPPORT_RESERVEDSTART. Traditionally the start value was
+ * 512, but that conflicts with some well-known-services that firewalls may
+ * have a fit if we use.
+ */
+#define IPPORT_RESERVEDSTART 600
+
+#define IPPORT_MAX 65535
+
+/*
+ * Definitions of bits in internet address integers.
+ * On subnets, the decomposition of addresses to host and net parts
+ * is done according to subnet mask, not the masks here.
+ */
+#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0)
+#define IN_CLASSA_NET 0xff000000
+#define IN_CLASSA_NSHIFT 24
+#define IN_CLASSA_HOST 0x00ffffff
+#define IN_CLASSA_MAX 128
+
+#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000)
+#define IN_CLASSB_NET 0xffff0000
+#define IN_CLASSB_NSHIFT 16
+#define IN_CLASSB_HOST 0x0000ffff
+#define IN_CLASSB_MAX 65536
+
+#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000)
+#define IN_CLASSC_NET 0xffffff00
+#define IN_CLASSC_NSHIFT 8
+#define IN_CLASSC_HOST 0x000000ff
+
+#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000)
+#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */
+#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */
+#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */
+#define IN_MULTICAST(i) IN_CLASSD(i)
+
+#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000)
+#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000)
+
+#define INADDR_LOOPBACK (u_int32_t)0x7f000001
+#ifndef _KERNEL
+#define INADDR_NONE 0xffffffff /* -1 return */
+#endif
+
+#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */
+#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */
+#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */
+#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */
+
+#define IN_LOOPBACKNET 127 /* official! */
+
+/*
+ * Options for use with [gs]etsockopt at the IP level.
+ * First word of comment is data type; bool is stored in int.
+ */
+#define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */
+#define IP_HDRINCL 2 /* int; header is included with data */
+#define IP_TOS 3 /* int; IP type of service and preced. */
+#define IP_TTL 4 /* int; IP time to live */
+#define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */
+#define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */
+#define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */
+#define IP_RETOPTS 8 /* ip_opts; set/get IP options */
+#define IP_MULTICAST_IF 9 /* u_char; set/get IP multicast i/f */
+#define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */
+#define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */
+#define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */
+#define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */
+#define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */
+#define IP_RSVP_ON 15 /* enable RSVP in kernel */
+#define IP_RSVP_OFF 16 /* disable RSVP in kernel */
+#define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */
+#define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */
+#define IP_PORTRANGE 19 /* int; range to choose for unspec port */
+#define IP_RECVIF 20 /* bool; receive reception if w/dgram */
+/* for IPSEC */
+#define IP_IPSEC_POLICY 21 /* int; set/get security policy */
+#define IP_FAITH 22 /* bool; accept FAITH'ed connections */
+
+#define IP_FW_ADD 50 /* add a firewall rule to chain */
+#define IP_FW_DEL 51 /* delete a firewall rule from chain */
+#define IP_FW_FLUSH 52 /* flush firewall rule chain */
+#define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */
+#define IP_FW_GET 54 /* get entire firewall rule chain */
+#define IP_FW_RESETLOG 55 /* reset logging counters */
+
+#define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */
+#define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */
+#define IP_DUMMYNET_FLUSH 62 /* flush dummynet */
+#define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */
+
+/*
+ * Defaults and limits for options
+ */
+#define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */
+#define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */
+#define IP_MAX_MEMBERSHIPS 20 /* per socket */
+
+/*
+ * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP.
+ */
+struct ip_mreq {
+ struct in_addr imr_multiaddr; /* IP multicast address of group */
+ struct in_addr imr_interface; /* local IP address of interface */
+};
+
+/*
+ * Argument for IP_PORTRANGE:
+ * - which range to search when port is unspecified at bind() or connect()
+ */
+#define IP_PORTRANGE_DEFAULT 0 /* default range */
+#define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */
+#define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */
+
+/*
+ * Definitions for inet sysctl operations.
+ *
+ * Third level is protocol number.
+ * Fourth level is desired variable within that protocol.
+ */
+#define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */
+
+#define CTL_IPPROTO_NAMES { \
+ { "ip", CTLTYPE_NODE }, \
+ { "icmp", CTLTYPE_NODE }, \
+ { "igmp", CTLTYPE_NODE }, \
+ { "ggp", CTLTYPE_NODE }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "tcp", CTLTYPE_NODE }, \
+ { 0, 0 }, \
+ { "egp", CTLTYPE_NODE }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "pup", CTLTYPE_NODE }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "udp", CTLTYPE_NODE }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "idp", CTLTYPE_NODE }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { 0, 0 }, \
+ { "ipsec", CTLTYPE_NODE }, \
+}
+
+/*
+ * Names for IP sysctl objects
+ */
+#define IPCTL_FORWARDING 1 /* act as router */
+#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */
+#define IPCTL_DEFTTL 3 /* default TTL */
+#ifdef notyet
+#define IPCTL_DEFMTU 4 /* default MTU */
+#endif
+#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */
+#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */
+#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */
+#define IPCTL_SOURCEROUTE 8 /* may perform source routes */
+#define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */
+#define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */
+#define IPCTL_INTRQDROPS 11 /* number of netisr q drops */
+#define IPCTL_STATS 12 /* ipstat structure */
+#define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */
+#define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */
+#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */
+#define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */
+#define IPCTL_MAXID 17
+
+#define IPCTL_NAMES { \
+ { 0, 0 }, \
+ { "forwarding", CTLTYPE_INT }, \
+ { "redirect", CTLTYPE_INT }, \
+ { "ttl", CTLTYPE_INT }, \
+ { "mtu", CTLTYPE_INT }, \
+ { "rtexpire", CTLTYPE_INT }, \
+ { "rtminexpire", CTLTYPE_INT }, \
+ { "rtmaxcache", CTLTYPE_INT }, \
+ { "sourceroute", CTLTYPE_INT }, \
+ { "directed-broadcast", CTLTYPE_INT }, \
+ { "intr-queue-maxlen", CTLTYPE_INT }, \
+ { "intr-queue-drops", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "accept_sourceroute", CTLTYPE_INT }, \
+ { "fastforwarding", CTLTYPE_INT }, \
+}
+
+#endif /* __BSD_VISIBLE */
+
+#ifdef _KERNEL
+
+struct ifnet; struct mbuf; /* forward declarations for Standard C */
+
+int in_broadcast(struct in_addr, struct ifnet *);
+int in_canforward(struct in_addr);
+int in_localaddr(struct in_addr);
+char *inet_ntoa(struct in_addr); /* in libkern */
+char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */
+
+#define satosin(sa) ((struct sockaddr_in *)(sa))
+#define sintosa(sin) ((struct sockaddr *)(sin))
+#define ifatoia(ifa) ((struct in_ifaddr *)(ifa))
+
+#endif /* _KERNEL */
+
+/* INET6 stuff */
+#if __POSIX_VISIBLE >= 200112
+#define __KAME_NETINET_IN_H_INCLUDED_
+#include <netinet6/in6.h>
+#undef __KAME_NETINET_IN_H_INCLUDED_
+#endif
+
+#endif /* !_NETINET_IN_H_*/
diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c
new file mode 100644
index 0000000..eaf1493
--- /dev/null
+++ b/sys/netinet/in_cksum.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 1988, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+
+/*
+ * Checksum routine for Internet Protocol family headers (Portable Version).
+ *
+ * This routine is very heavily used in the network
+ * code and should be modified for each CPU to be as fast as possible.
+ */
+
+#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);}
+
+int
+in_cksum(m, len)
+ register struct mbuf *m;
+ register int len;
+{
+ register u_short *w;
+ register int sum = 0;
+ register int mlen = 0;
+ int byte_swapped = 0;
+
+ union {
+ char c[2];
+ u_short s;
+ } s_util;
+ union {
+ u_short s[2];
+ long l;
+ } l_util;
+
+ for (;m && len; m = m->m_next) {
+ if (m->m_len == 0)
+ continue;
+ w = mtod(m, u_short *);
+ if (mlen == -1) {
+ /*
+ * The first byte of this mbuf is the continuation
+ * of a word spanning between this mbuf and the
+ * last mbuf.
+ *
+ * s_util.c[0] is already saved when scanning previous
+ * mbuf.
+ */
+ s_util.c[1] = *(char *)w;
+ sum += s_util.s;
+ w = (u_short *)((char *)w + 1);
+ mlen = m->m_len - 1;
+ len--;
+ } else
+ mlen = m->m_len;
+ if (len < mlen)
+ mlen = len;
+ len -= mlen;
+ /*
+ * Force to even boundary.
+ */
+ if ((1 & (int) w) && (mlen > 0)) {
+ REDUCE;
+ sum <<= 8;
+ s_util.c[0] = *(u_char *)w;
+ w = (u_short *)((char *)w + 1);
+ mlen--;
+ byte_swapped = 1;
+ }
+ /*
+ * Unroll the loop to make overhead from
+ * branches &c small.
+ */
+ while ((mlen -= 32) >= 0) {
+ sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+ sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7];
+ sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11];
+ sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15];
+ w += 16;
+ }
+ mlen += 32;
+ while ((mlen -= 8) >= 0) {
+ sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3];
+ w += 4;
+ }
+ mlen += 8;
+ if (mlen == 0 && byte_swapped == 0)
+ continue;
+ REDUCE;
+ while ((mlen -= 2) >= 0) {
+ sum += *w++;
+ }
+ if (byte_swapped) {
+ REDUCE;
+ sum <<= 8;
+ byte_swapped = 0;
+ if (mlen == -1) {
+ s_util.c[1] = *(char *)w;
+ sum += s_util.s;
+ mlen = 0;
+ } else
+ mlen = -1;
+ } else if (mlen == -1)
+ s_util.c[0] = *(char *)w;
+ }
+ if (len)
+ printf("cksum: out of data\n");
+ if (mlen == -1) {
+ /* The last mbuf has odd # of bytes. Follow the
+ standard (the odd byte may be shifted left by 8 bits
+ or not as determined by endian-ness of the machine) */
+ s_util.c[1] = 0;
+ sum += s_util.s;
+ }
+ REDUCE;
+ return (~sum & 0xffff);
+}
diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c
new file mode 100644
index 0000000..b7a1cec
--- /dev/null
+++ b/sys/netinet/in_gif.c
@@ -0,0 +1,354 @@
+/* $FreeBSD$ */
+/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_mrouting.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <sys/malloc.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/in_gif.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_encap.h>
+#include <netinet/ip_ecn.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+
+#ifdef MROUTING
+#include <netinet/ip_mroute.h>
+#endif /* MROUTING */
+
+#include <net/if_gif.h>
+
+#include <net/net_osdep.h>
+
+static int ip_gif_ttl = GIF_TTL;
+SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW,
+ &ip_gif_ttl, 0, "");
+
+int
+in_gif_output(ifp, family, m, rt)
+ struct ifnet *ifp;
+ int family;
+ struct mbuf *m;
+ struct rtentry *rt;
+{
+ struct gif_softc *sc = (struct gif_softc*)ifp;
+ struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst;
+ struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc;
+ struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst;
+ struct ip iphdr; /* capsule IP header, host byte ordered */
+ int proto, error;
+ u_int8_t tos;
+
+ if (sin_src == NULL || sin_dst == NULL ||
+ sin_src->sin_family != AF_INET ||
+ sin_dst->sin_family != AF_INET) {
+ m_freem(m);
+ return EAFNOSUPPORT;
+ }
+
+ switch (family) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct ip *ip;
+
+ proto = IPPROTO_IPV4;
+ if (m->m_len < sizeof(*ip)) {
+ m = m_pullup(m, sizeof(*ip));
+ if (!m)
+ return ENOBUFS;
+ }
+ ip = mtod(m, struct ip *);
+ tos = ip->ip_tos;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct ip6_hdr *ip6;
+ proto = IPPROTO_IPV6;
+ if (m->m_len < sizeof(*ip6)) {
+ m = m_pullup(m, sizeof(*ip6));
+ if (!m)
+ return ENOBUFS;
+ }
+ ip6 = mtod(m, struct ip6_hdr *);
+ tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ break;
+ }
+#endif /* INET6 */
+ default:
+#ifdef DEBUG
+ printf("in_gif_output: warning: unknown family %d passed\n",
+ family);
+#endif
+ m_freem(m);
+ return EAFNOSUPPORT;
+ }
+
+ bzero(&iphdr, sizeof(iphdr));
+ iphdr.ip_src = sin_src->sin_addr;
+ /* bidirectional configured tunnel mode */
+ if (sin_dst->sin_addr.s_addr != INADDR_ANY)
+ iphdr.ip_dst = sin_dst->sin_addr;
+ else {
+ m_freem(m);
+ return ENETUNREACH;
+ }
+ iphdr.ip_p = proto;
+ /* version will be set in ip_output() */
+ iphdr.ip_ttl = ip_gif_ttl;
+ iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip);
+ if (ifp->if_flags & IFF_LINK1)
+ ip_ecn_ingress(ECN_ALLOWED, &iphdr.ip_tos, &tos);
+ else
+ ip_ecn_ingress(ECN_NOCARE, &iphdr.ip_tos, &tos);
+
+ /* prepend new IP header */
+ M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
+ if (m && m->m_len < sizeof(struct ip))
+ m = m_pullup(m, sizeof(struct ip));
+ if (m == NULL) {
+ printf("ENOBUFS in in_gif_output %d\n", __LINE__);
+ return ENOBUFS;
+ }
+ bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip));
+
+ if (dst->sin_family != sin_dst->sin_family ||
+ dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) {
+ /* cache route doesn't match */
+ dst->sin_family = sin_dst->sin_family;
+ dst->sin_len = sizeof(struct sockaddr_in);
+ dst->sin_addr = sin_dst->sin_addr;
+ if (sc->gif_ro.ro_rt) {
+ RTFREE(sc->gif_ro.ro_rt);
+ sc->gif_ro.ro_rt = NULL;
+ }
+#if 0
+ sc->gif_if.if_mtu = GIF_MTU;
+#endif
+ }
+
+ if (sc->gif_ro.ro_rt == NULL) {
+ rtalloc(&sc->gif_ro);
+ if (sc->gif_ro.ro_rt == NULL) {
+ m_freem(m);
+ return ENETUNREACH;
+ }
+
+ /* if it constitutes infinite encapsulation, punt. */
+ if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
+ m_freem(m);
+ return ENETUNREACH; /* XXX */
+ }
+#if 0
+ ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu
+ - sizeof(struct ip);
+#endif
+ }
+
+ error = ip_output(m, NULL, &sc->gif_ro, 0, NULL);
+ return(error);
+}
+
+void
+in_gif_input(m, off)
+ struct mbuf *m;
+ int off;
+{
+ struct ifnet *gifp = NULL;
+ struct ip *ip;
+ int af;
+ u_int8_t otos;
+ int proto;
+
+ ip = mtod(m, struct ip *);
+ proto = ip->ip_p;
+
+ gifp = (struct ifnet *)encap_getarg(m);
+
+ if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) {
+ m_freem(m);
+ ipstat.ips_nogif++;
+ return;
+ }
+
+ otos = ip->ip_tos;
+ m_adj(m, off);
+
+ switch (proto) {
+#ifdef INET
+ case IPPROTO_IPV4:
+ {
+ struct ip *ip;
+ af = AF_INET;
+ if (m->m_len < sizeof(*ip)) {
+ m = m_pullup(m, sizeof(*ip));
+ if (!m)
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ if (gifp->if_flags & IFF_LINK1)
+ ip_ecn_egress(ECN_ALLOWED, &otos, &ip->ip_tos);
+ else
+ ip_ecn_egress(ECN_NOCARE, &otos, &ip->ip_tos);
+ break;
+ }
+#endif
+#ifdef INET6
+ case IPPROTO_IPV6:
+ {
+ struct ip6_hdr *ip6;
+ u_int8_t itos;
+ af = AF_INET6;
+ if (m->m_len < sizeof(*ip6)) {
+ m = m_pullup(m, sizeof(*ip6));
+ if (!m)
+ return;
+ }
+ ip6 = mtod(m, struct ip6_hdr *);
+ itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ if (gifp->if_flags & IFF_LINK1)
+ ip_ecn_egress(ECN_ALLOWED, &otos, &itos);
+ else
+ ip_ecn_egress(ECN_NOCARE, &otos, &itos);
+ ip6->ip6_flow &= ~htonl(0xff << 20);
+ ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
+ break;
+ }
+#endif /* INET6 */
+ default:
+ ipstat.ips_nogif++;
+ m_freem(m);
+ return;
+ }
+ gif_input(m, af, gifp);
+ return;
+}
+
+/*
+ * we know that we are in IFF_UP, outer address available, and outer family
+ * matched the physical addr family. see gif_encapcheck().
+ */
+int
+gif_encapcheck4(m, off, proto, arg)
+ const struct mbuf *m;
+ int off;
+ int proto;
+ void *arg;
+{
+ struct ip ip;
+ struct gif_softc *sc;
+ struct sockaddr_in *src, *dst;
+ int addrmatch;
+ struct in_ifaddr *ia4;
+
+ /* sanity check done in caller */
+ sc = (struct gif_softc *)arg;
+ src = (struct sockaddr_in *)sc->gif_psrc;
+ dst = (struct sockaddr_in *)sc->gif_pdst;
+
+ /* LINTED const cast */
+ m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
+
+ /* check for address match */
+ addrmatch = 0;
+ if (src->sin_addr.s_addr == ip.ip_dst.s_addr)
+ addrmatch |= 1;
+ if (dst->sin_addr.s_addr == ip.ip_src.s_addr)
+ addrmatch |= 2;
+ if (addrmatch != 3)
+ return 0;
+
+ /* martian filters on outer source - NOT done in ip_input! */
+ if (IN_MULTICAST(ntohl(ip.ip_src.s_addr)))
+ return 0;
+ switch ((ntohl(ip.ip_src.s_addr) & 0xff000000) >> 24) {
+ case 0: case 127: case 255:
+ return 0;
+ }
+ /* reject packets with broadcast on source */
+ TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link)
+ {
+ if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
+ continue;
+ if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr)
+ return 0;
+ }
+
+ /* ingress filters on outer source */
+ if ((sc->gif_if.if_flags & IFF_LINK2) == 0 &&
+ (m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) {
+ struct sockaddr_in sin;
+ struct rtentry *rt;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_addr = ip.ip_src;
+ rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL);
+ if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) {
+#if 0
+ log(LOG_WARNING, "%s: packet from 0x%x dropped "
+ "due to ingress filter\n", if_name(&sc->gif_if),
+ (u_int32_t)ntohl(sin.sin_addr.s_addr));
+#endif
+ if (rt)
+ rtfree(rt);
+ return 0;
+ }
+ rtfree(rt);
+ }
+
+ return 32 * 2;
+}
diff --git a/sys/netinet/in_gif.h b/sys/netinet/in_gif.h
new file mode 100644
index 0000000..262d9ba
--- /dev/null
+++ b/sys/netinet/in_gif.h
@@ -0,0 +1,42 @@
+/* $FreeBSD$ */
+/* $KAME: in_gif.h,v 1.5 2000/04/14 08:36:02 itojun Exp $ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_IN_GIF_H_
+#define _NETINET_IN_GIF_H_
+
+#define GIF_TTL 30
+
+void in_gif_input(struct mbuf *, int off);
+int in_gif_output(struct ifnet *, int, struct mbuf *, struct rtentry *);
+int gif_encapcheck4(const struct mbuf *, int, int, void *);
+
+#endif /*_NETINET_IN_GIF_H_*/
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
new file mode 100644
index 0000000..744cfc2
--- /dev/null
+++ b/sys/netinet/in_pcb.c
@@ -0,0 +1,1072 @@
+/*
+ * Copyright (c) 1982, 1986, 1991, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_ipsec.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <machine/limits.h>
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif /* INET6 */
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#include <netkey/key.h>
+#endif /* IPSEC */
+
+struct in_addr zeroin_addr;
+
+/*
+ * These configure the range of local port addresses assigned to
+ * "unspecified" outgoing connections/packets/whatever.
+ */
+int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
+int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
+int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
+int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
+int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
+int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
+
+#define RANGECHK(var, min, max) \
+ if ((var) < (min)) { (var) = (min); } \
+ else if ((var) > (max)) { (var) = (max); }
+
+static int
+sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
+{
+ int error = sysctl_handle_int(oidp,
+ oidp->oid_arg1, oidp->oid_arg2, req);
+ if (!error) {
+ RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
+ RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
+ RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
+ RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
+ RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
+ RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
+ }
+ return error;
+}
+
+#undef RANGECHK
+
+SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
+
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
+ &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
+ &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
+ &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
+ &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
+ &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
+SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
+ &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
+
+/*
+ * in_pcb.c: manage the Protocol Control Blocks.
+ *
+ * NOTE: It is assumed that most of these functions will be called at
+ * splnet(). XXX - There are, unfortunately, a few exceptions to this
+ * rule that should be fixed.
+ */
+
+/*
+ * Allocate a PCB and associate it with the socket.
+ */
+int
+in_pcballoc(so, pcbinfo, td)
+ struct socket *so;
+ struct inpcbinfo *pcbinfo;
+ struct thread *td;
+{
+ register struct inpcb *inp;
+#ifdef IPSEC
+ int error;
+#endif
+
+ inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
+ if (inp == NULL)
+ return (ENOBUFS);
+ bzero((caddr_t)inp, sizeof(*inp));
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ inp->inp_pcbinfo = pcbinfo;
+ inp->inp_socket = so;
+#ifdef IPSEC
+ error = ipsec_init_policy(so, &inp->inp_sp);
+ if (error != 0) {
+ uma_zfree(pcbinfo->ipi_zone, inp);
+ return error;
+ }
+#endif /*IPSEC*/
+#if defined(INET6)
+ if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on)
+ inp->inp_flags |= IN6P_IPV6_V6ONLY;
+#endif
+ LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
+ pcbinfo->ipi_count++;
+ so->so_pcb = (caddr_t)inp;
+ INP_LOCK_INIT(inp, "inp");
+#ifdef INET6
+ if (ip6_auto_flowlabel)
+ inp->inp_flags |= IN6P_AUTOFLOWLABEL;
+#endif
+ return (0);
+}
+
+int
+in_pcbbind(inp, nam, td)
+ register struct inpcb *inp;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ register struct socket *so = inp->inp_socket;
+ unsigned short *lastport;
+ struct sockaddr_in *sin;
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ u_short lport = 0;
+ int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
+ int error, prison = 0;
+
+ if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
+ return (EADDRNOTAVAIL);
+ if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY)
+ return (EINVAL);
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ wild = 1;
+ if (nam) {
+ sin = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sin))
+ return (EINVAL);
+#ifdef notdef
+ /*
+ * We should check the family, but old programs
+ * incorrectly fail to initialize it.
+ */
+ if (sin->sin_family != AF_INET)
+ return (EAFNOSUPPORT);
+#endif
+ if (sin->sin_addr.s_addr != INADDR_ANY)
+ if (prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr))
+ return(EINVAL);
+ lport = sin->sin_port;
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
+ /*
+ * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
+ * allow complete duplication of binding if
+ * SO_REUSEPORT is set, or if SO_REUSEADDR is set
+ * and a multicast address is bound on both
+ * new and duplicated sockets.
+ */
+ if (so->so_options & SO_REUSEADDR)
+ reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ } else if (sin->sin_addr.s_addr != INADDR_ANY) {
+ sin->sin_port = 0; /* yech... */
+ bzero(&sin->sin_zero, sizeof(sin->sin_zero));
+ if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
+ return (EADDRNOTAVAIL);
+ }
+ if (lport) {
+ struct inpcb *t;
+ /* GROSS */
+ if (ntohs(lport) < IPPORT_RESERVED && td &&
+ suser_cred(td->td_ucred, PRISON_ROOT))
+ return (EACCES);
+ if (td && jailed(td->td_ucred))
+ prison = 1;
+ if (so->so_cred->cr_uid != 0 &&
+ !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
+ t = in_pcblookup_local(inp->inp_pcbinfo,
+ sin->sin_addr, lport,
+ prison ? 0 : INPLOOKUP_WILDCARD);
+ if (t &&
+ (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
+ ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
+ (t->inp_socket->so_options &
+ SO_REUSEPORT) == 0) &&
+ (so->so_cred->cr_uid !=
+ t->inp_socket->so_cred->cr_uid)) {
+#if defined(INET6)
+ if (ntohl(sin->sin_addr.s_addr) !=
+ INADDR_ANY ||
+ ntohl(t->inp_laddr.s_addr) !=
+ INADDR_ANY ||
+ INP_SOCKAF(so) ==
+ INP_SOCKAF(t->inp_socket))
+#endif /* defined(INET6) */
+ return (EADDRINUSE);
+ }
+ }
+ if (prison &&
+ prison_ip(td->td_ucred, 0, &sin->sin_addr.s_addr))
+ return (EADDRNOTAVAIL);
+ t = in_pcblookup_local(pcbinfo, sin->sin_addr,
+ lport, prison ? 0 : wild);
+ if (t &&
+ (reuseport & t->inp_socket->so_options) == 0) {
+#if defined(INET6)
+ if (ntohl(sin->sin_addr.s_addr) !=
+ INADDR_ANY ||
+ ntohl(t->inp_laddr.s_addr) !=
+ INADDR_ANY ||
+ INP_SOCKAF(so) ==
+ INP_SOCKAF(t->inp_socket))
+#endif /* defined(INET6) */
+ return (EADDRINUSE);
+ }
+ }
+ inp->inp_laddr = sin->sin_addr;
+ }
+ if (lport == 0) {
+ ushort first, last;
+ int count;
+
+ if (inp->inp_laddr.s_addr != INADDR_ANY)
+ if (prison_ip(td->td_ucred, 0, &inp->inp_laddr.s_addr )) {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ return (EINVAL);
+ }
+ inp->inp_flags |= INP_ANONPORT;
+
+ if (inp->inp_flags & INP_HIGHPORT) {
+ first = ipport_hifirstauto; /* sysctl */
+ last = ipport_hilastauto;
+ lastport = &pcbinfo->lasthi;
+ } else if (inp->inp_flags & INP_LOWPORT) {
+ if (td && (error = suser_cred(td->td_ucred, PRISON_ROOT))) {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ return error;
+ }
+ first = ipport_lowfirstauto; /* 1023 */
+ last = ipport_lowlastauto; /* 600 */
+ lastport = &pcbinfo->lastlow;
+ } else {
+ first = ipport_firstauto; /* sysctl */
+ last = ipport_lastauto;
+ lastport = &pcbinfo->lastport;
+ }
+ /*
+ * Simple check to ensure all ports are not used up causing
+ * a deadlock here.
+ *
+ * We split the two cases (up and down) so that the direction
+ * is not being tested on each round of the loop.
+ */
+ if (first > last) {
+ /*
+ * counting down
+ */
+ count = first - last;
+
+ do {
+ if (count-- < 0) { /* completely used? */
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ return (EADDRNOTAVAIL);
+ }
+ --*lastport;
+ if (*lastport > first || *lastport < last)
+ *lastport = first;
+ lport = htons(*lastport);
+ } while (in_pcblookup_local(pcbinfo,
+ inp->inp_laddr, lport, wild));
+ } else {
+ /*
+ * counting up
+ */
+ count = last - first;
+
+ do {
+ if (count-- < 0) { /* completely used? */
+ /*
+ * Undo any address bind that may have
+ * occurred above.
+ */
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ return (EADDRNOTAVAIL);
+ }
+ ++*lastport;
+ if (*lastport < first || *lastport > last)
+ *lastport = first;
+ lport = htons(*lastport);
+ } while (in_pcblookup_local(pcbinfo,
+ inp->inp_laddr, lport, wild));
+ }
+ }
+ inp->inp_lport = lport;
+ if (prison_ip(td->td_ucred, 0, &inp->inp_laddr.s_addr)) {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ inp->inp_lport = 0;
+ return (EINVAL);
+ }
+ if (in_pcbinshash(inp) != 0) {
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ inp->inp_lport = 0;
+ return (EAGAIN);
+ }
+ return (0);
+}
+
+/*
+ * Transform old in_pcbconnect() into an inner subroutine for new
+ * in_pcbconnect(): Do some validity-checking on the remote
+ * address (in mbuf 'nam') and then determine local host address
+ * (i.e., which interface) to use to access that remote host.
+ *
+ * This preserves definition of in_pcbconnect(), while supporting a
+ * slightly different version for T/TCP. (This is more than
+ * a bit of a kludge, but cleaning up the internal interfaces would
+ * have forced minor changes in every protocol).
+ */
+
+int
+in_pcbladdr(inp, nam, plocal_sin)
+ register struct inpcb *inp;
+ struct sockaddr *nam;
+ struct sockaddr_in **plocal_sin;
+{
+ struct in_ifaddr *ia;
+ register struct sockaddr_in *sin = (struct sockaddr_in *)nam;
+
+ if (nam->sa_len != sizeof (*sin))
+ return (EINVAL);
+ if (sin->sin_family != AF_INET)
+ return (EAFNOSUPPORT);
+ if (sin->sin_port == 0)
+ return (EADDRNOTAVAIL);
+ if (!TAILQ_EMPTY(&in_ifaddrhead)) {
+ /*
+ * If the destination address is INADDR_ANY,
+ * use the primary local address.
+ * If the supplied address is INADDR_BROADCAST,
+ * and the primary interface supports broadcast,
+ * choose the broadcast address for that interface.
+ */
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ sin->sin_addr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr;
+ else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST &&
+ (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST))
+ sin->sin_addr = satosin(&TAILQ_FIRST(&in_ifaddrhead)->ia_broadaddr)->sin_addr;
+ }
+ if (inp->inp_laddr.s_addr == INADDR_ANY) {
+ register struct route *ro;
+
+ ia = (struct in_ifaddr *)0;
+ /*
+ * If route is known or can be allocated now,
+ * our src addr is taken from the i/f, else punt.
+ * Note that we should check the address family of the cached
+ * destination, in case of sharing the cache with IPv6.
+ */
+ ro = &inp->inp_route;
+ if (ro->ro_rt &&
+ (ro->ro_dst.sa_family != AF_INET ||
+ satosin(&ro->ro_dst)->sin_addr.s_addr !=
+ sin->sin_addr.s_addr ||
+ inp->inp_socket->so_options & SO_DONTROUTE)) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = (struct rtentry *)0;
+ }
+ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
+ (ro->ro_rt == (struct rtentry *)0 ||
+ ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
+ /* No route yet, so try to acquire one */
+ bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
+ ro->ro_dst.sa_family = AF_INET;
+ ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
+ ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
+ sin->sin_addr;
+ rtalloc(ro);
+ }
+ /*
+ * If we found a route, use the address
+ * corresponding to the outgoing interface
+ * unless it is the loopback (in case a route
+ * to our address on another net goes to loopback).
+ */
+ if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
+ ia = ifatoia(ro->ro_rt->rt_ifa);
+ if (ia == 0) {
+ u_short fport = sin->sin_port;
+
+ sin->sin_port = 0;
+ ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
+ if (ia == 0)
+ ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
+ sin->sin_port = fport;
+ if (ia == 0)
+ ia = TAILQ_FIRST(&in_ifaddrhead);
+ if (ia == 0)
+ return (EADDRNOTAVAIL);
+ }
+ /*
+ * If the destination address is multicast and an outgoing
+ * interface has been set as a multicast option, use the
+ * address of that interface as our source address.
+ */
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
+ inp->inp_moptions != NULL) {
+ struct ip_moptions *imo;
+ struct ifnet *ifp;
+
+ imo = inp->inp_moptions;
+ if (imo->imo_multicast_ifp != NULL) {
+ ifp = imo->imo_multicast_ifp;
+ TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
+ if (ia->ia_ifp == ifp)
+ break;
+ if (ia == 0)
+ return (EADDRNOTAVAIL);
+ }
+ }
+ /*
+ * Don't do pcblookup call here; return interface in plocal_sin
+ * and exit to caller, that will do the lookup.
+ */
+ *plocal_sin = &ia->ia_addr;
+
+ }
+ return(0);
+}
+
+/*
+ * Outer subroutine:
+ * Connect from a socket to a specified address.
+ * Both address and port must be specified in argument sin.
+ * If don't have a local address for this socket yet,
+ * then pick one.
+ */
+int
+in_pcbconnect(inp, nam, td)
+ register struct inpcb *inp;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ struct sockaddr_in *ifaddr;
+ struct sockaddr_in *sin = (struct sockaddr_in *)nam;
+ struct sockaddr_in sa;
+ struct ucred *cred;
+ int error;
+
+ cred = inp->inp_socket->so_cred;
+ if (inp->inp_laddr.s_addr == INADDR_ANY && jailed(cred)) {
+ bzero(&sa, sizeof (sa));
+ sa.sin_addr.s_addr = htonl(prison_getip(cred));
+ sa.sin_len=sizeof (sa);
+ sa.sin_family = AF_INET;
+ error = in_pcbbind(inp, (struct sockaddr *)&sa, td);
+ if (error)
+ return (error);
+ }
+ /*
+ * Call inner routine, to assign local interface address.
+ */
+ if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0)
+ return(error);
+
+ if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
+ inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr,
+ inp->inp_lport, 0, NULL) != NULL) {
+ return (EADDRINUSE);
+ }
+ if (inp->inp_laddr.s_addr == INADDR_ANY) {
+ if (inp->inp_lport == 0) {
+ error = in_pcbbind(inp, (struct sockaddr *)0, td);
+ if (error)
+ return (error);
+ }
+ inp->inp_laddr = ifaddr->sin_addr;
+ }
+ inp->inp_faddr = sin->sin_addr;
+ inp->inp_fport = sin->sin_port;
+ in_pcbrehash(inp);
+ return (0);
+}
+
+void
+in_pcbdisconnect(inp)
+ struct inpcb *inp;
+{
+
+ inp->inp_faddr.s_addr = INADDR_ANY;
+ inp->inp_fport = 0;
+ in_pcbrehash(inp);
+ if (inp->inp_socket->so_state & SS_NOFDREF)
+ in_pcbdetach(inp);
+}
+
+void
+in_pcbdetach(inp)
+ struct inpcb *inp;
+{
+ struct socket *so = inp->inp_socket;
+ struct inpcbinfo *ipi = inp->inp_pcbinfo;
+
+#ifdef IPSEC
+ ipsec4_delete_pcbpolicy(inp);
+#endif /*IPSEC*/
+ inp->inp_gencnt = ++ipi->ipi_gencnt;
+ in_pcbremlists(inp);
+ so->so_pcb = 0;
+ sotryfree(so);
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+ if (inp->inp_route.ro_rt)
+ rtfree(inp->inp_route.ro_rt);
+ ip_freemoptions(inp->inp_moptions);
+ inp->inp_vflag = 0;
+ INP_LOCK_DESTROY(inp);
+ uma_zfree(ipi->ipi_zone, inp);
+}
+
+/*
+ * The wrapper function will pass down the pcbinfo for this function to lock.
+ * The socket must have a valid
+ * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
+ * except through a kernel programming error, so it is acceptable to panic
+ * (or in this case trap) if the PCB is invalid. (Actually, we don't trap
+ * because there actually /is/ a programming error somewhere... XXX)
+ */
+int
+in_setsockaddr(so, nam, pcbinfo)
+ struct socket *so;
+ struct sockaddr **nam;
+ struct inpcbinfo *pcbinfo;
+{
+ int s;
+ register struct inpcb *inp;
+ register struct sockaddr_in *sin;
+
+ /*
+ * Do the malloc first in case it blocks.
+ */
+ MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
+ M_WAITOK | M_ZERO);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+
+ s = splnet();
+ INP_INFO_RLOCK(pcbinfo);
+ inp = sotoinpcb(so);
+ if (!inp) {
+ INP_INFO_RUNLOCK(pcbinfo);
+ splx(s);
+ free(sin, M_SONAME);
+ return ECONNRESET;
+ }
+ INP_LOCK(inp);
+ sin->sin_port = inp->inp_lport;
+ sin->sin_addr = inp->inp_laddr;
+ INP_UNLOCK(inp);
+ INP_INFO_RUNLOCK(pcbinfo);
+ splx(s);
+
+ *nam = (struct sockaddr *)sin;
+ return 0;
+}
+
+/*
+ * The wrapper function will pass down the pcbinfo for this function to lock.
+ */
+int
+in_setpeeraddr(so, nam, pcbinfo)
+ struct socket *so;
+ struct sockaddr **nam;
+ struct inpcbinfo *pcbinfo;
+{
+ int s;
+ register struct inpcb *inp;
+ register struct sockaddr_in *sin;
+
+ /*
+ * Do the malloc first in case it blocks.
+ */
+ MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
+ M_WAITOK | M_ZERO);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+
+ s = splnet();
+ INP_INFO_RLOCK(pcbinfo);
+ inp = sotoinpcb(so);
+ if (!inp) {
+ INP_INFO_RUNLOCK(pcbinfo);
+ splx(s);
+ free(sin, M_SONAME);
+ return ECONNRESET;
+ }
+ INP_LOCK(inp);
+ sin->sin_port = inp->inp_fport;
+ sin->sin_addr = inp->inp_faddr;
+ INP_UNLOCK(inp);
+ INP_INFO_RUNLOCK(pcbinfo);
+ splx(s);
+
+ *nam = (struct sockaddr *)sin;
+ return 0;
+}
+
+void
+in_pcbnotifyall(pcbinfo, faddr, errno, notify)
+ struct inpcbinfo *pcbinfo;
+ struct in_addr faddr;
+ int errno;
+ struct inpcb *(*notify)(struct inpcb *, int);
+{
+ struct inpcb *inp, *ninp;
+ struct inpcbhead *head;
+ int s;
+
+ s = splnet();
+ INP_INFO_RLOCK(pcbinfo);
+ head = pcbinfo->listhead;
+ for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) {
+ INP_LOCK(inp);
+ ninp = LIST_NEXT(inp, inp_list);
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0) {
+ INP_UNLOCK(inp);
+ continue;
+ }
+#endif
+ if (inp->inp_faddr.s_addr != faddr.s_addr ||
+ inp->inp_socket == NULL) {
+ INP_UNLOCK(inp);
+ continue;
+ }
+ (*notify)(inp, errno);
+ INP_UNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(pcbinfo);
+ splx(s);
+}
+
+void
+in_pcbpurgeif0(pcbinfo, ifp)
+ struct inpcbinfo *pcbinfo;
+ struct ifnet *ifp;
+{
+ struct inpcb *inp;
+ struct ip_moptions *imo;
+ int i, gap;
+
+ /* why no splnet here? XXX */
+ INP_INFO_RLOCK(pcbinfo);
+ LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
+ INP_LOCK(inp);
+ imo = inp->inp_moptions;
+ if ((inp->inp_vflag & INP_IPV4) &&
+ imo != NULL) {
+ /*
+ * Unselect the outgoing interface if it is being
+ * detached.
+ */
+ if (imo->imo_multicast_ifp == ifp)
+ imo->imo_multicast_ifp = NULL;
+
+ /*
+ * Drop multicast group membership if we joined
+ * through the interface being detached.
+ */
+ for (i = 0, gap = 0; i < imo->imo_num_memberships;
+ i++) {
+ if (imo->imo_membership[i]->inm_ifp == ifp) {
+ in_delmulti(imo->imo_membership[i]);
+ gap++;
+ } else if (gap != 0)
+ imo->imo_membership[i - gap] =
+ imo->imo_membership[i];
+ }
+ imo->imo_num_memberships -= gap;
+ }
+ INP_UNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(pcbinfo);
+}
+
+/*
+ * Check for alternatives when higher level complains
+ * about service problems. For now, invalidate cached
+ * routing information. If the route was created dynamically
+ * (by a redirect), time to try a default gateway again.
+ */
+void
+in_losing(inp)
+ struct inpcb *inp;
+{
+ register struct rtentry *rt;
+ struct rt_addrinfo info;
+
+ if ((rt = inp->inp_route.ro_rt)) {
+ bzero((caddr_t)&info, sizeof(info));
+ info.rti_flags = rt->rt_flags;
+ info.rti_info[RTAX_DST] = rt_key(rt);
+ info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
+ info.rti_info[RTAX_NETMASK] = rt_mask(rt);
+ rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
+ if (rt->rt_flags & RTF_DYNAMIC)
+ (void) rtrequest1(RTM_DELETE, &info, NULL);
+ inp->inp_route.ro_rt = NULL;
+ rtfree(rt);
+ /*
+ * A new route can be allocated
+ * the next time output is attempted.
+ */
+ }
+}
+
+/*
+ * After a routing change, flush old routing
+ * and allocate a (hopefully) better one.
+ */
+struct inpcb *
+in_rtchange(inp, errno)
+ register struct inpcb *inp;
+ int errno;
+{
+ if (inp->inp_route.ro_rt) {
+ rtfree(inp->inp_route.ro_rt);
+ inp->inp_route.ro_rt = 0;
+ /*
+ * A new route can be allocated the next time
+ * output is attempted.
+ */
+ }
+ return inp;
+}
+
+/*
+ * Lookup a PCB based on the local address and port.
+ */
+struct inpcb *
+in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay)
+ struct inpcbinfo *pcbinfo;
+ struct in_addr laddr;
+ u_int lport_arg;
+ int wild_okay;
+{
+ register struct inpcb *inp;
+ int matchwild = 3, wildcard;
+ u_short lport = lport_arg;
+
+ if (!wild_okay) {
+ struct inpcbhead *head;
+ /*
+ * Look for an unconnected (wildcard foreign addr) PCB that
+ * matches the local address and port we're looking for.
+ */
+ head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
+ LIST_FOREACH(inp, head, inp_hash) {
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr == INADDR_ANY &&
+ inp->inp_laddr.s_addr == laddr.s_addr &&
+ inp->inp_lport == lport) {
+ /*
+ * Found.
+ */
+ return (inp);
+ }
+ }
+ /*
+ * Not found.
+ */
+ return (NULL);
+ } else {
+ struct inpcbporthead *porthash;
+ struct inpcbport *phd;
+ struct inpcb *match = NULL;
+ /*
+ * Best fit PCB lookup.
+ *
+ * First see if this local port is in use by looking on the
+ * port hash list.
+ */
+ porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
+ pcbinfo->porthashmask)];
+ LIST_FOREACH(phd, porthash, phd_hash) {
+ if (phd->phd_port == lport)
+ break;
+ }
+ if (phd != NULL) {
+ /*
+ * Port is in use by one or more PCBs. Look for best
+ * fit.
+ */
+ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
+ wildcard = 0;
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr != INADDR_ANY)
+ wildcard++;
+ if (inp->inp_laddr.s_addr != INADDR_ANY) {
+ if (laddr.s_addr == INADDR_ANY)
+ wildcard++;
+ else if (inp->inp_laddr.s_addr != laddr.s_addr)
+ continue;
+ } else {
+ if (laddr.s_addr != INADDR_ANY)
+ wildcard++;
+ }
+ if (wildcard < matchwild) {
+ match = inp;
+ matchwild = wildcard;
+ if (matchwild == 0) {
+ break;
+ }
+ }
+ }
+ }
+ return (match);
+ }
+}
+
+/*
+ * Lookup PCB in hash list.
+ */
+struct inpcb *
+in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard,
+ ifp)
+ struct inpcbinfo *pcbinfo;
+ struct in_addr faddr, laddr;
+ u_int fport_arg, lport_arg;
+ int wildcard;
+ struct ifnet *ifp;
+{
+ struct inpcbhead *head;
+ register struct inpcb *inp;
+ u_short fport = fport_arg, lport = lport_arg;
+
+ /*
+ * First look for an exact match.
+ */
+ head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
+ LIST_FOREACH(inp, head, inp_hash) {
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr == faddr.s_addr &&
+ inp->inp_laddr.s_addr == laddr.s_addr &&
+ inp->inp_fport == fport &&
+ inp->inp_lport == lport) {
+ /*
+ * Found.
+ */
+ return (inp);
+ }
+ }
+ if (wildcard) {
+ struct inpcb *local_wild = NULL;
+#if defined(INET6)
+ struct inpcb *local_wild_mapped = NULL;
+#endif /* defined(INET6) */
+
+ head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
+ LIST_FOREACH(inp, head, inp_hash) {
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr == INADDR_ANY &&
+ inp->inp_lport == lport) {
+ if (ifp && ifp->if_type == IFT_FAITH &&
+ (inp->inp_flags & INP_FAITH) == 0)
+ continue;
+ if (inp->inp_laddr.s_addr == laddr.s_addr)
+ return (inp);
+ else if (inp->inp_laddr.s_addr == INADDR_ANY) {
+#if defined(INET6)
+ if (INP_CHECK_SOCKAF(inp->inp_socket,
+ AF_INET6))
+ local_wild_mapped = inp;
+ else
+#endif /* defined(INET6) */
+ local_wild = inp;
+ }
+ }
+ }
+#if defined(INET6)
+ if (local_wild == NULL)
+ return (local_wild_mapped);
+#endif /* defined(INET6) */
+ return (local_wild);
+ }
+
+ /*
+ * Not found.
+ */
+ return (NULL);
+}
+
+/*
+ * Insert PCB onto various hash lists.
+ */
+int
+in_pcbinshash(inp)
+ struct inpcb *inp;
+{
+ struct inpcbhead *pcbhash;
+ struct inpcbporthead *pcbporthash;
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ struct inpcbport *phd;
+ u_int32_t hashkey_faddr;
+
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6)
+ hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
+ else
+#endif /* INET6 */
+ hashkey_faddr = inp->inp_faddr.s_addr;
+
+ pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
+ inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)];
+
+ pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
+ pcbinfo->porthashmask)];
+
+ /*
+ * Go through port list and look for a head for this lport.
+ */
+ LIST_FOREACH(phd, pcbporthash, phd_hash) {
+ if (phd->phd_port == inp->inp_lport)
+ break;
+ }
+ /*
+ * If none exists, malloc one and tack it on.
+ */
+ if (phd == NULL) {
+ MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+ if (phd == NULL) {
+ return (ENOBUFS); /* XXX */
+ }
+ phd->phd_port = inp->inp_lport;
+ LIST_INIT(&phd->phd_pcblist);
+ LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
+ }
+ inp->inp_phd = phd;
+ LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
+ LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
+ return (0);
+}
+
+/*
+ * Move PCB to the proper hash bucket when { faddr, fport } have been
+ * changed. NOTE: This does not handle the case of the lport changing (the
+ * hashed port list would have to be updated as well), so the lport must
+ * not change after in_pcbinshash() has been called.
+ */
+void
+in_pcbrehash(inp)
+ struct inpcb *inp;
+{
+ struct inpcbhead *head;
+ u_int32_t hashkey_faddr;
+
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6)
+ hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
+ else
+#endif /* INET6 */
+ hashkey_faddr = inp->inp_faddr.s_addr;
+
+ head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
+ inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)];
+
+ LIST_REMOVE(inp, inp_hash);
+ LIST_INSERT_HEAD(head, inp, inp_hash);
+}
+
+/*
+ * Remove PCB from various lists.
+ */
+void
+in_pcbremlists(inp)
+ struct inpcb *inp;
+{
+ inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
+ if (inp->inp_lport) {
+ struct inpcbport *phd = inp->inp_phd;
+
+ LIST_REMOVE(inp, inp_hash);
+ LIST_REMOVE(inp, inp_portlist);
+ if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ LIST_REMOVE(phd, phd_hash);
+ free(phd, M_PCB);
+ }
+ }
+ LIST_REMOVE(inp, inp_list);
+ inp->inp_pcbinfo->ipi_count--;
+}
+
+int
+prison_xinpcb(struct thread *td, struct inpcb *inp)
+{
+ if (!jailed(td->td_ucred))
+ return (0);
+ if (ntohl(inp->inp_laddr.s_addr) == prison_getip(td->td_ucred))
+ return (0);
+ return (1);
+}
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
new file mode 100644
index 0000000..1d9a21d
--- /dev/null
+++ b/sys/netinet/in_pcb.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_PCB_H_
+#define _NETINET_IN_PCB_H_
+
+#include <sys/queue.h>
+
+#include <net/route.h>
+#include <netinet6/ipsec.h> /* for IPSEC */
+#include <vm/uma.h>
+
+#define in6pcb inpcb /* for KAME src sync over BSD*'s */
+#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */
+
+/*
+ * Common structure pcb for internet protocol implementation.
+ * Here are stored pointers to local and foreign host table
+ * entries, local and foreign socket numbers, and pointers
+ * up (to a socket structure) and down (to a protocol-specific)
+ * control block.
+ */
+LIST_HEAD(inpcbhead, inpcb);
+LIST_HEAD(inpcbporthead, inpcbport);
+typedef u_quad_t inp_gen_t;
+
+/*
+ * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
+ * So, AF_INET6 null laddr is also used as AF_INET null laddr,
+ * by utilize following structure. (At last, same as INRIA)
+ */
+struct in_addr_4in6 {
+ u_int32_t ia46_pad32[3];
+ struct in_addr ia46_addr4;
+};
+
+/*
+ * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.
+ * in_conninfo has some extra padding to accomplish this.
+ */
+struct in_endpoints {
+ u_int16_t ie_fport; /* foreign port */
+ u_int16_t ie_lport; /* local port */
+ /* protocol dependent part, local and foreign addr */
+ union {
+ /* foreign host table entry */
+ struct in_addr_4in6 ie46_foreign;
+ struct in6_addr ie6_foreign;
+ } ie_dependfaddr;
+ union {
+ /* local host table entry */
+ struct in_addr_4in6 ie46_local;
+ struct in6_addr ie6_local;
+ } ie_dependladdr;
+#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4
+#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4
+#define ie6_faddr ie_dependfaddr.ie6_foreign
+#define ie6_laddr ie_dependladdr.ie6_local
+};
+
+/*
+ * XXX
+ * At some point struct route should possibly change to:
+ * struct rtentry *rt
+ * struct in_endpoints *ie;
+ */
+struct in_conninfo {
+ u_int8_t inc_flags;
+ u_int8_t inc_len;
+ u_int16_t inc_pad; /* XXX alignment for in_endpoints */
+ /* protocol dependent part; cached route */
+ struct in_endpoints inc_ie;
+ union {
+ /* placeholder for routing entry */
+ struct route inc4_route;
+ struct route_in6 inc6_route;
+ } inc_dependroute;
+};
+#define inc_isipv6 inc_flags /* temp compatability */
+#define inc_fport inc_ie.ie_fport
+#define inc_lport inc_ie.ie_lport
+#define inc_faddr inc_ie.ie_faddr
+#define inc_laddr inc_ie.ie_laddr
+#define inc_route inc_dependroute.inc4_route
+#define inc6_faddr inc_ie.ie6_faddr
+#define inc6_laddr inc_ie.ie6_laddr
+#define inc6_route inc_dependroute.inc6_route
+
+struct icmp6_filter;
+
+struct inpcb {
+ LIST_ENTRY(inpcb) inp_hash; /* hash list */
+ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */
+ u_int32_t inp_flow;
+
+ /* local and foreign ports, local and foreign addr */
+ struct in_conninfo inp_inc;
+
+ caddr_t inp_ppcb; /* pointer to per-protocol pcb */
+ struct inpcbinfo *inp_pcbinfo; /* PCB list info */
+ struct socket *inp_socket; /* back pointer to socket */
+ /* list for this PCB's local port */
+ int inp_flags; /* generic IP/datagram flags */
+
+ struct inpcbpolicy *inp_sp; /* for IPSEC */
+ u_char inp_vflag; /* IP version flag (v4/v6) */
+#define INP_IPV4 0x1
+#define INP_IPV6 0x2
+ u_char inp_ip_ttl; /* time to live proto */
+ u_char inp_ip_p; /* protocol proto */
+
+ /* protocol dependent part; options */
+ struct {
+ u_char inp4_ip_tos; /* type of service proto */
+ struct mbuf *inp4_options; /* IP options */
+ struct ip_moptions *inp4_moptions; /* IP multicast options */
+ } inp_depend4;
+#define inp_fport inp_inc.inc_fport
+#define inp_lport inp_inc.inc_lport
+#define inp_faddr inp_inc.inc_faddr
+#define inp_laddr inp_inc.inc_laddr
+#define inp_route inp_inc.inc_route
+#define inp_ip_tos inp_depend4.inp4_ip_tos
+#define inp_options inp_depend4.inp4_options
+#define inp_moptions inp_depend4.inp4_moptions
+ struct {
+ /* IP options */
+ struct mbuf *inp6_options;
+ /* IP6 options for outgoing packets */
+ struct ip6_pktopts *inp6_outputopts;
+ /* IP multicast options */
+ struct ip6_moptions *inp6_moptions;
+ /* ICMPv6 code type filter */
+ struct icmp6_filter *inp6_icmp6filt;
+ /* IPV6_CHECKSUM setsockopt */
+ int inp6_cksum;
+ u_short inp6_ifindex;
+ short inp6_hops;
+ u_int8_t inp6_hlim;
+ } inp_depend6;
+ LIST_ENTRY(inpcb) inp_portlist;
+ struct inpcbport *inp_phd; /* head of this list */
+ inp_gen_t inp_gencnt; /* generation count of this instance */
+ struct mtx inp_mtx;
+
+#define in6p_faddr inp_inc.inc6_faddr
+#define in6p_laddr inp_inc.inc6_laddr
+#define in6p_route inp_inc.inc6_route
+#define in6p_ip6_hlim inp_depend6.inp6_hlim
+#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
+#define in6p_ip6_nxt inp_ip_p
+#define in6p_flowinfo inp_flow
+#define in6p_vflag inp_vflag
+#define in6p_options inp_depend6.inp6_options
+#define in6p_outputopts inp_depend6.inp6_outputopts
+#define in6p_moptions inp_depend6.inp6_moptions
+#define in6p_icmp6filt inp_depend6.inp6_icmp6filt
+#define in6p_cksum inp_depend6.inp6_cksum
+#define inp6_ifindex inp_depend6.inp6_ifindex
+#define in6p_flags inp_flags /* for KAME src sync over BSD*'s */
+#define in6p_socket inp_socket /* for KAME src sync over BSD*'s */
+#define in6p_lport inp_lport /* for KAME src sync over BSD*'s */
+#define in6p_fport inp_fport /* for KAME src sync over BSD*'s */
+#define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */
+};
+/*
+ * The range of the generation count, as used in this implementation,
+ * is 9e19. We would have to create 300 billion connections per
+ * second for this number to roll over in a year. This seems sufficiently
+ * unlikely that we simply don't concern ourselves with that possibility.
+ */
+
+/*
+ * Interface exported to userland by various protocols which use
+ * inpcbs. Hack alert -- only define if struct xsocket is in scope.
+ */
+#ifdef _SYS_SOCKETVAR_H_
+struct xinpcb {
+ size_t xi_len; /* length of this structure */
+ struct inpcb xi_inp;
+ struct xsocket xi_socket;
+ u_quad_t xi_alignment_hack;
+};
+
+struct xinpgen {
+ size_t xig_len; /* length of this structure */
+ u_int xig_count; /* number of PCBs at this time */
+ inp_gen_t xig_gen; /* generation count at this time */
+ so_gen_t xig_sogen; /* socket generation count at this time */
+};
+#endif /* _SYS_SOCKETVAR_H_ */
+
+struct inpcbport {
+ LIST_ENTRY(inpcbport) phd_hash;
+ struct inpcbhead phd_pcblist;
+ u_short phd_port;
+};
+
+struct inpcbinfo { /* XXX documentation, prefixes */
+ struct inpcbhead *hashbase;
+ u_long hashmask;
+ struct inpcbporthead *porthashbase;
+ u_long porthashmask;
+ struct inpcbhead *listhead;
+ u_short lastport;
+ u_short lastlow;
+ u_short lasthi;
+ uma_zone_t ipi_zone; /* zone to allocate pcbs from */
+ u_int ipi_count; /* number of pcbs in this list */
+ u_quad_t ipi_gencnt; /* current generation count */
+ struct mtx ipi_mtx;
+};
+
+#define INP_LOCK_INIT(inp, d) \
+ mtx_init(&(inp)->inp_mtx, (d), NULL, MTX_DEF | MTX_RECURSE)
+#define INP_LOCK_DESTROY(inp) mtx_destroy(&(inp)->inp_mtx)
+#define INP_LOCK(inp) mtx_lock(&(inp)->inp_mtx)
+#define INP_UNLOCK(inp) mtx_unlock(&(inp)->inp_mtx)
+
+#define INP_INFO_LOCK_INIT(ipi, d) \
+ mtx_init(&(ipi)->ipi_mtx, (d), NULL, MTX_DEF | MTX_RECURSE)
+#define INP_INFO_RLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx)
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_mtx)
+#define INP_INFO_RUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx)
+#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_mtx)
+
+#define INP_PCBHASH(faddr, lport, fport, mask) \
+ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
+#define INP_PCBPORTHASH(lport, mask) \
+ (ntohs((lport)) & (mask))
+
+/* flags in inp_flags: */
+#define INP_RECVOPTS 0x01 /* receive incoming IP options */
+#define INP_RECVRETOPTS 0x02 /* receive IP options for reply */
+#define INP_RECVDSTADDR 0x04 /* receive IP dst address */
+#define INP_HDRINCL 0x08 /* user supplies entire IP header */
+#define INP_HIGHPORT 0x10 /* user wants "high" port binding */
+#define INP_LOWPORT 0x20 /* user wants "low" port binding */
+#define INP_ANONPORT 0x40 /* port chosen for user */
+#define INP_RECVIF 0x80 /* receive incoming interface */
+#define INP_MTUDISC 0x100 /* user can do MTU discovery */
+#define INP_FAITH 0x200 /* accept FAITH'ed connections */
+
+#define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */
+
+#define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */
+#define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */
+#define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */
+#define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */
+#define IN6P_RTHDR 0x100000 /* receive routing header */
+#define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */
+#define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */
+#define IN6P_BINDV6ONLY 0x10000000 /* do not grab IPv4 traffic */
+
+#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
+ INP_RECVIF|\
+ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
+ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
+ IN6P_AUTOFLOWLABEL)
+#define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\
+ IN6P_AUTOFLOWLABEL)
+
+ /* for KAME src sync over BSD*'s */
+#define IN6P_HIGHPORT INP_HIGHPORT
+#define IN6P_LOWPORT INP_LOWPORT
+#define IN6P_ANONPORT INP_ANONPORT
+#define IN6P_RECVIF INP_RECVIF
+#define IN6P_MTUDISC INP_MTUDISC
+#define IN6P_FAITH INP_FAITH
+#define IN6P_CONTROLOPTS INP_CONTROLOPTS
+ /*
+ * socket AF version is {newer than,or include}
+ * actual datagram AF version
+ */
+
+#define INPLOOKUP_WILDCARD 1
+#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
+#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */
+
+#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
+
+#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
+
+#ifdef _KERNEL
+extern int ipport_lowfirstauto;
+extern int ipport_lowlastauto;
+extern int ipport_firstauto;
+extern int ipport_lastauto;
+extern int ipport_hifirstauto;
+extern int ipport_hilastauto;
+
+void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
+void in_losing(struct inpcb *);
+struct inpcb *
+ in_rtchange(struct inpcb *, int);
+int in_pcballoc(struct socket *, struct inpcbinfo *, struct thread *);
+int in_pcbbind(struct inpcb *, struct sockaddr *, struct thread *);
+int in_pcbconnect(struct inpcb *, struct sockaddr *, struct thread *);
+void in_pcbdetach(struct inpcb *);
+void in_pcbdisconnect(struct inpcb *);
+int in_pcbinshash(struct inpcb *);
+int in_pcbladdr(struct inpcb *, struct sockaddr *,
+ struct sockaddr_in **);
+struct inpcb *
+ in_pcblookup_local(struct inpcbinfo *,
+ struct in_addr, u_int, int);
+struct inpcb *
+ in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
+ struct in_addr, u_int, int, struct ifnet *);
+void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
+ int, struct inpcb *(*)(struct inpcb *, int));
+void in_pcbrehash(struct inpcb *);
+int in_setpeeraddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo);
+int in_setsockaddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo);;
+void in_pcbremlists(struct inpcb *inp);
+int prison_xinpcb(struct thread *td, struct inpcb *inp);
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_IN_PCB_H_ */
diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c
new file mode 100644
index 0000000..b522652
--- /dev/null
+++ b/sys/netinet/in_proto.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_proto.c 8.2 (Berkeley) 2/9/95
+ * $FreeBSD$
+ */
+
+#include "opt_ipdivert.h"
+#include "opt_ipx.h"
+#include "opt_ipsec.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/igmp_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/ip_encap.h>
+
+/*
+ * TCP/IP protocol family: IP, ICMP, UDP, TCP.
+ */
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#include <netinet6/ah.h>
+#ifdef IPSEC_ESP
+#include <netinet6/esp.h>
+#endif
+#include <netinet6/ipcomp.h>
+#endif /* IPSEC */
+
+#ifdef IPXIP
+#include <netipx/ipx_ip.h>
+#endif
+
+#ifdef NSIP
+#include <netns/ns.h>
+#include <netns/ns_if.h>
+#endif
+
+extern struct domain inetdomain;
+static struct pr_usrreqs nousrreqs;
+
+struct protosw inetsw[] = {
+{ 0, &inetdomain, 0, 0,
+ 0, 0, 0, 0,
+ 0,
+ ip_init, 0, ip_slowtimo, ip_drain,
+ &nousrreqs
+},
+{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR,
+ udp_input, 0, udp_ctlinput, ip_ctloutput,
+ 0,
+ udp_init, 0, 0, 0,
+ &udp_usrreqs
+},
+{ SOCK_STREAM, &inetdomain, IPPROTO_TCP,
+ PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
+ tcp_input, 0, tcp_ctlinput, tcp_ctloutput,
+ 0,
+ tcp_init, 0, tcp_slowtimo, tcp_drain,
+ &tcp_usrreqs
+},
+{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR,
+ rip_input, 0, rip_ctlinput, rip_ctloutput,
+ 0,
+ 0, 0, 0, 0,
+ &rip_usrreqs
+},
+{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ icmp_input, 0, 0, rip_ctloutput,
+ 0,
+ 0, 0, 0, 0,
+ &rip_usrreqs
+},
+{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ igmp_input, 0, 0, rip_ctloutput,
+ 0,
+ igmp_init, igmp_fasttimo, igmp_slowtimo, 0,
+ &rip_usrreqs
+},
+{ SOCK_RAW, &inetdomain, IPPROTO_RSVP, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ rsvp_input, 0, 0, rip_ctloutput,
+ 0,
+ 0, 0, 0, 0,
+ &rip_usrreqs
+},
+#ifdef IPSEC
+{ SOCK_RAW, &inetdomain, IPPROTO_AH, PR_ATOMIC|PR_ADDR,
+ ah4_input, 0, 0, 0,
+ 0,
+ 0, 0, 0, 0,
+ &nousrreqs
+},
+#ifdef IPSEC_ESP
+{ SOCK_RAW, &inetdomain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR,
+ esp4_input, 0, 0, 0,
+ 0,
+ 0, 0, 0, 0,
+ &nousrreqs
+},
+#endif
+{ SOCK_RAW, &inetdomain, IPPROTO_IPCOMP, PR_ATOMIC|PR_ADDR,
+ ipcomp4_input, 0, 0, 0,
+ 0,
+ 0, 0, 0, 0,
+ &nousrreqs
+},
+#endif /* IPSEC */
+{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ encap4_input, 0, 0, rip_ctloutput,
+ 0,
+ encap_init, 0, 0, 0,
+ &rip_usrreqs
+},
+# ifdef INET6
+{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ encap4_input, 0, 0, rip_ctloutput,
+ 0,
+ encap_init, 0, 0, 0,
+ &rip_usrreqs
+},
+#endif
+#ifdef IPDIVERT
+{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR,
+ div_input, 0, 0, ip_ctloutput,
+ 0,
+ div_init, 0, 0, 0,
+ &div_usrreqs,
+},
+#endif
+#ifdef IPXIP
+{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ ipxip_input, 0, ipxip_ctlinput, 0,
+ 0,
+ 0, 0, 0, 0,
+ &rip_usrreqs
+},
+#endif
+#ifdef NSIP
+{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ idpip_input, 0, nsip_ctlinput, 0,
+ 0,
+ 0, 0, 0, 0,
+ &rip_usrreqs
+},
+#endif
+ /* raw wildcard */
+{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR,
+ rip_input, 0, 0, rip_ctloutput,
+ 0,
+ rip_init, 0, 0, 0,
+ &rip_usrreqs
+},
+};
+
+extern int in_inithead(void **, int);
+
+struct domain inetdomain =
+ { AF_INET, "internet", 0, 0, 0,
+ inetsw,
+ &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0,
+ in_inithead, 32, sizeof(struct sockaddr_in)
+ };
+
+DOMAIN_SET(inet);
+
+SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0,
+ "Internet Family");
+
+SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP");
+SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP");
+SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP");
+SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP");
+SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP");
+#ifdef IPSEC
+SYSCTL_NODE(_net_inet, IPPROTO_AH, ipsec, CTLFLAG_RW, 0, "IPSEC");
+#endif /* IPSEC */
+SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW");
+#ifdef IPDIVERT
+SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "DIVERT");
+#endif
+
diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c
new file mode 100644
index 0000000..08052fb
--- /dev/null
+++ b/sys/netinet/in_rmx.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright 1994, 1995 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This code does two things necessary for the enhanced TCP metrics to
+ * function in a useful manner:
+ * 1) It marks all non-host routes as `cloning', thus ensuring that
+ * every actual reference to such a route actually gets turned
+ * into a reference to a host route to the specific destination
+ * requested.
+ * 2) When such routes lose all their references, it arranges for them
+ * to be deleted in some random collection of circumstances, so that
+ * a large quantity of stale routing data is not kept in kernel memory
+ * indefinitely. See in_rtqtimo() below for the exact mechanism.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <sys/mbuf.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+
+extern int in_inithead(void **head, int off);
+
+#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */
+
+/*
+ * Do what we need to do when inserting a route.
+ */
+static struct radix_node *
+in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
+ struct radix_node *treenodes)
+{
+ struct rtentry *rt = (struct rtentry *)treenodes;
+ struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
+ struct radix_node *ret;
+
+ /*
+ * For IP, all unicast non-host routes are automatically cloning.
+ */
+ if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ rt->rt_flags |= RTF_MULTICAST;
+
+ if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
+ rt->rt_flags |= RTF_PRCLONING;
+ }
+
+ /*
+ * A little bit of help for both IP output and input:
+ * For host routes, we make sure that RTF_BROADCAST
+ * is set for anything that looks like a broadcast address.
+ * This way, we can avoid an expensive call to in_broadcast()
+ * in ip_output() most of the time (because the route passed
+ * to ip_output() is almost always a host route).
+ *
+ * We also do the same for local addresses, with the thought
+ * that this might one day be used to speed up ip_input().
+ *
+ * We also mark routes to multicast addresses as such, because
+ * it's easy to do and might be useful (but this is much more
+ * dubious since it's so easy to inspect the address). (This
+ * is done above.)
+ */
+ if (rt->rt_flags & RTF_HOST) {
+ if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
+ rt->rt_flags |= RTF_BROADCAST;
+ } else {
+ if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr
+ == sin->sin_addr.s_addr)
+ rt->rt_flags |= RTF_LOCAL;
+ }
+ }
+
+ if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
+ && rt->rt_ifp)
+ rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
+
+ ret = rn_addroute(v_arg, n_arg, head, treenodes);
+ if (ret == NULL && rt->rt_flags & RTF_HOST) {
+ struct rtentry *rt2;
+ /*
+ * We are trying to add a host route, but can't.
+ * Find out if it is because of an
+ * ARP entry and delete it if so.
+ */
+ rt2 = rtalloc1((struct sockaddr *)sin, 0,
+ RTF_CLONING | RTF_PRCLONING);
+ if (rt2) {
+ if (rt2->rt_flags & RTF_LLINFO &&
+ rt2->rt_flags & RTF_HOST &&
+ rt2->rt_gateway &&
+ rt2->rt_gateway->sa_family == AF_LINK) {
+ rtrequest(RTM_DELETE,
+ (struct sockaddr *)rt_key(rt2),
+ rt2->rt_gateway,
+ rt_mask(rt2), rt2->rt_flags, 0);
+ ret = rn_addroute(v_arg, n_arg, head,
+ treenodes);
+ }
+ RTFREE(rt2);
+ }
+ }
+
+ /*
+ * If the new route created successfully, and we are forwarding,
+ * and there is a cached route, free it. Otherwise, we may end
+ * up using the wrong route.
+ */
+ if (ret != NULL && ipforwarding && ipforward_rt.ro_rt) {
+ RTFREE(ipforward_rt.ro_rt);
+ ipforward_rt.ro_rt = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * This code is the inverse of in_clsroute: on first reference, if we
+ * were managing the route, stop doing so and set the expiration timer
+ * back off again.
+ */
+static struct radix_node *
+in_matroute(void *v_arg, struct radix_node_head *head)
+{
+ struct radix_node *rn = rn_match(v_arg, head);
+ struct rtentry *rt = (struct rtentry *)rn;
+
+ if(rt && rt->rt_refcnt == 0) { /* this is first reference */
+ if(rt->rt_flags & RTPRF_OURS) {
+ rt->rt_flags &= ~RTPRF_OURS;
+ rt->rt_rmx.rmx_expire = 0;
+ }
+ }
+ return rn;
+}
+
+static int rtq_reallyold = 60*60;
+ /* one hour is ``really old'' */
+SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
+ &rtq_reallyold , 0,
+ "Default expiration time on dynamically learned routes");
+
+static int rtq_minreallyold = 10;
+ /* never automatically crank down to less */
+SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
+ &rtq_minreallyold , 0,
+ "Minimum time to attempt to hold onto dynamically learned routes");
+
+static int rtq_toomany = 128;
+ /* 128 cached routes is ``too many'' */
+SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
+ &rtq_toomany , 0, "Upper limit on dynamically learned routes");
+
+/*
+ * On last reference drop, mark the route as belong to us so that it can be
+ * timed out.
+ */
+static void
+in_clsroute(struct radix_node *rn, struct radix_node_head *head)
+{
+ struct rtentry *rt = (struct rtentry *)rn;
+
+ if(!(rt->rt_flags & RTF_UP))
+ return; /* prophylactic measures */
+
+ if((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
+ return;
+
+ if((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS))
+ != RTF_WASCLONED)
+ return;
+
+ /*
+ * As requested by David Greenman:
+ * If rtq_reallyold is 0, just delete the route without
+ * waiting for a timeout cycle to kill it.
+ */
+ if(rtq_reallyold != 0) {
+ rt->rt_flags |= RTPRF_OURS;
+ rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
+ } else {
+ rtrequest(RTM_DELETE,
+ (struct sockaddr *)rt_key(rt),
+ rt->rt_gateway, rt_mask(rt),
+ rt->rt_flags, 0);
+ }
+}
+
+struct rtqk_arg {
+ struct radix_node_head *rnh;
+ int draining;
+ int killed;
+ int found;
+ int updating;
+ time_t nextstop;
+};
+
+/*
+ * Get rid of old routes. When draining, this deletes everything, even when
+ * the timeout is not expired yet. When updating, this makes sure that
+ * nothing has a timeout longer than the current value of rtq_reallyold.
+ */
+static int
+in_rtqkill(struct radix_node *rn, void *rock)
+{
+ struct rtqk_arg *ap = rock;
+ struct rtentry *rt = (struct rtentry *)rn;
+ int err;
+
+ if(rt->rt_flags & RTPRF_OURS) {
+ ap->found++;
+
+ if(ap->draining || rt->rt_rmx.rmx_expire <= time_second) {
+ if(rt->rt_refcnt > 0)
+ panic("rtqkill route really not free");
+
+ err = rtrequest(RTM_DELETE,
+ (struct sockaddr *)rt_key(rt),
+ rt->rt_gateway, rt_mask(rt),
+ rt->rt_flags, 0);
+ if(err) {
+ log(LOG_WARNING, "in_rtqkill: error %d\n", err);
+ } else {
+ ap->killed++;
+ }
+ } else {
+ if(ap->updating
+ && (rt->rt_rmx.rmx_expire - time_second
+ > rtq_reallyold)) {
+ rt->rt_rmx.rmx_expire = time_second
+ + rtq_reallyold;
+ }
+ ap->nextstop = lmin(ap->nextstop,
+ rt->rt_rmx.rmx_expire);
+ }
+ }
+
+ return 0;
+}
+
+#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */
+static int rtq_timeout = RTQ_TIMEOUT;
+
+static void
+in_rtqtimo(void *rock)
+{
+ struct radix_node_head *rnh = rock;
+ struct rtqk_arg arg;
+ struct timeval atv;
+ static time_t last_adjusted_timeout = 0;
+ int s;
+
+ arg.found = arg.killed = 0;
+ arg.rnh = rnh;
+ arg.nextstop = time_second + rtq_timeout;
+ arg.draining = arg.updating = 0;
+ s = splnet();
+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
+ splx(s);
+
+ /*
+ * Attempt to be somewhat dynamic about this:
+ * If there are ``too many'' routes sitting around taking up space,
+ * then crank down the timeout, and see if we can't make some more
+ * go away. However, we make sure that we will never adjust more
+ * than once in rtq_timeout seconds, to keep from cranking down too
+ * hard.
+ */
+ if((arg.found - arg.killed > rtq_toomany)
+ && (time_second - last_adjusted_timeout >= rtq_timeout)
+ && rtq_reallyold > rtq_minreallyold) {
+ rtq_reallyold = 2*rtq_reallyold / 3;
+ if(rtq_reallyold < rtq_minreallyold) {
+ rtq_reallyold = rtq_minreallyold;
+ }
+
+ last_adjusted_timeout = time_second;
+#ifdef DIAGNOSTIC
+ log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
+ rtq_reallyold);
+#endif
+ arg.found = arg.killed = 0;
+ arg.updating = 1;
+ s = splnet();
+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
+ splx(s);
+ }
+
+ atv.tv_usec = 0;
+ atv.tv_sec = arg.nextstop - time_second;
+ timeout(in_rtqtimo, rock, tvtohz(&atv));
+}
+
+void
+in_rtqdrain(void)
+{
+ struct radix_node_head *rnh = rt_tables[AF_INET];
+ struct rtqk_arg arg;
+ int s;
+ arg.found = arg.killed = 0;
+ arg.rnh = rnh;
+ arg.nextstop = 0;
+ arg.draining = 1;
+ arg.updating = 0;
+ s = splnet();
+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
+ splx(s);
+}
+
+/*
+ * Initialize our routing tree.
+ */
+int
+in_inithead(void **head, int off)
+{
+ struct radix_node_head *rnh;
+
+ if(!rn_inithead(head, off))
+ return 0;
+
+ if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */
+ return 1; /* only do this for the real routing table */
+
+ rnh = *head;
+ rnh->rnh_addaddr = in_addroute;
+ rnh->rnh_matchaddr = in_matroute;
+ rnh->rnh_close = in_clsroute;
+ in_rtqtimo(rnh); /* kick off timeout first time */
+ return 1;
+}
+
+
+/*
+ * This zaps old routes when the interface goes down or interface
+ * address is deleted. In the latter case, it deletes static routes
+ * that point to this address. If we don't do this, we may end up
+ * using the old address in the future. The ones we always want to
+ * get rid of are things like ARP entries, since the user might down
+ * the interface, walk over to a completely different network, and
+ * plug back in.
+ */
+struct in_ifadown_arg {
+ struct radix_node_head *rnh;
+ struct ifaddr *ifa;
+ int del;
+};
+
+static int
+in_ifadownkill(struct radix_node *rn, void *xap)
+{
+ struct in_ifadown_arg *ap = xap;
+ struct rtentry *rt = (struct rtentry *)rn;
+ int err;
+
+ if (rt->rt_ifa == ap->ifa &&
+ (ap->del || !(rt->rt_flags & RTF_STATIC))) {
+ /*
+ * We need to disable the automatic prune that happens
+ * in this case in rtrequest() because it will blow
+ * away the pointers that rn_walktree() needs in order
+ * continue our descent. We will end up deleting all
+ * the routes that rtrequest() would have in any case,
+ * so that behavior is not needed there.
+ */
+ rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING);
+ err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt),
+ rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0);
+ if (err) {
+ log(LOG_WARNING, "in_ifadownkill: error %d\n", err);
+ }
+ }
+ return 0;
+}
+
+int
+in_ifadown(struct ifaddr *ifa, int delete)
+{
+ struct in_ifadown_arg arg;
+ struct radix_node_head *rnh;
+
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ return 1;
+
+ arg.rnh = rnh = rt_tables[AF_INET];
+ arg.ifa = ifa;
+ arg.del = delete;
+ rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
+ ifa->ifa_flags &= ~IFA_ROUTE;
+ return 0;
+}
diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h
new file mode 100644
index 0000000..4282752
--- /dev/null
+++ b/sys/netinet/in_systm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_systm.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_SYSTM_H_
+#define _NETINET_IN_SYSTM_H_
+
+/*
+ * Miscellaneous internetwork
+ * definitions for kernel.
+ */
+
+/*
+ * Network types.
+ *
+ * Internally the system keeps counters in the headers with the bytes
+ * swapped so that VAX instructions will work on them. It reverses
+ * the bytes before transmission at each protocol level. The n_ types
+ * represent the types with the bytes in ``high-ender'' order.
+ */
+typedef u_int16_t n_short; /* short as received from the net */
+typedef u_int32_t n_long; /* long as received from the net */
+
+typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */
+
+#ifdef _KERNEL
+n_time iptime(void);
+#endif
+
+#endif
diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h
new file mode 100644
index 0000000..4ccc4cd
--- /dev/null
+++ b/sys/netinet/in_var.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 1985, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_var.h 8.2 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IN_VAR_H_
+#define _NETINET_IN_VAR_H_
+
+#include <sys/queue.h>
+#include <sys/fnv_hash.h>
+
+/*
+ * Interface address, Internet version. One of these structures
+ * is allocated for each Internet address on an interface.
+ * The ifaddr structure contains the protocol-independent part
+ * of the structure and is assumed to be first.
+ */
+struct in_ifaddr {
+ struct ifaddr ia_ifa; /* protocol-independent info */
+#define ia_ifp ia_ifa.ifa_ifp
+#define ia_flags ia_ifa.ifa_flags
+ /* ia_{,sub}net{,mask} in host order */
+ u_long ia_net; /* network number of interface */
+ u_long ia_netmask; /* mask of net part */
+ u_long ia_subnet; /* subnet number, including net */
+ u_long ia_subnetmask; /* mask of subnet part */
+ struct in_addr ia_netbroadcast; /* to recognize net broadcasts */
+ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */
+ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */
+ struct sockaddr_in ia_addr; /* reserve space for interface name */
+ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
+#define ia_broadaddr ia_dstaddr
+ struct sockaddr_in ia_sockmask; /* reserve space for general netmask */
+};
+
+struct in_aliasreq {
+ char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
+ struct sockaddr_in ifra_addr;
+ struct sockaddr_in ifra_broadaddr;
+#define ifra_dstaddr ifra_broadaddr
+ struct sockaddr_in ifra_mask;
+};
+/*
+ * Given a pointer to an in_ifaddr (ifaddr),
+ * return a pointer to the addr as a sockaddr_in.
+ */
+#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr))
+#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr))
+
+#define IN_LNAOF(in, ifa) \
+ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask))
+
+
+#ifdef _KERNEL
+extern struct ifqueue ipintrq; /* ip packet input queue */
+extern struct in_addr zeroin_addr;
+extern u_char inetctlerrmap[];
+
+/*
+ * Hash table for IP addresses.
+ */
+extern LIST_HEAD(in_ifaddrhashhead, in_ifaddr) *in_ifaddrhashtbl;
+extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead;
+extern u_long in_ifaddrhmask; /* mask for hash table */
+
+#define INADDR_NHASH_LOG2 9
+#define INADDR_NHASH (1 << INADDR_NHASH_LOG2)
+#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT)
+#define INADDR_HASH(x) \
+ (&in_ifaddrhashtbl[INADDR_HASHVAL(x) & in_ifaddrhmask])
+
+
+/*
+ * Macro for finding the interface (ifnet structure) corresponding to one
+ * of our IP addresses.
+ */
+#define INADDR_TO_IFP(addr, ifp) \
+ /* struct in_addr addr; */ \
+ /* struct ifnet *ifp; */ \
+{ \
+ struct in_ifaddr *ia; \
+\
+ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \
+ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \
+ break; \
+ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \
+}
+
+/*
+ * Macro for finding the internet address structure (in_ifaddr) corresponding
+ * to a given interface (ifnet structure).
+ */
+#define IFP_TO_IA(ifp, ia) \
+ /* struct ifnet *ifp; */ \
+ /* struct in_ifaddr *ia; */ \
+{ \
+ for ((ia) = TAILQ_FIRST(&in_ifaddrhead); \
+ (ia) != NULL && (ia)->ia_ifp != (ifp); \
+ (ia) = TAILQ_NEXT((ia), ia_link)) \
+ continue; \
+}
+#endif
+
+/*
+ * This information should be part of the ifnet structure but we don't wish
+ * to change that - as it might break a number of things
+ */
+
+struct router_info {
+ struct ifnet *rti_ifp;
+ int rti_type; /* type of router which is querier on this interface */
+ int rti_time; /* # of slow timeouts since last old query */
+ struct router_info *rti_next;
+};
+
+/*
+ * Internet multicast address structure. There is one of these for each IP
+ * multicast group to which this host belongs on a given network interface.
+ * For every entry on the interface's if_multiaddrs list which represents
+ * an IP multicast group, there is one of these structures. They are also
+ * kept on a system-wide list to make it easier to keep our legacy IGMP code
+ * compatible with the rest of the world (see IN_FIRST_MULTI et al, below).
+ */
+struct in_multi {
+ LIST_ENTRY(in_multi) inm_link; /* queue macro glue */
+ struct in_addr inm_addr; /* IP multicast address, convenience */
+ struct ifnet *inm_ifp; /* back pointer to ifnet */
+ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */
+ u_int inm_timer; /* IGMP membership report timer */
+ u_int inm_state; /* state of the membership */
+ struct router_info *inm_rti; /* router info*/
+};
+
+#ifdef _KERNEL
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_raw);
+#endif
+
+extern LIST_HEAD(in_multihead, in_multi) in_multihead;
+
+/*
+ * Structure used by macros below to remember position when stepping through
+ * all of the in_multi records.
+ */
+struct in_multistep {
+ struct in_multi *i_inm;
+};
+
+/*
+ * Macro for looking up the in_multi record for a given IP multicast address
+ * on a given interface. If no matching record is found, "inm" is set null.
+ */
+#define IN_LOOKUP_MULTI(addr, ifp, inm) \
+ /* struct in_addr addr; */ \
+ /* struct ifnet *ifp; */ \
+ /* struct in_multi *inm; */ \
+do { \
+ struct ifmultiaddr *ifma; \
+\
+ TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \
+ if (ifma->ifma_addr->sa_family == AF_INET \
+ && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \
+ (addr).s_addr) \
+ break; \
+ } \
+ (inm) = ifma ? ifma->ifma_protospec : 0; \
+} while(0)
+
+/*
+ * Macro to step through all of the in_multi records, one at a time.
+ * The current position is remembered in "step", which the caller must
+ * provide. IN_FIRST_MULTI(), below, must be called to initialize "step"
+ * and get the first record. Both macros return a NULL "inm" when there
+ * are no remaining records.
+ */
+#define IN_NEXT_MULTI(step, inm) \
+ /* struct in_multistep step; */ \
+ /* struct in_multi *inm; */ \
+do { \
+ if (((inm) = (step).i_inm) != NULL) \
+ (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \
+} while(0)
+
+#define IN_FIRST_MULTI(step, inm) \
+ /* struct in_multistep step; */ \
+ /* struct in_multi *inm; */ \
+do { \
+ (step).i_inm = LIST_FIRST(&in_multihead); \
+ IN_NEXT_MULTI((step), (inm)); \
+} while(0)
+
+struct route;
+struct in_multi *in_addmulti(struct in_addr *, struct ifnet *);
+void in_delmulti(struct in_multi *);
+int in_control(struct socket *, u_long, caddr_t, struct ifnet *,
+ struct thread *);
+void in_rtqdrain(void);
+void ip_input(struct mbuf *);
+int in_ifadown(struct ifaddr *ifa, int);
+void in_ifscrub(struct ifnet *, struct in_ifaddr *);
+int ipflow_fastforward(struct mbuf *);
+void ipflow_create(const struct route *, struct mbuf *);
+void ipflow_slowtimo(void);
+
+#endif /* _KERNEL */
+
+/* INET6 stuff */
+#include <netinet6/in6_var.h>
+
+#endif /* _NETINET_IN_VAR_H_ */
diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h
new file mode 100644
index 0000000..7bb0988
--- /dev/null
+++ b/sys/netinet/ip.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip.h 8.2 (Berkeley) 6/1/94
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_H_
+#define _NETINET_IP_H_
+
+/*
+ * Definitions for internet protocol version 4.
+ * Per RFC 791, September 1981.
+ */
+#define IPVERSION 4
+
+/*
+ * Structure of an internet header, naked of options.
+ */
+struct ip {
+#ifdef _IP_VHL
+ u_char ip_vhl; /* version << 4 | header length >> 2 */
+#else
+#if BYTE_ORDER == LITTLE_ENDIAN
+ u_int ip_hl:4, /* header length */
+ ip_v:4; /* version */
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+ u_int ip_v:4, /* version */
+ ip_hl:4; /* header length */
+#endif
+#endif /* not _IP_VHL */
+ u_char ip_tos; /* type of service */
+ u_short ip_len; /* total length */
+ u_short ip_id; /* identification */
+ u_short ip_off; /* fragment offset field */
+#define IP_RF 0x8000 /* reserved fragment flag */
+#define IP_DF 0x4000 /* dont fragment flag */
+#define IP_MF 0x2000 /* more fragments flag */
+#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */
+ u_char ip_ttl; /* time to live */
+ u_char ip_p; /* protocol */
+ u_short ip_sum; /* checksum */
+ struct in_addr ip_src,ip_dst; /* source and dest address */
+};
+
+#ifdef _IP_VHL
+#define IP_MAKE_VHL(v, hl) ((v) << 4 | (hl))
+#define IP_VHL_HL(vhl) ((vhl) & 0x0f)
+#define IP_VHL_V(vhl) ((vhl) >> 4)
+#define IP_VHL_BORING 0x45
+#endif
+
+#define IP_MAXPACKET 65535 /* maximum packet size */
+
+/*
+ * Definitions for IP type of service (ip_tos)
+ */
+#define IPTOS_LOWDELAY 0x10
+#define IPTOS_THROUGHPUT 0x08
+#define IPTOS_RELIABILITY 0x04
+#define IPTOS_MINCOST 0x02
+/* ECN bits proposed by Sally Floyd */
+#define IPTOS_CE 0x01 /* congestion experienced */
+#define IPTOS_ECT 0x02 /* ECN-capable transport */
+
+
+/*
+ * Definitions for IP precedence (also in ip_tos) (hopefully unused)
+ */
+#define IPTOS_PREC_NETCONTROL 0xe0
+#define IPTOS_PREC_INTERNETCONTROL 0xc0
+#define IPTOS_PREC_CRITIC_ECP 0xa0
+#define IPTOS_PREC_FLASHOVERRIDE 0x80
+#define IPTOS_PREC_FLASH 0x60
+#define IPTOS_PREC_IMMEDIATE 0x40
+#define IPTOS_PREC_PRIORITY 0x20
+#define IPTOS_PREC_ROUTINE 0x00
+
+/*
+ * Definitions for options.
+ */
+#define IPOPT_COPIED(o) ((o)&0x80)
+#define IPOPT_CLASS(o) ((o)&0x60)
+#define IPOPT_NUMBER(o) ((o)&0x1f)
+
+#define IPOPT_CONTROL 0x00
+#define IPOPT_RESERVED1 0x20
+#define IPOPT_DEBMEAS 0x40
+#define IPOPT_RESERVED2 0x60
+
+#define IPOPT_EOL 0 /* end of option list */
+#define IPOPT_NOP 1 /* no operation */
+
+#define IPOPT_RR 7 /* record packet route */
+#define IPOPT_TS 68 /* timestamp */
+#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */
+#define IPOPT_LSRR 131 /* loose source route */
+#define IPOPT_ESO 133 /* extended security */
+#define IPOPT_CIPSO 134 /* commerical security */
+#define IPOPT_SATID 136 /* satnet id */
+#define IPOPT_SSRR 137 /* strict source route */
+#define IPOPT_RA 148 /* router alert */
+
+/*
+ * Offsets to fields in options other than EOL and NOP.
+ */
+#define IPOPT_OPTVAL 0 /* option ID */
+#define IPOPT_OLEN 1 /* option length */
+#define IPOPT_OFFSET 2 /* offset within option */
+#define IPOPT_MINOFF 4 /* min value of above */
+
+/*
+ * Time stamp option structure.
+ */
+struct ip_timestamp {
+ u_char ipt_code; /* IPOPT_TS */
+ u_char ipt_len; /* size of structure (variable) */
+ u_char ipt_ptr; /* index of current entry */
+#if BYTE_ORDER == LITTLE_ENDIAN
+ u_int ipt_flg:4, /* flags, see below */
+ ipt_oflw:4; /* overflow counter */
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+ u_int ipt_oflw:4, /* overflow counter */
+ ipt_flg:4; /* flags, see below */
+#endif
+ union ipt_timestamp {
+ n_long ipt_time[1];
+ struct ipt_ta {
+ struct in_addr ipt_addr;
+ n_long ipt_time;
+ } ipt_ta[1];
+ } ipt_timestamp;
+};
+
+/* flag bits for ipt_flg */
+#define IPOPT_TS_TSONLY 0 /* timestamps only */
+#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */
+#define IPOPT_TS_PRESPEC 3 /* specified modules only */
+
+/* bits for security (not byte swapped) */
+#define IPOPT_SECUR_UNCLASS 0x0000
+#define IPOPT_SECUR_CONFID 0xf135
+#define IPOPT_SECUR_EFTO 0x789a
+#define IPOPT_SECUR_MMMM 0xbc4d
+#define IPOPT_SECUR_RESTR 0xaf13
+#define IPOPT_SECUR_SECRET 0xd788
+#define IPOPT_SECUR_TOPSECRET 0x6bc5
+
+/*
+ * Internet implementation parameters.
+ */
+#define MAXTTL 255 /* maximum time to live (seconds) */
+#define IPDEFTTL 64 /* default ttl, from RFC 1340 */
+#define IPFRAGTTL 60 /* time to live for frags, slowhz */
+#define IPTTLDEC 1 /* subtracted when forwarding */
+
+#define IP_MSS 576 /* default maximum segment size */
+
+#endif
diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h
new file mode 100644
index 0000000..ec2c216
--- /dev/null
+++ b/sys/netinet/ip6.h
@@ -0,0 +1,308 @@
+/* $FreeBSD$ */
+/* $KAME: ip6.h,v 1.18 2001/03/29 05:34:30 itojun Exp $ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip.h 8.1 (Berkeley) 6/10/93
+ */
+
+#ifndef _NETINET_IP6_H_
+#define _NETINET_IP6_H_
+
+/*
+ * Definition for internet protocol version 6.
+ * RFC 2460
+ */
+
+struct ip6_hdr {
+ union {
+ struct ip6_hdrctl {
+ u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */
+ u_int16_t ip6_un1_plen; /* payload length */
+ u_int8_t ip6_un1_nxt; /* next header */
+ u_int8_t ip6_un1_hlim; /* hop limit */
+ } ip6_un1;
+ u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */
+ } ip6_ctlun;
+ struct in6_addr ip6_src; /* source address */
+ struct in6_addr ip6_dst; /* destination address */
+} __attribute__((__packed__));
+
+#define ip6_vfc ip6_ctlun.ip6_un2_vfc
+#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow
+#define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen
+#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt
+#define ip6_hlim ip6_ctlun.ip6_un1.ip6_un1_hlim
+#define ip6_hops ip6_ctlun.ip6_un1.ip6_un1_hlim
+
+#define IPV6_VERSION 0x60
+#define IPV6_VERSION_MASK 0xf0
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */
+#define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */
+#else
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */
+#define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */
+#endif /* LITTLE_ENDIAN */
+#endif
+#if 1
+/* ECN bits proposed by Sally Floyd */
+#define IP6TOS_CE 0x01 /* congestion experienced */
+#define IP6TOS_ECT 0x02 /* ECN-capable transport */
+#endif
+
+/*
+ * Extension Headers
+ */
+
+struct ip6_ext {
+ u_int8_t ip6e_nxt;
+ u_int8_t ip6e_len;
+} __attribute__((__packed__));
+
+/* Hop-by-Hop options header */
+/* XXX should we pad it to force alignment on an 8-byte boundary? */
+struct ip6_hbh {
+ u_int8_t ip6h_nxt; /* next header */
+ u_int8_t ip6h_len; /* length in units of 8 octets */
+ /* followed by options */
+} __attribute__((__packed__));
+
+/* Destination options header */
+/* XXX should we pad it to force alignment on an 8-byte boundary? */
+struct ip6_dest {
+ u_int8_t ip6d_nxt; /* next header */
+ u_int8_t ip6d_len; /* length in units of 8 octets */
+ /* followed by options */
+} __attribute__((__packed__));
+
+/* Option types and related macros */
+#define IP6OPT_PAD1 0x00 /* 00 0 00000 */
+#define IP6OPT_PADN 0x01 /* 00 0 00001 */
+#define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */
+#define IP6OPT_NSAP_ADDR 0xC3 /* 11 0 00011 */
+#define IP6OPT_TUNNEL_LIMIT 0x04 /* 00 0 00100 */
+#define IP6OPT_RTALERT 0x05 /* 00 0 00101 (KAME definition) */
+
+#define IP6OPT_RTALERT_LEN 4
+#define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */
+#define IP6OPT_RTALERT_RSVP 1 /* Datagram contains an RSVP message */
+#define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */
+#define IP6OPT_MINLEN 2
+
+#define IP6OPT_BINDING_UPDATE 0xc6 /* 11 0 00110 */
+#define IP6OPT_BINDING_ACK 0x07 /* 00 0 00111 */
+#define IP6OPT_BINDING_REQ 0x08 /* 00 0 01000 */
+#define IP6OPT_HOME_ADDRESS 0xc9 /* 11 0 01001 */
+#define IP6OPT_EID 0x8a /* 10 0 01010 */
+
+#define IP6OPT_TYPE(o) ((o) & 0xC0)
+#define IP6OPT_TYPE_SKIP 0x00
+#define IP6OPT_TYPE_DISCARD 0x40
+#define IP6OPT_TYPE_FORCEICMP 0x80
+#define IP6OPT_TYPE_ICMP 0xC0
+
+#define IP6OPT_MUTABLE 0x20
+
+#define IP6OPT_JUMBO_LEN 6
+
+/* Routing header */
+struct ip6_rthdr {
+ u_int8_t ip6r_nxt; /* next header */
+ u_int8_t ip6r_len; /* length in units of 8 octets */
+ u_int8_t ip6r_type; /* routing type */
+ u_int8_t ip6r_segleft; /* segments left */
+ /* followed by routing type specific data */
+} __attribute__((__packed__));
+
+/* Type 0 Routing header */
+struct ip6_rthdr0 {
+ u_int8_t ip6r0_nxt; /* next header */
+ u_int8_t ip6r0_len; /* length in units of 8 octets */
+ u_int8_t ip6r0_type; /* always zero */
+ u_int8_t ip6r0_segleft; /* segments left */
+ u_int8_t ip6r0_reserved; /* reserved field */
+ u_int8_t ip6r0_slmap[3]; /* strict/loose bit map */
+ struct in6_addr ip6r0_addr[1]; /* up to 23 addresses */
+} __attribute__((__packed__));
+
+/* Fragment header */
+struct ip6_frag {
+ u_int8_t ip6f_nxt; /* next header */
+ u_int8_t ip6f_reserved; /* reserved field */
+ u_int16_t ip6f_offlg; /* offset, reserved, and flag */
+ u_int32_t ip6f_ident; /* identification */
+} __attribute__((__packed__));
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */
+#define IP6F_RESERVED_MASK 0x0006 /* reserved bits in ip6f_offlg */
+#define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */
+#else /* BYTE_ORDER == LITTLE_ENDIAN */
+#define IP6F_OFF_MASK 0xf8ff /* mask out offset from _offlg */
+#define IP6F_RESERVED_MASK 0x0600 /* reserved bits in ip6f_offlg */
+#define IP6F_MORE_FRAG 0x0100 /* more-fragments flag */
+#endif /* BYTE_ORDER == LITTLE_ENDIAN */
+
+/*
+ * Internet implementation parameters.
+ */
+#define IPV6_MAXHLIM 255 /* maximun hoplimit */
+#define IPV6_DEFHLIM 64 /* default hlim */
+#define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */
+#define IPV6_HLIMDEC 1 /* subtracted when forwaeding */
+
+#define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */
+#define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/
+
+#ifdef _KERNEL
+/*
+ * IP6_EXTHDR_CHECK ensures that region between the IP6 header and the
+ * target header (including IPv6 itself, extension headers and
+ * TCP/UDP/ICMP6 headers) are continuous. KAME requires drivers
+ * to store incoming data into one internal mbuf or one or more external
+ * mbufs(never into two or more internal mbufs). Thus, the third case is
+ * supposed to never be matched but is prepared just in case.
+ */
+
+#define IP6_EXTHDR_CHECK(m, off, hlen, ret) \
+do { \
+ if ((m)->m_next != NULL) { \
+ if (((m)->m_flags & M_LOOP) && \
+ ((m)->m_len < (off) + (hlen)) && \
+ (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \
+ ip6stat.ip6s_exthdrtoolong++; \
+ return ret; \
+ } else if ((m)->m_flags & M_EXT) { \
+ if ((m)->m_len < (off) + (hlen)) { \
+ ip6stat.ip6s_exthdrtoolong++; \
+ m_freem(m); \
+ return ret; \
+ } \
+ } else { \
+ if ((m)->m_len < (off) + (hlen)) { \
+ ip6stat.ip6s_exthdrtoolong++; \
+ m_freem(m); \
+ return ret; \
+ } \
+ } \
+ } else { \
+ if ((m)->m_len < (off) + (hlen)) { \
+ ip6stat.ip6s_tooshort++; \
+ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \
+ m_freem(m); \
+ return ret; \
+ } \
+ } \
+} while (0)
+
+/*
+ * IP6_EXTHDR_GET ensures that intermediate protocol header (from "off" to
+ * "len") is located in single mbuf, on contiguous memory region.
+ * The pointer to the region will be returned to pointer variable "val",
+ * with type "typ".
+ * IP6_EXTHDR_GET0 does the same, except that it aligns the structure at the
+ * very top of mbuf. GET0 is likely to make memory copy than GET.
+ *
+ * XXX we're now testing this, needs m_pulldown()
+ */
+#define IP6_EXTHDR_GET(val, typ, m, off, len) \
+do { \
+ struct mbuf *t; \
+ int tmp; \
+ if ((m)->m_len >= (off) + (len)) \
+ (val) = (typ)(mtod((m), caddr_t) + (off)); \
+ else { \
+ t = m_pulldown((m), (off), (len), &tmp); \
+ if (t) { \
+ if (t->m_len < tmp + (len)) \
+ panic("m_pulldown malfunction"); \
+ (val) = (typ)(mtod(t, caddr_t) + tmp); \
+ } else { \
+ (val) = (typ)NULL; \
+ (m) = NULL; \
+ } \
+ } \
+} while (0)
+
+#define IP6_EXTHDR_GET0(val, typ, m, off, len) \
+do { \
+ struct mbuf *t; \
+ if ((off) == 0) \
+ (val) = (typ)mtod(m, caddr_t); \
+ else { \
+ t = m_pulldown((m), (off), (len), NULL); \
+ if (t) { \
+ if (t->m_len < (len)) \
+ panic("m_pulldown malfunction"); \
+ (val) = (typ)mtod(t, caddr_t); \
+ } else { \
+ (val) = (typ)NULL; \
+ (m) = NULL; \
+ } \
+ } \
+} while (0)
+#endif /*_KERNEL*/
+
+#endif /* not _NETINET_IP6_H_ */
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
new file mode 100644
index 0000000..50e939b
--- /dev/null
+++ b/sys/netinet/ip_divert.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipsec.h"
+
+#ifndef INET
+#error "IPDIVERT requires INET."
+#endif
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+
+/*
+ * Divert sockets
+ */
+
+/*
+ * Allocate enough space to hold a full IP packet
+ */
+#define DIVSNDQ (65536 + 100)
+#define DIVRCVQ (65536 + 100)
+
+/*
+ * Divert sockets work in conjunction with ipfw, see the divert(4)
+ * manpage for features.
+ * Internally, packets selected by ipfw in ip_input() or ip_output(),
+ * and never diverted before, are passed to the input queue of the
+ * divert socket with a given 'divert_port' number (as specified in
+ * the matching ipfw rule), and they are tagged with a 16 bit cookie
+ * (representing the rule number of the matching ipfw rule), which
+ * is passed to process reading from the socket.
+ *
+ * Packets written to the divert socket are again tagged with a cookie
+ * (usually the same as above) and a destination address.
+ * If the destination address is INADDR_ANY then the packet is
+ * treated as outgoing and sent to ip_output(), otherwise it is
+ * treated as incoming and sent to ip_input().
+ * In both cases, the packet is tagged with the cookie.
+ *
+ * On reinjection, processing in ip_input() and ip_output()
+ * will be exactly the same as for the original packet, except that
+ * ipfw processing will start at the rule number after the one
+ * written in the cookie (so, tagging a packet with a cookie of 0
+ * will cause it to be effectively considered as a standard packet).
+ */
+
+/* Internal variables */
+static struct inpcbhead divcb;
+static struct inpcbinfo divcbinfo;
+
+static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
+static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */
+
+/* Optimization: have this preinitialized */
+static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET };
+
+/*
+ * Initialize divert connection block queue.
+ */
+void
+div_init(void)
+{
+ INP_INFO_LOCK_INIT(&divcbinfo, "div");
+ LIST_INIT(&divcb);
+ divcbinfo.listhead = &divcb;
+ /*
+ * XXX We don't use the hash list for divert IP, but it's easier
+ * to allocate a one entry hash list than it is to check all
+ * over the place for hashbase == NULL.
+ */
+ divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask);
+ divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask);
+ divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(divcbinfo.ipi_zone, maxsockets);
+}
+
+/*
+ * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets
+ * with that protocol number to enter the system from the outside.
+ */
+void
+div_input(struct mbuf *m, int off)
+{
+ ipstat.ips_noproto++;
+ m_freem(m);
+}
+
+/*
+ * Divert a packet by passing it up to the divert socket at port 'port'.
+ *
+ * Setup generic address and protocol structures for div_input routine,
+ * then pass them along with mbuf chain.
+ */
+void
+divert_packet(struct mbuf *m, int incoming, int port, int rule)
+{
+ struct ip *ip;
+ struct inpcb *inp;
+ struct socket *sa;
+ u_int16_t nport;
+
+ /* Sanity check */
+ KASSERT(port != 0, ("%s: port=0", __func__));
+
+ divsrc.sin_port = rule; /* record matching rule */
+
+ /* Assure header */
+ if (m->m_len < sizeof(struct ip) &&
+ (m = m_pullup(m, sizeof(struct ip))) == 0)
+ return;
+ ip = mtod(m, struct ip *);
+
+ /*
+ * Record receive interface address, if any.
+ * But only for incoming packets.
+ */
+ divsrc.sin_addr.s_addr = 0;
+ if (incoming) {
+ struct ifaddr *ifa;
+
+ /* Sanity check */
+ KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __func__));
+
+ /* Find IP address for receive interface */
+ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL)
+ continue;
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ divsrc.sin_addr =
+ ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
+ break;
+ }
+ }
+ /*
+ * Record the incoming interface name whenever we have one.
+ */
+ bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero));
+ if (m->m_pkthdr.rcvif) {
+ /*
+ * Hide the actual interface name in there in the
+ * sin_zero array. XXX This needs to be moved to a
+ * different sockaddr type for divert, e.g.
+ * sockaddr_div with multiple fields like
+ * sockaddr_dl. Presently we have only 7 bytes
+ * but that will do for now as most interfaces
+ * are 4 or less + 2 or less bytes for unit.
+ * There is probably a faster way of doing this,
+ * possibly taking it from the sockaddr_dl on the iface.
+ * This solves the problem of a P2P link and a LAN interface
+ * having the same address, which can result in the wrong
+ * interface being assigned to the packet when fed back
+ * into the divert socket. Theoretically if the daemon saves
+ * and re-uses the sockaddr_in as suggested in the man pages,
+ * this iface name will come along for the ride.
+ * (see div_output for the other half of this.)
+ */
+ snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero),
+ "%s%d", m->m_pkthdr.rcvif->if_name,
+ m->m_pkthdr.rcvif->if_unit);
+ }
+
+ /* Put packet on socket queue, if any */
+ sa = NULL;
+ nport = htons((u_int16_t)port);
+ LIST_FOREACH(inp, &divcb, inp_list) {
+ if (inp->inp_lport == nport)
+ sa = inp->inp_socket;
+ }
+ if (sa) {
+ if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc,
+ m, (struct mbuf *)0) == 0)
+ m_freem(m);
+ else
+ sorwakeup(sa);
+ } else {
+ m_freem(m);
+ ipstat.ips_noproto++;
+ ipstat.ips_delivered--;
+ }
+}
+
+/*
+ * Deliver packet back into the IP processing machinery.
+ *
+ * If no address specified, or address is 0.0.0.0, send to ip_output();
+ * otherwise, send to ip_input() and mark as having been received on
+ * the interface with that address.
+ */
+static int
+div_output(struct socket *so, struct mbuf *m,
+ struct sockaddr_in *sin, struct mbuf *control)
+{
+ int error = 0;
+ struct m_hdr divert_tag;
+
+ /*
+ * Prepare the tag for divert info. Note that a packet
+ * with a 0 tag in mh_data is effectively untagged,
+ * so we could optimize that case.
+ */
+ divert_tag.mh_type = MT_TAG;
+ divert_tag.mh_flags = PACKET_TAG_DIVERT;
+ divert_tag.mh_next = m;
+ divert_tag.mh_data = 0; /* the matching rule # */
+ m->m_pkthdr.rcvif = NULL; /* XXX is it necessary ? */
+
+ if (control)
+ m_freem(control); /* XXX */
+
+ /* Loopback avoidance and state recovery */
+ if (sin) {
+ int i;
+
+ divert_tag.mh_data = (caddr_t)(int)sin->sin_port;
+ /*
+ * Find receive interface with the given name, stuffed
+ * (if it exists) in the sin_zero[] field.
+ * The name is user supplied data so don't trust its size
+ * or that it is zero terminated.
+ */
+ for (i = 0; sin->sin_zero[i] && i < sizeof(sin->sin_zero); i++)
+ ;
+ if ( i > 0 && i < sizeof(sin->sin_zero))
+ m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
+ }
+
+ /* Reinject packet into the system as incoming or outgoing */
+ if (!sin || sin->sin_addr.s_addr == 0) {
+ struct inpcb *const inp = sotoinpcb(so);
+ struct ip *const ip = mtod(m, struct ip *);
+
+ /*
+ * Don't allow both user specified and setsockopt options,
+ * and don't allow packet length sizes that will crash
+ */
+ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
+ ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
+ error = EINVAL;
+ goto cantsend;
+ }
+
+ /* Convert fields to host order for ip_output() */
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_off = ntohs(ip->ip_off);
+
+ /* Send packet to output processing */
+ ipstat.ips_rawout++; /* XXX */
+ error = ip_output((struct mbuf *)&divert_tag,
+ inp->inp_options, &inp->inp_route,
+ (so->so_options & SO_DONTROUTE) |
+ IP_ALLOWBROADCAST | IP_RAWOUTPUT,
+ inp->inp_moptions);
+ } else {
+ if (m->m_pkthdr.rcvif == NULL) {
+ /*
+ * No luck with the name, check by IP address.
+ * Clear the port and the ifname to make sure
+ * there are no distractions for ifa_ifwithaddr.
+ */
+ struct ifaddr *ifa;
+
+ bzero(sin->sin_zero, sizeof(sin->sin_zero));
+ sin->sin_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *) sin);
+ if (ifa == NULL) {
+ error = EADDRNOTAVAIL;
+ goto cantsend;
+ }
+ m->m_pkthdr.rcvif = ifa->ifa_ifp;
+ }
+ /* Send packet to input processing */
+ ip_input((struct mbuf *)&divert_tag);
+ }
+
+ return error;
+
+cantsend:
+ m_freem(m);
+ return error;
+}
+
+static int
+div_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ int error, s;
+
+ inp = sotoinpcb(so);
+ if (inp)
+ panic("div_attach");
+ if (td && (error = suser(td)) != 0)
+ return error;
+
+ error = soreserve(so, div_sendspace, div_recvspace);
+ if (error)
+ return error;
+ s = splnet();
+ error = in_pcballoc(so, &divcbinfo, td);
+ splx(s);
+ if (error)
+ return error;
+ inp = (struct inpcb *)so->so_pcb;
+ inp->inp_ip_p = proto;
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_flags |= INP_HDRINCL;
+ /* The socket is always "connected" because
+ we always know "where" to send the packet */
+ so->so_state |= SS_ISCONNECTED;
+ return 0;
+}
+
+static int
+div_detach(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ if (inp == 0)
+ panic("div_detach");
+ in_pcbdetach(inp);
+ return 0;
+}
+
+static int
+div_abort(struct socket *so)
+{
+ soisdisconnected(so);
+ return div_detach(so);
+}
+
+static int
+div_disconnect(struct socket *so)
+{
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return ENOTCONN;
+ return div_abort(so);
+}
+
+static int
+div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp;
+ int s;
+ int error;
+
+ s = splnet();
+ inp = sotoinpcb(so);
+ /* in_pcbbind assumes that nam is a sockaddr_in
+ * and in_pcbbind requires a valid address. Since divert
+ * sockets don't we need to make sure the address is
+ * filled in properly.
+ * XXX -- divert should not be abusing in_pcbind
+ * and should probably have its own family.
+ */
+ if (nam->sa_family != AF_INET)
+ error = EAFNOSUPPORT;
+ else {
+ ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
+ error = in_pcbbind(inp, nam, td);
+ }
+ splx(s);
+ return error;
+}
+
+static int
+div_shutdown(struct socket *so)
+{
+ socantsendmore(so);
+ return 0;
+}
+
+static int
+div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ /* Packet must have a header (but that's about it) */
+ if (m->m_len < sizeof (struct ip) &&
+ (m = m_pullup(m, sizeof (struct ip))) == 0) {
+ ipstat.ips_toosmall++;
+ m_freem(m);
+ return EINVAL;
+ }
+
+ /* Send packet */
+ return div_output(so, m, (struct sockaddr_in *)nam, control);
+}
+
+static int
+div_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n, s;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = divcbinfo.ipi_count;
+ req->oldidx = 2 * (sizeof xig)
+ + (n + n/8) * sizeof(struct xinpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ s = splnet();
+ gencnt = divcbinfo.ipi_gencnt;
+ n = divcbinfo.ipi_count;
+ splx(s);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return error;
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return ENOMEM;
+
+ s = splnet();
+ for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->td, inp))
+ inp_list[i++] = inp;
+ }
+ splx(s);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ if (inp->inp_gencnt <= gencnt) {
+ struct xinpcb xi;
+ xi.xi_len = sizeof xi;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xi.xi_inp, sizeof *inp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xi.xi_socket);
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ }
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ s = splnet();
+ xig.xig_gen = divcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = divcbinfo.ipi_count;
+ splx(s);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return error;
+}
+
+/*
+ * This is the wrapper function for in_setsockaddr. We just pass down
+ * the pcbinfo for in_setpeeraddr to lock.
+ */
+static int
+div_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setsockaddr(so, nam, &divcbinfo));
+}
+
+/*
+ * This is the wrapper function for in_setpeeraddr. We just pass down
+ * the pcbinfo for in_setpeeraddr to lock.
+ */
+static int
+div_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setpeeraddr(so, nam, &divcbinfo));
+}
+
+
+SYSCTL_DECL(_net_inet_divert);
+SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
+ div_pcblist, "S,xinpcb", "List of active divert sockets");
+
+struct pr_usrreqs div_usrreqs = {
+ div_abort, pru_accept_notsupp, div_attach, div_bind,
+ pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach,
+ div_disconnect, pru_listen_notsupp, div_peeraddr, pru_rcvd_notsupp,
+ pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown,
+ div_sockaddr, sosend, soreceive, sopoll
+};
diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c
new file mode 100644
index 0000000..6006b65
--- /dev/null
+++ b/sys/netinet/ip_dummynet.c
@@ -0,0 +1,1952 @@
+/*
+ * Copyright (c) 1998-2001 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define DEB(x)
+#define DDB(x) x
+
+/*
+ * This module implements IP dummynet, a bandwidth limiter/delay emulator
+ * used in conjunction with the ipfw package.
+ * Description of the data structures used is in ip_dummynet.h
+ * Here you mainly find the following blocks of code:
+ * + variable declarations;
+ * + heap management functions;
+ * + scheduler and dummynet functions;
+ * + configuration and initialization.
+ *
+ * NOTA BENE: critical sections are protected by splimp()/splx()
+ * pairs. One would think that splnet() is enough as for most of
+ * the netinet code, but it is not so because when used with
+ * bridging, dummynet is invoked at splimp().
+ *
+ * Most important Changes:
+ *
+ * 011004: KLDable
+ * 010124: Fixed WF2Q behaviour
+ * 010122: Fixed spl protection.
+ * 000601: WF2Q support
+ * 000106: large rewrite, use heaps to handle very many pipes.
+ * 980513: initial release
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/queue.h> /* XXX */
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_var.h>
+
+#include <netinet/if_ether.h> /* for struct arpcom */
+#include <net/bridge.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ; /* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static int searches, search_steps ;
+static int pipe_expire = 1 ; /* expire queue if empty */
+static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+
+static int red_lookup_depth = 256; /* RED - default lookup table depth */
+static int red_avg_pkt_size = 512; /* RED - default medium packet size */
+static int red_max_pkt_size = 1500; /* RED - default max packet size */
+
+/*
+ * Three heaps contain queues and pipes that the scheduler handles:
+ *
+ * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
+ *
+ * wfq_ready_heap contains the pipes associated with WF2Q flows
+ *
+ * extract_heap contains pipes associated with delay lines.
+ *
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
+
+static int heap_init(struct dn_heap *h, int size) ;
+static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void heap_extract(struct dn_heap *h, void *obj);
+
+static void transmit_event(struct dn_pipe *pipe);
+static void ready_event(struct dn_flow_queue *q);
+
+static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
+static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
+
+static struct callout_handle dn_timeout;
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
+ CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+ CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+ CTLFLAG_RD, &curr_time, 0, "Current tick");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+ CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+ CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
+ CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+ CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+ CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
+ CTLFLAG_RW, &dn_max_ratio, 0,
+ "Max ratio between dynamic queues and buckets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+ CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+ CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+ CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
+#endif
+
+static int config_pipe(struct dn_pipe *p);
+static int ip_dn_ctl(struct sockopt *sopt);
+
+static void rt_unref(struct rtentry *);
+static void dummynet(void *);
+static void dummynet_flush(void);
+void dummynet_drain(void);
+static ip_dn_io_t dummynet_io;
+static void dn_rule_delete(void *);
+
+int if_tx_rdy(struct ifnet *ifp);
+
+/*
+ * ip_fw_chain_head is used when deleting a pipe, because ipfw rules can
+ * hold references to the pipe.
+ */
+extern LIST_HEAD (ip_fw_head, ip_fw) ip_fw_chain_head;
+
+static void
+rt_unref(struct rtentry *rt)
+{
+ if (rt == NULL)
+ return ;
+ if (rt->rt_refcnt <= 0)
+ printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt);
+ RTFREE(rt);
+}
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+ struct dn_heap_entry *p;
+
+ if (h->size >= new_size ) {
+ printf("heap_init, Bogus call, have %d want %d\n",
+ h->size, new_size);
+ return 0 ;
+ }
+ new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+ p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
+ if (p == NULL) {
+ printf(" heap_init, resize %d failed\n", new_size );
+ return 1 ; /* error */
+ }
+ if (h->size > 0) {
+ bcopy(h->p, p, h->size * sizeof(*p) );
+ free(h->p, M_DUMMYNET);
+ }
+ h->p = p ;
+ h->size = new_size ;
+ return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+ if (heap->offset > 0) \
+ *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+ if (heap->offset > 0) \
+ *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+ int son = h->elements ;
+
+ if (p == NULL) /* data already there, set starting point */
+ son = key1 ;
+ else { /* insert new element at the end, possibly resize */
+ son = h->elements ;
+ if (son == h->size) /* need resize... */
+ if (heap_init(h, h->elements+1) )
+ return 1 ; /* failure... */
+ h->p[son].object = p ;
+ h->p[son].key = key1 ;
+ h->elements++ ;
+ }
+ while (son > 0) { /* bubble up */
+ int father = HEAP_FATHER(son) ;
+ struct dn_heap_entry tmp ;
+
+ if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+ break ; /* found right position */
+ /* son smaller than father, swap and repeat */
+ HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+ SET_OFFSET(h, son);
+ son = father ;
+ }
+ SET_OFFSET(h, son);
+ return 0 ;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+static void
+heap_extract(struct dn_heap *h, void *obj)
+{
+ int child, father, max = h->elements - 1 ;
+
+ if (max < 0) {
+ printf("warning, extract from empty heap 0x%p\n", h);
+ return ;
+ }
+ father = 0 ; /* default: move up smallest child */
+ if (obj != NULL) { /* extract specific element, index is at offset */
+ if (h->offset <= 0)
+ panic("*** heap_extract from middle not supported on this heap!!!\n");
+ father = *((int *)((char *)obj + h->offset)) ;
+ if (father < 0 || father >= h->elements) {
+ printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+ father, h->elements);
+ panic("heap_extract");
+ }
+ }
+ RESET_OFFSET(h, father);
+ child = HEAP_LEFT(father) ; /* left child */
+ while (child <= max) { /* valid entry */
+ if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+ child = child+1 ; /* take right child, otherwise left */
+ h->p[father] = h->p[child] ;
+ SET_OFFSET(h, father);
+ father = child ;
+ child = HEAP_LEFT(child) ; /* left child for next loop */
+ }
+ h->elements-- ;
+ if (father != max) {
+ /*
+ * Fill hole with last entry and bubble up, reusing the insert code
+ */
+ h->p[father] = h->p[max] ;
+ heap_insert(h, father, NULL); /* this one cannot fail */
+ }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+ int temp;
+ int i ;
+ int max = h->elements-1 ;
+ struct dn_heap_entry buf ;
+
+ if (h->offset <= 0)
+ panic("cannot move items on this heap");
+
+ i = *((int *)((char *)object + h->offset));
+ if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+ h->p[i].key = new_key ;
+ for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+ i = temp ) { /* bubble up */
+ HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+ SET_OFFSET(h, i);
+ }
+ } else { /* must move down */
+ h->p[i].key = new_key ;
+ while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+ if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+ temp++ ; /* select child with min key */
+ if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+ HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+ SET_OFFSET(h, i);
+ } else
+ break ;
+ i = temp ;
+ }
+ }
+ SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+ int i ;
+
+ for (i = 0 ; i < h->elements ; i++ )
+ heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+static void
+heap_free(struct dn_heap *h)
+{
+ if (h->size >0 )
+ free(h->p, M_DUMMYNET);
+ bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+/*
+ * Scheduler functions:
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with fixed-rate queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * wfq_ready_event() does something similar with WF2Q queues, and the
+ * event handled is the start time of the head pkt.
+ *
+ * In all cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe)
+{
+ struct dn_pkt *pkt ;
+
+ while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) {
+ /*
+ * first unlink, then call procedures, since ip_input() can invoke
+ * ip_output() and viceversa, thus causing nested calls
+ */
+ pipe->head = DN_NEXT(pkt) ;
+
+ /*
+ * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf
+ * (NOT A REAL one, just a small block of malloc'ed memory) with
+ * m_type = MT_TAG, m_flags = PACKET_TAG_DUMMYNET
+ * dn_m (m_next) = actual mbuf to be processed by ip_input/output
+ * and some other fields.
+ * The block IS FREED HERE because it contains parameters passed
+ * to the called routine.
+ */
+ switch (pkt->dn_dir) {
+ case DN_TO_IP_OUT:
+ (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL);
+ rt_unref (pkt->ro.ro_rt) ;
+ break ;
+
+ case DN_TO_IP_IN :
+ ip_input((struct mbuf *)pkt) ;
+ break ;
+
+ case DN_TO_BDG_FWD :
+ if (!BDG_LOADED) {
+ /* somebody unloaded the bridge module. Drop pkt */
+ printf("-- dropping bridged packet trapped in pipe--\n");
+ m_freem(pkt->dn_m);
+ break;
+ } /* fallthrough */
+ case DN_TO_ETH_DEMUX:
+ {
+ struct mbuf *m = (struct mbuf *)pkt ;
+ struct ether_header *eh;
+
+ if (pkt->dn_m->m_len < ETHER_HDR_LEN &&
+ (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) {
+ printf("dummynet/bridge: pullup fail, dropping pkt\n");
+ break;
+ }
+ /*
+ * same as ether_input, make eh be a pointer into the mbuf
+ */
+ eh = mtod(pkt->dn_m, struct ether_header *);
+ m_adj(pkt->dn_m, ETHER_HDR_LEN);
+ /*
+ * bdg_forward() wants a pointer to the pseudo-mbuf-header, but
+ * on return it will supply the pointer to the actual packet
+ * (originally pkt->dn_m, but could be something else now) if
+ * it has not consumed it.
+ */
+ if (pkt->dn_dir == DN_TO_BDG_FWD) {
+ m = bdg_forward_ptr(m, eh, pkt->ifp);
+ if (m)
+ m_freem(m);
+ } else
+ ether_demux(NULL, eh, m); /* which consumes the mbuf */
+ }
+ break ;
+ case DN_TO_ETH_OUT:
+ ether_output_frame(pkt->ifp, (struct mbuf *)pkt);
+ break;
+
+ default:
+ printf("dummynet: bad switch %d!\n", pkt->dn_dir);
+ m_freem(pkt->dn_m);
+ break ;
+ }
+ free(pkt, M_DUMMYNET);
+ }
+ /* if there are leftover packets, put into the heap for next event */
+ if ( (pkt = pipe->head) )
+ heap_insert(&extract_heap, pkt->output_time, pipe ) ;
+ /* XXX should check errors on heap_insert, by draining the
+ * whole pipe p and hoping in the future we are more successful
+ */
+}
+
+/*
+ * the following macro computes how many ticks we have to wait
+ * before being able to transmit a packet. The credit is taken from
+ * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
+ */
+#define SET_TICKS(pkt, q, p) \
+ (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \
+ p->bandwidth ;
+
+/*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+static void
+move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q,
+ struct dn_pipe *p, int len)
+{
+ q->head = DN_NEXT(pkt) ;
+ q->len-- ;
+ q->len_bytes -= len ;
+
+ pkt->output_time = curr_time + p->delay ;
+
+ if (p->head == NULL)
+ p->head = pkt;
+ else
+ DN_NEXT(p->tail) = pkt;
+ p->tail = pkt;
+ DN_NEXT(p->tail) = NULL;
+}
+
+/*
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
+ */
+static void
+ready_event(struct dn_flow_queue *q)
+{
+ struct dn_pkt *pkt;
+ struct dn_pipe *p = q->fs->pipe ;
+ int p_was_empty ;
+
+ if (p == NULL) {
+ printf("ready_event- pipe is gone\n");
+ return ;
+ }
+ p_was_empty = (p->head == NULL) ;
+
+ /*
+ * schedule fixed-rate queues linked to this pipe:
+ * Account for the bw accumulated since last scheduling, then
+ * drain as many pkts as allowed by q->numbytes and move to
+ * the delay line (in p) computing output time.
+ * bandwidth==0 (no limit) means we can drain the whole queue,
+ * setting len_scaled = 0 does the job.
+ */
+ q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
+ while ( (pkt = q->head) != NULL ) {
+ int len = pkt->dn_m->m_pkthdr.len;
+ int len_scaled = p->bandwidth ? len*8*hz : 0 ;
+ if (len_scaled > q->numbytes )
+ break ;
+ q->numbytes -= len_scaled ;
+ move_pkt(pkt, q, p, len);
+ }
+ /*
+ * If we have more packets queued, schedule next ready event
+ * (can only occur when bandwidth != 0, otherwise we would have
+ * flushed the whole queue in the previous loop).
+ * To this purpose we record the current time and compute how many
+ * ticks to go for the finish time of the packet.
+ */
+ if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
+ dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
+ q->sched_time = curr_time ;
+ heap_insert(&ready_heap, curr_time + t, (void *)q );
+ /* XXX should check errors on heap_insert, and drain the whole
+ * queue on error hoping next time we are luckier.
+ */
+ } else /* RED needs to know when the queue becomes empty */
+ q->q_time = curr_time;
+ /*
+ * If the delay line was empty call transmit_event(p) now.
+ * Otherwise, the scheduler will take care of it.
+ */
+ if (p_was_empty)
+ transmit_event(p);
+}
+
+/*
+ * Called when we can transmit packets on WF2Q queues. Take pkts out of
+ * the queues at their start time, and enqueue into the delay line.
+ * Packets are drained until p->numbytes < 0. As long as
+ * len_scaled >= p->numbytes, the packet goes into the delay line
+ * with a deadline p->delay. For the last packet, if p->numbytes<0,
+ * there is an additional delay.
+ */
+static void
+ready_event_wfq(struct dn_pipe *p)
+{
+ int p_was_empty = (p->head == NULL) ;
+ struct dn_heap *sch = &(p->scheduler_heap);
+ struct dn_heap *neh = &(p->not_eligible_heap) ;
+
+ if (p->if_name[0] == 0) /* tx clock is simulated */
+ p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
+ else { /* tx clock is for real, the ifq must be empty or this is a NOP */
+ if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
+ return ;
+ else {
+ DEB(printf("pipe %d ready from %s --\n",
+ p->pipe_nr, p->if_name);)
+ }
+ }
+
+ /*
+ * While we have backlogged traffic AND credit, we need to do
+ * something on the queue.
+ */
+ while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
+ if (sch->elements > 0) { /* have some eligible pkts to send out */
+ struct dn_flow_queue *q = sch->p[0].object ;
+ struct dn_pkt *pkt = q->head;
+ struct dn_flow_set *fs = q->fs;
+ u_int64_t len = pkt->dn_m->m_pkthdr.len;
+ int len_scaled = p->bandwidth ? len*8*hz : 0 ;
+
+ heap_extract(sch, NULL); /* remove queue from heap */
+ p->numbytes -= len_scaled ;
+ move_pkt(pkt, q, p, len);
+
+ p->V += (len<<MY_M) / p->sum ; /* update V */
+ q->S = q->F ; /* update start time */
+ if (q->len == 0) { /* Flow not backlogged any more */
+ fs->backlogged-- ;
+ heap_insert(&(p->idle_heap), q->F, q);
+ } else { /* still backlogged */
+ /*
+ * update F and position in backlogged queue, then
+ * put flow in not_eligible_heap (we will fix this later).
+ */
+ len = (q->head)->dn_m->m_pkthdr.len;
+ q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
+ if (DN_KEY_LEQ(q->S, p->V))
+ heap_insert(neh, q->S, q);
+ else
+ heap_insert(sch, q->F, q);
+ }
+ }
+ /*
+ * now compute V = max(V, min(S_i)). Remember that all elements in sch
+ * have by definition S_i <= V so if sch is not empty, V is surely
+ * the max and we must not update it. Conversely, if sch is empty
+ * we only need to look at neh.
+ */
+ if (sch->elements == 0 && neh->elements > 0)
+ p->V = MAX64 ( p->V, neh->p[0].key );
+ /* move from neh to sch any packets that have become eligible */
+ while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
+ struct dn_flow_queue *q = neh->p[0].object ;
+ heap_extract(neh, NULL);
+ heap_insert(sch, q->F, q);
+ }
+
+ if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
+ p->numbytes = -1 ; /* mark not ready for I/O */
+ break ;
+ }
+ }
+ if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
+ && p->idle_heap.elements > 0) {
+ /*
+ * no traffic and no events scheduled. We can get rid of idle-heap.
+ */
+ int i ;
+
+ for (i = 0 ; i < p->idle_heap.elements ; i++) {
+ struct dn_flow_queue *q = p->idle_heap.p[i].object ;
+
+ q->F = 0 ;
+ q->S = q->F + 1 ;
+ }
+ p->sum = 0 ;
+ p->V = 0 ;
+ p->idle_heap.elements = 0 ;
+ }
+ /*
+ * If we are getting clocks from dummynet (not a real interface) and
+ * If we are under credit, schedule the next ready event.
+ * Also fix the delivery time of the last packet.
+ */
+ if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
+ dn_key t=0 ; /* number of ticks i have to wait */
+
+ if (p->bandwidth > 0)
+ t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
+ p->tail->output_time += t ;
+ p->sched_time = curr_time ;
+ heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
+ /* XXX should check errors on heap_insert, and drain the whole
+ * queue on error hoping next time we are luckier.
+ */
+ }
+ /*
+ * If the delay line was empty call transmit_event(p) now.
+ * Otherwise, the scheduler will take care of it.
+ */
+ if (p_was_empty)
+ transmit_event(p);
+}
+
+/*
+ * This is called once per tick, or HZ times per second. It is used to
+ * increment the current tick counter and schedule expired events.
+ */
+static void
+dummynet(void * __unused unused)
+{
+ void *p ; /* generic parameter to handler */
+ struct dn_heap *h ;
+ int s ;
+ struct dn_heap *heaps[3];
+ int i;
+ struct dn_pipe *pe ;
+
+ heaps[0] = &ready_heap ; /* fixed-rate queues */
+ heaps[1] = &wfq_ready_heap ; /* wfq queues */
+ heaps[2] = &extract_heap ; /* delay line */
+ s = splimp(); /* see note on top, splnet() is not enough */
+ curr_time++ ;
+ for (i=0; i < 3 ; i++) {
+ h = heaps[i];
+ while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
+ DDB(if (h->p[0].key > curr_time)
+ printf("-- dummynet: warning, heap %d is %d ticks late\n",
+ i, (int)(curr_time - h->p[0].key));)
+ p = h->p[0].object ; /* store a copy before heap_extract */
+ heap_extract(h, NULL); /* need to extract before processing */
+ if (i == 0)
+ ready_event(p) ;
+ else if (i == 1) {
+ struct dn_pipe *pipe = p;
+ if (pipe->if_name[0] != '\0')
+ printf("*** bad ready_event_wfq for pipe %s\n",
+ pipe->if_name);
+ else
+ ready_event_wfq(p) ;
+ } else
+ transmit_event(p);
+ }
+ }
+ /* sweep pipes trying to expire idle flow_queues */
+ for (pe = all_pipes; pe ; pe = pe->next )
+ if (pe->idle_heap.elements > 0 &&
+ DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
+ struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
+
+ heap_extract(&(pe->idle_heap), NULL);
+ q->S = q->F + 1 ; /* mark timestamp as invalid */
+ pe->sum -= q->fs->weight ;
+ }
+ splx(s);
+ dn_timeout = timeout(dummynet, NULL, 1);
+}
+
+/*
+ * called by an interface when tx_rdy occurs.
+ */
+int
+if_tx_rdy(struct ifnet *ifp)
+{
+ struct dn_pipe *p;
+
+ for (p = all_pipes; p ; p = p->next )
+ if (p->ifp == ifp)
+ break ;
+ if (p == NULL) {
+ char buf[32];
+ sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit);
+ for (p = all_pipes; p ; p = p->next )
+ if (!strcmp(p->if_name, buf) ) {
+ p->ifp = ifp ;
+ DEB(printf("++ tx rdy from %s (now found)\n", buf);)
+ break ;
+ }
+ }
+ if (p != NULL) {
+ DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
+ ifp->if_unit, ifp->if_snd.ifq_len);)
+ p->numbytes = 0 ; /* mark ready for I/O */
+ ready_event_wfq(p);
+ }
+ return 0;
+}
+
+/*
+ * Unconditionally expire empty queues in case of shortage.
+ * Returns the number of queues freed.
+ */
+static int
+expire_queues(struct dn_flow_set *fs)
+{
+ struct dn_flow_queue *q, *prev ;
+ int i, initial_elements = fs->rq_elements ;
+
+ if (fs->last_expired == time_second)
+ return 0 ;
+ fs->last_expired = time_second ;
+ for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
+ for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
+ if (q->head != NULL || q->S != q->F+1) {
+ prev = q ;
+ q = q->next ;
+ } else { /* entry is idle, expire it */
+ struct dn_flow_queue *old_q = q ;
+
+ if (prev != NULL)
+ prev->next = q = q->next ;
+ else
+ fs->rq[i] = q = q->next ;
+ fs->rq_elements-- ;
+ free(old_q, M_DUMMYNET);
+ }
+ return initial_elements - fs->rq_elements ;
+}
+
+/*
+ * If room, create a new queue and put at head of slot i;
+ * otherwise, create or use the default queue.
+ */
+static struct dn_flow_queue *
+create_queue(struct dn_flow_set *fs, int i)
+{
+ struct dn_flow_queue *q ;
+
+ if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
+ expire_queues(fs) == 0) {
+ /*
+ * No way to get room, use or create overflow queue.
+ */
+ i = fs->rq_size ;
+ if ( fs->rq[i] != NULL )
+ return fs->rq[i] ;
+ }
+ q = malloc(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
+ if (q == NULL) {
+ printf("sorry, cannot allocate queue for new flow\n");
+ return NULL ;
+ }
+ q->fs = fs ;
+ q->hash_slot = i ;
+ q->next = fs->rq[i] ;
+ q->S = q->F + 1; /* hack - mark timestamp as invalid */
+ fs->rq[i] = q ;
+ fs->rq_elements++ ;
+ return q ;
+}
+
+/*
+ * Given a flow_set and a pkt in last_pkt, find a matching queue
+ * after appropriate masking. The queue is moved to front
+ * so that further searches take less time.
+ */
+static struct dn_flow_queue *
+find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
+{
+ int i = 0 ; /* we need i and q for new allocations */
+ struct dn_flow_queue *q, *prev;
+
+ if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
+ q = fs->rq[0] ;
+ else {
+ /* first, do the masking */
+ id->dst_ip &= fs->flow_mask.dst_ip ;
+ id->src_ip &= fs->flow_mask.src_ip ;
+ id->dst_port &= fs->flow_mask.dst_port ;
+ id->src_port &= fs->flow_mask.src_port ;
+ id->proto &= fs->flow_mask.proto ;
+ id->flags = 0 ; /* we don't care about this one */
+ /* then, hash function */
+ i = ( (id->dst_ip) & 0xffff ) ^
+ ( (id->dst_ip >> 15) & 0xffff ) ^
+ ( (id->src_ip << 1) & 0xffff ) ^
+ ( (id->src_ip >> 16 ) & 0xffff ) ^
+ (id->dst_port << 1) ^ (id->src_port) ^
+ (id->proto );
+ i = i % fs->rq_size ;
+ /* finally, scan the current list for a match */
+ searches++ ;
+ for (prev=NULL, q = fs->rq[i] ; q ; ) {
+ search_steps++;
+ if (bcmp(id, &(q->id), sizeof(q->id) ) == 0)
+ break ; /* found */
+ else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
+ /* entry is idle and not in any heap, expire it */
+ struct dn_flow_queue *old_q = q ;
+
+ if (prev != NULL)
+ prev->next = q = q->next ;
+ else
+ fs->rq[i] = q = q->next ;
+ fs->rq_elements-- ;
+ free(old_q, M_DUMMYNET);
+ continue ;
+ }
+ prev = q ;
+ q = q->next ;
+ }
+ if (q && prev != NULL) { /* found and not in front */
+ prev->next = q->next ;
+ q->next = fs->rq[i] ;
+ fs->rq[i] = q ;
+ }
+ }
+ if (q == NULL) { /* no match, need to allocate a new entry */
+ q = create_queue(fs, i);
+ if (q != NULL)
+ q->id = *id ;
+ }
+ return q ;
+}
+
+static int
+red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+{
+ /*
+ * RED algorithm
+ *
+ * RED calculates the average queue size (avg) using a low-pass filter
+ * with an exponential weighted (w_q) moving average:
+ * avg <- (1-w_q) * avg + w_q * q_size
+ * where q_size is the queue length (measured in bytes or * packets).
+ *
+ * If q_size == 0, we compute the idle time for the link, and set
+ * avg = (1 - w_q)^(idle/s)
+ * where s is the time needed for transmitting a medium-sized packet.
+ *
+ * Now, if avg < min_th the packet is enqueued.
+ * If avg > max_th the packet is dropped. Otherwise, the packet is
+ * dropped with probability P function of avg.
+ *
+ */
+
+ int64_t p_b = 0;
+ /* queue in bytes or packets ? */
+ u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
+
+ DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);)
+
+ /* average queue size estimation */
+ if (q_size != 0) {
+ /*
+ * queue is not empty, avg <- avg + (q_size - avg) * w_q
+ */
+ int diff = SCALE(q_size) - q->avg;
+ int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
+
+ q->avg += (int) v;
+ } else {
+ /*
+ * queue is empty, find for how long the queue has been
+ * empty and use a lookup table for computing
+ * (1 - * w_q)^(idle_time/s) where s is the time to send a
+ * (small) packet.
+ * XXX check wraps...
+ */
+ if (q->avg) {
+ u_int t = (curr_time - q->q_time) / fs->lookup_step;
+
+ q->avg = (t < fs->lookup_depth) ?
+ SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+ }
+ }
+ DEB(printf("avg: %u ", SCALE_VAL(q->avg));)
+
+ /* should i drop ? */
+
+ if (q->avg < fs->min_th) {
+ q->count = -1;
+ return 0; /* accept packet ; */
+ }
+ if (q->avg >= fs->max_th) { /* average queue >= max threshold */
+ if (fs->flags_fs & DN_IS_GENTLE_RED) {
+ /*
+ * According to Gentle-RED, if avg is greater than max_th the
+ * packet is dropped with a probability
+ * p_b = c_3 * avg - c_4
+ * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
+ */
+ p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
+ } else {
+ q->count = -1;
+ printf("- drop");
+ return 1 ;
+ }
+ } else if (q->avg > fs->min_th) {
+ /*
+ * we compute p_b using the linear dropping function p_b = c_1 *
+ * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
+ * max_p * min_th / (max_th - min_th)
+ */
+ p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
+ }
+ if (fs->flags_fs & DN_QSIZE_IS_BYTES)
+ p_b = (p_b * len) / fs->max_pkt_size;
+ if (++q->count == 0)
+ q->random = random() & 0xffff;
+ else {
+ /*
+ * q->count counts packets arrived since last drop, so a greater
+ * value of q->count means a greater packet drop probability.
+ */
+ if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
+ q->count = 0;
+ DEB(printf("- red drop");)
+ /* after a drop we calculate a new random value */
+ q->random = random() & 0xffff;
+ return 1; /* drop */
+ }
+ }
+ /* end of RED algorithm */
+ return 0 ; /* accept */
+}
+
+static __inline
+struct dn_flow_set *
+locate_flowset(int pipe_nr, struct ip_fw *rule)
+{
+ struct dn_flow_set *fs = NULL ;
+
+ if ( (rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE )
+ for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
+ ;
+ else {
+ struct dn_pipe *p1;
+ for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
+ ;
+ if (p1 != NULL)
+ fs = &(p1->fs) ;
+ }
+ if (fs != NULL)
+ rule->pipe_ptr = fs ; /* record for the future */
+ return fs ;
+}
+
+/*
+ * dummynet hook for packets. Below 'pipe' is a pipe or a queue
+ * depending on whether WF2Q or fixed bw is used.
+ *
+ * pipe_nr pipe or queue the packet is destined for.
+ * dir where shall we send the packet after dummynet.
+ * m the mbuf with the packet
+ * ifp the 'ifp' parameter from the caller.
+ * NULL in ip_input, destination interface in ip_output,
+ * real_dst in bdg_forward
+ * ro route parameter (only used in ip_output, NULL otherwise)
+ * dst destination address, only used by ip_output
+ * rule matching rule, in case of multiple passes
+ * flags flags from the caller, only used in ip_output
+ *
+ */
+static int
+dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
+{
+ struct dn_pkt *pkt;
+ struct dn_flow_set *fs;
+ struct dn_pipe *pipe ;
+ u_int64_t len = m->m_pkthdr.len ;
+ struct dn_flow_queue *q = NULL ;
+ int s ;
+
+ s = splimp();
+
+ pipe_nr &= 0xffff ;
+
+ if ( (fs = fwa->rule->pipe_ptr) == NULL ) {
+ fs = locate_flowset(pipe_nr, fwa->rule);
+ if (fs == NULL)
+ goto dropit ; /* this queue/pipe does not exist! */
+ }
+ pipe = fs->pipe ;
+ if (pipe == NULL) { /* must be a queue, try find a matching pipe */
+ for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
+ pipe = pipe->next)
+ ;
+ if (pipe != NULL)
+ fs->pipe = pipe ;
+ else {
+ printf("No pipe %d for queue %d, drop pkt\n",
+ fs->parent_nr, fs->fs_nr);
+ goto dropit ;
+ }
+ }
+ q = find_queue(fs, &(fwa->f_id));
+ if ( q == NULL )
+ goto dropit ; /* cannot allocate queue */
+ /*
+ * update statistics, then check reasons to drop pkt
+ */
+ q->tot_bytes += len ;
+ q->tot_pkts++ ;
+ if ( fs->plr && random() < fs->plr )
+ goto dropit ; /* random pkt drop */
+ if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
+ if (q->len_bytes > fs->qsize)
+ goto dropit ; /* queue size overflow */
+ } else {
+ if (q->len >= fs->qsize)
+ goto dropit ; /* queue count overflow */
+ }
+ if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
+ goto dropit ;
+
+ /* XXX expensive to zero, see if we can remove it*/
+ pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_DUMMYNET, M_NOWAIT|M_ZERO);
+ if ( pkt == NULL )
+ goto dropit ; /* cannot allocate packet header */
+ /* ok, i can handle the pkt now... */
+ /* build and enqueue packet + parameters */
+ pkt->hdr.mh_type = MT_TAG;
+ pkt->hdr.mh_flags = PACKET_TAG_DUMMYNET;
+ pkt->rule = fwa->rule ;
+ DN_NEXT(pkt) = NULL;
+ pkt->dn_m = m;
+ pkt->dn_dir = dir ;
+
+ pkt->ifp = fwa->oif;
+ if (dir == DN_TO_IP_OUT) {
+ /*
+ * We need to copy *ro because for ICMP pkts (and maybe others)
+ * the caller passed a pointer into the stack; dst might also be
+ * a pointer into *ro so it needs to be updated.
+ */
+ pkt->ro = *(fwa->ro);
+ if (fwa->ro->ro_rt)
+ fwa->ro->ro_rt->rt_refcnt++ ;
+ if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
+ fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
+
+ pkt->dn_dst = fwa->dst;
+ pkt->flags = fwa->flags;
+ }
+ if (q->head == NULL)
+ q->head = pkt;
+ else
+ DN_NEXT(q->tail) = pkt;
+ q->tail = pkt;
+ q->len++;
+ q->len_bytes += len ;
+
+ if ( q->head != pkt ) /* flow was not idle, we are done */
+ goto done;
+ /*
+ * If we reach this point the flow was previously idle, so we need
+ * to schedule it. This involves different actions for fixed-rate or
+ * WF2Q queues.
+ */
+ if ( (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) {
+ /*
+ * Fixed-rate queue: just insert into the ready_heap.
+ */
+ dn_key t = 0 ;
+ if (pipe->bandwidth)
+ t = SET_TICKS(pkt, q, pipe);
+ q->sched_time = curr_time ;
+ if (t == 0) /* must process it now */
+ ready_event( q );
+ else
+ heap_insert(&ready_heap, curr_time + t , q );
+ } else {
+ /*
+ * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
+ * set S to the virtual time V for the controlling pipe, and update
+ * the sum of weights for the pipe; otherwise, remove flow from
+ * idle_heap and set S to max(F,V).
+ * Second, compute finish time F = S + len/weight.
+ * Third, if pipe was idle, update V=max(S, V).
+ * Fourth, count one more backlogged flow.
+ */
+ if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
+ q->S = pipe->V ;
+ pipe->sum += fs->weight ; /* add weight of new queue */
+ } else {
+ heap_extract(&(pipe->idle_heap), q);
+ q->S = MAX64(q->F, pipe->V ) ;
+ }
+ q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
+
+ if (pipe->not_eligible_heap.elements == 0 &&
+ pipe->scheduler_heap.elements == 0)
+ pipe->V = MAX64 ( q->S, pipe->V );
+ fs->backlogged++ ;
+ /*
+ * Look at eligibility. A flow is not eligibile if S>V (when
+ * this happens, it means that there is some other flow already
+ * scheduled for the same pipe, so the scheduler_heap cannot be
+ * empty). If the flow is not eligible we just store it in the
+ * not_eligible_heap. Otherwise, we store in the scheduler_heap
+ * and possibly invoke ready_event_wfq() right now if there is
+ * leftover credit.
+ * Note that for all flows in scheduler_heap (SCH), S_i <= V,
+ * and for all flows in not_eligible_heap (NEH), S_i > V .
+ * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
+ * we only need to look into NEH.
+ */
+ if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
+ if (pipe->scheduler_heap.elements == 0)
+ printf("++ ouch! not eligible but empty scheduler!\n");
+ heap_insert(&(pipe->not_eligible_heap), q->S, q);
+ } else {
+ heap_insert(&(pipe->scheduler_heap), q->F, q);
+ if (pipe->numbytes >= 0) { /* pipe is idle */
+ if (pipe->scheduler_heap.elements != 1)
+ printf("*** OUCH! pipe should have been idle!\n");
+ DEB(printf("Waking up pipe %d at %d\n",
+ pipe->pipe_nr, (int)(q->F >> MY_M)); )
+ pipe->sched_time = curr_time ;
+ ready_event_wfq(pipe);
+ }
+ }
+ }
+done:
+ splx(s);
+ return 0;
+
+dropit:
+ splx(s);
+ if (q)
+ q->drops++ ;
+ m_freem(m);
+ return ENOBUFS ;
+}
+
+/*
+ * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
+ * Doing this would probably save us the initial bzero of dn_pkt
+ */
+#define DN_FREE_PKT(pkt) { \
+ struct dn_pkt *n = pkt ; \
+ rt_unref ( n->ro.ro_rt ) ; \
+ m_freem(n->dn_m); \
+ pkt = DN_NEXT(n) ; \
+ free(n, M_DUMMYNET) ; }
+
+/*
+ * Dispose all packets and flow_queues on a flow_set.
+ * If all=1, also remove red lookup table and other storage,
+ * including the descriptor itself.
+ * For the one in dn_pipe MUST also cleanup ready_heap...
+ */
+static void
+purge_flow_set(struct dn_flow_set *fs, int all)
+{
+ struct dn_pkt *pkt ;
+ struct dn_flow_queue *q, *qn ;
+ int i ;
+
+ for (i = 0 ; i <= fs->rq_size ; i++ ) {
+ for (q = fs->rq[i] ; q ; q = qn ) {
+ for (pkt = q->head ; pkt ; )
+ DN_FREE_PKT(pkt) ;
+ qn = q->next ;
+ free(q, M_DUMMYNET);
+ }
+ fs->rq[i] = NULL ;
+ }
+ fs->rq_elements = 0 ;
+ if (all) {
+ /* RED - free lookup table */
+ if (fs->w_q_lookup)
+ free(fs->w_q_lookup, M_DUMMYNET);
+ if (fs->rq)
+ free(fs->rq, M_DUMMYNET);
+ /* if this fs is not part of a pipe, free it */
+ if (fs->pipe && fs != &(fs->pipe->fs) )
+ free(fs, M_DUMMYNET);
+ }
+}
+
+/*
+ * Dispose all packets queued on a pipe (not a flow_set).
+ * Also free all resources associated to a pipe, which is about
+ * to be deleted.
+ */
+static void
+purge_pipe(struct dn_pipe *pipe)
+{
+ struct dn_pkt *pkt ;
+
+ purge_flow_set( &(pipe->fs), 1 );
+
+ for (pkt = pipe->head ; pkt ; )
+ DN_FREE_PKT(pkt) ;
+
+ heap_free( &(pipe->scheduler_heap) );
+ heap_free( &(pipe->not_eligible_heap) );
+ heap_free( &(pipe->idle_heap) );
+}
+
+/*
+ * Delete all pipes and heaps returning memory. Must also
+ * remove references from all ipfw rules to all pipes.
+ */
+static void
+dummynet_flush()
+{
+ struct dn_pipe *curr_p, *p ;
+ struct ip_fw *rule ;
+ struct dn_flow_set *fs, *curr_fs;
+ int s ;
+
+ s = splimp() ;
+
+ /* remove all references to pipes ...*/
+ LIST_FOREACH(rule, &ip_fw_chain_head, next)
+ rule->pipe_ptr = NULL ;
+ /* prevent future matches... */
+ p = all_pipes ;
+ all_pipes = NULL ;
+ fs = all_flow_sets ;
+ all_flow_sets = NULL ;
+ /* and free heaps so we don't have unwanted events */
+ heap_free(&ready_heap);
+ heap_free(&wfq_ready_heap);
+ heap_free(&extract_heap);
+ splx(s) ;
+ /*
+ * Now purge all queued pkts and delete all pipes
+ */
+ /* scan and purge all flow_sets. */
+ for ( ; fs ; ) {
+ curr_fs = fs ;
+ fs = fs->next ;
+ purge_flow_set(curr_fs, 1);
+ }
+ for ( ; p ; ) {
+ purge_pipe(p);
+ curr_p = p ;
+ p = p->next ;
+ free(curr_p, M_DUMMYNET);
+ }
+}
+
+
+extern struct ip_fw *ip_fw_default_rule ;
+static void
+dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
+{
+ int i ;
+ struct dn_flow_queue *q ;
+ struct dn_pkt *pkt ;
+
+ for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
+ for (q = fs->rq[i] ; q ; q = q->next )
+ for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) )
+ if (pkt->rule == r)
+ pkt->rule = ip_fw_default_rule ;
+}
+/*
+ * when a firewall rule is deleted, scan all queues and remove the flow-id
+ * from packets matching this rule.
+ */
+void
+dn_rule_delete(void *r)
+{
+ struct dn_pipe *p ;
+ struct dn_pkt *pkt ;
+ struct dn_flow_set *fs ;
+
+ /*
+ * If the rule references a queue (dn_flow_set), then scan
+ * the flow set, otherwise scan pipes. Should do either, but doing
+ * both does not harm.
+ */
+ for ( fs = all_flow_sets ; fs ; fs = fs->next )
+ dn_rule_delete_fs(fs, r);
+ for ( p = all_pipes ; p ; p = p->next ) {
+ fs = &(p->fs) ;
+ dn_rule_delete_fs(fs, r);
+ for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) )
+ if (pkt->rule == r)
+ pkt->rule = ip_fw_default_rule ;
+ }
+}
+
+/*
+ * setup RED parameters
+ */
+static int
+config_red(struct dn_flow_set *p, struct dn_flow_set * x)
+{
+ int i;
+
+ x->w_q = p->w_q;
+ x->min_th = SCALE(p->min_th);
+ x->max_th = SCALE(p->max_th);
+ x->max_p = p->max_p;
+
+ x->c_1 = p->max_p / (p->max_th - p->min_th);
+ x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
+ if (x->flags_fs & DN_IS_GENTLE_RED) {
+ x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
+ x->c_4 = (SCALE(1) - 2 * p->max_p);
+ }
+
+ /* if the lookup table already exist, free and create it again */
+ if (x->w_q_lookup) {
+ free(x->w_q_lookup, M_DUMMYNET);
+ x->w_q_lookup = NULL ;
+ }
+ if (red_lookup_depth == 0) {
+ printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0");
+ free(x, M_DUMMYNET);
+ return EINVAL;
+ }
+ x->lookup_depth = red_lookup_depth;
+ x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int),
+ M_DUMMYNET, M_DONTWAIT);
+ if (x->w_q_lookup == NULL) {
+ printf("sorry, cannot allocate red lookup table\n");
+ free(x, M_DUMMYNET);
+ return ENOSPC;
+ }
+
+ /* fill the lookup table with (1 - w_q)^x */
+ x->lookup_step = p->lookup_step ;
+ x->lookup_weight = p->lookup_weight ;
+ x->w_q_lookup[0] = SCALE(1) - x->w_q;
+ for (i = 1; i < x->lookup_depth; i++)
+ x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+ if (red_avg_pkt_size < 1)
+ red_avg_pkt_size = 512 ;
+ x->avg_pkt_size = red_avg_pkt_size ;
+ if (red_max_pkt_size < 1)
+ red_max_pkt_size = 1500 ;
+ x->max_pkt_size = red_max_pkt_size ;
+ return 0 ;
+}
+
+static int
+alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
+{
+ if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
+ int l = pfs->rq_size;
+
+ if (l == 0)
+ l = dn_hash_size;
+ if (l < 4)
+ l = 4;
+ else if (l > 1024)
+ l = 1024;
+ x->rq_size = l;
+ } else /* one is enough for null mask */
+ x->rq_size = 1;
+ x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
+ M_DUMMYNET, M_DONTWAIT | M_ZERO);
+ if (x->rq == NULL) {
+ printf("sorry, cannot allocate queue\n");
+ return ENOSPC;
+ }
+ x->rq_elements = 0;
+ return 0 ;
+}
+
+static void
+set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
+{
+ x->flags_fs = src->flags_fs;
+ x->qsize = src->qsize;
+ x->plr = src->plr;
+ x->flow_mask = src->flow_mask;
+ if (x->flags_fs & DN_QSIZE_IS_BYTES) {
+ if (x->qsize > 1024*1024)
+ x->qsize = 1024*1024 ;
+ } else {
+ if (x->qsize == 0)
+ x->qsize = 50 ;
+ if (x->qsize > 100)
+ x->qsize = 50 ;
+ }
+ /* configuring RED */
+ if ( x->flags_fs & DN_IS_RED )
+ config_red(src, x) ; /* XXX should check errors */
+}
+
+/*
+ * setup pipe or queue parameters.
+ */
+
+static int
+config_pipe(struct dn_pipe *p)
+{
+ int s ;
+ struct dn_flow_set *pfs = &(p->fs);
+
+ /*
+ * The config program passes parameters as follows:
+ * bw = bits/second (0 means no limits),
+ * delay = ms, must be translated into ticks.
+ * qsize = slots/bytes
+ */
+ p->delay = ( p->delay * hz ) / 1000 ;
+ /* We need either a pipe number or a flow_set number */
+ if (p->pipe_nr == 0 && pfs->fs_nr == 0)
+ return EINVAL ;
+ if (p->pipe_nr != 0 && pfs->fs_nr != 0)
+ return EINVAL ;
+ if (p->pipe_nr != 0) { /* this is a pipe */
+ struct dn_pipe *x, *a, *b;
+ /* locate pipe */
+ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
+ a = b , b = b->next) ;
+
+ if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
+ x = malloc(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO);
+ if (x == NULL) {
+ printf("ip_dummynet.c: no memory for new pipe\n");
+ return ENOSPC;
+ }
+ x->pipe_nr = p->pipe_nr;
+ x->fs.pipe = x ;
+ /* idle_heap is the only one from which we extract from the middle.
+ */
+ x->idle_heap.size = x->idle_heap.elements = 0 ;
+ x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
+ } else
+ x = b;
+
+ x->bandwidth = p->bandwidth ;
+ x->numbytes = 0; /* just in case... */
+ bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
+ x->ifp = NULL ; /* reset interface ptr */
+ x->delay = p->delay ;
+ set_fs_parms(&(x->fs), pfs);
+
+
+ if ( x->fs.rq == NULL ) { /* a new pipe */
+ s = alloc_hash(&(x->fs), pfs) ;
+ if (s) {
+ free(x, M_DUMMYNET);
+ return s ;
+ }
+ s = splimp() ;
+ x->next = b ;
+ if (a == NULL)
+ all_pipes = x ;
+ else
+ a->next = x ;
+ splx(s);
+ }
+ } else { /* config queue */
+ struct dn_flow_set *x, *a, *b ;
+
+ /* locate flow_set */
+ for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
+ a = b , b = b->next) ;
+
+ if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */
+ if (pfs->parent_nr == 0) /* need link to a pipe */
+ return EINVAL ;
+ x = malloc(sizeof(struct dn_flow_set),M_DUMMYNET,M_DONTWAIT|M_ZERO);
+ if (x == NULL) {
+ printf("ip_dummynet.c: no memory for new flow_set\n");
+ return ENOSPC;
+ }
+ x->fs_nr = pfs->fs_nr;
+ x->parent_nr = pfs->parent_nr;
+ x->weight = pfs->weight ;
+ if (x->weight == 0)
+ x->weight = 1 ;
+ else if (x->weight > 100)
+ x->weight = 100 ;
+ } else {
+ /* Change parent pipe not allowed; must delete and recreate */
+ if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr)
+ return EINVAL ;
+ x = b;
+ }
+ set_fs_parms(x, pfs);
+
+ if ( x->rq == NULL ) { /* a new flow_set */
+ s = alloc_hash(x, pfs) ;
+ if (s) {
+ free(x, M_DUMMYNET);
+ return s ;
+ }
+ s = splimp() ;
+ x->next = b;
+ if (a == NULL)
+ all_flow_sets = x;
+ else
+ a->next = x;
+ splx(s);
+ }
+ }
+ return 0 ;
+}
+
+/*
+ * Helper function to remove from a heap queues which are linked to
+ * a flow_set about to be deleted.
+ */
+static void
+fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+{
+ int i = 0, found = 0 ;
+ for (; i < h->elements ;)
+ if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ found++ ;
+ } else
+ i++ ;
+ if (found)
+ heapify(h);
+}
+
+/*
+ * helper function to remove a pipe from a heap (can be there at most once)
+ */
+static void
+pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+{
+ if (h->elements > 0) {
+ int i = 0 ;
+ for (i=0; i < h->elements ; i++ ) {
+ if (h->p[i].object == p) { /* found it */
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ heapify(h);
+ break ;
+ }
+ }
+ }
+}
+
+/*
+ * drain all queues. Called in case of severe mbuf shortage.
+ */
+void
+dummynet_drain()
+{
+ struct dn_flow_set *fs;
+ struct dn_pipe *p;
+ struct dn_pkt *pkt;
+
+ heap_free(&ready_heap);
+ heap_free(&wfq_ready_heap);
+ heap_free(&extract_heap);
+ /* remove all references to this pipe from flow_sets */
+ for (fs = all_flow_sets; fs; fs= fs->next )
+ purge_flow_set(fs, 0);
+
+ for (p = all_pipes; p; p= p->next ) {
+ purge_flow_set(&(p->fs), 0);
+ for (pkt = p->head ; pkt ; )
+ DN_FREE_PKT(pkt) ;
+ p->head = p->tail = NULL ;
+ }
+}
+
+/*
+ * Fully delete a pipe or a queue, cleaning up associated info.
+ */
+static int
+delete_pipe(struct dn_pipe *p)
+{
+ int s ;
+ struct ip_fw *rule ;
+
+ if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+ return EINVAL ;
+ if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+ return EINVAL ;
+ if (p->pipe_nr != 0) { /* this is an old-style pipe */
+ struct dn_pipe *a, *b;
+ struct dn_flow_set *fs;
+
+ /* locate pipe */
+ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
+ a = b , b = b->next) ;
+ if (b == NULL || (b->pipe_nr != p->pipe_nr) )
+ return EINVAL ; /* not found */
+
+ s = splimp() ;
+
+ /* unlink from list of pipes */
+ if (a == NULL)
+ all_pipes = b->next ;
+ else
+ a->next = b->next ;
+ /* remove references to this pipe from the ip_fw rules. */
+ LIST_FOREACH(rule, &ip_fw_chain_head, next)
+ if (rule->pipe_ptr == &(b->fs))
+ rule->pipe_ptr = NULL ;
+
+ /* remove all references to this pipe from flow_sets */
+ for (fs = all_flow_sets; fs; fs= fs->next )
+ if (fs->pipe == b) {
+ printf("++ ref to pipe %d from fs %d\n",
+ p->pipe_nr, fs->fs_nr);
+ fs->pipe = NULL ;
+ purge_flow_set(fs, 0);
+ }
+ fs_remove_from_heap(&ready_heap, &(b->fs));
+ purge_pipe(b); /* remove all data associated to this pipe */
+ /* remove reference to here from extract_heap and wfq_ready_heap */
+ pipe_remove_from_heap(&extract_heap, b);
+ pipe_remove_from_heap(&wfq_ready_heap, b);
+ splx(s);
+ free(b, M_DUMMYNET);
+ } else { /* this is a WF2Q queue (dn_flow_set) */
+ struct dn_flow_set *a, *b;
+
+ /* locate set */
+ for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
+ a = b , b = b->next) ;
+ if (b == NULL || (b->fs_nr != p->fs.fs_nr) )
+ return EINVAL ; /* not found */
+
+ s = splimp() ;
+ if (a == NULL)
+ all_flow_sets = b->next ;
+ else
+ a->next = b->next ;
+ /* remove references to this flow_set from the ip_fw rules. */
+ LIST_FOREACH(rule, &ip_fw_chain_head, next)
+ if (rule->pipe_ptr == b)
+ rule->pipe_ptr = NULL ;
+
+ if (b->pipe != NULL) {
+ /* Update total weight on parent pipe and cleanup parent heaps */
+ b->pipe->sum -= b->weight * b->backlogged ;
+ fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
+ fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
+#if 1 /* XXX should i remove from idle_heap as well ? */
+ fs_remove_from_heap(&(b->pipe->idle_heap), b);
+#endif
+ }
+ purge_flow_set(b, 1);
+ splx(s);
+ }
+ return 0 ;
+}
+
+/*
+ * helper function used to copy data from kernel in DUMMYNET_GET
+ */
+static char *
+dn_copy_set(struct dn_flow_set *set, char *bp)
+{
+ int i, copied = 0 ;
+ struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
+
+ for (i = 0 ; i <= set->rq_size ; i++)
+ for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
+ if (q->hash_slot != i)
+ printf("++ at %d: wrong slot (have %d, "
+ "should be %d)\n", copied, q->hash_slot, i);
+ if (q->fs != set)
+ printf("++ at %d: wrong fs ptr (have %p, should be %p)\n",
+ i, q->fs, set);
+ copied++ ;
+ bcopy(q, qp, sizeof( *q ) );
+ /* cleanup pointers */
+ qp->next = NULL ;
+ qp->head = qp->tail = NULL ;
+ qp->fs = NULL ;
+ }
+ if (copied != set->rq_elements)
+ printf("++ wrong count, have %d should be %d\n",
+ copied, set->rq_elements);
+ return (char *)qp ;
+}
+
+static int
+dummynet_get(struct sockopt *sopt)
+{
+ char *buf, *bp ; /* bp is the "copy-pointer" */
+ size_t size ;
+ struct dn_flow_set *set ;
+ struct dn_pipe *p ;
+ int s, error=0 ;
+
+ s = splimp();
+ /*
+ * compute size of data structures: list of pipes and flow_sets.
+ */
+ for (p = all_pipes, size = 0 ; p ; p = p->next )
+ size += sizeof( *p ) +
+ p->fs.rq_elements * sizeof(struct dn_flow_queue);
+ for (set = all_flow_sets ; set ; set = set->next )
+ size += sizeof ( *set ) +
+ set->rq_elements * sizeof(struct dn_flow_queue);
+ buf = malloc(size, M_TEMP, M_DONTWAIT);
+ if (buf == 0) {
+ splx(s);
+ return ENOBUFS ;
+ }
+ for (p = all_pipes, bp = buf ; p ; p = p->next ) {
+ struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
+
+ /*
+ * copy pipe descriptor into *bp, convert delay back to ms,
+ * then copy the flow_set descriptor(s) one at a time.
+ * After each flow_set, copy the queue descriptor it owns.
+ */
+ bcopy(p, bp, sizeof( *p ) );
+ pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
+ /*
+ * XXX the following is a hack based on ->next being the
+ * first field in dn_pipe and dn_flow_set. The correct
+ * solution would be to move the dn_flow_set to the beginning
+ * of struct dn_pipe.
+ */
+ pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
+ /* clean pointers */
+ pipe_bp->head = pipe_bp->tail = NULL ;
+ pipe_bp->fs.next = NULL ;
+ pipe_bp->fs.pipe = NULL ;
+ pipe_bp->fs.rq = NULL ;
+
+ bp += sizeof( *p ) ;
+ bp = dn_copy_set( &(p->fs), bp );
+ }
+ for (set = all_flow_sets ; set ; set = set->next ) {
+ struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
+ bcopy(set, bp, sizeof( *set ) );
+ /* XXX same hack as above */
+ fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
+ fs_bp->pipe = NULL ;
+ fs_bp->rq = NULL ;
+ bp += sizeof( *set ) ;
+ bp = dn_copy_set( set, bp );
+ }
+ splx(s);
+ error = sooptcopyout(sopt, buf, size);
+ free(buf, M_TEMP);
+ return error ;
+}
+
+/*
+ * Handler for the various dummynet socket options (get, flush, config, del)
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+ int error = 0 ;
+ struct dn_pipe *p, tmp_pipe;
+
+ /* Disallow sets in really-really secure mode. */
+ if (sopt->sopt_dir == SOPT_SET) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ switch (sopt->sopt_name) {
+ default :
+ printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name);
+ return EINVAL ;
+
+ case IP_DUMMYNET_GET :
+ error = dummynet_get(sopt);
+ break ;
+
+ case IP_DUMMYNET_FLUSH :
+ dummynet_flush() ;
+ break ;
+
+ case IP_DUMMYNET_CONFIGURE :
+ p = &tmp_pipe ;
+ error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
+ if (error)
+ break ;
+ error = config_pipe(p);
+ break ;
+
+ case IP_DUMMYNET_DEL : /* remove a pipe or queue */
+ p = &tmp_pipe ;
+ error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
+ if (error)
+ break ;
+
+ error = delete_pipe(p);
+ break ;
+ }
+ return error ;
+}
+
+static void
+ip_dn_init(void)
+{
+ printf("DUMMYNET initialized (011031)\n");
+ all_pipes = NULL ;
+ all_flow_sets = NULL ;
+ ready_heap.size = ready_heap.elements = 0 ;
+ ready_heap.offset = 0 ;
+
+ wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
+ wfq_ready_heap.offset = 0 ;
+
+ extract_heap.size = extract_heap.elements = 0 ;
+ extract_heap.offset = 0 ;
+ ip_dn_ctl_ptr = ip_dn_ctl;
+ ip_dn_io_ptr = dummynet_io;
+ ip_dn_ruledel_ptr = dn_rule_delete;
+ bzero(&dn_timeout, sizeof(struct callout_handle));
+ dn_timeout = timeout(dummynet, NULL, 1);
+}
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+ int s;
+ switch (type) {
+ case MOD_LOAD:
+ s = splimp();
+ if (DUMMYNET_LOADED) {
+ splx(s);
+ printf("DUMMYNET already loaded\n");
+ return EEXIST ;
+ }
+ ip_dn_init();
+ splx(s);
+ break;
+
+ case MOD_UNLOAD:
+#if !defined(KLD_MODULE)
+ printf("dummynet statically compiled, cannot unload\n");
+ return EINVAL ;
+#else
+ s = splimp();
+ untimeout(dummynet, NULL, dn_timeout);
+ dummynet_flush();
+ ip_dn_ctl_ptr = NULL;
+ ip_dn_io_ptr = NULL;
+ ip_dn_ruledel_ptr = NULL;
+ splx(s);
+#endif
+ break ;
+ default:
+ break ;
+ }
+ return 0 ;
+}
+
+static moduledata_t dummynet_mod = {
+ "dummynet",
+ dummynet_modevent,
+ NULL
+};
+DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_DEPEND(dummynet, ipfw, 1, 1, 1);
+MODULE_VERSION(dummynet, 1);
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
new file mode 100644
index 0000000..30c5f6f
--- /dev/null
+++ b/sys/netinet/ip_dummynet.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DUMMYNET_H
+#define _IP_DUMMYNET_H
+
+/*
+ * Definition of dummynet data structures. In the structures, I decided
+ * not to use the macros in <sys/queue.h> in the hope of making the code
+ * easier to port to other architectures. The type of lists and queue we
+ * use here is pretty simple anyways.
+ */
+
+/*
+ * We start with a heap, which is used in the scheduler to decide when
+ * to transmit packets etc.
+ *
+ * The key for the heap is used for two different values:
+ *
+ * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ *
+ * 2. virtual times. These increase in steps of len/x, where len is the
+ * packet length, and x is either the weight of the flow, or the
+ * sum of all weights.
+ * If we limit to max 1000 flows and a max weight of 100, then
+ * x needs 17 bits. The packet size is 16 bits, so we can easily
+ * overflow if we do not allow errors.
+ * So we use a key "dn_key" which is 64 bits. Some macros are used to
+ * compare key values and handle wraparounds.
+ * MAX64 returns the largest of two key values.
+ * MY_M is used as a shift count when doing fixed point arithmetic
+ * (a better name would be useful...).
+ */
+typedef u_int64_t dn_key ; /* sorting key */
+#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0)
+#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0)
+#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0)
+#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#define MY_M 16 /* number of left shift to obtain a larger precision */
+
+/*
+ * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
+ * virtual time wraps every 15 days.
+ */
+
+/*
+ * The OFFSET_OF macro is used to return the offset of a field within
+ * a structure. It is used by the heap management routines.
+ */
+#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) )
+
+/*
+ * A heap entry is made of a key and a pointer to the actual
+ * object stored in the heap.
+ * The heap is an array of dn_heap_entry entries, dynamically allocated.
+ * Current size is "size", with "elements" actually in use.
+ * The heap normally supports only ordered insert and extract from the top.
+ * If we want to extract an object from the middle of the heap, we
+ * have to know where the object itself is located in the heap (or we
+ * need to scan the whole array). To this purpose, an object has a
+ * field (int) which contains the index of the object itself into the
+ * heap. When the object is moved, the field must also be updated.
+ * The offset of the index in the object is stored in the 'offset'
+ * field in the heap descriptor. The assumption is that this offset
+ * is non-zero if we want to support extract from the middle.
+ */
+struct dn_heap_entry {
+ dn_key key ; /* sorting key. Topmost element is smallest one */
+ void *object ; /* object pointer */
+} ;
+
+struct dn_heap {
+ int size ;
+ int elements ;
+ int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
+ struct dn_heap_entry *p ; /* really an array of "size" entries */
+} ;
+
+/*
+ * struct dn_pkt identifies a packet in the dummynet queue, but
+ * is also used to tag packets passed back to the various destinations
+ * (ip_input(), ip_output(), bdg_forward() and so on).
+ * As such the first part of the structure must be a struct m_hdr,
+ * followed by dummynet-specific parameters. The m_hdr must be
+ * initialized with
+ * mh_type = MT_TAG;
+ * mh_flags = PACKET_TYPE_DUMMYNET;
+ * mh_next = <pointer to the actual mbuf>
+ *
+ * mh_nextpkt, mh_data are free for dummynet use (mh_nextpkt is used to
+ * build a linked list of packets in a dummynet queue).
+ */
+struct dn_pkt {
+ struct m_hdr hdr ;
+#define DN_NEXT(x) (struct dn_pkt *)(x)->hdr.mh_nextpkt
+#define dn_m hdr.mh_next /* packet to be forwarded */
+
+ struct ip_fw *rule; /* matching rule */
+ int dn_dir; /* action when packet comes out. */
+#define DN_TO_IP_OUT 1
+#define DN_TO_IP_IN 2
+#define DN_TO_BDG_FWD 3
+#define DN_TO_ETH_DEMUX 4
+#define DN_TO_ETH_OUT 5
+
+ dn_key output_time; /* when the pkt is due for delivery */
+ struct ifnet *ifp; /* interface, for ip_output */
+ struct sockaddr_in *dn_dst ;
+ struct route ro; /* route, for ip_output. MUST COPY */
+ int flags ; /* flags, for ip_output (IPv6 ?) */
+};
+
+/*
+ * Overall structure of dummynet (with WF2Q+):
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE.
+
+A QUEUE is just a queue with configurable size and queue management
+policy. It is also associated with a mask (to discriminate among
+different flows), a weight (used to give different shares of the
+bandwidth to different flows) and a "pipe", which essentially
+supplies the transmit clock for all queues associated with that
+pipe.
+
+A PIPE emulates a fixed-bandwidth link, whose bandwidth is
+configurable. The "clock" for a pipe can come from either an
+internal timer, or from the transmit interrupt of an interface.
+A pipe is also associated with one (or more, if masks are used)
+queue, where all packets for that pipe are stored.
+
+The bandwidth available on the pipe is shared by the queues
+associated with that pipe (only one in case the packet is sent
+to a PIPE) according to the WF2Q+ scheduling algorithm and the
+configured weights.
+
+In general, incoming packets are stored in the appropriate queue,
+which is then placed into one of a few heaps managed by a scheduler
+to decide when the packet should be extracted.
+The scheduler (a function called dummynet()) is run at every timer
+tick, and grabs queues from the head of the heaps when they are
+ready for processing.
+
+There are three data structures definining a pipe and associated queues:
+
+ + dn_pipe, which contains the main configuration parameters related
+ to delay and bandwidth;
+ + dn_flow_set, which contains WF2Q+ configuration, flow
+ masks, plr and RED configuration;
+ + dn_flow_queue, which is the per-flow queue (containing the packets)
+
+Multiple dn_flow_set can be linked to the same pipe, and multiple
+dn_flow_queue can be linked to the same dn_flow_set.
+All data structures are linked in a linear list which is used for
+housekeeping purposes.
+
+During configuration, we create and initialize the dn_flow_set
+and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
+
+At runtime: packets are sent to the appropriate dn_flow_set (either
+WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
+which in turn dispatches them to the appropriate dn_flow_queue
+(created dynamically according to the masks).
+
+The transmit clock for fixed rate flows (ready_event()) selects the
+dn_flow_queue to be used to transmit the next packet. For WF2Q,
+wfq_ready_event() extract a pipe which in turn selects the right
+flow using a number of heaps defined into the pipe itself.
+
+ *
+ */
+
+/*
+ * per flow queue. This contains the flow identifier, the queue
+ * of packets, counters, and parameters used to support both RED and
+ * WF2Q+.
+ *
+ * A dn_flow_queue is created and initialized whenever a packet for
+ * a new flow arrives.
+ */
+struct dn_flow_queue {
+ struct dn_flow_queue *next ;
+ struct ipfw_flow_id id ;
+
+ struct dn_pkt *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+ long numbytes ; /* credit for transmission (dynamic queues) */
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ u_int32_t q_time ; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ dn_key sched_time ; /* current time when queue enters ready_heap */
+
+ dn_key S,F ; /* start time, finish time */
+ /*
+ * Setting F < S means the timestamp is invalid. We only need
+ * to test this when the queue is empty.
+ */
+} ;
+
+/*
+ * flow_set descriptor. Contains the "template" parameters for the
+ * queue configuration, and pointers to the hash table of dn_flow_queue's.
+ *
+ * The hash table is an array of lists -- we identify the slot by
+ * hashing the flow-id, then scan the list looking for a match.
+ * The size of the hash table (buckets) is configurable on a per-queue
+ * basis.
+ *
+ * A dn_flow_set is created whenever a new queue or pipe is created (in the
+ * latter case, the structure is located inside the struct dn_pipe).
+ */
+struct dn_flow_set {
+ struct dn_flow_set *next; /* next flow set in all_flow_sets list */
+
+ u_short fs_nr ; /* flow_set number */
+ u_short flags_fs;
+#define DN_HAVE_FLOW_MASK 0x0001
+#define DN_IS_RED 0x0002
+#define DN_IS_GENTLE_RED 0x0004
+#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
+#define DN_IS_PIPE 0x4000
+#define DN_IS_QUEUE 0x8000
+
+ struct dn_pipe *pipe ; /* pointer to parent pipe */
+ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
+
+ int weight ; /* WFQ queue weight */
+ int qsize ; /* queue size in slots or bytes */
+ int plr ; /* pkt loss rate (2^31-1 means 100%) */
+
+ struct ipfw_flow_id flow_mask ;
+
+ /* hash table of queues onto this flow_set */
+ int rq_size ; /* number of slots */
+ int rq_elements ; /* active elements */
+ struct dn_flow_queue **rq; /* array of rq_size entries */
+
+ u_int32_t last_expired ; /* do not expire too frequently */
+ int backlogged ; /* #active queues for this flowset */
+
+ /* RED parameters */
+#define SCALE_RED 16
+#define SCALE(x) ( (x) << SCALE_RED )
+#define SCALE_VAL(x) ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+} ;
+
+/*
+ * Pipe descriptor. Contains global parameters, delay-line queue,
+ * and the flow_set used for fixed-rate queues.
+ *
+ * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
+ * not_eligible_heap, for queues whose start time is higher
+ * than the virtual time. Sorted by start time.
+ * scheduler_heap, for queues eligible for scheduling. Sorted by
+ * finish time.
+ * idle_heap, all flows that are idle and can be removed. We
+ * do that on each tick so we do not slow down too much
+ * operations during forwarding.
+ *
+ */
+struct dn_pipe { /* a pipe */
+ struct dn_pipe *next ;
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct dn_pkt *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
+
+ dn_key V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+ int numbytes; /* bits I can transmit (more or less). */
+
+ dn_key sched_time ; /* time pipe was scheduled in ready_heap */
+
+ /*
+ * When the tx clock come from an interface (if_name[0] != '\0'), its name
+ * is stored below, whereas the ifp is filled when the rule is configured.
+ */
+ char if_name[16];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+
+#ifdef _KERNEL
+typedef int ip_dn_ctl_t(struct sockopt *); /* raw_ip.c */
+typedef void ip_dn_ruledel_t(void *); /* ip_fw.c */
+typedef int ip_dn_io_t(struct mbuf *m, int pipe_nr, int dir,
+ struct ip_fw_args *fwa);
+extern ip_dn_ctl_t *ip_dn_ctl_ptr;
+extern ip_dn_ruledel_t *ip_dn_ruledel_ptr;
+extern ip_dn_io_t *ip_dn_io_ptr;
+#define DUMMYNET_LOADED (ip_dn_io_ptr != NULL)
+#endif
+
+#endif /* _IP_DUMMYNET_H */
diff --git a/sys/netinet/ip_ecn.c b/sys/netinet/ip_ecn.c
new file mode 100644
index 0000000..de3d38e
--- /dev/null
+++ b/sys/netinet/ip_ecn.c
@@ -0,0 +1,142 @@
+/* $FreeBSD$ */
+/* $KAME: ip_ecn.c,v 1.11 2001/05/03 16:09:29 itojun Exp $ */
+
+/*
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * ECN consideration on tunnel ingress/egress operation.
+ * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt
+ */
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/errno.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+
+#include <netinet/ip_ecn.h>
+#ifdef INET6
+#include <netinet6/ip6_ecn.h>
+#endif
+
+/*
+ * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation).
+ */
+void
+ip_ecn_ingress(mode, outer, inner)
+ int mode;
+ u_int8_t *outer;
+ const u_int8_t *inner;
+{
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip_ecn_ingress");
+
+ *outer = *inner;
+ switch (mode) {
+ case ECN_ALLOWED: /* ECN allowed */
+ *outer &= ~IPTOS_CE;
+ break;
+ case ECN_FORBIDDEN: /* ECN forbidden */
+ *outer &= ~(IPTOS_ECT | IPTOS_CE);
+ break;
+ case ECN_NOCARE: /* no consideration to ECN */
+ break;
+ }
+}
+
+/*
+ * modify inner ECN (TOS) field on egress operation (tunnel decapsulation).
+ */
+void
+ip_ecn_egress(mode, outer, inner)
+ int mode;
+ const u_int8_t *outer;
+ u_int8_t *inner;
+{
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip_ecn_egress");
+
+ switch (mode) {
+ case ECN_ALLOWED:
+ if (*outer & IPTOS_CE)
+ *inner |= IPTOS_CE;
+ break;
+ case ECN_FORBIDDEN: /* ECN forbidden */
+ case ECN_NOCARE: /* no consideration to ECN */
+ break;
+ }
+}
+
+#ifdef INET6
+void
+ip6_ecn_ingress(mode, outer, inner)
+ int mode;
+ u_int32_t *outer;
+ const u_int32_t *inner;
+{
+ u_int8_t outer8, inner8;
+
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip6_ecn_ingress");
+
+ outer8 = (ntohl(*outer) >> 20) & 0xff;
+ inner8 = (ntohl(*inner) >> 20) & 0xff;
+ ip_ecn_ingress(mode, &outer8, &inner8);
+ *outer &= ~htonl(0xff << 20);
+ *outer |= htonl((u_int32_t)outer8 << 20);
+}
+
+void
+ip6_ecn_egress(mode, outer, inner)
+ int mode;
+ const u_int32_t *outer;
+ u_int32_t *inner;
+{
+ u_int8_t outer8, inner8;
+
+ if (!outer || !inner)
+ panic("NULL pointer passed to ip6_ecn_egress");
+
+ outer8 = (ntohl(*outer) >> 20) & 0xff;
+ inner8 = (ntohl(*inner) >> 20) & 0xff;
+ ip_ecn_egress(mode, &outer8, &inner8);
+ *inner &= ~htonl(0xff << 20);
+ *inner |= htonl((u_int32_t)inner8 << 20);
+}
+#endif
diff --git a/sys/netinet/ip_ecn.h b/sys/netinet/ip_ecn.h
new file mode 100644
index 0000000..1a38a48
--- /dev/null
+++ b/sys/netinet/ip_ecn.h
@@ -0,0 +1,49 @@
+/* $FreeBSD$ */
+/* $KAME: ip_ecn.h,v 1.6 2001/05/03 14:51:48 itojun Exp $ */
+
+/*
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * ECN consideration on tunnel ingress/egress operation.
+ * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt
+ */
+
+#if defined(_KERNEL) && !defined(_LKM)
+#include "opt_inet.h"
+#endif
+
+#define ECN_ALLOWED 1 /* ECN allowed */
+#define ECN_FORBIDDEN 0 /* ECN forbidden */
+#define ECN_NOCARE (-1) /* no consideration to ECN */
+
+#ifdef _KERNEL
+extern void ip_ecn_ingress(int, u_int8_t *, const u_int8_t *);
+extern void ip_ecn_egress(int, const u_int8_t *, u_int8_t *);
+#endif
diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c
new file mode 100644
index 0000000..e12f50a
--- /dev/null
+++ b/sys/netinet/ip_encap.c
@@ -0,0 +1,522 @@
+/* $FreeBSD$ */
+/* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * My grandfather said that there's a devil inside tunnelling technology...
+ *
+ * We have surprisingly many protocols that want packets with IP protocol
+ * #4 or #41. Here's a list of protocols that want protocol #41:
+ * RFC1933 configured tunnel
+ * RFC1933 automatic tunnel
+ * RFC2401 IPsec tunnel
+ * RFC2473 IPv6 generic packet tunnelling
+ * RFC2529 6over4 tunnel
+ * mobile-ip6 (uses RFC2473)
+ * RFC3056 6to4 tunnel
+ * isatap tunnel
+ * Here's a list of protocol that want protocol #4:
+ * RFC1853 IPv4-in-IPv4 tunnelling
+ * RFC2003 IPv4 encapsulation within IPv4
+ * RFC2344 reverse tunnelling for mobile-ip4
+ * RFC2401 IPsec tunnel
+ * Well, what can I say. They impose different en/decapsulation mechanism
+ * from each other, so they need separate protocol handler. The only one
+ * we can easily determine by protocol # is IPsec, which always has
+ * AH/ESP/IPComp header right after outer IP header.
+ *
+ * So, clearly good old protosw does not work for protocol #4 and #41.
+ * The code will let you match protocol via src/dst address pair.
+ */
+/* XXX is M_NETADDR correct? */
+
+#include "opt_mrouting.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/errno.h>
+#include <sys/protosw.h>
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_encap.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/ip6protosw.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#include <net/net_osdep.h>
+
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+
+static void encap_add(struct encaptab *);
+static int mask_match(const struct encaptab *, const struct sockaddr *,
+ const struct sockaddr *);
+static void encap_fillarg(struct mbuf *, const struct encaptab *);
+
+#ifndef LIST_HEAD_INITIALIZER
+/* rely upon BSS initialization */
+LIST_HEAD(, encaptab) encaptab;
+#else
+LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(&encaptab);
+#endif
+
+void
+encap_init()
+{
+ static int initialized = 0;
+
+ if (initialized)
+ return;
+ initialized++;
+#if 0
+ /*
+ * we cannot use LIST_INIT() here, since drivers may want to call
+ * encap_attach(), on driver attach. encap_init() will be called
+ * on AF_INET{,6} initialization, which happens after driver
+ * initialization - using LIST_INIT() here can nuke encap_attach()
+ * from drivers.
+ */
+ LIST_INIT(&encaptab);
+#endif
+}
+
+#ifdef INET
+void
+encap4_input(m, off)
+ struct mbuf *m;
+ int off;
+{
+ struct ip *ip;
+ int proto;
+ struct sockaddr_in s, d;
+ const struct protosw *psw;
+ struct encaptab *ep, *match;
+ int prio, matchprio;
+
+ ip = mtod(m, struct ip *);
+ proto = ip->ip_p;
+
+ bzero(&s, sizeof(s));
+ s.sin_family = AF_INET;
+ s.sin_len = sizeof(struct sockaddr_in);
+ s.sin_addr = ip->ip_src;
+ bzero(&d, sizeof(d));
+ d.sin_family = AF_INET;
+ d.sin_len = sizeof(struct sockaddr_in);
+ d.sin_addr = ip->ip_dst;
+
+ match = NULL;
+ matchprio = 0;
+ LIST_FOREACH(ep, &encaptab, chain) {
+ if (ep->af != AF_INET)
+ continue;
+ if (ep->proto >= 0 && ep->proto != proto)
+ continue;
+ if (ep->func)
+ prio = (*ep->func)(m, off, proto, ep->arg);
+ else {
+ /*
+ * it's inbound traffic, we need to match in reverse
+ * order
+ */
+ prio = mask_match(ep, (struct sockaddr *)&d,
+ (struct sockaddr *)&s);
+ }
+
+ /*
+ * We prioritize the matches by using bit length of the
+ * matches. mask_match() and user-supplied matching function
+ * should return the bit length of the matches (for example,
+ * if both src/dst are matched for IPv4, 64 should be returned).
+ * 0 or negative return value means "it did not match".
+ *
+ * The question is, since we have two "mask" portion, we
+ * cannot really define total order between entries.
+ * For example, which of these should be preferred?
+ * mask_match() returns 48 (32 + 16) for both of them.
+ * src=3ffe::/16, dst=3ffe:501::/32
+ * src=3ffe:501::/32, dst=3ffe::/16
+ *
+ * We need to loop through all the possible candidates
+ * to get the best match - the search takes O(n) for
+ * n attachments (i.e. interfaces).
+ */
+ if (prio <= 0)
+ continue;
+ if (prio > matchprio) {
+ matchprio = prio;
+ match = ep;
+ }
+ }
+
+ if (match) {
+ /* found a match, "match" has the best one */
+ psw = match->psw;
+ if (psw && psw->pr_input) {
+ encap_fillarg(m, match);
+ (*psw->pr_input)(m, off);
+ } else
+ m_freem(m);
+ return;
+ }
+
+ /* last resort: inject to raw socket */
+ rip_input(m, off);
+}
+#endif
+
+#ifdef INET6
+int
+encap6_input(mp, offp, proto)
+ struct mbuf **mp;
+ int *offp;
+ int proto;
+{
+ struct mbuf *m = *mp;
+ struct ip6_hdr *ip6;
+ struct sockaddr_in6 s, d;
+ const struct ip6protosw *psw;
+ struct encaptab *ep, *match;
+ int prio, matchprio;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+
+ bzero(&s, sizeof(s));
+ s.sin6_family = AF_INET6;
+ s.sin6_len = sizeof(struct sockaddr_in6);
+ s.sin6_addr = ip6->ip6_src;
+ bzero(&d, sizeof(d));
+ d.sin6_family = AF_INET6;
+ d.sin6_len = sizeof(struct sockaddr_in6);
+ d.sin6_addr = ip6->ip6_dst;
+
+ match = NULL;
+ matchprio = 0;
+ LIST_FOREACH(ep, &encaptab, chain) {
+ if (ep->af != AF_INET6)
+ continue;
+ if (ep->proto >= 0 && ep->proto != proto)
+ continue;
+ if (ep->func)
+ prio = (*ep->func)(m, *offp, proto, ep->arg);
+ else {
+ /*
+ * it's inbound traffic, we need to match in reverse
+ * order
+ */
+ prio = mask_match(ep, (struct sockaddr *)&d,
+ (struct sockaddr *)&s);
+ }
+
+ /* see encap4_input() for issues here */
+ if (prio <= 0)
+ continue;
+ if (prio > matchprio) {
+ matchprio = prio;
+ match = ep;
+ }
+ }
+
+ if (match) {
+ /* found a match */
+ psw = (const struct ip6protosw *)match->psw;
+ if (psw && psw->pr_input) {
+ encap_fillarg(m, match);
+ return (*psw->pr_input)(mp, offp, proto);
+ } else {
+ m_freem(m);
+ return IPPROTO_DONE;
+ }
+ }
+
+ /* last resort: inject to raw socket */
+ return rip6_input(mp, offp, proto);
+}
+#endif
+
+static void
+encap_add(ep)
+ struct encaptab *ep;
+{
+
+ LIST_INSERT_HEAD(&encaptab, ep, chain);
+}
+
+/*
+ * sp (src ptr) is always my side, and dp (dst ptr) is always remote side.
+ * length of mask (sm and dm) is assumed to be same as sp/dp.
+ * Return value will be necessary as input (cookie) for encap_detach().
+ */
+const struct encaptab *
+encap_attach(af, proto, sp, sm, dp, dm, psw, arg)
+ int af;
+ int proto;
+ const struct sockaddr *sp, *sm;
+ const struct sockaddr *dp, *dm;
+ const struct protosw *psw;
+ void *arg;
+{
+ struct encaptab *ep;
+ int error;
+ int s;
+
+ s = splnet();
+ /* sanity check on args */
+ if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) {
+ error = EINVAL;
+ goto fail;
+ }
+ if (sp->sa_len != dp->sa_len) {
+ error = EINVAL;
+ goto fail;
+ }
+ if (af != sp->sa_family || af != dp->sa_family) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ /* check if anyone have already attached with exactly same config */
+ LIST_FOREACH(ep, &encaptab, chain) {
+ if (ep->af != af)
+ continue;
+ if (ep->proto != proto)
+ continue;
+ if (ep->src.ss_len != sp->sa_len ||
+ bcmp(&ep->src, sp, sp->sa_len) != 0 ||
+ bcmp(&ep->srcmask, sm, sp->sa_len) != 0)
+ continue;
+ if (ep->dst.ss_len != dp->sa_len ||
+ bcmp(&ep->dst, dp, dp->sa_len) != 0 ||
+ bcmp(&ep->dstmask, dm, dp->sa_len) != 0)
+ continue;
+
+ error = EEXIST;
+ goto fail;
+ }
+
+ ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/
+ if (ep == NULL) {
+ error = ENOBUFS;
+ goto fail;
+ }
+ bzero(ep, sizeof(*ep));
+
+ ep->af = af;
+ ep->proto = proto;
+ bcopy(sp, &ep->src, sp->sa_len);
+ bcopy(sm, &ep->srcmask, sp->sa_len);
+ bcopy(dp, &ep->dst, dp->sa_len);
+ bcopy(dm, &ep->dstmask, dp->sa_len);
+ ep->psw = psw;
+ ep->arg = arg;
+
+ encap_add(ep);
+
+ error = 0;
+ splx(s);
+ return ep;
+
+fail:
+ splx(s);
+ return NULL;
+}
+
+const struct encaptab *
+encap_attach_func(af, proto, func, psw, arg)
+ int af;
+ int proto;
+ int (*func)(const struct mbuf *, int, int, void *);
+ const struct protosw *psw;
+ void *arg;
+{
+ struct encaptab *ep;
+ int error;
+ int s;
+
+ s = splnet();
+ /* sanity check on args */
+ if (!func) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/
+ if (ep == NULL) {
+ error = ENOBUFS;
+ goto fail;
+ }
+ bzero(ep, sizeof(*ep));
+
+ ep->af = af;
+ ep->proto = proto;
+ ep->func = func;
+ ep->psw = psw;
+ ep->arg = arg;
+
+ encap_add(ep);
+
+ error = 0;
+ splx(s);
+ return ep;
+
+fail:
+ splx(s);
+ return NULL;
+}
+
+int
+encap_detach(cookie)
+ const struct encaptab *cookie;
+{
+ const struct encaptab *ep = cookie;
+ struct encaptab *p;
+
+ LIST_FOREACH(p, &encaptab, chain) {
+ if (p == ep) {
+ LIST_REMOVE(p, chain);
+ free(p, M_NETADDR); /*XXX*/
+ return 0;
+ }
+ }
+
+ return EINVAL;
+}
+
+static int
+mask_match(ep, sp, dp)
+ const struct encaptab *ep;
+ const struct sockaddr *sp;
+ const struct sockaddr *dp;
+{
+ struct sockaddr_storage s;
+ struct sockaddr_storage d;
+ int i;
+ const u_int8_t *p, *q;
+ u_int8_t *r;
+ int matchlen;
+
+ if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d))
+ return 0;
+ if (sp->sa_family != ep->af || dp->sa_family != ep->af)
+ return 0;
+ if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len)
+ return 0;
+
+ matchlen = 0;
+
+ p = (const u_int8_t *)sp;
+ q = (const u_int8_t *)&ep->srcmask;
+ r = (u_int8_t *)&s;
+ for (i = 0 ; i < sp->sa_len; i++) {
+ r[i] = p[i] & q[i];
+ /* XXX estimate */
+ matchlen += (q[i] ? 8 : 0);
+ }
+
+ p = (const u_int8_t *)dp;
+ q = (const u_int8_t *)&ep->dstmask;
+ r = (u_int8_t *)&d;
+ for (i = 0 ; i < dp->sa_len; i++) {
+ r[i] = p[i] & q[i];
+ /* XXX rough estimate */
+ matchlen += (q[i] ? 8 : 0);
+ }
+
+ /* need to overwrite len/family portion as we don't compare them */
+ s.ss_len = sp->sa_len;
+ s.ss_family = sp->sa_family;
+ d.ss_len = dp->sa_len;
+ d.ss_family = dp->sa_family;
+
+ if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 &&
+ bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) {
+ return matchlen;
+ } else
+ return 0;
+}
+
+static void
+encap_fillarg(m, ep)
+ struct mbuf *m;
+ const struct encaptab *ep;
+{
+#if 0
+ m->m_pkthdr.aux = ep->arg;
+#else
+ struct mbuf *n;
+
+ n = m_aux_add(m, AF_INET, IPPROTO_IPV4);
+ if (n) {
+ *mtod(n, void **) = ep->arg;
+ n->m_len = sizeof(void *);
+ }
+#endif
+}
+
+void *
+encap_getarg(m)
+ struct mbuf *m;
+{
+ void *p;
+#if 0
+ p = m->m_pkthdr.aux;
+ m->m_pkthdr.aux = NULL;
+ return p;
+#else
+ struct mbuf *n;
+
+ p = NULL;
+ n = m_aux_find(m, AF_INET, IPPROTO_IPV4);
+ if (n) {
+ if (n->m_len == sizeof(void *))
+ p = *mtod(n, void **);
+ m_aux_delete(m, n);
+ }
+ return p;
+#endif
+}
diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h
new file mode 100644
index 0000000..518f502
--- /dev/null
+++ b/sys/netinet/ip_encap.h
@@ -0,0 +1,64 @@
+/* $FreeBSD$ */
+/* $KAME: ip_encap.h,v 1.7 2000/03/25 07:23:37 sumikawa Exp $ */
+
+/*
+ * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETINET_IP_ENCAP_H_
+#define _NETINET_IP_ENCAP_H_
+
+#ifdef _KERNEL
+
+struct encaptab {
+ LIST_ENTRY(encaptab) chain;
+ int af;
+ int proto; /* -1: don't care, I'll check myself */
+ struct sockaddr_storage src; /* my addr */
+ struct sockaddr_storage srcmask;
+ struct sockaddr_storage dst; /* remote addr */
+ struct sockaddr_storage dstmask;
+ int (*func)(const struct mbuf *, int, int, void *);
+ const struct protosw *psw; /* only pr_input will be used */
+ void *arg; /* passed via m->m_pkthdr.aux */
+};
+
+void encap_init(void);
+void encap4_input(struct mbuf *, int);
+int encap6_input(struct mbuf **, int *, int);
+const struct encaptab *encap_attach(int, int, const struct sockaddr *,
+ const struct sockaddr *, const struct sockaddr *,
+ const struct sockaddr *, const struct protosw *, void *);
+const struct encaptab *encap_attach_func(int, int,
+ int (*)(const struct mbuf *, int, int, void *),
+ const struct protosw *, void *);
+int encap_detach(const struct encaptab *);
+void *encap_getarg(struct mbuf *);
+#endif
+
+#endif /*_NETINET_IP_ENCAP_H_*/
diff --git a/sys/netinet/ip_flow.c b/sys/netinet/ip_flow.c
new file mode 100644
index 0000000..a23f2bb
--- /dev/null
+++ b/sys/netinet/ip_flow.c
@@ -0,0 +1,333 @@
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_flow.h>
+
+#define IPFLOW_TIMER (5 * PR_SLOWHZ)
+#define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */
+#define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS)
+static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE];
+static int ipflow_inuse;
+#define IPFLOW_MAX 256
+
+static int ipflow_active = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW,
+ &ipflow_active, 0, "Enable flow-based IP forwarding");
+
+static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow");
+
+static unsigned
+ipflow_hash(
+ struct in_addr dst,
+ struct in_addr src,
+ unsigned tos)
+{
+ unsigned hash = tos;
+ int idx;
+ for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS)
+ hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx);
+ return hash & (IPFLOW_HASHSIZE-1);
+}
+
+static struct ipflow *
+ipflow_lookup(
+ const struct ip *ip)
+{
+ unsigned hash;
+ struct ipflow *ipf;
+
+ hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos);
+
+ ipf = LIST_FIRST(&ipflows[hash]);
+ while (ipf != NULL) {
+ if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr
+ && ip->ip_src.s_addr == ipf->ipf_src.s_addr
+ && ip->ip_tos == ipf->ipf_tos)
+ break;
+ ipf = LIST_NEXT(ipf, ipf_next);
+ }
+ return ipf;
+}
+
+int
+ipflow_fastforward(
+ struct mbuf *m)
+{
+ struct ip *ip;
+ struct ipflow *ipf;
+ struct rtentry *rt;
+ struct sockaddr *dst;
+ int error;
+
+ /*
+ * Are we forwarding packets? Big enough for an IP packet?
+ */
+ if (!ipforwarding || !ipflow_active || m->m_len < sizeof(struct ip))
+ return 0;
+ /*
+ * IP header with no option and valid version and length
+ */
+ ip = mtod(m, struct ip *);
+ if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2)
+ || ntohs(ip->ip_len) > m->m_pkthdr.len)
+ return 0;
+ /*
+ * Find a flow.
+ */
+ if ((ipf = ipflow_lookup(ip)) == NULL)
+ return 0;
+
+ /*
+ * Route and interface still up?
+ */
+ rt = ipf->ipf_ro.ro_rt;
+ if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0)
+ return 0;
+
+ /*
+ * Packet size OK? TTL?
+ */
+ if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC)
+ return 0;
+
+ /*
+ * Everything checks out and so we can forward this packet.
+ * Modify the TTL and incrementally change the checksum.
+ */
+ ip->ip_ttl -= IPTTLDEC;
+ if (ip->ip_sum >= htons(0xffff - (IPTTLDEC << 8))) {
+ ip->ip_sum += htons(IPTTLDEC << 8) + 1;
+ } else {
+ ip->ip_sum += htons(IPTTLDEC << 8);
+ }
+
+ /*
+ * Send the packet on its way. All we can get back is ENOBUFS
+ */
+ ipf->ipf_uses++;
+ ipf->ipf_timer = IPFLOW_TIMER;
+
+ if (rt->rt_flags & RTF_GATEWAY)
+ dst = rt->rt_gateway;
+ else
+ dst = &ipf->ipf_ro.ro_dst;
+ if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) {
+ if (error == ENOBUFS)
+ ipf->ipf_dropped++;
+ else
+ ipf->ipf_errors++;
+ }
+ return 1;
+}
+
+static void
+ipflow_addstats(
+ struct ipflow *ipf)
+{
+ ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses;
+ ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped;
+ ipstat.ips_forward += ipf->ipf_uses;
+ ipstat.ips_fastforward += ipf->ipf_uses;
+}
+
+static void
+ipflow_free(
+ struct ipflow *ipf)
+{
+ int s;
+ /*
+ * Remove the flow from the hash table (at elevated IPL).
+ * Once it's off the list, we can deal with it at normal
+ * network IPL.
+ */
+ s = splimp();
+ LIST_REMOVE(ipf, ipf_next);
+ splx(s);
+ ipflow_addstats(ipf);
+ RTFREE(ipf->ipf_ro.ro_rt);
+ ipflow_inuse--;
+ free(ipf, M_IPFLOW);
+}
+
+static struct ipflow *
+ipflow_reap(
+ void)
+{
+ struct ipflow *ipf, *maybe_ipf = NULL;
+ int idx;
+ int s;
+
+ for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) {
+ ipf = LIST_FIRST(&ipflows[idx]);
+ while (ipf != NULL) {
+ /*
+ * If this no longer points to a valid route
+ * reclaim it.
+ */
+ if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0)
+ goto done;
+ /*
+ * choose the one that's been least recently used
+ * or has had the least uses in the last 1.5
+ * intervals.
+ */
+ if (maybe_ipf == NULL
+ || ipf->ipf_timer < maybe_ipf->ipf_timer
+ || (ipf->ipf_timer == maybe_ipf->ipf_timer
+ && ipf->ipf_last_uses + ipf->ipf_uses <
+ maybe_ipf->ipf_last_uses +
+ maybe_ipf->ipf_uses))
+ maybe_ipf = ipf;
+ ipf = LIST_NEXT(ipf, ipf_next);
+ }
+ }
+ ipf = maybe_ipf;
+ done:
+ /*
+ * Remove the entry from the flow table.
+ */
+ s = splimp();
+ LIST_REMOVE(ipf, ipf_next);
+ splx(s);
+ ipflow_addstats(ipf);
+ RTFREE(ipf->ipf_ro.ro_rt);
+ return ipf;
+}
+
+void
+ipflow_slowtimo(
+ void)
+{
+ struct ipflow *ipf;
+ int idx;
+
+ for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) {
+ ipf = LIST_FIRST(&ipflows[idx]);
+ while (ipf != NULL) {
+ struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next);
+ if (--ipf->ipf_timer == 0) {
+ ipflow_free(ipf);
+ } else {
+ ipf->ipf_last_uses = ipf->ipf_uses;
+ ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses;
+ ipstat.ips_forward += ipf->ipf_uses;
+ ipstat.ips_fastforward += ipf->ipf_uses;
+ ipf->ipf_uses = 0;
+ }
+ ipf = next_ipf;
+ }
+ }
+}
+
+void
+ipflow_create(
+ const struct route *ro,
+ struct mbuf *m)
+{
+ const struct ip *const ip = mtod(m, struct ip *);
+ struct ipflow *ipf;
+ unsigned hash;
+ int s;
+
+ /*
+ * Don't create cache entries for ICMP messages.
+ */
+ if (!ipflow_active || ip->ip_p == IPPROTO_ICMP)
+ return;
+ /*
+ * See if an existing flow struct exists. If so remove it from it's
+ * list and free the old route. If not, try to malloc a new one
+ * (if we aren't at our limit).
+ */
+ ipf = ipflow_lookup(ip);
+ if (ipf == NULL) {
+ if (ipflow_inuse == IPFLOW_MAX) {
+ ipf = ipflow_reap();
+ } else {
+ ipf = (struct ipflow *) malloc(sizeof(*ipf), M_IPFLOW,
+ M_NOWAIT);
+ if (ipf == NULL)
+ return;
+ ipflow_inuse++;
+ }
+ bzero((caddr_t) ipf, sizeof(*ipf));
+ } else {
+ s = splimp();
+ LIST_REMOVE(ipf, ipf_next);
+ splx(s);
+ ipflow_addstats(ipf);
+ RTFREE(ipf->ipf_ro.ro_rt);
+ ipf->ipf_uses = ipf->ipf_last_uses = 0;
+ ipf->ipf_errors = ipf->ipf_dropped = 0;
+ }
+
+ /*
+ * Fill in the updated information.
+ */
+ ipf->ipf_ro = *ro;
+ ro->ro_rt->rt_refcnt++;
+ ipf->ipf_dst = ip->ip_dst;
+ ipf->ipf_src = ip->ip_src;
+ ipf->ipf_tos = ip->ip_tos;
+ ipf->ipf_timer = IPFLOW_TIMER;
+ /*
+ * Insert into the approriate bucket of the flow table.
+ */
+ hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos);
+ s = splimp();
+ LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next);
+ splx(s);
+}
diff --git a/sys/netinet/ip_flow.h b/sys/netinet/ip_flow.h
new file mode 100644
index 0000000..4675996
--- /dev/null
+++ b/sys/netinet/ip_flow.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_FLOW_H
+#define _NETINET_IP_FLOW_H
+
+struct ipflow {
+ LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */
+ struct in_addr ipf_dst; /* destination address */
+ struct in_addr ipf_src; /* source address */
+
+ u_int8_t ipf_tos; /* type-of-service */
+ struct route ipf_ro; /* associated route entry */
+ u_long ipf_uses; /* number of uses in this period */
+
+ int ipf_timer; /* remaining lifetime of this entry */
+ u_long ipf_dropped; /* ENOBUFS returned by if_output */
+ u_long ipf_errors; /* other errors returned by if_output */
+ u_long ipf_last_uses; /* number of uses in last period */
+};
+
+#endif
diff --git a/sys/netinet/ip_fw.c b/sys/netinet/ip_fw.c
new file mode 100644
index 0000000..d7ccad7
--- /dev/null
+++ b/sys/netinet/ip_fw.c
@@ -0,0 +1,2254 @@
+/*
+ * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ * Copyright (c) 1996 Alex Nash
+ * Copyright (c) 2000-2002 Luigi Rizzo
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * $FreeBSD$
+ */
+
+#define DEB(x)
+#define DDB(x) x
+
+/*
+ * Implement IP packet firewall
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_ipdivert.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+
+#include <netinet/if_ether.h> /* XXX ethertype_ip */
+
+static int fw_debug = 1;
+#ifdef IPFIREWALL_VERBOSE
+static int fw_verbose = 1;
+#else
+static int fw_verbose = 0;
+#endif
+int fw_one_pass = 1 ;
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+static int fw_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#else
+static int fw_verbose_limit = 0;
+#endif
+static int fw_permanent_rules = 0;
+
+/*
+ * Right now, two fields in the IP header are changed to host format
+ * by the IP layer before calling the firewall. Ideally, we would like
+ * to have them in network format so that the packet can be
+ * used as it comes from the device driver (and is thus readonly).
+ */
+
+static u_int64_t counter; /* counter for ipfw_report(NULL...) */
+
+#define IPFW_DEFAULT_RULE ((u_int)(u_short)~0)
+
+LIST_HEAD (ip_fw_head, ip_fw) ip_fw_chain_head;
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+#ifdef SYSCTL_NODE
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW,
+ &fw_enable, 0, "Enable ipfw");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
+ &fw_one_pass, 0,
+ "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
+ &fw_debug, 0, "Enable printing of debug ip_fw statements");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
+ &fw_verbose, 0, "Log matches to ipfw rules");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
+ &fw_verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, permanent_rules, CTLFLAG_RW,
+ &fw_permanent_rules, 0, "Set rule number, below which rules are permanent");
+
+/*
+ * Extension for stateful ipfw.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, it is first hashed, then matched
+ * against the entries in the corresponding list.
+ * Matching occurs according to the rule type. The default is to
+ * match the four fields and the protocol, and rules are bidirectional.
+ *
+ * For a busy proxy/web server we will have lots of connections to
+ * the server. We could decide for a rule type where we ignore
+ * ports (different hashing) and avoid special SYN/RST/FIN handling.
+ *
+ * XXX when we decide to support more than one rule type, we should
+ * repeat the hashing multiple times uing only the useful fields.
+ * Or, we could run the various tests in parallel, because the
+ * 'move to front' technique should shorten the average search.
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rules holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted.
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall.
+ * XXX check the latter!!!
+ */
+static struct ipfw_dyn_rule **ipfw_dyn_v = NULL ;
+static u_int32_t dyn_buckets = 256 ; /* must be power of 2 */
+static u_int32_t curr_dyn_buckets = 256 ; /* must be power of 2 */
+
+/*
+ * timeouts for various events in handing dynamic rules.
+ */
+static u_int32_t dyn_ack_lifetime = 300 ;
+static u_int32_t dyn_syn_lifetime = 20 ;
+static u_int32_t dyn_fin_lifetime = 1 ;
+static u_int32_t dyn_rst_lifetime = 1 ;
+static u_int32_t dyn_udp_lifetime = 10 ;
+static u_int32_t dyn_short_lifetime = 5 ;
+
+/*
+ * after reaching 0, dynamic rules are considered still valid for
+ * an additional grace time, unless there is lack of resources.
+ */
+static u_int32_t dyn_grace_time = 10 ;
+
+static u_int32_t static_count = 0 ; /* # of static rules */
+static u_int32_t dyn_count = 0 ; /* # of dynamic rules */
+static u_int32_t dyn_max = 1000 ; /* max # of dynamic rules */
+
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
+ &dyn_buckets, 0, "Number of dyn. buckets");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
+ &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
+ &dyn_count, 0, "Number of dyn. rules");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
+ &dyn_max, 0, "Max number of dyn. rules");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
+ &static_count, 0, "Number of static rules");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
+ &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
+ &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
+ &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
+ &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
+ &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
+ &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_grace_time, CTLFLAG_RD,
+ &dyn_grace_time, 0, "Grace time for dyn. rules");
+
+#endif /* SYSCTL_NODE */
+
+#define dprintf(a) do { \
+ if (fw_debug) \
+ printf a; \
+ } while (0)
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+
+static int add_entry (struct ip_fw_head *chainptr, struct ip_fw *frwl);
+static int del_entry (struct ip_fw_head *chainptr, u_short number);
+static int zero_entry (struct ip_fw *, int);
+static int check_ipfw_struct (struct ip_fw *m);
+static int iface_match (struct ifnet *ifp, union ip_fw_if *ifu,
+ int byname);
+static int ipopts_match (struct ip *ip, struct ip_fw *f);
+static int iptos_match (struct ip *ip, struct ip_fw *f);
+static __inline int
+ port_match (u_short *portptr, int nports, u_short port,
+ int range_flag, int mask);
+static int tcpflg_match (struct tcphdr *tcp, struct ip_fw *f);
+static int icmptype_match (struct icmp * icmp, struct ip_fw * f);
+static void ipfw_report (struct ip_fw *f, struct ip *ip, int ip_off,
+ int ip_len, struct ifnet *rif,
+ struct ifnet *oif);
+
+static void flush_rule_ptrs(void);
+
+static ip_fw_chk_t ip_fw_chk;
+static int ip_fw_ctl (struct sockopt *sopt);
+
+ip_dn_ruledel_t *ip_dn_ruledel_ptr = NULL;
+
+static char err_prefix[] = "ip_fw_ctl:";
+
+/*
+ * Returns 1 if the port is matched by the vector, 0 otherwise
+ */
+static __inline int
+port_match(u_short *portptr, int nports, u_short port, int range_flag, int mask)
+{
+ if (!nports)
+ return 1;
+ if (mask) {
+ if ( 0 == ((portptr[0] ^ port) & portptr[1]) )
+ return 1;
+ nports -= 2;
+ portptr += 2;
+ }
+ if (range_flag) {
+ if (portptr[0] <= port && port <= portptr[1])
+ return 1;
+ nports -= 2;
+ portptr += 2;
+ }
+ while (nports-- > 0)
+ if (*portptr++ == port)
+ return 1;
+ return 0;
+}
+
+static int
+tcpflg_match(struct tcphdr *tcp, struct ip_fw *f)
+{
+ u_char flg_set, flg_clr;
+
+ /*
+ * If an established connection is required, reject packets that
+ * have only SYN of RST|ACK|SYN set. Otherwise, fall through to
+ * other flag requirements.
+ */
+ if ((f->fw_ipflg & IP_FW_IF_TCPEST) &&
+ ((tcp->th_flags & (TH_RST | TH_ACK | TH_SYN)) == TH_SYN))
+ return 0;
+
+ flg_set = tcp->th_flags & f->fw_tcpf;
+ flg_clr = tcp->th_flags & f->fw_tcpnf;
+
+ if (flg_set != f->fw_tcpf)
+ return 0;
+ if (flg_clr)
+ return 0;
+
+ return 1;
+}
+
+static int
+icmptype_match(struct icmp *icmp, struct ip_fw *f)
+{
+ int type;
+
+ if (!(f->fw_flg & IP_FW_F_ICMPBIT))
+ return(1);
+
+ type = icmp->icmp_type;
+
+ /* check for matching type in the bitmap */
+ if (type < IP_FW_ICMPTYPES_MAX &&
+ (f->fw_uar.fw_icmptypes[type / (sizeof(unsigned) * NBBY)] &
+ (1U << (type % (sizeof(unsigned) * NBBY)))))
+ return(1);
+
+ return(0); /* no match */
+}
+
+static int
+is_icmp_query(struct ip *ip)
+{
+ const struct icmp *icmp;
+ int icmp_type;
+
+ icmp = (struct icmp *)((u_int32_t *)ip + ip->ip_hl);
+ icmp_type = icmp->icmp_type;
+
+ if (icmp_type == ICMP_ECHO || icmp_type == ICMP_ROUTERSOLICIT ||
+ icmp_type == ICMP_TSTAMP || icmp_type == ICMP_IREQ ||
+ icmp_type == ICMP_MASKREQ)
+ return(1);
+
+ return(0);
+}
+
+static int
+ipopts_match(struct ip *ip, struct ip_fw *f)
+{
+ register u_char *cp;
+ int opt, optlen, cnt;
+ u_char opts, nopts, nopts_sve;
+
+ cp = (u_char *)(ip + 1);
+ cnt = (ip->ip_hl << 2) - sizeof (struct ip);
+ opts = f->fw_ipopt;
+ nopts = nopts_sve = f->fw_ipnopt;
+
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ optlen = cp[IPOPT_OLEN];
+ if (optlen <= 0 || optlen > cnt) {
+ return 0; /*XXX*/
+ }
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ case IPOPT_LSRR:
+ opts &= ~IP_FW_IPOPT_LSRR;
+ nopts &= ~IP_FW_IPOPT_LSRR;
+ break;
+
+ case IPOPT_SSRR:
+ opts &= ~IP_FW_IPOPT_SSRR;
+ nopts &= ~IP_FW_IPOPT_SSRR;
+ break;
+
+ case IPOPT_RR:
+ opts &= ~IP_FW_IPOPT_RR;
+ nopts &= ~IP_FW_IPOPT_RR;
+ break;
+ case IPOPT_TS:
+ opts &= ~IP_FW_IPOPT_TS;
+ nopts &= ~IP_FW_IPOPT_TS;
+ break;
+ }
+ if (opts == nopts)
+ break;
+ }
+ if (opts == 0 && nopts == nopts_sve)
+ return 1;
+ else
+ return 0;
+}
+
+static int
+iptos_match(struct ip *ip, struct ip_fw *f)
+{
+
+ u_int flags = (ip->ip_tos & 0x1f);
+ u_char opts, nopts, nopts_sve;
+
+ opts = (f->fw_iptos & 0x1f);
+ nopts = nopts_sve = f->fw_ipntos;
+
+ while (flags != 0) {
+ u_int flag;
+
+ flag = 1 << (ffs(flags) -1);
+ opts &= ~flag;
+ nopts &= ~flag;
+ flags &= ~flag;
+ }
+
+ if (opts == 0 && nopts == nopts_sve)
+ return 1;
+ else
+ return 0;
+
+}
+
+
+static int
+tcpopts_match(struct tcphdr *tcp, struct ip_fw *f)
+{
+ register u_char *cp;
+ int opt, optlen, cnt;
+ u_char opts, nopts, nopts_sve;
+
+ cp = (u_char *)(tcp + 1);
+ cnt = (tcp->th_off << 2) - sizeof (struct tcphdr);
+ opts = f->fw_tcpopt;
+ nopts = nopts_sve = f->fw_tcpnopt;
+
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ optlen = cp[1];
+ if (optlen <= 0)
+ break;
+ }
+
+
+ switch (opt) {
+
+ default:
+ break;
+
+ case TCPOPT_MAXSEG:
+ opts &= ~IP_FW_TCPOPT_MSS;
+ nopts &= ~IP_FW_TCPOPT_MSS;
+ break;
+
+ case TCPOPT_WINDOW:
+ opts &= ~IP_FW_TCPOPT_WINDOW;
+ nopts &= ~IP_FW_TCPOPT_WINDOW;
+ break;
+
+ case TCPOPT_SACK_PERMITTED:
+ case TCPOPT_SACK:
+ opts &= ~IP_FW_TCPOPT_SACK;
+ nopts &= ~IP_FW_TCPOPT_SACK;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ opts &= ~IP_FW_TCPOPT_TS;
+ nopts &= ~IP_FW_TCPOPT_TS;
+ break;
+
+ case TCPOPT_CC:
+ case TCPOPT_CCNEW:
+ case TCPOPT_CCECHO:
+ opts &= ~IP_FW_TCPOPT_CC;
+ nopts &= ~IP_FW_TCPOPT_CC;
+ break;
+ }
+ if (opts == nopts)
+ break;
+ }
+ if (opts == 0 && nopts == nopts_sve)
+ return 1;
+ else
+ return 0;
+}
+
+static int
+iface_match(struct ifnet *ifp, union ip_fw_if *ifu, int byname)
+{
+ /* Check by name or by IP address */
+ if (byname) {
+ /* Check unit number (-1 is wildcard) */
+ if (ifu->fu_via_if.unit != -1
+ && ifp->if_unit != ifu->fu_via_if.unit)
+ return(0);
+ /* Check name */
+ if (strncmp(ifp->if_name, ifu->fu_via_if.name, FW_IFNLEN))
+ return(0);
+ return(1);
+ } else if (ifu->fu_via_ip.s_addr != 0) { /* Zero == wildcard */
+ struct ifaddr *ia;
+
+ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+ if (ia->ifa_addr == NULL)
+ continue;
+ if (ia->ifa_addr->sa_family != AF_INET)
+ continue;
+ if (ifu->fu_via_ip.s_addr != ((struct sockaddr_in *)
+ (ia->ifa_addr))->sin_addr.s_addr)
+ continue;
+ return(1);
+ }
+ return(0);
+ }
+ return(1);
+}
+
+static void
+ipfw_report(struct ip_fw *f, struct ip *ip, int ip_off, int ip_len,
+ struct ifnet *rif, struct ifnet *oif)
+{
+ struct tcphdr *const tcp = (struct tcphdr *) ((u_int32_t *) ip+ ip->ip_hl);
+ struct udphdr *const udp = (struct udphdr *) ((u_int32_t *) ip+ ip->ip_hl);
+ struct icmp *const icmp = (struct icmp *) ((u_int32_t *) ip + ip->ip_hl);
+ u_int64_t count;
+ char *action;
+ char action2[32], proto[47], name[18], fragment[27];
+ int len;
+ int offset = ip_off & IP_OFFMASK;
+
+ count = f ? f->fw_pcnt : ++counter;
+ if ((f == NULL && fw_verbose_limit != 0 && count > fw_verbose_limit) ||
+ (f && f->fw_logamount != 0 && count > f->fw_loghighest))
+ return;
+
+ /* Print command name */
+ snprintf(SNPARGS(name, 0), "ipfw: %d", f ? f->fw_number : -1);
+
+ action = action2;
+ if (!f)
+ action = "Refuse";
+ else {
+ switch (f->fw_flg & IP_FW_F_COMMAND) {
+ case IP_FW_F_DENY:
+ action = "Deny";
+ break;
+ case IP_FW_F_REJECT:
+ if (f->fw_reject_code == IP_FW_REJECT_RST)
+ action = "Reset";
+ else
+ action = "Unreach";
+ break;
+ case IP_FW_F_ACCEPT:
+ action = "Accept";
+ break;
+ case IP_FW_F_COUNT:
+ action = "Count";
+ break;
+#ifdef IPDIVERT
+ case IP_FW_F_DIVERT:
+ snprintf(SNPARGS(action2, 0), "Divert %d",
+ f->fw_divert_port);
+ break;
+ case IP_FW_F_TEE:
+ snprintf(SNPARGS(action2, 0), "Tee %d",
+ f->fw_divert_port);
+ break;
+#endif
+ case IP_FW_F_SKIPTO:
+ snprintf(SNPARGS(action2, 0), "SkipTo %d",
+ f->fw_skipto_rule);
+ break;
+ case IP_FW_F_PIPE:
+ snprintf(SNPARGS(action2, 0), "Pipe %d",
+ f->fw_skipto_rule);
+ break;
+ case IP_FW_F_QUEUE:
+ snprintf(SNPARGS(action2, 0), "Queue %d",
+ f->fw_skipto_rule);
+ break;
+
+ case IP_FW_F_FWD:
+ if (f->fw_fwd_ip.sin_port)
+ snprintf(SNPARGS(action2, 0),
+ "Forward to %s:%d",
+ inet_ntoa(f->fw_fwd_ip.sin_addr),
+ f->fw_fwd_ip.sin_port);
+ else
+ snprintf(SNPARGS(action2, 0), "Forward to %s",
+ inet_ntoa(f->fw_fwd_ip.sin_addr));
+ break;
+
+ default:
+ action = "UNKNOWN";
+ break;
+ }
+ }
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ len = snprintf(SNPARGS(proto, 0), "TCP %s",
+ inet_ntoa(ip->ip_src));
+ if (offset == 0)
+ len += snprintf(SNPARGS(proto, len), ":%d ",
+ ntohs(tcp->th_sport));
+ else
+ len += snprintf(SNPARGS(proto, len), " ");
+ len += snprintf(SNPARGS(proto, len), "%s",
+ inet_ntoa(ip->ip_dst));
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d",
+ ntohs(tcp->th_dport));
+ break;
+ case IPPROTO_UDP:
+ len = snprintf(SNPARGS(proto, 0), "UDP %s",
+ inet_ntoa(ip->ip_src));
+ if (offset == 0)
+ len += snprintf(SNPARGS(proto, len), ":%d ",
+ ntohs(udp->uh_sport));
+ else
+ len += snprintf(SNPARGS(proto, len), " ");
+ len += snprintf(SNPARGS(proto, len), "%s",
+ inet_ntoa(ip->ip_dst));
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d",
+ ntohs(udp->uh_dport));
+ break;
+ case IPPROTO_ICMP:
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ",
+ icmp->icmp_type, icmp->icmp_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMP ");
+ len += snprintf(SNPARGS(proto, len), "%s",
+ inet_ntoa(ip->ip_src));
+ snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst));
+ break;
+ default:
+ len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
+ inet_ntoa(ip->ip_src));
+ snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst));
+ break;
+ }
+
+ if (ip_off & (IP_MF | IP_OFFMASK))
+ snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
+ ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
+ offset << 3,
+ (ip_off & IP_MF) ? "+" : "");
+ else
+ fragment[0] = '\0';
+ if (oif)
+ log(LOG_SECURITY | LOG_INFO, "%s %s %s out via %s%d%s\n",
+ name, action, proto, oif->if_name, oif->if_unit, fragment);
+ else if (rif)
+ log(LOG_SECURITY | LOG_INFO, "%s %s %s in via %s%d%s\n", name,
+ action, proto, rif->if_name, rif->if_unit, fragment);
+ else
+ log(LOG_SECURITY | LOG_INFO, "%s %s %s%s\n", name, action,
+ proto, fragment);
+ if ((f ? f->fw_logamount != 0 : 1) &&
+ count == (f ? f->fw_loghighest : fw_verbose_limit))
+ log(LOG_SECURITY | LOG_NOTICE,
+ "ipfw: limit %d reached on entry %d\n",
+ f ? f->fw_logamount : fw_verbose_limit,
+ f ? f->fw_number : -1);
+}
+
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+ u_int32_t i ;
+
+ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+ i &= (curr_dyn_buckets - 1) ;
+ return i ;
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) { \
+ struct ipfw_dyn_rule *old_q = q; \
+ \
+ /* remove a refcount to the parent */ \
+ if (q->dyn_type == DYN_LIMIT) \
+ q->parent->count--; \
+ DEB(printf("-- unlink entry 0x%08x %d -> 0x%08x %d, %d left\n", \
+ (q->id.src_ip), (q->id.src_port), \
+ (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) \
+ if (prev != NULL) \
+ prev->next = q = q->next ; \
+ else \
+ ipfw_dyn_v[i] = q = q->next ; \
+ dyn_count-- ; \
+ free(old_q, M_IPFW); }
+
+#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0)
+/**
+ * Remove all dynamic rules pointing to a given rule, or all
+ * rules if rule == NULL. Second parameter is 1 if we want to
+ * delete unconditionally, otherwise only expired rules are removed.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, int force)
+{
+ struct ipfw_dyn_rule *prev, *q;
+ int i, pass, max_pass ;
+ static u_int32_t last_remove = 0 ;
+
+ if (ipfw_dyn_v == NULL || dyn_count == 0)
+ return ;
+ /* do not expire more than once per second, it is useless */
+ if (force == 0 && last_remove == time_second)
+ return ;
+ last_remove = time_second ;
+
+ /*
+ * because DYN_LIMIT refer to parent rules, during the first pass only
+ * remove child and mark any pending LIMIT_PARENT, and remove
+ * them in a second pass.
+ */
+ for (pass = max_pass = 0; pass <= max_pass ; pass++ ) {
+ for (i = 0 ; i < curr_dyn_buckets ; i++) {
+ for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) {
+ /*
+ * logic can become complex here, so we split tests.
+ * First, test if we match any rule,
+ * then make sure the rule is expired or we want to kill it,
+ * and possibly more in the future.
+ */
+ int zap = ( rule == NULL || rule == q->rule);
+ if (zap)
+ zap = force || TIME_LEQ( q->expire , time_second );
+ /* do not zap parent in first pass, record we need a second pass */
+ if (q->dyn_type == DYN_LIMIT_PARENT) {
+ max_pass = 1; /* we need a second pass */
+ if (zap == 1 && (pass == 0 || q->count != 0) ) {
+ zap = 0 ;
+ if (pass == 1) /* should not happen */
+ printf("OUCH! cannot remove rule, count %d\n",
+ q->count);
+ }
+ }
+ if (zap) {
+ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
+ } else {
+ prev = q ;
+ q = q->next ;
+ }
+ }
+ }
+ }
+}
+
+#define EXPIRE_DYN_CHAIN(rule) remove_dyn_rule(rule, 0 /* expired ones */)
+#define EXPIRE_DYN_CHAINS() remove_dyn_rule(NULL, 0 /* expired ones */)
+#define DELETE_DYN_CHAIN(rule) remove_dyn_rule(rule, 1 /* force removal */)
+#define DELETE_DYN_CHAINS() remove_dyn_rule(NULL, 1 /* force removal */)
+
+/**
+ * lookup a dynamic rule.
+ */
+static struct ipfw_dyn_rule *
+lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction)
+{
+ /*
+ * stateful ipfw extensions.
+ * Lookup into dynamic session queue
+ */
+ struct ipfw_dyn_rule *prev, *q ;
+ int i, dir = 0;
+#define MATCH_FORWARD 1
+
+ if (ipfw_dyn_v == NULL)
+ return NULL ;
+ i = hash_packet( pkt );
+ for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) {
+ if (q->dyn_type == DYN_LIMIT_PARENT)
+ goto next;
+ if (TIME_LEQ( q->expire , time_second ) ) { /* expire entry */
+ UNLINK_DYN_RULE(prev, ipfw_dyn_v[i], q);
+ continue;
+ }
+ if ( pkt->proto == q->id.proto) {
+ if (pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port ) {
+ dir = MATCH_FORWARD ;
+ goto found ;
+ }
+ if (pkt->src_ip == q->id.dst_ip &&
+ pkt->dst_ip == q->id.src_ip &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port ) {
+ dir = 0 ; /* reverse match */
+ goto found ;
+ }
+ }
+next:
+ prev = q ;
+ q = q->next ;
+ }
+ return NULL ; /* clearly not found */
+found:
+ if ( prev != NULL) { /* found and not in front */
+ prev->next = q->next ;
+ q->next = ipfw_dyn_v[i] ;
+ ipfw_dyn_v[i] = q ;
+ }
+ if (pkt->proto == IPPROTO_TCP) {
+ /* update state according to flags */
+ u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
+ q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
+ switch (q->state) {
+ case TH_SYN :
+ /* opening */
+ q->expire = time_second + dyn_syn_lifetime ;
+ break ;
+ case TH_SYN | (TH_SYN << 8) :
+ /* move to established */
+ q->expire = time_second + dyn_ack_lifetime ;
+ break ;
+ case TH_SYN | (TH_SYN << 8) | TH_FIN :
+ case TH_SYN | (TH_SYN << 8) | (TH_FIN << 8) :
+ /* one side tries to close */
+ q->expire = time_second + dyn_ack_lifetime ;
+ break ;
+ case TH_SYN | (TH_SYN << 8) | TH_FIN | (TH_FIN << 8) :
+ /* both sides closed */
+ q->expire = time_second + dyn_fin_lifetime ;
+ break ;
+ default:
+#if 0
+ /*
+ * reset or some invalid combination, but can also
+ * occur if we use keep-state the wrong way.
+ */
+ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+ printf("invalid state: 0x%x\n", q->state);
+#endif
+ q->expire = time_second + dyn_rst_lifetime ;
+ break ;
+ }
+ } else if (pkt->proto == IPPROTO_UDP) {
+ q->expire = time_second + dyn_udp_lifetime ;
+ } else {
+ /* other protocols */
+ q->expire = time_second + dyn_short_lifetime ;
+ }
+ if (match_direction)
+ *match_direction = dir ;
+ return q ;
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (DYN_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ * (DYN_LIMIT). When they are created, the parent is
+ * increased by 1, and decreased on delete. In this case,
+ * the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (DYN_LIMIT_PARENT).
+ */
+
+static struct ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+ struct ipfw_dyn_rule *r ;
+
+ int i ;
+ if (ipfw_dyn_v == NULL ||
+ (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) {
+ /* try reallocation, make sure we have a power of 2 */
+ u_int32_t i = dyn_buckets ;
+ while ( i > 0 && (i & 1) == 0 )
+ i >>= 1 ;
+ if (i != 1) /* not a power of 2 */
+ dyn_buckets = curr_dyn_buckets ; /* reset */
+ else {
+ curr_dyn_buckets = dyn_buckets ;
+ if (ipfw_dyn_v != NULL)
+ free(ipfw_dyn_v, M_IPFW);
+ ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof r,
+ M_IPFW, M_DONTWAIT | M_ZERO);
+ if (ipfw_dyn_v == NULL)
+ return NULL; /* failed ! */
+ }
+ }
+ i = hash_packet(id);
+
+ r = malloc(sizeof *r, M_IPFW, M_DONTWAIT | M_ZERO);
+ if (r == NULL) {
+ printf ("sorry cannot allocate state\n");
+ return NULL ;
+ }
+
+ /* increase refcount on parent, and set pointer */
+ if (dyn_type == DYN_LIMIT) {
+ struct ipfw_dyn_rule *parent = (struct ipfw_dyn_rule *)rule;
+ if ( parent->dyn_type != DYN_LIMIT_PARENT)
+ panic("invalid parent");
+ parent->count++ ;
+ r->parent = parent ;
+ rule = parent->rule;
+ }
+
+ r->id = *id ;
+ r->expire = time_second + dyn_syn_lifetime ;
+ r->rule = rule ;
+ r->dyn_type = dyn_type ;
+ r->pcnt = r->bcnt = 0 ;
+ r->count = 0 ;
+
+ r->bucket = i ;
+ r->next = ipfw_dyn_v[i] ;
+ ipfw_dyn_v[i] = r ;
+ dyn_count++ ;
+ DEB(printf("-- add entry 0x%08x %d -> 0x%08x %d, total %d\n",
+ (r->id.src_ip), (r->id.src_port),
+ (r->id.dst_ip), (r->id.dst_port),
+ dyn_count ); )
+ return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static struct ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+ struct ipfw_dyn_rule *q;
+ int i;
+
+ if (ipfw_dyn_v) {
+ i = hash_packet( pkt );
+ for (q = ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+ if (q->dyn_type == DYN_LIMIT_PARENT && rule == q->rule &&
+ pkt->proto == q->id.proto &&
+ pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port) {
+ q->expire = time_second + dyn_short_lifetime ;
+ DEB(printf("lookup_dyn_parent found 0x%p\n", q);)
+ return q;
+ }
+ }
+ return add_dyn_rule(pkt, DYN_LIMIT_PARENT, rule);
+}
+
+/*
+ * Install dynamic state.
+ * There are different types of dynamic rules which can be installed.
+ * The type is in rule->dyn_type.
+ * Type 0 (default) is a bidirectional rule
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+static int
+install_state(struct ip_fw *rule, struct ip_fw_args *args)
+{
+ struct ipfw_dyn_rule *q ;
+ static int last_log ;
+
+ u_int8_t type = rule->dyn_type ;
+
+ DEB(printf("-- install state type %d 0x%08x %u -> 0x%08x %u\n",
+ type,
+ (args->f_id.src_ip), (args->f_id.src_port),
+ (args->f_id.dst_ip), (args->f_id.dst_port) );)
+
+ q = lookup_dyn_rule(&args->f_id, NULL) ;
+ if (q != NULL) { /* should never occur */
+ if (last_log != time_second) {
+ last_log = time_second ;
+ printf(" entry already present, done\n");
+ }
+ return 0 ;
+ }
+ if (dyn_count >= dyn_max) /* try remove old ones... */
+ EXPIRE_DYN_CHAINS();
+ if (dyn_count >= dyn_max) {
+ if (last_log != time_second) {
+ last_log = time_second ;
+ printf(" Too many dynamic rules, sorry\n");
+ }
+ return 1; /* cannot install, notify caller */
+ }
+
+ switch (type) {
+ case DYN_KEEP_STATE: /* bidir rule */
+ add_dyn_rule(&args->f_id, DYN_KEEP_STATE, rule);
+ break ;
+ case DYN_LIMIT: /* limit number of sessions */
+ {
+ u_int16_t limit_mask = rule->limit_mask ;
+ u_int16_t conn_limit = rule->conn_limit ;
+ struct ipfw_flow_id id;
+ struct ipfw_dyn_rule *parent;
+
+ DEB(printf("installing dyn-limit rule %d\n", conn_limit);)
+
+ id.dst_ip = id.src_ip = 0;
+ id.dst_port = id.src_port = 0 ;
+ id.proto = args->f_id.proto ;
+
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip = args->f_id.src_ip;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip = args->f_id.dst_ip;
+ if (limit_mask & DYN_SRC_PORT)
+ id.src_port = args->f_id.src_port;
+ if (limit_mask & DYN_DST_PORT)
+ id.dst_port = args->f_id.dst_port;
+ parent = lookup_dyn_parent(&id, rule);
+ if (parent == NULL) {
+ printf("add parent failed\n");
+ return 1;
+ }
+ if (parent->count >= conn_limit) {
+ EXPIRE_DYN_CHAIN(rule); /* try to expire some */
+ if (parent->count >= conn_limit) {
+ printf("drop session, too many entries\n");
+ return 1;
+ }
+ }
+ add_dyn_rule(&args->f_id, DYN_LIMIT, (struct ip_fw *)parent);
+ }
+ break ;
+ default:
+ printf("unknown dynamic rule type %u\n", type);
+ return 1 ;
+ }
+ lookup_dyn_rule(&args->f_id, NULL) ; /* XXX just set the lifetime */
+ return 0;
+}
+
+/*
+ * given an ip_fw *, lookup_next_rule will return a pointer
+ * of the same type to the next one. This can be either the jump
+ * target (for skipto instructions) or the next one in the list (in
+ * all other cases including a missing jump target).
+ * Backward jumps are not allowed, so start looking from the next
+ * rule...
+ */
+static struct ip_fw * lookup_next_rule(struct ip_fw *me);
+
+static struct ip_fw *
+lookup_next_rule(struct ip_fw *me)
+{
+ struct ip_fw *rule ;
+ int rulenum = me->fw_skipto_rule ; /* guess... */
+
+ if ( (me->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_SKIPTO )
+ for (rule = LIST_NEXT(me,next); rule ; rule = LIST_NEXT(rule,next))
+ if (rule->fw_number >= rulenum)
+ return rule ;
+ return LIST_NEXT(me,next) ; /* failure or not a skipto */
+}
+
+/*
+ * Parameters:
+ *
+ * *m The packet; we set to NULL when/if we nuke it.
+ * oif Outgoing interface, or NULL if packet is incoming
+ * *cookie Skip up to the first rule past this rule number;
+ * upon return, non-zero port number for divert or tee.
+ * Special case: cookie == NULL on input for bridging.
+ * *flow_id pointer to the last matching rule (in/out)
+ * *next_hop socket we are forwarding to (in/out).
+ * For bridged packets, this is a pointer to the MAC header.
+ *
+ * Return value:
+ *
+ * IP_FW_PORT_DENY_FLAG the packet must be dropped.
+ * 0 The packet is to be accepted and routed normally OR
+ * the packet was denied/rejected and has been dropped;
+ * in the latter case, *m is equal to NULL upon return.
+ * port Divert the packet to port, with these caveats:
+ *
+ * - If IP_FW_PORT_TEE_FLAG is set, tee the packet instead
+ * of diverting it (ie, 'ipfw tee').
+ *
+ * - If IP_FW_PORT_DYNT_FLAG is set, interpret the lower
+ * 16 bits as a dummynet pipe number instead of diverting
+ */
+
+static int
+ip_fw_chk(struct ip_fw_args *args)
+#if 0 /* the old interface was this: */
+ struct mbuf **m, struct ifnet *oif, u_int16_t *cookie,
+ struct ip_fw **flow_id, struct sockaddr_in **next_hop)
+#endif
+{
+ /*
+ * grab things into variables to minimize diffs.
+ * XXX this has to be cleaned up later.
+ */
+ struct mbuf **m = &(args->m);
+ struct ifnet *oif = args->oif;
+ u_int16_t *cookie = &(args->divert_rule);
+ struct ip_fw **flow_id = &(args->rule);
+ struct sockaddr_in **next_hop = &(args->next_hop);
+
+ struct ip_fw *f = NULL; /* matching rule */
+ struct ip *ip = mtod(*m, struct ip *);
+ struct ifnet *const rif = (*m)->m_pkthdr.rcvif;
+ struct ifnet *tif;
+ u_int hlen = ip->ip_hl << 2;
+ struct ether_header * eh = NULL;
+
+ u_short ip_off=0, offset = 0 ;
+ /* local copy of addresses for faster matching */
+ u_short src_port = 0, dst_port = 0;
+ struct in_addr src_ip, dst_ip;
+ u_int8_t proto= 0, flags = 0;
+
+ u_int16_t skipto;
+ u_int16_t ip_len=0;
+
+ int dyn_checked = 0 ; /* set after dyn.rules have been checked. */
+ int direction = MATCH_FORWARD ; /* dirty trick... */
+ struct ipfw_dyn_rule *q = NULL ;
+
+#define BRIDGED (args->eh != NULL)
+ if (BRIDGED) { /* this is a bridged packet */
+ eh = args->eh;
+ if ( (*m)->m_pkthdr.len >= sizeof(struct ip) &&
+ ntohs(eh->ether_type) == ETHERTYPE_IP)
+ hlen = ip->ip_hl << 2;
+ } else
+ hlen = ip->ip_hl << 2;
+
+ /* Grab and reset cookie */
+ skipto = *cookie;
+ *cookie = 0;
+
+ /*
+ * Collect parameters into local variables for faster matching.
+ */
+ if (hlen > 0) { /* this is an IP packet */
+ proto = ip->ip_p;
+ src_ip = ip->ip_src;
+ dst_ip = ip->ip_dst;
+ if (BRIDGED) { /* bridged packets are as on the wire */
+ ip_off = ntohs(ip->ip_off);
+ ip_len = ntohs(ip->ip_len);
+ } else {
+ ip_off = ip->ip_off;
+ ip_len = ip->ip_len;
+ }
+ offset = ip_off & IP_OFFMASK;
+ if (offset == 0) {
+
+#define PULLUP_TO(len) \
+ do { \
+ if ((*m)->m_len < (len)) { \
+ *m = m_pullup(*m, (len)); \
+ if (*m == 0) \
+ goto bogusfrag; \
+ ip = mtod(*m, struct ip *); \
+ } \
+ } while (0)
+
+ switch (proto) {
+ case IPPROTO_TCP : {
+ struct tcphdr *tcp;
+
+ PULLUP_TO(hlen + sizeof(struct tcphdr));
+ tcp =(struct tcphdr *)((u_int32_t *)ip + ip->ip_hl);
+ dst_port = tcp->th_dport ;
+ src_port = tcp->th_sport ;
+ flags = tcp->th_flags ;
+ }
+ break ;
+
+ case IPPROTO_UDP : {
+ struct udphdr *udp;
+
+ PULLUP_TO(hlen + sizeof(struct udphdr));
+ udp =(struct udphdr *)((u_int32_t *)ip + ip->ip_hl);
+ dst_port = udp->uh_dport ;
+ src_port = udp->uh_sport ;
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ PULLUP_TO(hlen + 4); /* type, code and checksum. */
+ flags = ((struct icmp *)
+ ((u_int32_t *)ip + ip->ip_hl))->icmp_type ;
+ break ;
+
+ default :
+ break;
+ }
+#undef PULLUP_TO
+ }
+ }
+ args->f_id.src_ip = ntohl(src_ip.s_addr);
+ args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+ args->f_id.proto = proto;
+ args->f_id.src_port = ntohs(src_port);
+ args->f_id.dst_port = ntohs(dst_port);
+ args->f_id.flags = flags;
+
+ if (*flow_id) {
+ /*
+ * Packet has already been tagged. Look for the next rule
+ * to restart processing.
+ */
+ if (fw_one_pass) /* just accept if fw_one_pass is set */
+ return 0;
+
+ f = (*flow_id)->next_rule_ptr ;
+ if (f == NULL)
+ f = (*flow_id)->next_rule_ptr = lookup_next_rule(*flow_id);
+ if (f == NULL)
+ goto dropit;
+ } else {
+ /*
+ * Go down the list, looking for enlightment.
+ * If we've been asked to start at a given rule, do so.
+ */
+ f = LIST_FIRST(&ip_fw_chain_head);
+ if (skipto != 0) {
+ if (skipto >= IPFW_DEFAULT_RULE)
+ goto dropit;
+ while (f && f->fw_number <= skipto)
+ f = LIST_NEXT(f, next);
+ if (f == NULL)
+ goto dropit;
+ }
+ }
+
+ for (; f; f = LIST_NEXT(f, next)) {
+again:
+ if (f->fw_number == IPFW_DEFAULT_RULE)
+ goto got_match ;
+
+ /* Check if rule only valid for bridged packets */
+ if ((f->fw_flg & IP_FW_BRIDGED) != 0 && !(BRIDGED))
+ continue;
+#undef BRIDGED
+
+ if (oif) {
+ /* Check direction outbound */
+ if (!(f->fw_flg & IP_FW_F_OUT))
+ continue;
+ } else {
+ /* Check direction inbound */
+ if (!(f->fw_flg & IP_FW_F_IN))
+ continue;
+ }
+
+ if (f->fw_flg & IP_FW_F_MAC) {
+ u_int32_t *want, *mask, *hdr;
+
+ if (eh == NULL) /* header not available */
+ continue;
+
+ want = (void *)&(f->fw_mac_hdr);
+ mask = (void *)&(f->fw_mac_mask);
+ hdr = (void *)eh;
+
+ if ( want[0] != (hdr[0] & mask[0]) )
+ continue;
+ if ( want[1] != (hdr[1] & mask[1]) )
+ continue;
+ if ( want[2] != (hdr[2] & mask[2]) )
+ continue;
+ if (f->fw_flg & IP_FW_F_SRNG) {
+ u_int16_t type = ntohs(eh->ether_type);
+ if (type < (u_int16_t)(f->fw_mac_type) ||
+ type > (u_int16_t)(f->fw_mac_mask_type) )
+ continue;
+ } else {
+ if ((u_int16_t)(f->fw_mac_type) != (eh->ether_type &
+ (u_int16_t)(f->fw_mac_mask_type)) )
+ continue;
+ }
+ }
+
+ /* Interface check */
+ if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) {
+ struct ifnet *const iface = oif ? oif : rif;
+
+ /* Backwards compatibility hack for "via" */
+ if (!iface || !iface_match(iface,
+ &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME))
+ continue;
+ } else {
+ /* Check receive interface */
+ if ((f->fw_flg & IP_FW_F_IIFACE)
+ && (!rif || !iface_match(rif,
+ &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME)))
+ continue;
+ /* Check outgoing interface */
+ if ((f->fw_flg & IP_FW_F_OIFACE)
+ && (!oif || !iface_match(oif,
+ &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME)))
+ continue;
+ }
+
+ /*
+ * For packets which matched the MAC check, we do not need
+ * to continue, this is a valid match.
+ * For not-ip packets, the rule does not apply.
+ */
+ if (f->fw_flg & IP_FW_F_MAC)
+ goto rnd_then_got_match;
+
+ if (hlen == 0)
+ continue;
+
+ /*
+ * dynamic rules are checked at the first keep-state or
+ * check-state occurrence.
+ */
+ if (f->fw_flg & (IP_FW_F_KEEP_S|IP_FW_F_CHECK_S) &&
+ dyn_checked == 0 ) {
+ dyn_checked = 1 ;
+ q = lookup_dyn_rule(&args->f_id, &direction);
+ if (q != NULL) {
+ DEB(printf("-- dynamic match 0x%08x %d %s 0x%08x %d\n",
+ (q->id.src_ip), (q->id.src_port),
+ (direction == MATCH_FORWARD ? "-->" : "<--"),
+ (q->id.dst_ip), (q->id.dst_port) ); )
+ f = q->rule ;
+ q->pcnt++ ;
+ q->bcnt += ip_len;
+ goto got_match ; /* random not allowed here */
+ }
+ /* if this was a check-only rule, continue with next */
+ if (f->fw_flg & IP_FW_F_CHECK_S)
+ continue ;
+ }
+
+ /* Fragments */
+ if ((f->fw_flg & IP_FW_F_FRAG) && offset == 0 )
+ continue;
+
+ /*
+ * For matching addresses, tif != NULL means we matched
+ * the address we requested (either "me" or addr/mask).
+ * Then the check for "xxx" or "not xxx" can be done
+ * with an XOR.
+ */
+
+ /* source address -- mandatory */
+ if (f->fw_flg & IP_FW_F_SME) {
+ INADDR_TO_IFP(src_ip, tif);
+ } else
+ (int)tif = f->fw_src.s_addr ==
+ (src_ip.s_addr & f->fw_smsk.s_addr);
+ if ( ((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ (tif == NULL) )
+ continue;
+
+ /* dst address -- mandatory */
+ if (f->fw_flg & IP_FW_F_DME) {
+ INADDR_TO_IFP(dst_ip, tif);
+ } else
+ (int)tif = f->fw_dst.s_addr ==
+ (dst_ip.s_addr & f->fw_dmsk.s_addr);
+ if ( ((f->fw_flg & IP_FW_F_INVDST) != 0) ^ (tif == NULL) )
+ continue;
+
+ /* Check IP header values */
+ if (f->fw_ipflg & IP_FW_IF_IPOPT && !ipopts_match(ip, f))
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_IPLEN && f->fw_iplen != ip_len)
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_IPID && f->fw_ipid != ntohs(ip->ip_id))
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_IPPRE &&
+ (f->fw_iptos & 0xe0) != (ip->ip_tos & 0xe0))
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_IPTOS && !iptos_match(ip, f))
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_IPTTL && f->fw_ipttl != ip->ip_ttl)
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_IPVER && f->fw_ipver != ip->ip_v)
+ continue;
+
+ /* Check protocol; if wildcard, and no [ug]id, match */
+ if (f->fw_prot == IPPROTO_IP) {
+ if (!(f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)))
+ goto rnd_then_got_match;
+ } else
+ /* If different, don't match */
+ if (proto != f->fw_prot)
+ continue;
+
+ /* Protocol specific checks for uid only */
+ if (f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)) {
+ switch (proto) {
+ case IPPROTO_TCP:
+ {
+ struct inpcb *P;
+
+ if (offset == 1) /* cf. RFC 1858 */
+ goto bogusfrag;
+ if (offset != 0)
+ continue;
+
+ if (oif)
+ P = in_pcblookup_hash(&tcbinfo, dst_ip,
+ dst_port, src_ip, src_port, 0,
+ oif);
+ else
+ P = in_pcblookup_hash(&tcbinfo, src_ip,
+ src_port, dst_ip, dst_port, 0,
+ NULL);
+
+ if (P && P->inp_socket) {
+ if (f->fw_flg & IP_FW_F_UID) {
+ if (socheckuid(P->inp_socket, f->fw_uid))
+ continue;
+ } else if (!groupmember(f->fw_gid,
+ P->inp_socket->so_cred))
+ continue;
+ } else
+ continue;
+ break;
+ }
+
+ case IPPROTO_UDP:
+ {
+ struct inpcb *P;
+
+ if (offset != 0)
+ continue;
+
+ if (oif)
+ P = in_pcblookup_hash(&udbinfo, dst_ip,
+ dst_port, src_ip, src_port, 1,
+ oif);
+ else
+ P = in_pcblookup_hash(&udbinfo, src_ip,
+ src_port, dst_ip, dst_port, 1,
+ NULL);
+
+ if (P && P->inp_socket) {
+ if (f->fw_flg & IP_FW_F_UID) {
+ if (socheckuid(P->inp_socket, f->fw_uid))
+ continue;
+ } else if (!groupmember(f->fw_gid,
+ P->inp_socket->so_cred))
+ continue;
+ } else
+ continue;
+ break;
+ }
+
+ default:
+ continue;
+ }
+ }
+
+ /* Protocol specific checks */
+ switch (proto) {
+ case IPPROTO_TCP:
+ {
+ struct tcphdr *tcp;
+
+ if (offset == 1) /* cf. RFC 1858 */
+ goto bogusfrag;
+ if (offset != 0) {
+ /*
+ * TCP flags and ports aren't available in this
+ * packet -- if this rule specified either one,
+ * we consider the rule a non-match.
+ */
+ if (IP_FW_HAVEPORTS(f) != 0 ||
+ f->fw_ipflg & IP_FW_IF_TCPMSK)
+ continue;
+
+ break;
+ }
+ tcp = (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl);
+
+ if (f->fw_ipflg & IP_FW_IF_TCPOPT && !tcpopts_match(tcp, f))
+ continue;
+ if (((f->fw_ipflg & IP_FW_IF_TCPFLG) ||
+ (f->fw_ipflg & IP_FW_IF_TCPEST)) &&
+ !tcpflg_match(tcp, f))
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_TCPSEQ && tcp->th_seq != f->fw_tcpseq)
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_TCPACK && tcp->th_ack != f->fw_tcpack)
+ continue;
+ if (f->fw_ipflg & IP_FW_IF_TCPWIN && tcp->th_win != f->fw_tcpwin)
+ continue;
+ goto check_ports;
+ }
+
+ case IPPROTO_UDP:
+ if (offset != 0) {
+ /*
+ * Port specification is unavailable -- if this
+ * rule specifies a port, we consider the rule
+ * a non-match.
+ */
+ if (IP_FW_HAVEPORTS(f) )
+ continue;
+
+ break;
+ }
+check_ports:
+ if (!port_match(&f->fw_uar.fw_pts[0],
+ IP_FW_GETNSRCP(f), ntohs(src_port),
+ f->fw_flg & IP_FW_F_SRNG,
+ f->fw_flg & IP_FW_F_SMSK))
+ continue;
+ if (!port_match(&f->fw_uar.fw_pts[IP_FW_GETNSRCP(f)],
+ IP_FW_GETNDSTP(f), ntohs(dst_port),
+ f->fw_flg & IP_FW_F_DRNG,
+ f->fw_flg & IP_FW_F_DMSK))
+ continue;
+ break;
+
+ case IPPROTO_ICMP:
+ {
+ struct icmp *icmp;
+
+ if (offset != 0) /* Type isn't valid */
+ break;
+ icmp = (struct icmp *) ((u_int32_t *)ip + ip->ip_hl);
+ if (!icmptype_match(icmp, f))
+ continue;
+ break;
+ }
+
+ default:
+ break;
+
+bogusfrag:
+ if (fw_verbose) {
+ if (*m != NULL)
+ ipfw_report(NULL, ip, ip_off, ip_len, rif, oif);
+ else
+ printf("pullup failed\n");
+ }
+ goto dropit;
+
+ }
+
+rnd_then_got_match:
+ if ( f->dont_match_prob && random() < f->dont_match_prob )
+ continue ;
+got_match:
+ /*
+ * If not a dynamic match (q == NULL) and keep-state, install
+ * a new dynamic entry.
+ */
+ if (q == NULL && f->fw_flg & IP_FW_F_KEEP_S) {
+ if (install_state(f, args)) /* error or limit violation */
+ goto dropit;
+ }
+ /* Update statistics */
+ f->fw_pcnt += 1;
+ f->fw_bcnt += ip_len;
+ f->timestamp = time_second;
+
+ /* Log to console if desired */
+ if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose && hlen >0)
+ ipfw_report(f, ip, ip_off, ip_len, rif, oif);
+
+ /* Take appropriate action */
+ switch (f->fw_flg & IP_FW_F_COMMAND) {
+ case IP_FW_F_ACCEPT:
+ return(0);
+ case IP_FW_F_COUNT:
+ continue;
+#ifdef IPDIVERT
+ case IP_FW_F_DIVERT:
+ *cookie = f->fw_number;
+ return(f->fw_divert_port);
+ case IP_FW_F_TEE:
+ *cookie = f->fw_number;
+ return(f->fw_divert_port | IP_FW_PORT_TEE_FLAG);
+#endif
+ case IP_FW_F_SKIPTO: /* XXX check */
+ if (f->next_rule_ptr == NULL)
+ f->next_rule_ptr = lookup_next_rule(f) ;
+ f = f->next_rule_ptr;
+ if (!f)
+ goto dropit;
+ goto again ;
+
+ case IP_FW_F_PIPE:
+ case IP_FW_F_QUEUE:
+ *flow_id = f; /* XXX set flow id */
+ return(f->fw_pipe_nr | IP_FW_PORT_DYNT_FLAG);
+
+ case IP_FW_F_FWD:
+ /* Change the next-hop address for this packet.
+ * Initially we'll only worry about directly
+ * reachable next-hop's, but ultimately
+ * we will work out for next-hops that aren't
+ * direct the route we would take for it. We
+ * [cs]ould leave this latter problem to
+ * ip_output.c. We hope to high [name the abode of
+ * your favourite deity] that ip_output doesn't modify
+ * the new value of next_hop (which is dst there)
+ * XXX warning-- there is a dangerous reference here
+ * from next_hop to a field within the rule. If the
+ * rule is deleted, weird things might occur.
+ */
+ if (next_hop != NULL /* Make sure, first... */
+ && (q == NULL || direction == MATCH_FORWARD) )
+ *next_hop = &(f->fw_fwd_ip);
+ return(0); /* Allow the packet */
+
+ }
+
+ /* Deny/reject this packet using this rule */
+ break;
+ }
+
+ /* Rule IPFW_DEFAULT_RULE should always be there and match */
+ KASSERT(f != NULL, ("ip_fw: no chain"));
+
+ /*
+ * At this point, we're going to drop the packet.
+ * Send a reject notice if all of the following are true:
+ *
+ * - The packet matched a reject rule
+ * - The packet is not an ICMP packet, or is an ICMP query packet
+ * - The packet is not a multicast or broadcast packet
+ */
+ if ((f->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT
+ && (proto != IPPROTO_ICMP || is_icmp_query(ip))
+ && !((*m)->m_flags & (M_BCAST|M_MCAST))
+ && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ switch (f->fw_reject_code) {
+ case IP_FW_REJECT_RST:
+ {
+ /* XXX warning, this code writes into the mbuf */
+ struct tcphdr *const tcp =
+ (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl);
+ struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip;
+
+ if (offset != 0 || (tcp->th_flags & TH_RST))
+ break;
+ ti.ti_i = *((struct ipovly *) ip);
+ ti.ti_t = *tcp;
+ bcopy(&ti, ip, sizeof(ti));
+ tip->ti_seq = ntohl(tip->ti_seq);
+ tip->ti_ack = ntohl(tip->ti_ack);
+ tip->ti_len = ip_len - hlen - (tip->ti_off << 2);
+ if (tcp->th_flags & TH_ACK) {
+ tcp_respond(NULL, (void *)ip, tcp, *m,
+ (tcp_seq)0, tcp->th_ack, TH_RST);
+ } else {
+ if (tcp->th_flags & TH_SYN)
+ tip->ti_len++;
+ tcp_respond(NULL, (void *)ip, tcp, *m,
+ tip->ti_seq + tip->ti_len,
+ (tcp_seq)0, TH_RST|TH_ACK);
+ }
+ *m = NULL;
+ break;
+ }
+ default: /* Send an ICMP unreachable using code */
+ icmp_error(*m, ICMP_UNREACH,
+ f->fw_reject_code, 0L, 0);
+ *m = NULL;
+ break;
+ }
+ }
+
+dropit:
+ /*
+ * Finally, drop the packet.
+ */
+ return(IP_FW_PORT_DENY_FLAG);
+}
+
+/*
+ * when a rule is added/deleted, zero the direct pointers within
+ * all firewall rules. These will be reconstructed on the fly
+ * as packets are matched.
+ * Must be called at splimp().
+ */
+static void
+flush_rule_ptrs()
+{
+ struct ip_fw *fcp ;
+
+ LIST_FOREACH(fcp, &ip_fw_chain_head, next) {
+ fcp->next_rule_ptr = NULL ;
+ }
+}
+
+static int
+add_entry(struct ip_fw_head *head, struct ip_fw *rule)
+{
+ struct ip_fw *ftmp, *fcp, *fcpl;
+ u_short nbr = 0;
+ int s;
+
+ ftmp = malloc(sizeof *ftmp, M_IPFW, M_DONTWAIT | M_ZERO);
+ if (!ftmp)
+ return (ENOSPC);
+ bcopy(rule, ftmp, sizeof(*ftmp));
+
+ ftmp->fw_in_if.fu_via_if.name[FW_IFNLEN - 1] = '\0';
+ ftmp->fw_pcnt = 0L;
+ ftmp->fw_bcnt = 0L;
+ ftmp->next_rule_ptr = NULL ;
+ ftmp->pipe_ptr = NULL ;
+
+ s = splimp();
+
+ if (LIST_FIRST(head) == 0) {
+ LIST_INSERT_HEAD(head, ftmp, next);
+ goto done;
+ }
+
+ /* If entry number is 0, find highest numbered rule and add 100 */
+ if (ftmp->fw_number == 0) {
+ LIST_FOREACH(fcp, head, next) {
+ if (fcp->fw_number != IPFW_DEFAULT_RULE)
+ nbr = fcp->fw_number;
+ else
+ break;
+ }
+ if (nbr < IPFW_DEFAULT_RULE - 100)
+ nbr += 100;
+ ftmp->fw_number = rule->fw_number = nbr;
+ }
+
+ /* Got a valid number; now insert it, keeping the list ordered */
+ fcpl = NULL ;
+ LIST_FOREACH(fcp, head, next) {
+ if (fcp->fw_number > ftmp->fw_number) {
+ if (fcpl) {
+ LIST_INSERT_AFTER(fcpl, ftmp, next);
+ } else {
+ LIST_INSERT_HEAD(head, ftmp, next);
+ }
+ break;
+ } else {
+ fcpl = fcp;
+ }
+ }
+ flush_rule_ptrs();
+done:
+ static_count++;
+ splx(s);
+ DEB(printf("++ installed rule %d, static count now %d\n",
+ ftmp->fw_number, static_count);)
+ return (0);
+}
+
+/**
+ * free storage associated with a static rule entry (including
+ * dependent dynamic rules), and zeroes rule pointers to avoid
+ * dangling pointer dereferences.
+ * @return a pointer to the next entry.
+ * Must be called at splimp() and with a non-null argument.
+ */
+static struct ip_fw *
+free_chain(struct ip_fw *fcp)
+{
+ struct ip_fw *n;
+
+ n = LIST_NEXT(fcp, next);
+ DELETE_DYN_CHAIN(fcp);
+ LIST_REMOVE(fcp, next);
+ static_count--;
+ if (DUMMYNET_LOADED)
+ ip_dn_ruledel_ptr(fcp) ;
+ flush_rule_ptrs(); /* more efficient to do outside the loop */
+ free(fcp, M_IPFW);
+ return n;
+}
+
+/**
+ * remove all rules with given number.
+ */
+static int
+del_entry(struct ip_fw_head *chainptr, u_short number)
+{
+ struct ip_fw *rule;
+
+ if (number != IPFW_DEFAULT_RULE) {
+ LIST_FOREACH(rule, chainptr, next) {
+ if (rule->fw_number == number) {
+ int s ;
+
+ s = splimp(); /* prevent access to rules while removing */
+ while (rule && rule->fw_number == number)
+ rule = free_chain(rule);
+ /* XXX could move flush_rule_ptrs() here */
+ splx(s);
+ return 0 ;
+ }
+ }
+ }
+ return (EINVAL);
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * @arg frwl is null to clear all entries, or contains a specific
+ * rule number.
+ * @arg log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+
+static int
+zero_entry(struct ip_fw *frwl, int log_only)
+{
+ struct ip_fw *rule;
+ int s;
+ u_short number = 0 ;
+ char *msg ;
+
+ if (frwl == 0) {
+ s = splimp();
+ LIST_FOREACH(rule, &ip_fw_chain_head, next) {
+ if (log_only == 0) {
+ rule->fw_bcnt = rule->fw_pcnt = 0;
+ rule->timestamp = 0;
+ }
+ rule->fw_loghighest = rule->fw_pcnt+rule->fw_logamount;
+ }
+ splx(s);
+ msg = log_only ? "ipfw: All logging counts cleared.\n" :
+ "ipfw: Accounting cleared.\n";
+ } else {
+ int cleared = 0;
+ number = frwl->fw_number ;
+ /*
+ * It is possible to insert multiple chain entries with the
+ * same number, so we don't stop after finding the first
+ * match if zeroing a specific entry.
+ */
+ LIST_FOREACH(rule, &ip_fw_chain_head, next)
+ if (number == rule->fw_number) {
+ s = splimp();
+ while (rule && number == rule->fw_number) {
+ if (log_only == 0) {
+ rule->fw_bcnt = rule->fw_pcnt = 0;
+ rule->timestamp = 0;
+ }
+ rule->fw_loghighest = rule->fw_pcnt+ rule->fw_logamount;
+ rule = LIST_NEXT(rule, next);
+ }
+ splx(s);
+ cleared = 1;
+ break;
+ }
+ if (!cleared) /* we did not find any matching rules */
+ return (EINVAL);
+ msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
+ "ipfw: Entry %d cleared.\n";
+ }
+ if (fw_verbose)
+ log(LOG_SECURITY | LOG_NOTICE, msg, number);
+ return (0);
+}
+
+static int
+check_ipfw_struct(struct ip_fw *frwl)
+{
+ /* Check for invalid flag bits */
+ if ((frwl->fw_flg & ~IP_FW_F_MASK) != 0) {
+ dprintf(("%s undefined flag bits set (flags=%x)\n",
+ err_prefix, frwl->fw_flg));
+ return (EINVAL);
+ }
+ if ( (frwl->fw_flg & IP_FW_F_MAC) ) { /* match MAC address */
+ return 0;
+ }
+ if (frwl->fw_flg == IP_FW_F_CHECK_S) {
+ /* check-state */
+ return 0 ;
+ }
+ /* Must apply to incoming or outgoing (or both) */
+ if (!(frwl->fw_flg & (IP_FW_F_IN | IP_FW_F_OUT))) {
+ dprintf(("%s neither in nor out\n", err_prefix));
+ return (EINVAL);
+ }
+ /* Empty interface name is no good */
+ if (((frwl->fw_flg & IP_FW_F_IIFNAME)
+ && !*frwl->fw_in_if.fu_via_if.name)
+ || ((frwl->fw_flg & IP_FW_F_OIFNAME)
+ && !*frwl->fw_out_if.fu_via_if.name)) {
+ dprintf(("%s empty interface name\n", err_prefix));
+ return (EINVAL);
+ }
+ /* Sanity check interface matching */
+ if ((frwl->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) {
+ ; /* allow "via" backwards compatibility */
+ } else if ((frwl->fw_flg & IP_FW_F_IN)
+ && (frwl->fw_flg & IP_FW_F_OIFACE)) {
+ dprintf(("%s outgoing interface check on incoming\n",
+ err_prefix));
+ return (EINVAL);
+ }
+ /* Sanity check port ranges */
+ if ((frwl->fw_flg & IP_FW_F_SRNG) && IP_FW_GETNSRCP(frwl) < 2) {
+ dprintf(("%s src range set but n_src_p=%d\n",
+ err_prefix, IP_FW_GETNSRCP(frwl)));
+ return (EINVAL);
+ }
+ if ((frwl->fw_flg & IP_FW_F_DRNG) && IP_FW_GETNDSTP(frwl) < 2) {
+ dprintf(("%s dst range set but n_dst_p=%d\n",
+ err_prefix, IP_FW_GETNDSTP(frwl)));
+ return (EINVAL);
+ }
+ if (IP_FW_GETNSRCP(frwl) + IP_FW_GETNDSTP(frwl) > IP_FW_MAX_PORTS) {
+ dprintf(("%s too many ports (%d+%d)\n",
+ err_prefix, IP_FW_GETNSRCP(frwl), IP_FW_GETNDSTP(frwl)));
+ return (EINVAL);
+ }
+ /*
+ * Protocols other than TCP/UDP don't use port range
+ */
+ if ((frwl->fw_prot != IPPROTO_TCP) &&
+ (frwl->fw_prot != IPPROTO_UDP) &&
+ (IP_FW_GETNSRCP(frwl) || IP_FW_GETNDSTP(frwl))) {
+ dprintf(("%s port(s) specified for non TCP/UDP rule\n",
+ err_prefix));
+ return (EINVAL);
+ }
+
+ /*
+ * Rather than modify the entry to make such entries work,
+ * we reject this rule and require user level utilities
+ * to enforce whatever policy they deem appropriate.
+ */
+ if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) ||
+ (frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) {
+ dprintf(("%s rule never matches\n", err_prefix));
+ return (EINVAL);
+ }
+
+ if ((frwl->fw_flg & IP_FW_F_FRAG) &&
+ (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) {
+ if (IP_FW_HAVEPORTS(frwl)) {
+ dprintf(("%s cannot mix 'frag' and ports\n", err_prefix));
+ return (EINVAL);
+ }
+ if (frwl->fw_prot == IPPROTO_TCP &&
+ frwl->fw_tcpf != frwl->fw_tcpnf) {
+ dprintf(("%s cannot mix 'frag' and TCP flags\n", err_prefix));
+ return (EINVAL);
+ }
+ }
+
+ if (frwl->fw_flg & (IP_FW_F_UID | IP_FW_F_GID)) {
+ if ((frwl->fw_prot != IPPROTO_TCP) &&
+ (frwl->fw_prot != IPPROTO_UDP) &&
+ (frwl->fw_prot != IPPROTO_IP)) {
+ dprintf(("%s cannot use uid/gid logic on non-TCP/UDP\n", err_prefix));
+ return (EINVAL);
+ }
+ }
+
+ /* Check command specific stuff */
+ switch (frwl->fw_flg & IP_FW_F_COMMAND) {
+ case IP_FW_F_REJECT:
+ if (frwl->fw_reject_code >= 0x100
+ && !(frwl->fw_prot == IPPROTO_TCP
+ && frwl->fw_reject_code == IP_FW_REJECT_RST)) {
+ dprintf(("%s unknown reject code\n", err_prefix));
+ return (EINVAL);
+ }
+ break;
+#ifdef IPDIVERT
+ case IP_FW_F_DIVERT: /* Diverting to port zero is invalid */
+ case IP_FW_F_TEE:
+#endif
+ case IP_FW_F_PIPE: /* pipe 0 is invalid */
+ case IP_FW_F_QUEUE: /* queue 0 is invalid */
+ if (frwl->fw_divert_port == 0) {
+ dprintf(("%s 0 is an invalid argument\n", err_prefix));
+ return (EINVAL);
+ }
+ break;
+ case IP_FW_F_DENY:
+ case IP_FW_F_ACCEPT:
+ case IP_FW_F_COUNT:
+ case IP_FW_F_SKIPTO:
+ case IP_FW_F_FWD:
+ break;
+ default:
+ dprintf(("%s invalid command\n", err_prefix));
+ return (EINVAL);
+ }
+
+ return 0;
+}
+
+static int
+ip_fw_ctl(struct sockopt *sopt)
+{
+ int error, s;
+ size_t size;
+ struct ip_fw *fcp;
+ struct ip_fw frwl, *bp , *buf;
+
+ /*
+ * Disallow modifications in really-really secure mode, but still allow
+ * the logging counters to be reset.
+ */
+ if (sopt->sopt_name == IP_FW_ADD ||
+ (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ error = 0;
+
+ switch (sopt->sopt_name) {
+ case IP_FW_GET:
+ /*
+ * pass up a copy of the current rules. Static rules
+ * come first (the last of which has number 65535),
+ * followed by a possibly empty list of dynamic rule.
+ * The last dynamic rule has NULL in the "next" field.
+ */
+ s = splimp();
+ /* size of static rules */
+ size = static_count * sizeof(struct ip_fw) ;
+ if (ipfw_dyn_v) /* add size of dyn.rules */
+ size += (dyn_count * sizeof(struct ipfw_dyn_rule));
+
+ /*
+ * XXX todo: if the user passes a short length to know how
+ * much room is needed, do not
+ * bother filling up the buffer, just jump to the
+ * sooptcopyout.
+ */
+ buf = malloc(size, M_TEMP, M_WAITOK);
+ if (buf == 0) {
+ splx(s);
+ error = ENOBUFS;
+ break;
+ }
+
+ bp = buf ;
+ LIST_FOREACH(fcp, &ip_fw_chain_head, next) {
+ bcopy(fcp, bp, sizeof *fcp);
+ bp++;
+ }
+ if (ipfw_dyn_v) {
+ int i ;
+ struct ipfw_dyn_rule *p, *dst, *last = NULL ;
+
+ dst = (struct ipfw_dyn_rule *)bp ;
+ for (i = 0 ; i < curr_dyn_buckets ; i++ )
+ for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next, dst++ ) {
+ bcopy(p, dst, sizeof *p);
+ (int)dst->rule = p->rule->fw_number ;
+ /*
+ * store a non-null value in "next". The userland
+ * code will interpret a NULL here as a marker
+ * for the last dynamic rule.
+ */
+ dst->next = dst ;
+ last = dst ;
+ if (TIME_LEQ(dst->expire, time_second) )
+ dst->expire = 0 ;
+ else
+ dst->expire -= time_second ;
+ }
+ if (last != NULL)
+ last->next = NULL ; /* mark last dynamic rule */
+ }
+ splx(s);
+
+ error = sooptcopyout(sopt, buf, size);
+ free(buf, M_TEMP);
+ break;
+
+ case IP_FW_FLUSH:
+ /*
+ * Normally we cannot release the lock on each iteration.
+ * We could do it here only because we start from the head all
+ * the times so there is no risk of missing some entries.
+ * On the other hand, the risk is that we end up with
+ * a very inconsistent ruleset, so better keep the lock
+ * around the whole cycle.
+ *
+ * XXX this code can be improved by resetting the head of
+ * the list to point to the default rule, and then freeing
+ * the old list without the need for a lock.
+ */
+
+ s = splimp();
+ while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) &&
+ fcp->fw_number != IPFW_DEFAULT_RULE )
+ free_chain(fcp);
+ splx(s);
+ break;
+
+ case IP_FW_ADD:
+ error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl);
+ if (error || (error = check_ipfw_struct(&frwl)))
+ break;
+
+ if (frwl.fw_number == IPFW_DEFAULT_RULE) {
+ dprintf(("%s can't add rule %u\n", err_prefix,
+ (unsigned)IPFW_DEFAULT_RULE));
+ error = EINVAL;
+ } else {
+ error = add_entry(&ip_fw_chain_head, &frwl);
+ if (!error && sopt->sopt_dir == SOPT_GET)
+ error = sooptcopyout(sopt, &frwl, sizeof frwl);
+ }
+ break;
+
+ case IP_FW_DEL:
+ error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl);
+ if (error)
+ break;
+
+ if (frwl.fw_number == IPFW_DEFAULT_RULE) {
+ dprintf(("%s can't delete rule %u\n", err_prefix,
+ (unsigned)IPFW_DEFAULT_RULE));
+ error = EINVAL;
+ } else {
+ error = del_entry(&ip_fw_chain_head, frwl.fw_number);
+ }
+ break;
+
+ case IP_FW_ZERO:
+ case IP_FW_RESETLOG:
+ {
+ int cmd = (sopt->sopt_name == IP_FW_RESETLOG );
+ void *arg = NULL ;
+
+ if (sopt->sopt_val != 0) {
+ error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl);
+ if (error)
+ break;
+ arg = &frwl ;
+ }
+ error = zero_entry(arg, cmd);
+ }
+ break;
+
+ default:
+ printf("ip_fw_ctl invalid option %d\n", sopt->sopt_name);
+ error = EINVAL ;
+ }
+
+ return (error);
+}
+
+/**
+ * dummynet needs a reference to the default rule, because rules can
+ * be deleted while packets hold a reference to them (e.g. to resume
+ * processing at the next rule). When this happens, dummynet changes
+ * the reference to the default rule (probably it could well be a
+ * NULL pointer, but this way we do not need to check for the special
+ * case, plus here he have info on the default behaviour.
+ */
+struct ip_fw *ip_fw_default_rule ;
+
+void
+ip_fw_init(void)
+{
+ struct ip_fw default_rule;
+
+ ip_fw_chk_ptr = ip_fw_chk;
+ ip_fw_ctl_ptr = ip_fw_ctl;
+ LIST_INIT(&ip_fw_chain_head);
+
+ bzero(&default_rule, sizeof default_rule);
+ default_rule.fw_prot = IPPROTO_IP;
+ default_rule.fw_number = IPFW_DEFAULT_RULE;
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+ default_rule.fw_flg |= IP_FW_F_ACCEPT;
+#else
+ default_rule.fw_flg |= IP_FW_F_DENY;
+#endif
+ default_rule.fw_flg |= IP_FW_F_IN | IP_FW_F_OUT;
+ if (check_ipfw_struct(&default_rule) != 0 ||
+ add_entry(&ip_fw_chain_head, &default_rule))
+ panic("ip_fw_init");
+
+ ip_fw_default_rule = LIST_FIRST(&ip_fw_chain_head) ;
+ printf("IP packet filtering initialized, "
+#ifdef IPDIVERT
+ "divert enabled, "
+#else
+ "divert disabled, "
+#endif
+ "rule-based forwarding enabled, "
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+ "default to accept, ");
+#else
+ "default to deny, " );
+#endif
+#ifndef IPFIREWALL_VERBOSE
+ printf("logging disabled\n");
+#else
+ if (fw_verbose_limit == 0)
+ printf("unlimited logging\n");
+ else
+ printf("logging limited to %d packets/entry by default\n",
+ fw_verbose_limit);
+#endif
+}
+
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+ int s;
+ int err = 0 ;
+#if defined(KLD_MODULE)
+ struct ip_fw *fcp;
+#endif
+
+ switch (type) {
+ case MOD_LOAD:
+ s = splimp();
+ if (IPFW_LOADED) {
+ splx(s);
+ printf("IP firewall already loaded\n");
+ err = EEXIST ;
+ } else {
+ ip_fw_init();
+ splx(s);
+ }
+ break ;
+ case MOD_UNLOAD:
+#if !defined(KLD_MODULE)
+ printf("ipfw statically compiled, cannot unload\n");
+ err = EBUSY;
+#else
+ s = splimp();
+ ip_fw_chk_ptr = NULL ;
+ ip_fw_ctl_ptr = NULL ;
+ while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) != NULL)
+ free_chain(fcp);
+ splx(s);
+ printf("IP firewall unloaded\n");
+#endif
+ break;
+ default:
+ break;
+ }
+ return err;
+}
+
+static moduledata_t ipfwmod = {
+ "ipfw",
+ ipfw_modevent,
+ 0
+};
+DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(ipfw, 1);
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
new file mode 100644
index 0000000..dcb3bcf
--- /dev/null
+++ b/sys/netinet/ip_fw.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_FW_H
+#define _IP_FW_H
+
+#include <sys/queue.h>
+
+/*
+ * This union structure identifies an interface, either explicitly
+ * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME
+ * and IP_FW_F_OIFNAME say how to interpret this structure. An
+ * interface unit number of -1 matches any unit number, while an
+ * IP address of 0.0.0.0 indicates matches any interface.
+ *
+ * The receive and transmit interfaces are only compared against the
+ * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE)
+ * is set. Note some packets lack a receive or transmit interface
+ * (in which case the missing "interface" never matches).
+ */
+
+union ip_fw_if {
+ struct in_addr fu_via_ip; /* Specified by IP address */
+ struct { /* Specified by interface name */
+#define FW_IFNLEN 10 /* need room ! was IFNAMSIZ */
+ char name[FW_IFNLEN];
+ short unit; /* -1 means match any unit */
+ } fu_via_if;
+};
+
+/*
+ * Format of an IP firewall descriptor
+ *
+ * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order.
+ * fw_flg and fw_n*p are stored in host byte order (of course).
+ * Port numbers are stored in HOST byte order.
+ */
+
+/*
+ * To match MAC headers:
+ * 12 bytes at fw_mac_hdr contain the dst-src MAC address after masking.
+ * 12 bytes at fw_mac_mask contain the mask to apply to dst-src
+ * 2 bytes at fw_mac_type contain the mac type after mask (in net format)
+ * 2 bytes at fw_mac_type_mask contain the mac type mask
+ * If IP_FW_F_SRNG, the two contain the low-high of a range of types.
+ * IP_FW_F_DRNG is used to indicare we want to match a vlan.
+ */
+#define fw_mac_hdr fw_src
+#define fw_mac_mask fw_uar
+#define fw_mac_type fw_iplen
+#define fw_mac_mask_type fw_ipid
+
+struct ip_fw {
+ LIST_ENTRY(ip_fw) next; /* bidirectional list of rules */
+ u_int fw_flg; /* Operational Flags word */
+ u_int64_t fw_pcnt; /* Packet counters */
+ u_int64_t fw_bcnt; /* Byte counters */
+
+ struct in_addr fw_src; /* Source IP address */
+ struct in_addr fw_dst; /* Destination IP address */
+ struct in_addr fw_smsk; /* Mask for source IP address */
+ struct in_addr fw_dmsk; /* Mask for destination address */
+ u_short fw_number; /* Rule number */
+ u_char fw_prot; /* IP protocol */
+#if 1
+ u_char fw_nports; /* # of src/dst port in array */
+#define IP_FW_GETNSRCP(rule) ((rule)->fw_nports & 0x0f)
+#define IP_FW_SETNSRCP(rule, n) do { \
+ (rule)->fw_nports &= ~0x0f; \
+ (rule)->fw_nports |= (n); \
+ } while (0)
+#define IP_FW_GETNDSTP(rule) ((rule)->fw_nports >> 4)
+#define IP_FW_SETNDSTP(rule, n) do { \
+ (rule)->fw_nports &= ~0xf0; \
+ (rule)->fw_nports |= (n) << 4;\
+ } while (0)
+#define IP_FW_HAVEPORTS(rule) ((rule)->fw_nports != 0)
+#else
+ u_char __pad[1];
+ u_int _nsrcp;
+ u_int _ndstp;
+#define IP_FW_GETNSRCP(rule) (rule)->_nsrcp
+#define IP_FW_SETNSRCP(rule,n) (rule)->_nsrcp = n
+#define IP_FW_GETNDSTP(rule) (rule)->_ndstp
+#define IP_FW_SETNDSTP(rule,n) (rule)->_ndstp = n
+#define IP_FW_HAVEPORTS(rule) ((rule)->_ndstp + (rule)->_nsrcp != 0)
+#endif
+#define IP_FW_MAX_PORTS 10 /* A reasonable maximum */
+ union {
+ u_short fw_pts[IP_FW_MAX_PORTS]; /* port numbers to match */
+#define IP_FW_ICMPTYPES_MAX 128
+#define IP_FW_ICMPTYPES_DIM (IP_FW_ICMPTYPES_MAX / (sizeof(unsigned) * 8))
+ unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM]; /*ICMP types bitmap*/
+ } fw_uar;
+
+ u_int fw_ipflg; /* IP flags word */
+ u_short fw_iplen; /* IP length */
+ u_short fw_ipid; /* Identification */
+ u_char fw_ipopt; /* IP options set */
+ u_char fw_ipnopt; /* IP options unset */
+ u_char fw_iptos; /* IP type of service set */
+ u_char fw_ipntos; /* IP type of service unset */
+ u_char fw_ipttl; /* IP time to live */
+ u_int fw_ipver:4; /* IP version */
+ u_char fw_tcpopt; /* TCP options set */
+ u_char fw_tcpnopt; /* TCP options unset */
+ u_char fw_tcpf; /* TCP flags set */
+ u_char fw_tcpnf; /* TCP flags unset */
+ u_short fw_tcpwin; /* TCP window size */
+ u_int32_t fw_tcpseq; /* TCP sequence */
+ u_int32_t fw_tcpack; /* TCP acknowledgement */
+ long timestamp; /* timestamp (tv_sec) of last match */
+ union ip_fw_if fw_in_if; /* Incoming interfaces */
+ union ip_fw_if fw_out_if; /* Outgoing interfaces */
+ union {
+ u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */
+ u_short fu_pipe_nr; /* queue number (option DUMMYNET) */
+ u_short fu_skipto_rule; /* SKIPTO command rule number */
+ u_short fu_reject_code; /* REJECT response code */
+ struct sockaddr_in fu_fwd_ip;
+ } fw_un;
+ void *pipe_ptr; /* flow_set ptr for dummynet pipe */
+ void *next_rule_ptr; /* next rule in case of match */
+ uid_t fw_uid; /* uid to match */
+ gid_t fw_gid; /* gid to match */
+ int fw_logamount; /* amount to log */
+ u_int64_t fw_loghighest; /* highest number packet to log */
+
+ long dont_match_prob; /* 0x7fffffff means 1.0, always fail */
+ u_char dyn_type; /* type for dynamic rule */
+
+#define DYN_KEEP_STATE 0 /* type for keep-state rules */
+#define DYN_LIMIT 1 /* type for limit connection rules */
+#define DYN_LIMIT_PARENT 2 /* parent entry for limit connection rules */
+
+ /* following two fields are used to limit number of connections
+ * basing on either src, srcport, dst, dstport.
+ */
+ u_char limit_mask; /* mask type for limit rule, can
+ * have many.
+ */
+#define DYN_SRC_ADDR 0x1
+#define DYN_SRC_PORT 0x2
+#define DYN_DST_ADDR 0x4
+#define DYN_DST_PORT 0x8
+
+ u_short conn_limit; /* # of connections for limit rule */
+};
+
+#define fw_divert_port fw_un.fu_divert_port
+#define fw_skipto_rule fw_un.fu_skipto_rule
+#define fw_reject_code fw_un.fu_reject_code
+#define fw_pipe_nr fw_un.fu_pipe_nr
+#define fw_fwd_ip fw_un.fu_fwd_ip
+
+/*
+ *
+ * rule_ptr -------------+
+ * V
+ * [ next.le_next ]---->[ next.le_next ]---- [ next.le_next ]--->
+ * [ next.le_prev ]<----[ next.le_prev ]<----[ next.le_prev ]<---
+ * [ <ip_fw> body ] [ <ip_fw> body ] [ <ip_fw> body ]
+ *
+ */
+
+/*
+ * Flow mask/flow id for each queue.
+ */
+struct ipfw_flow_id {
+ u_int32_t dst_ip;
+ u_int32_t src_ip;
+ u_int16_t dst_port;
+ u_int16_t src_port;
+ u_int8_t proto;
+ u_int8_t flags; /* protocol-specific flags */
+};
+
+/*
+ * dynamic ipfw rule
+ */
+struct ipfw_dyn_rule {
+ struct ipfw_dyn_rule *next;
+ struct ipfw_flow_id id; /* (masked) flow id */
+ struct ip_fw *rule; /* pointer to rule */
+ struct ipfw_dyn_rule *parent; /* pointer to parent rule */
+ u_int32_t expire; /* expire time */
+ u_int64_t pcnt; /* packet match counters */
+ u_int64_t bcnt; /* byte match counters */
+ u_int32_t bucket; /* which bucket in hash table */
+ u_int32_t state; /* state of this rule (typically a
+ * combination of TCP flags)
+ */
+ u_int16_t dyn_type; /* rule type */
+ u_int16_t count; /* refcount */
+};
+
+/*
+ * Values for "flags" field .
+ */
+#define IP_FW_F_COMMAND 0x000000ff /* Mask for type of chain entry: */
+#define IP_FW_F_DENY 0x00000000 /* This is a deny rule */
+#define IP_FW_F_REJECT 0x00000001 /* Deny and send a response packet */
+#define IP_FW_F_ACCEPT 0x00000002 /* This is an accept rule */
+#define IP_FW_F_COUNT 0x00000003 /* This is a count rule */
+#define IP_FW_F_DIVERT 0x00000004 /* This is a divert rule */
+#define IP_FW_F_TEE 0x00000005 /* This is a tee rule */
+#define IP_FW_F_SKIPTO 0x00000006 /* This is a skipto rule */
+#define IP_FW_F_FWD 0x00000007 /* This is a "change forwarding
+ * address" rule
+ */
+#define IP_FW_F_PIPE 0x00000008 /* This is a dummynet rule */
+#define IP_FW_F_QUEUE 0x00000009 /* This is a dummynet queue */
+
+#define IP_FW_F_IN 0x00000100 /* Check inbound packets */
+#define IP_FW_F_OUT 0x00000200 /* Check outbound packets */
+#define IP_FW_F_IIFACE 0x00000400 /* Apply inbound interface test */
+#define IP_FW_F_OIFACE 0x00000800 /* Apply outbound interface test */
+#define IP_FW_F_PRN 0x00001000 /* Print if this rule matches */
+#define IP_FW_F_SRNG 0x00002000 /* The first two src ports are a min
+ * and max range (stored in host byte
+ * order).
+ */
+#define IP_FW_F_DRNG 0x00004000 /* The first two dst ports are a min
+ * and max range (stored in host byte
+ * order).
+ */
+#define IP_FW_F_FRAG 0x00008000 /* Fragment */
+#define IP_FW_F_IIFNAME 0x00010000 /* In interface by name/unit (not IP) */
+#define IP_FW_F_OIFNAME 0x00020000 /* Out interface by name/unit (not IP)*/
+#define IP_FW_F_INVSRC 0x00040000 /* Invert sense of src check */
+#define IP_FW_F_INVDST 0x00080000 /* Invert sense of dst check */
+#define IP_FW_F_ICMPBIT 0x00100000 /* ICMP type bitmap is valid */
+#define IP_FW_F_UID 0x00200000 /* filter by uid */
+#define IP_FW_F_GID 0x00400000 /* filter by gid */
+#define IP_FW_F_RND_MATCH 0x00800000 /* probabilistic rule match */
+#define IP_FW_F_SMSK 0x01000000 /* src-port + mask */
+#define IP_FW_F_DMSK 0x02000000 /* dst-port + mask */
+#define IP_FW_BRIDGED 0x04000000 /* only match bridged packets */
+#define IP_FW_F_KEEP_S 0x08000000 /* keep state */
+#define IP_FW_F_CHECK_S 0x10000000 /* check state */
+#define IP_FW_F_SME 0x20000000 /* source = me */
+#define IP_FW_F_DME 0x40000000 /* destination = me */
+#define IP_FW_F_MAC 0x80000000 /* match MAC header */
+
+#define IP_FW_F_MASK 0xFFFFFFFF /* All possible flag bits mask */
+
+/*
+ * Flags for the 'fw_ipflg' field, for comparing values
+ * of ip and its protocols.
+ */
+#define IP_FW_IF_TCPOPT 0x00000001 /* tcp options */
+#define IP_FW_IF_TCPFLG 0x00000002 /* tcp flags */
+#define IP_FW_IF_TCPSEQ 0x00000004 /* tcp sequence number */
+#define IP_FW_IF_TCPACK 0x00000008 /* tcp acknowledgement number */
+#define IP_FW_IF_TCPWIN 0x00000010 /* tcp window size */
+#define IP_FW_IF_TCPEST 0x00000020 /* established TCP connection */
+#define IP_FW_IF_TCPMSK 0x0000003f /* mask of all tcp values */
+#define IP_FW_IF_IPOPT 0x00000100 /* ip options */
+#define IP_FW_IF_IPLEN 0x00000200 /* ip length */
+#define IP_FW_IF_IPID 0x00000400 /* ip identification */
+#define IP_FW_IF_IPTOS 0x00000800 /* ip type of service */
+#define IP_FW_IF_IPTTL 0x00001000 /* ip time to live */
+#define IP_FW_IF_IPVER 0x00002000 /* ip version */
+#define IP_FW_IF_IPPRE 0x00004000 /* ip precedence */
+#define IP_FW_IF_IPMSK 0x00007f00 /* mask of all ip values */
+#define IP_FW_IF_MSK 0x0000ffff /* All possible bits mask */
+
+/*
+ * For backwards compatibility with rules specifying "via iface" but
+ * not restricted to only "in" or "out" packets, we define this combination
+ * of bits to represent this configuration.
+ */
+
+#define IF_FW_F_VIAHACK (IP_FW_F_IN|IP_FW_F_OUT|IP_FW_F_IIFACE|IP_FW_F_OIFACE)
+
+/*
+ * Definitions for REJECT response codes.
+ * Values less than 256 correspond to ICMP unreachable codes.
+ */
+#define IP_FW_REJECT_RST 0x0100 /* TCP packets: send RST */
+
+/*
+ * Definitions for IP option names.
+ */
+#define IP_FW_IPOPT_LSRR 0x01
+#define IP_FW_IPOPT_SSRR 0x02
+#define IP_FW_IPOPT_RR 0x04
+#define IP_FW_IPOPT_TS 0x08
+
+/*
+ * Definitions for TCP option names.
+ */
+#define IP_FW_TCPOPT_MSS 0x01
+#define IP_FW_TCPOPT_WINDOW 0x02
+#define IP_FW_TCPOPT_SACK 0x04
+#define IP_FW_TCPOPT_TS 0x08
+#define IP_FW_TCPOPT_CC 0x10
+
+/*
+ * Main firewall chains definitions and global var's definitions.
+ */
+#ifdef _KERNEL
+
+#define IP_FW_PORT_DYNT_FLAG 0x10000
+#define IP_FW_PORT_TEE_FLAG 0x20000
+#define IP_FW_PORT_DENY_FLAG 0x40000
+
+/*
+ * arguments for calling ip_fw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+ struct mbuf *m; /* the mbuf chain */
+ struct ifnet *oif; /* output interface */
+ struct sockaddr_in *next_hop; /* forward address */
+ struct ip_fw *rule; /* matching rule */
+ struct ether_header *eh; /* for bridged packets */
+
+ struct route *ro; /* for dummynet */
+ struct sockaddr_in *dst; /* for dummynet */
+ int flags; /* for dummynet */
+
+ struct ipfw_flow_id f_id; /* grabbed from IP header */
+ u_int16_t divert_rule; /* divert cookie */
+ u_int32_t retval;
+};
+
+/*
+ * Function definitions.
+ */
+void ip_fw_init(void);
+
+/* Firewall hooks */
+struct ip;
+struct sockopt;
+typedef int ip_fw_chk_t (struct ip_fw_args *args);
+typedef int ip_fw_ctl_t (struct sockopt *);
+extern ip_fw_chk_t *ip_fw_chk_ptr;
+extern ip_fw_ctl_t *ip_fw_ctl_ptr;
+extern int fw_one_pass;
+extern int fw_enable;
+#define IPFW_LOADED (ip_fw_chk_ptr != NULL)
+#endif /* _KERNEL */
+
+#endif /* _IP_FW_H */
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
new file mode 100644
index 0000000..7042dd8
--- /dev/null
+++ b/sys/netinet/ip_icmp.c
@@ -0,0 +1,871 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/route.h>
+
+#define _IP_VHL
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
+#include <netinet/icmp_var.h>
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#include <netkey/key.h>
+#endif
+
+#include <machine/in_cksum.h>
+
+/*
+ * ICMP routines: error generation, receive packet processing, and
+ * routines to turnaround packets back to the originator, and
+ * host table maintenance routines.
+ */
+
+static struct icmpstat icmpstat;
+SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW,
+ &icmpstat, icmpstat, "");
+
+static int icmpmaskrepl = 0;
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW,
+ &icmpmaskrepl, 0, "");
+
+static int drop_redirect = 0;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW,
+ &drop_redirect, 0, "");
+
+static int log_redirect = 0;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW,
+ &log_redirect, 0, "");
+
+static int icmplim = 200;
+SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
+ &icmplim, 0, "");
+
+static int icmplim_output = 1;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
+ &icmplim_output, 0, "");
+
+/*
+ * ICMP broadcast echo sysctl
+ */
+
+static int icmpbmcastecho = 0;
+SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW,
+ &icmpbmcastecho, 0, "");
+
+
+#ifdef ICMPPRINTFS
+int icmpprintfs = 0;
+#endif
+
+static void icmp_reflect(struct mbuf *);
+static void icmp_send(struct mbuf *, struct mbuf *, struct route *);
+static int ip_next_mtu(int, int);
+
+extern struct protosw inetsw[];
+
+/*
+ * Generate an error packet of type error
+ * in response to bad packet ip.
+ */
+void
+icmp_error(n, type, code, dest, destifp)
+ struct mbuf *n;
+ int type, code;
+ n_long dest;
+ struct ifnet *destifp;
+{
+ register struct ip *oip = mtod(n, struct ip *), *nip;
+ register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2;
+ register struct icmp *icp;
+ register struct mbuf *m;
+ unsigned icmplen;
+
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("icmp_error(%p, %x, %d)\n", oip, type, code);
+#endif
+ if (type != ICMP_REDIRECT)
+ icmpstat.icps_error++;
+ /*
+ * Don't send error if not the first fragment of message.
+ * Don't error if the old packet protocol was ICMP
+ * error message, only known informational types.
+ */
+ if (oip->ip_off &~ (IP_MF|IP_DF))
+ goto freeit;
+ if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
+ n->m_len >= oiplen + ICMP_MINLEN &&
+ !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) {
+ icmpstat.icps_oldicmp++;
+ goto freeit;
+ }
+ /* Don't send error in response to a multicast or broadcast packet */
+ if (n->m_flags & (M_BCAST|M_MCAST))
+ goto freeit;
+ /*
+ * First, formulate icmp message
+ */
+ m = m_gethdr(M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ goto freeit;
+ icmplen = min(oiplen + 8, oip->ip_len);
+ if (icmplen < sizeof(struct ip))
+ panic("icmp_error: bad length");
+ m->m_len = icmplen + ICMP_MINLEN;
+ MH_ALIGN(m, m->m_len);
+ icp = mtod(m, struct icmp *);
+ if ((u_int)type > ICMP_MAXTYPE)
+ panic("icmp_error");
+ icmpstat.icps_outhist[type]++;
+ icp->icmp_type = type;
+ if (type == ICMP_REDIRECT)
+ icp->icmp_gwaddr.s_addr = dest;
+ else {
+ icp->icmp_void = 0;
+ /*
+ * The following assignments assume an overlay with the
+ * zeroed icmp_void field.
+ */
+ if (type == ICMP_PARAMPROB) {
+ icp->icmp_pptr = code;
+ code = 0;
+ } else if (type == ICMP_UNREACH &&
+ code == ICMP_UNREACH_NEEDFRAG && destifp) {
+ icp->icmp_nextmtu = htons(destifp->if_mtu);
+ }
+ }
+
+ icp->icmp_code = code;
+ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
+ nip = &icp->icmp_ip;
+
+ /*
+ * Convert fields to network representation.
+ */
+ nip->ip_len = htons(nip->ip_len);
+ nip->ip_off = htons(nip->ip_off);
+
+ /*
+ * Now, copy old ip header (without options)
+ * in front of icmp message.
+ */
+ if (m->m_data - sizeof(struct ip) < m->m_pktdat)
+ panic("icmp len");
+ m->m_data -= sizeof(struct ip);
+ m->m_len += sizeof(struct ip);
+ m->m_pkthdr.len = m->m_len;
+ m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
+ nip = mtod(m, struct ip *);
+ bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
+ nip->ip_len = m->m_len;
+ nip->ip_vhl = IP_VHL_BORING;
+ nip->ip_p = IPPROTO_ICMP;
+ nip->ip_tos = 0;
+ icmp_reflect(m);
+
+freeit:
+ m_freem(n);
+}
+
+static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET };
+static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET };
+static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET };
+
+/*
+ * Process a received ICMP message.
+ */
+void
+icmp_input(m, off)
+ register struct mbuf *m;
+ int off;
+{
+ int hlen = off;
+ register struct icmp *icp;
+ register struct ip *ip = mtod(m, struct ip *);
+ int icmplen = ip->ip_len;
+ register int i;
+ struct in_ifaddr *ia;
+ void (*ctlfunc)(int, struct sockaddr *, void *);
+ int code;
+
+ /*
+ * Locate icmp structure in mbuf, and check
+ * that not corrupted and of at least minimum length.
+ */
+#ifdef ICMPPRINTFS
+ if (icmpprintfs) {
+ char buf[4 * sizeof "123"];
+ strcpy(buf, inet_ntoa(ip->ip_src));
+ printf("icmp_input from %s to %s, len %d\n",
+ buf, inet_ntoa(ip->ip_dst), icmplen);
+ }
+#endif
+ if (icmplen < ICMP_MINLEN) {
+ icmpstat.icps_tooshort++;
+ goto freeit;
+ }
+ i = hlen + min(icmplen, ICMP_ADVLENMIN);
+ if (m->m_len < i && (m = m_pullup(m, i)) == 0) {
+ icmpstat.icps_tooshort++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ m->m_len -= hlen;
+ m->m_data += hlen;
+ icp = mtod(m, struct icmp *);
+ if (in_cksum(m, icmplen)) {
+ icmpstat.icps_checksum++;
+ goto freeit;
+ }
+ m->m_len += hlen;
+ m->m_data -= hlen;
+
+ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
+ /*
+ * Deliver very specific ICMP type only.
+ */
+ switch (icp->icmp_type) {
+ case ICMP_UNREACH:
+ case ICMP_TIMXCEED:
+ break;
+ default:
+ goto freeit;
+ }
+ }
+
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("icmp_input, type %d code %d\n", icp->icmp_type,
+ icp->icmp_code);
+#endif
+
+ /*
+ * Message type specific processing.
+ */
+ if (icp->icmp_type > ICMP_MAXTYPE)
+ goto raw;
+ icmpstat.icps_inhist[icp->icmp_type]++;
+ code = icp->icmp_code;
+ switch (icp->icmp_type) {
+
+ case ICMP_UNREACH:
+ switch (code) {
+ case ICMP_UNREACH_NET:
+ case ICMP_UNREACH_HOST:
+ case ICMP_UNREACH_SRCFAIL:
+ case ICMP_UNREACH_NET_UNKNOWN:
+ case ICMP_UNREACH_HOST_UNKNOWN:
+ case ICMP_UNREACH_ISOLATED:
+ case ICMP_UNREACH_TOSNET:
+ case ICMP_UNREACH_TOSHOST:
+ case ICMP_UNREACH_HOST_PRECEDENCE:
+ case ICMP_UNREACH_PRECEDENCE_CUTOFF:
+ code = PRC_UNREACH_NET;
+ break;
+
+ case ICMP_UNREACH_NEEDFRAG:
+ code = PRC_MSGSIZE;
+ break;
+
+ /*
+ * RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
+ * Treat subcodes 2,3 as immediate RST
+ */
+ case ICMP_UNREACH_PROTOCOL:
+ case ICMP_UNREACH_PORT:
+ code = PRC_UNREACH_PORT;
+ break;
+
+ case ICMP_UNREACH_NET_PROHIB:
+ case ICMP_UNREACH_HOST_PROHIB:
+ case ICMP_UNREACH_FILTER_PROHIB:
+ code = PRC_UNREACH_ADMIN_PROHIB;
+ break;
+
+ default:
+ goto badcode;
+ }
+ goto deliver;
+
+ case ICMP_TIMXCEED:
+ if (code > 1)
+ goto badcode;
+ code += PRC_TIMXCEED_INTRANS;
+ goto deliver;
+
+ case ICMP_PARAMPROB:
+ if (code > 1)
+ goto badcode;
+ code = PRC_PARAMPROB;
+ goto deliver;
+
+ case ICMP_SOURCEQUENCH:
+ if (code)
+ goto badcode;
+ code = PRC_QUENCH;
+ deliver:
+ /*
+ * Problem with datagram; advise higher level routines.
+ */
+ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
+ IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) {
+ icmpstat.icps_badlen++;
+ goto freeit;
+ }
+ icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len);
+ /* Discard ICMP's in response to multicast packets */
+ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
+ goto badcode;
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
+#endif
+ icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
+#if 1
+ /*
+ * MTU discovery:
+ * If we got a needfrag and there is a host route to the
+ * original destination, and the MTU is not locked, then
+ * set the MTU in the route to the suggested new value
+ * (if given) and then notify as usual. The ULPs will
+ * notice that the MTU has changed and adapt accordingly.
+ * If no new MTU was suggested, then we guess a new one
+ * less than the current value. If the new MTU is
+ * unreasonably small (arbitrarily set at 296), then
+ * we reset the MTU to the interface value and enable the
+ * lock bit, indicating that we are no longer doing MTU
+ * discovery.
+ */
+ if (code == PRC_MSGSIZE) {
+ struct rtentry *rt;
+ int mtu;
+
+ rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
+ RTF_CLONING | RTF_PRCLONING);
+ if (rt && (rt->rt_flags & RTF_HOST)
+ && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
+ mtu = ntohs(icp->icmp_nextmtu);
+ if (!mtu)
+ mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu,
+ 1);
+#ifdef DEBUG_MTUDISC
+ printf("MTU for %s reduced to %d\n",
+ inet_ntoa(icmpsrc.sin_addr), mtu);
+#endif
+ if (mtu < 296) {
+ /* rt->rt_rmx.rmx_mtu =
+ rt->rt_ifp->if_mtu; */
+ rt->rt_rmx.rmx_locks |= RTV_MTU;
+ } else if (rt->rt_rmx.rmx_mtu > mtu) {
+ rt->rt_rmx.rmx_mtu = mtu;
+ }
+ }
+ if (rt)
+ RTFREE(rt);
+ }
+
+#endif
+ /*
+ * XXX if the packet contains [IPv4 AH TCP], we can't make a
+ * notification to TCP layer.
+ */
+ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
+ if (ctlfunc)
+ (*ctlfunc)(code, (struct sockaddr *)&icmpsrc,
+ (void *)&icp->icmp_ip);
+ break;
+
+ badcode:
+ icmpstat.icps_badcode++;
+ break;
+
+ case ICMP_ECHO:
+ if (!icmpbmcastecho
+ && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
+ icmpstat.icps_bmcastecho++;
+ break;
+ }
+ icp->icmp_type = ICMP_ECHOREPLY;
+ if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
+ goto freeit;
+ else
+ goto reflect;
+
+ case ICMP_TSTAMP:
+ if (!icmpbmcastecho
+ && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
+ icmpstat.icps_bmcasttstamp++;
+ break;
+ }
+ if (icmplen < ICMP_TSLEN) {
+ icmpstat.icps_badlen++;
+ break;
+ }
+ icp->icmp_type = ICMP_TSTAMPREPLY;
+ icp->icmp_rtime = iptime();
+ icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */
+ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
+ goto freeit;
+ else
+ goto reflect;
+
+ case ICMP_MASKREQ:
+ if (icmpmaskrepl == 0)
+ break;
+ /*
+ * We are not able to respond with all ones broadcast
+ * unless we receive it over a point-to-point interface.
+ */
+ if (icmplen < ICMP_MASKLEN)
+ break;
+ switch (ip->ip_dst.s_addr) {
+
+ case INADDR_BROADCAST:
+ case INADDR_ANY:
+ icmpdst.sin_addr = ip->ip_src;
+ break;
+
+ default:
+ icmpdst.sin_addr = ip->ip_dst;
+ }
+ ia = (struct in_ifaddr *)ifaof_ifpforaddr(
+ (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
+ if (ia == 0)
+ break;
+ if (ia->ia_ifp == 0)
+ break;
+ icp->icmp_type = ICMP_MASKREPLY;
+ icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
+ if (ip->ip_src.s_addr == 0) {
+ if (ia->ia_ifp->if_flags & IFF_BROADCAST)
+ ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
+ else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
+ ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
+ }
+reflect:
+ ip->ip_len += hlen; /* since ip_input deducts this */
+ icmpstat.icps_reflect++;
+ icmpstat.icps_outhist[icp->icmp_type]++;
+ icmp_reflect(m);
+ return;
+
+ case ICMP_REDIRECT:
+ if (log_redirect) {
+ u_long src, dst, gw;
+
+ src = ntohl(ip->ip_src.s_addr);
+ dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
+ gw = ntohl(icp->icmp_gwaddr.s_addr);
+ printf("icmp redirect from %d.%d.%d.%d: "
+ "%d.%d.%d.%d => %d.%d.%d.%d\n",
+ (int)(src >> 24), (int)((src >> 16) & 0xff),
+ (int)((src >> 8) & 0xff), (int)(src & 0xff),
+ (int)(dst >> 24), (int)((dst >> 16) & 0xff),
+ (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
+ (int)(gw >> 24), (int)((gw >> 16) & 0xff),
+ (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
+ }
+ if (drop_redirect)
+ break;
+ if (code > 3)
+ goto badcode;
+ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
+ IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) {
+ icmpstat.icps_badlen++;
+ break;
+ }
+ /*
+ * Short circuit routing redirects to force
+ * immediate change in the kernel's routing
+ * tables. The message is also handed to anyone
+ * listening on a raw socket (e.g. the routing
+ * daemon for use in updating its tables).
+ */
+ icmpgw.sin_addr = ip->ip_src;
+ icmpdst.sin_addr = icp->icmp_gwaddr;
+#ifdef ICMPPRINTFS
+ if (icmpprintfs) {
+ char buf[4 * sizeof "123"];
+ strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst));
+
+ printf("redirect dst %s to %s\n",
+ buf, inet_ntoa(icp->icmp_gwaddr));
+ }
+#endif
+ icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
+ rtredirect((struct sockaddr *)&icmpsrc,
+ (struct sockaddr *)&icmpdst,
+ (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST,
+ (struct sockaddr *)&icmpgw, (struct rtentry **)0);
+ pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
+#ifdef IPSEC
+ key_sa_routechange((struct sockaddr *)&icmpsrc);
+#endif
+ break;
+
+ /*
+ * No kernel processing for the following;
+ * just fall through to send to raw listener.
+ */
+ case ICMP_ECHOREPLY:
+ case ICMP_ROUTERADVERT:
+ case ICMP_ROUTERSOLICIT:
+ case ICMP_TSTAMPREPLY:
+ case ICMP_IREQREPLY:
+ case ICMP_MASKREPLY:
+ default:
+ break;
+ }
+
+raw:
+ rip_input(m, off);
+ return;
+
+freeit:
+ m_freem(m);
+}
+
+/*
+ * Reflect the ip packet back to the source
+ */
+static void
+icmp_reflect(m)
+ struct mbuf *m;
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct ifaddr *ifa;
+ struct in_ifaddr *ia;
+ struct in_addr t;
+ struct mbuf *opts = 0;
+ int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
+ struct route *ro = NULL, rt;
+
+ if (!in_canforward(ip->ip_src) &&
+ ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) !=
+ (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) {
+ m_freem(m); /* Bad return address */
+ icmpstat.icps_badaddr++;
+ goto done; /* Ip_output() will check for broadcast */
+ }
+ t = ip->ip_dst;
+ ip->ip_dst = ip->ip_src;
+ ro = &rt;
+ bzero(ro, sizeof(*ro));
+ /*
+ * If the incoming packet was addressed directly to us,
+ * use dst as the src for the reply. Otherwise (broadcast
+ * or anonymous), use the address which corresponds
+ * to the incoming interface.
+ */
+ LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash)
+ if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr)
+ goto match;
+ if (m->m_pkthdr.rcvif != NULL &&
+ m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
+ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = ifatoia(ifa);
+ if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
+ t.s_addr)
+ goto match;
+ }
+ }
+ ia = ip_rtaddr(ip->ip_dst, ro);
+ /* We need a route to do anything useful. */
+ if (ia == NULL) {
+ m_freem(m);
+ icmpstat.icps_noroute++;
+ goto done;
+ }
+match:
+ t = IA_SIN(ia)->sin_addr;
+ ip->ip_src = t;
+ ip->ip_ttl = ip_defttl;
+
+ if (optlen > 0) {
+ register u_char *cp;
+ int opt, cnt;
+ u_int len;
+
+ /*
+ * Retrieve any source routing from the incoming packet;
+ * add on any record-route or timestamp options.
+ */
+ cp = (u_char *) (ip + 1);
+ if ((opts = ip_srcroute()) == 0 &&
+ (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) {
+ opts->m_len = sizeof(struct in_addr);
+ mtod(opts, struct in_addr *)->s_addr = 0;
+ }
+ if (opts) {
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("icmp_reflect optlen %d rt %d => ",
+ optlen, opts->m_len);
+#endif
+ for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ len = 1;
+ else {
+ if (cnt < IPOPT_OLEN + sizeof(*cp))
+ break;
+ len = cp[IPOPT_OLEN];
+ if (len < IPOPT_OLEN + sizeof(*cp) ||
+ len > cnt)
+ break;
+ }
+ /*
+ * Should check for overflow, but it "can't happen"
+ */
+ if (opt == IPOPT_RR || opt == IPOPT_TS ||
+ opt == IPOPT_SECURITY) {
+ bcopy((caddr_t)cp,
+ mtod(opts, caddr_t) + opts->m_len, len);
+ opts->m_len += len;
+ }
+ }
+ /* Terminate & pad, if necessary */
+ cnt = opts->m_len % 4;
+ if (cnt) {
+ for (; cnt < 4; cnt++) {
+ *(mtod(opts, caddr_t) + opts->m_len) =
+ IPOPT_EOL;
+ opts->m_len++;
+ }
+ }
+#ifdef ICMPPRINTFS
+ if (icmpprintfs)
+ printf("%d\n", opts->m_len);
+#endif
+ }
+ /*
+ * Now strip out original options by copying rest of first
+ * mbuf's data back, and adjust the IP length.
+ */
+ ip->ip_len -= optlen;
+ ip->ip_vhl = IP_VHL_BORING;
+ m->m_len -= optlen;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len -= optlen;
+ optlen += sizeof(struct ip);
+ bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1),
+ (unsigned)(m->m_len - sizeof(struct ip)));
+ }
+ m->m_flags &= ~(M_BCAST|M_MCAST);
+ icmp_send(m, opts, ro);
+done:
+ if (opts)
+ (void)m_free(opts);
+ if (ro && ro->ro_rt)
+ RTFREE(ro->ro_rt);
+}
+
+/*
+ * Send an icmp packet back to the ip level,
+ * after supplying a checksum.
+ */
+static void
+icmp_send(m, opts, rt)
+ register struct mbuf *m;
+ struct mbuf *opts;
+ struct route *rt;
+{
+ register struct ip *ip = mtod(m, struct ip *);
+ register int hlen;
+ register struct icmp *icp;
+
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+ m->m_data += hlen;
+ m->m_len -= hlen;
+ icp = mtod(m, struct icmp *);
+ icp->icmp_cksum = 0;
+ icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen);
+ m->m_data -= hlen;
+ m->m_len += hlen;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef ICMPPRINTFS
+ if (icmpprintfs) {
+ char buf[4 * sizeof "123"];
+ strcpy(buf, inet_ntoa(ip->ip_dst));
+ printf("icmp_send dst %s src %s\n",
+ buf, inet_ntoa(ip->ip_src));
+ }
+#endif
+ (void) ip_output(m, opts, rt, 0, NULL);
+}
+
+n_time
+iptime()
+{
+ struct timeval atv;
+ u_long t;
+
+ getmicrotime(&atv);
+ t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
+ return (htonl(t));
+}
+
+#if 1
+/*
+ * Return the next larger or smaller MTU plateau (table from RFC 1191)
+ * given current value MTU. If DIR is less than zero, a larger plateau
+ * is returned; otherwise, a smaller value is returned.
+ */
+static int
+ip_next_mtu(mtu, dir)
+ int mtu;
+ int dir;
+{
+ static int mtutab[] = {
+ 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296,
+ 68, 0
+ };
+ int i;
+
+ for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) {
+ if (mtu >= mtutab[i])
+ break;
+ }
+
+ if (dir < 0) {
+ if (i == 0) {
+ return 0;
+ } else {
+ return mtutab[i - 1];
+ }
+ } else {
+ if (mtutab[i] == 0) {
+ return 0;
+ } else if(mtu > mtutab[i]) {
+ return mtutab[i];
+ } else {
+ return mtutab[i + 1];
+ }
+ }
+}
+#endif
+
+
+/*
+ * badport_bandlim() - check for ICMP bandwidth limit
+ *
+ * Return 0 if it is ok to send an ICMP error response, -1 if we have
+ * hit our bandwidth limit and it is not ok.
+ *
+ * If icmplim is <= 0, the feature is disabled and 0 is returned.
+ *
+ * For now we separate the TCP and UDP subsystems w/ different 'which'
+ * values. We may eventually remove this separation (and simplify the
+ * code further).
+ *
+ * Note that the printing of the error message is delayed so we can
+ * properly print the icmp error rate that the system was trying to do
+ * (i.e. 22000/100 pps, etc...). This can cause long delays in printing
+ * the 'final' error, but it doesn't make sense to solve the printing
+ * delay with more complex code.
+ */
+
+int
+badport_bandlim(int which)
+{
+ static int lticks[BANDLIM_MAX + 1];
+ static int lpackets[BANDLIM_MAX + 1];
+ int dticks;
+ const char *bandlimittype[] = {
+ "Limiting icmp unreach response",
+ "Limiting icmp ping response",
+ "Limiting icmp tstamp response",
+ "Limiting closed port RST response",
+ "Limiting open port RST response"
+ };
+
+ /*
+ * Return ok status if feature disabled or argument out of
+ * ranage.
+ */
+
+ if (icmplim <= 0 || which > BANDLIM_MAX || which < 0)
+ return(0);
+ dticks = ticks - lticks[which];
+
+ /*
+ * reset stats when cumulative dt exceeds one second.
+ */
+
+ if ((unsigned int)dticks > hz) {
+ if (lpackets[which] > icmplim && icmplim_output) {
+ printf("%s from %d to %d packets per second\n",
+ bandlimittype[which],
+ lpackets[which],
+ icmplim
+ );
+ }
+ lticks[which] = ticks;
+ lpackets[which] = 0;
+ }
+
+ /*
+ * bump packet count
+ */
+
+ if (++lpackets[which] > icmplim) {
+ return(-1);
+ }
+ return(0);
+}
+
diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h
new file mode 100644
index 0000000..927efd9
--- /dev/null
+++ b/sys/netinet/ip_icmp.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_ICMP_H_
+#define _NETINET_IP_ICMP_H_
+
+/*
+ * Interface Control Message Protocol Definitions.
+ * Per RFC 792, September 1981.
+ */
+
+/*
+ * Internal of an ICMP Router Advertisement
+ */
+struct icmp_ra_addr {
+ u_int32_t ira_addr;
+ u_int32_t ira_preference;
+};
+
+/*
+ * Structure of an icmp header.
+ */
+struct icmp {
+ u_char icmp_type; /* type of message, see below */
+ u_char icmp_code; /* type sub code */
+ u_short icmp_cksum; /* ones complement cksum of struct */
+ union {
+ u_char ih_pptr; /* ICMP_PARAMPROB */
+ struct in_addr ih_gwaddr; /* ICMP_REDIRECT */
+ struct ih_idseq {
+ n_short icd_id;
+ n_short icd_seq;
+ } ih_idseq;
+ int ih_void;
+
+ /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */
+ struct ih_pmtu {
+ n_short ipm_void;
+ n_short ipm_nextmtu;
+ } ih_pmtu;
+
+ struct ih_rtradv {
+ u_char irt_num_addrs;
+ u_char irt_wpa;
+ u_int16_t irt_lifetime;
+ } ih_rtradv;
+ } icmp_hun;
+#define icmp_pptr icmp_hun.ih_pptr
+#define icmp_gwaddr icmp_hun.ih_gwaddr
+#define icmp_id icmp_hun.ih_idseq.icd_id
+#define icmp_seq icmp_hun.ih_idseq.icd_seq
+#define icmp_void icmp_hun.ih_void
+#define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void
+#define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu
+#define icmp_num_addrs icmp_hun.ih_rtradv.irt_num_addrs
+#define icmp_wpa icmp_hun.ih_rtradv.irt_wpa
+#define icmp_lifetime icmp_hun.ih_rtradv.irt_lifetime
+ union {
+ struct id_ts {
+ n_time its_otime;
+ n_time its_rtime;
+ n_time its_ttime;
+ } id_ts;
+ struct id_ip {
+ struct ip idi_ip;
+ /* options and then 64 bits of data */
+ } id_ip;
+ struct icmp_ra_addr id_radv;
+ u_int32_t id_mask;
+ char id_data[1];
+ } icmp_dun;
+#define icmp_otime icmp_dun.id_ts.its_otime
+#define icmp_rtime icmp_dun.id_ts.its_rtime
+#define icmp_ttime icmp_dun.id_ts.its_ttime
+#define icmp_ip icmp_dun.id_ip.idi_ip
+#define icmp_radv icmp_dun.id_radv
+#define icmp_mask icmp_dun.id_mask
+#define icmp_data icmp_dun.id_data
+};
+
+/*
+ * Lower bounds on packet lengths for various types.
+ * For the error advice packets must first insure that the
+ * packet is large enough to contain the returned ip header.
+ * Only then can we do the check to see if 64 bits of packet
+ * data have been returned, since we need to check the returned
+ * ip header length.
+ */
+#define ICMP_MINLEN 8 /* abs minimum */
+#define ICMP_TSLEN (8 + 3 * sizeof (n_time)) /* timestamp */
+#define ICMP_MASKLEN 12 /* address mask */
+#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */
+#ifndef _IP_VHL
+#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8)
+ /* N.B.: must separately check that ip_hl >= 5 */
+#else
+#define ICMP_ADVLEN(p) (8 + (IP_VHL_HL((p)->icmp_ip.ip_vhl) << 2) + 8)
+ /* N.B.: must separately check that header length >= 5 */
+#endif
+
+/*
+ * Definition of type and code field values.
+ */
+#define ICMP_ECHOREPLY 0 /* echo reply */
+#define ICMP_UNREACH 3 /* dest unreachable, codes: */
+#define ICMP_UNREACH_NET 0 /* bad net */
+#define ICMP_UNREACH_HOST 1 /* bad host */
+#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */
+#define ICMP_UNREACH_PORT 3 /* bad port */
+#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */
+#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */
+#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */
+#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */
+#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */
+#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */
+#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */
+#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */
+#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */
+#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */
+#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */
+#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */
+#define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */
+#define ICMP_REDIRECT 5 /* shorter route, codes: */
+#define ICMP_REDIRECT_NET 0 /* for network */
+#define ICMP_REDIRECT_HOST 1 /* for host */
+#define ICMP_REDIRECT_TOSNET 2 /* for tos and net */
+#define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */
+#define ICMP_ECHO 8 /* echo service */
+#define ICMP_ROUTERADVERT 9 /* router advertisement */
+#define ICMP_ROUTERSOLICIT 10 /* router solicitation */
+#define ICMP_TIMXCEED 11 /* time exceeded, code: */
+#define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */
+#define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */
+#define ICMP_PARAMPROB 12 /* ip header bad */
+#define ICMP_PARAMPROB_ERRATPTR 0 /* error at param ptr */
+#define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */
+#define ICMP_PARAMPROB_LENGTH 2 /* bad length */
+#define ICMP_TSTAMP 13 /* timestamp request */
+#define ICMP_TSTAMPREPLY 14 /* timestamp reply */
+#define ICMP_IREQ 15 /* information request */
+#define ICMP_IREQREPLY 16 /* information reply */
+#define ICMP_MASKREQ 17 /* address mask request */
+#define ICMP_MASKREPLY 18 /* address mask reply */
+
+#define ICMP_MAXTYPE 18
+
+#define ICMP_INFOTYPE(type) \
+ ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \
+ (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \
+ (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \
+ (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \
+ (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY)
+
+#ifdef _KERNEL
+void icmp_error(struct mbuf *, int, int, n_long, struct ifnet *);
+void icmp_input(struct mbuf *, int);
+#endif
+
+#endif
diff --git a/sys/netinet/ip_id.c b/sys/netinet/ip_id.c
new file mode 100644
index 0000000..664b4d1
--- /dev/null
+++ b/sys/netinet/ip_id.c
@@ -0,0 +1,210 @@
+/* $OpenBSD: ip_id.c,v 1.2 1999/08/26 13:37:01 provos Exp $ */
+
+/*
+ * Copyright 1998 Niels Provos <provos@citi.umich.edu>
+ * All rights reserved.
+ *
+ * Theo de Raadt <deraadt@openbsd.org> came up with the idea of using
+ * such a mathematical system to generate more random (yet non-repeating)
+ * ids to solve the resolver/named problem. But Niels designed the
+ * actual system based on the constraints.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Niels Provos.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * seed = random 15bit
+ * n = prime, g0 = generator to n,
+ * j = random so that gcd(j,n-1) == 1
+ * g = g0^j mod n will be a generator again.
+ *
+ * X[0] = random seed.
+ * X[n] = a*X[n-1]+b mod m is a Linear Congruential Generator
+ * with a = 7^(even random) mod m,
+ * b = random with gcd(b,m) == 1
+ * m = 31104 and a maximal period of m-1.
+ *
+ * The transaction id is determined by:
+ * id[n] = seed xor (g^X[n] mod n)
+ *
+ * Effectivly the id is restricted to the lower 15 bits, thus
+ * yielding two different cycles by toggling the msb on and off.
+ * This avoids reuse issues caused by reseeding.
+ */
+
+#include "opt_random_ip_id.h"
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/random.h>
+
+#ifdef RANDOM_IP_ID
+#define RU_OUT 180 /* Time after wich will be reseeded */
+#define RU_MAX 30000 /* Uniq cycle, avoid blackjack prediction */
+#define RU_GEN 2 /* Starting generator */
+#define RU_N 32749 /* RU_N-1 = 2*2*3*2729 */
+#define RU_AGEN 7 /* determine ru_a as RU_AGEN^(2*rand) */
+#define RU_M 31104 /* RU_M = 2^7*3^5 - don't change */
+
+#define PFAC_N 3
+const static u_int16_t pfacts[PFAC_N] = {
+ 2,
+ 3,
+ 2729
+};
+
+static u_int16_t ru_x;
+static u_int16_t ru_seed, ru_seed2;
+static u_int16_t ru_a, ru_b;
+static u_int16_t ru_g;
+static u_int16_t ru_counter = 0;
+static u_int16_t ru_msb = 0;
+static long ru_reseed;
+static u_int32_t tmp; /* Storage for unused random */
+
+static u_int16_t pmod(u_int16_t, u_int16_t, u_int16_t);
+static void ip_initid(void);
+u_int16_t ip_randomid(void);
+
+/*
+ * Do a fast modular exponation, returned value will be in the range
+ * of 0 - (mod-1)
+ */
+
+#ifdef __STDC__
+static u_int16_t
+pmod(u_int16_t gen, u_int16_t exp, u_int16_t mod)
+#else
+static u_int16_t
+pmod(gen, exp, mod)
+ u_int16_t gen, exp, mod;
+#endif
+{
+ u_int16_t s, t, u;
+
+ s = 1;
+ t = gen;
+ u = exp;
+
+ while (u) {
+ if (u & 1)
+ s = (s*t) % mod;
+ u >>= 1;
+ t = (t*t) % mod;
+ }
+ return (s);
+}
+
+/*
+ * Initalizes the seed and chooses a suitable generator. Also toggles
+ * the msb flag. The msb flag is used to generate two distinct
+ * cycles of random numbers and thus avoiding reuse of ids.
+ *
+ * This function is called from id_randomid() when needed, an
+ * application does not have to worry about it.
+ */
+static void
+ip_initid(void)
+{
+ u_int16_t j, i;
+ int noprime = 1;
+ struct timeval time;
+
+ getmicrotime(&time);
+ read_random((void *) &tmp, sizeof(tmp));
+ ru_x = (tmp & 0xFFFF) % RU_M;
+
+ /* 15 bits of random seed */
+ ru_seed = (tmp >> 16) & 0x7FFF;
+ read_random((void *) &tmp, sizeof(tmp));
+ ru_seed2 = tmp & 0x7FFF;
+
+ read_random((void *) &tmp, sizeof(tmp));
+
+ /* Determine the LCG we use */
+ ru_b = (tmp & 0xfffe) | 1;
+ ru_a = pmod(RU_AGEN, (tmp >> 16) & 0xfffe, RU_M);
+ while (ru_b % 3 == 0)
+ ru_b += 2;
+
+ read_random((void *) &tmp, sizeof(tmp));
+ j = tmp % RU_N;
+ tmp = tmp >> 16;
+
+ /*
+ * Do a fast gcd(j,RU_N-1), so we can find a j with
+ * gcd(j, RU_N-1) == 1, giving a new generator for
+ * RU_GEN^j mod RU_N
+ */
+
+ while (noprime) {
+ for (i=0; i<PFAC_N; i++)
+ if (j%pfacts[i] == 0)
+ break;
+
+ if (i>=PFAC_N)
+ noprime = 0;
+ else
+ j = (j+1) % RU_N;
+ }
+
+ ru_g = pmod(RU_GEN,j,RU_N);
+ ru_counter = 0;
+
+ ru_reseed = time.tv_sec + RU_OUT;
+ ru_msb = ru_msb == 0x8000 ? 0 : 0x8000;
+}
+
+u_int16_t
+ip_randomid(void)
+{
+ int i, n;
+ struct timeval time;
+
+ getmicrotime(&time);
+ if (ru_counter >= RU_MAX || time.tv_sec > ru_reseed)
+ ip_initid();
+
+ if (!tmp)
+ read_random((void *) &tmp, sizeof(tmp));
+
+ /* Skip a random number of ids */
+ n = tmp & 0x3; tmp = tmp >> 2;
+ if (ru_counter + n >= RU_MAX)
+ ip_initid();
+
+ for (i = 0; i <= n; i++)
+ /* Linear Congruential Generator */
+ ru_x = (ru_a*ru_x + ru_b) % RU_M;
+
+ ru_counter += i;
+
+ return (ru_seed ^ pmod(ru_g,ru_seed2 ^ ru_x,RU_N)) | ru_msb;
+}
+
+#endif /* RANDOM_IP_ID */
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
new file mode 100644
index 0000000..bec09ea
--- /dev/null
+++ b/sys/netinet/ip_input.c
@@ -0,0 +1,1948 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
+ * $FreeBSD$
+ */
+
+#define _IP_VHL
+
+#include "opt_bootp.h"
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_ipdivert.h"
+#include "opt_ipfilter.h"
+#include "opt_ipstealth.h"
+#include "opt_ipsec.h"
+#include "opt_pfil_hooks.h"
+#include "opt_random_ip_id.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+
+#include <net/pfil.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/netisr.h>
+#include <net/intrq.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <machine/in_cksum.h>
+
+#include <sys/socketvar.h>
+
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#include <netkey/key.h>
+#endif
+
+int rsvp_on = 0;
+
+int ipforwarding = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
+ &ipforwarding, 0, "Enable IP forwarding between interfaces");
+
+static int ipsendredirects = 1; /* XXX */
+SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
+ &ipsendredirects, 0, "Enable sending IP redirects");
+
+int ip_defttl = IPDEFTTL;
+SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
+ &ip_defttl, 0, "Maximum TTL on IP packets");
+
+static int ip_dosourceroute = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
+ &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
+
+static int ip_acceptsourceroute = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
+ CTLFLAG_RW, &ip_acceptsourceroute, 0,
+ "Enable accepting source routed IP packets");
+
+static int ip_keepfaith = 0;
+SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
+ &ip_keepfaith, 0,
+ "Enable packet capture for FAITH IPv4->IPv6 translater daemon");
+
+static int ip_nfragpackets = 0;
+static int ip_maxfragpackets; /* initialized in ip_init() */
+SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW,
+ &ip_maxfragpackets, 0,
+ "Maximum number of IPv4 fragment reassembly queue entries");
+
+/*
+ * XXX - Setting ip_checkinterface mostly implements the receive side of
+ * the Strong ES model described in RFC 1122, but since the routing table
+ * and transmit implementation do not implement the Strong ES model,
+ * setting this to 1 results in an odd hybrid.
+ *
+ * XXX - ip_checkinterface currently must be disabled if you use ipnat
+ * to translate the destination address to another local interface.
+ *
+ * XXX - ip_checkinterface must be disabled if you add IP aliases
+ * to the loopback interface instead of the interface where the
+ * packets for those addresses are received.
+ */
+static int ip_checkinterface = 1;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
+ &ip_checkinterface, 0, "Verify packet arrives on correct interface");
+
+#ifdef DIAGNOSTIC
+static int ipprintfs = 0;
+#endif
+
+static int ipqmaxlen = IFQ_MAXLEN;
+
+extern struct domain inetdomain;
+extern struct protosw inetsw[];
+u_char ip_protox[IPPROTO_MAX];
+struct in_ifaddrhead in_ifaddrhead; /* first inet address */
+struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */
+u_long in_ifaddrhmask; /* mask for hash table */
+
+SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW,
+ &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue");
+SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD,
+ &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue");
+
+struct ipstat ipstat;
+SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
+ &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");
+
+/* Packet reassembly stuff */
+#define IPREASS_NHASH_LOG2 6
+#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
+#define IPREASS_HMASK (IPREASS_NHASH - 1)
+#define IPREASS_HASH(x,y) \
+ (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
+
+static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH];
+static int nipq = 0; /* total # of reass queues */
+static int maxnipq;
+
+#ifdef IPCTL_DEFMTU
+SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
+ &ip_mtu, 0, "Default MTU");
+#endif
+
+#ifdef IPSTEALTH
+static int ipstealth = 0;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
+ &ipstealth, 0, "");
+#endif
+
+
+/* Firewall hooks */
+ip_fw_chk_t *ip_fw_chk_ptr;
+int fw_enable = 1 ;
+
+/* Dummynet hooks */
+ip_dn_io_t *ip_dn_io_ptr;
+
+
+/*
+ * XXX this is ugly -- the following two global variables are
+ * used to store packet state while it travels through the stack.
+ * Note that the code even makes assumptions on the size and
+ * alignment of fields inside struct ip_srcrt so e.g. adding some
+ * fields will break the code. This needs to be fixed.
+ *
+ * We need to save the IP options in case a protocol wants to respond
+ * to an incoming packet over the same route if the packet got here
+ * using IP source routing. This allows connection establishment and
+ * maintenance when the remote end is on a network that is not known
+ * to us.
+ */
+static int ip_nhops = 0;
+static struct ip_srcrt {
+ struct in_addr dst; /* final destination */
+ char nop; /* one NOP to align */
+ char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */
+ struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
+} ip_srcrt;
+
+static void save_rte(u_char *, struct in_addr);
+static int ip_dooptions(struct mbuf *m, int,
+ struct sockaddr_in *next_hop);
+static void ip_forward(struct mbuf *m, int srcrt,
+ struct sockaddr_in *next_hop);
+static void ip_freef(struct ipqhead *, struct ipq *);
+static struct mbuf *ip_reass(struct mbuf *, struct ipqhead *,
+ struct ipq *, u_int32_t *, u_int16_t *);
+static void ipintr(void);
+
+/*
+ * IP initialization: fill in IP protocol switch table.
+ * All protocols not implemented in kernel go to raw IP protocol handler.
+ */
+void
+ip_init()
+{
+ register struct protosw *pr;
+ register int i;
+
+ TAILQ_INIT(&in_ifaddrhead);
+ in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask);
+ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
+ if (pr == 0)
+ panic("ip_init");
+ for (i = 0; i < IPPROTO_MAX; i++)
+ ip_protox[i] = pr - inetsw;
+ for (pr = inetdomain.dom_protosw;
+ pr < inetdomain.dom_protoswNPROTOSW; pr++)
+ if (pr->pr_domain->dom_family == PF_INET &&
+ pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
+ ip_protox[pr->pr_protocol] = pr - inetsw;
+
+ for (i = 0; i < IPREASS_NHASH; i++)
+ TAILQ_INIT(&ipq[i]);
+
+ maxnipq = nmbclusters / 4;
+ ip_maxfragpackets = nmbclusters / 4;
+
+#ifndef RANDOM_IP_ID
+ ip_id = time_second & 0xffff;
+#endif
+ ipintrq.ifq_maxlen = ipqmaxlen;
+ mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF);
+ ipintrq_present = 1;
+
+ register_netisr(NETISR_IP, ipintr);
+}
+
+/*
+ * XXX watch out this one. It is perhaps used as a cache for
+ * the most recently used route ? it is cleared in in_addroute()
+ * when a new route is successfully created.
+ */
+struct route ipforward_rt;
+
+/*
+ * Ip input routine. Checksum and byte swap header. If fragmented
+ * try to reassemble. Process options. Pass to next level.
+ */
+void
+ip_input(struct mbuf *m)
+{
+ struct ip *ip;
+ struct ipq *fp;
+ struct in_ifaddr *ia = NULL;
+ struct ifaddr *ifa;
+ int i, hlen, checkif;
+ u_short sum;
+ struct in_addr pkt_dst;
+ u_int32_t divert_info = 0; /* packet divert/tee info */
+ struct ip_fw_args args;
+#ifdef PFIL_HOOKS
+ struct packet_filter_hook *pfh;
+ struct mbuf *m0;
+ int rv;
+#endif /* PFIL_HOOKS */
+
+ args.eh = NULL;
+ args.oif = NULL;
+ args.rule = NULL;
+ args.divert_rule = 0; /* divert cookie */
+ args.next_hop = NULL;
+
+ /* Grab info from MT_TAG mbufs prepended to the chain. */
+ for (; m && m->m_type == MT_TAG; m = m->m_next) {
+ switch(m->m_tag_id) {
+ default:
+ printf("ip_input: unrecognised MT_TAG tag %d\n",
+ m->m_tag_id);
+ break;
+
+ case PACKET_TAG_DUMMYNET:
+ args.rule = ((struct dn_pkt *)m)->rule;
+ break;
+
+ case PACKET_TAG_DIVERT:
+ args.divert_rule = (intptr_t)m->m_hdr.mh_data & 0xffff;
+ break;
+
+ case PACKET_TAG_IPFORWARD:
+ args.next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
+ break;
+ }
+ }
+
+ KASSERT(m != NULL && (m->m_flags & M_PKTHDR) != 0,
+ ("ip_input: no HDR"));
+
+ if (args.rule) { /* dummynet already filtered us */
+ ip = mtod(m, struct ip *);
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+ goto iphack ;
+ }
+
+ ipstat.ips_total++;
+
+ if (m->m_pkthdr.len < sizeof(struct ip))
+ goto tooshort;
+
+ if (m->m_len < sizeof (struct ip) &&
+ (m = m_pullup(m, sizeof (struct ip))) == 0) {
+ ipstat.ips_toosmall++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+
+ if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
+ ipstat.ips_badvers++;
+ goto bad;
+ }
+
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+ if (hlen < sizeof(struct ip)) { /* minimum header length */
+ ipstat.ips_badhlen++;
+ goto bad;
+ }
+ if (hlen > m->m_len) {
+ if ((m = m_pullup(m, hlen)) == 0) {
+ ipstat.ips_badhlen++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+
+ /* 127/8 must not appear on wire - RFC1122 */
+ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+ if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
+ ipstat.ips_badaddr++;
+ goto bad;
+ }
+ }
+
+ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
+ sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
+ } else {
+ if (hlen == sizeof(struct ip)) {
+ sum = in_cksum_hdr(ip);
+ } else {
+ sum = in_cksum(m, hlen);
+ }
+ }
+ if (sum) {
+ ipstat.ips_badsum++;
+ goto bad;
+ }
+
+ /*
+ * Convert fields to host representation.
+ */
+ ip->ip_len = ntohs(ip->ip_len);
+ if (ip->ip_len < hlen) {
+ ipstat.ips_badlen++;
+ goto bad;
+ }
+ ip->ip_off = ntohs(ip->ip_off);
+
+ /*
+ * Check that the amount of data in the buffers
+ * is as at least much as the IP header would have us expect.
+ * Trim mbufs if longer than we expect.
+ * Drop packet if shorter than we expect.
+ */
+ if (m->m_pkthdr.len < ip->ip_len) {
+tooshort:
+ ipstat.ips_tooshort++;
+ goto bad;
+ }
+ if (m->m_pkthdr.len > ip->ip_len) {
+ if (m->m_len == m->m_pkthdr.len) {
+ m->m_len = ip->ip_len;
+ m->m_pkthdr.len = ip->ip_len;
+ } else
+ m_adj(m, ip->ip_len - m->m_pkthdr.len);
+ }
+
+#ifdef IPSEC
+ if (ipsec_gethist(m, NULL))
+ goto pass;
+#endif
+
+ /*
+ * IpHack's section.
+ * Right now when no processing on packet has done
+ * and it is still fresh out of network we do our black
+ * deals with it.
+ * - Firewall: deny/allow/divert
+ * - Xlate: translate packet's addr/port (NAT).
+ * - Pipe: pass pkt through dummynet.
+ * - Wrap: fake packet's addr/port <unimpl.>
+ * - Encapsulate: put it in another IP and send out. <unimp.>
+ */
+
+iphack:
+
+#ifdef PFIL_HOOKS
+ /*
+ * Run through list of hooks for input packets. If there are any
+ * filters which require that additional packets in the flow are
+ * not fast-forwarded, they must clear the M_CANFASTFWD flag.
+ * Note that filters must _never_ set this flag, as another filter
+ * in the list may have previously cleared it.
+ */
+ m0 = m;
+ pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
+ for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link))
+ if (pfh->pfil_func) {
+ rv = pfh->pfil_func(ip, hlen,
+ m->m_pkthdr.rcvif, 0, &m0);
+ if (rv)
+ return;
+ m = m0;
+ if (m == NULL)
+ return;
+ ip = mtod(m, struct ip *);
+ }
+#endif /* PFIL_HOOKS */
+
+ if (fw_enable && IPFW_LOADED) {
+ /*
+ * If we've been forwarded from the output side, then
+ * skip the firewall a second time
+ */
+ if (args.next_hop)
+ goto ours;
+
+ args.m = m;
+ i = ip_fw_chk_ptr(&args);
+ m = args.m;
+
+ if ( (i & IP_FW_PORT_DENY_FLAG) || m == NULL) { /* drop */
+ if (m)
+ m_freem(m);
+ return;
+ }
+ ip = mtod(m, struct ip *); /* just in case m changed */
+ if (i == 0 && args.next_hop == NULL) /* common case */
+ goto pass;
+ if (DUMMYNET_LOADED && (i & IP_FW_PORT_DYNT_FLAG) != 0) {
+ /* Send packet to the appropriate pipe */
+ ip_dn_io_ptr(m, i&0xffff, DN_TO_IP_IN, &args);
+ return;
+ }
+#ifdef IPDIVERT
+ if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) {
+ /* Divert or tee packet */
+ divert_info = i;
+ goto ours;
+ }
+#endif
+ if (i == 0 && args.next_hop != NULL)
+ goto pass;
+ /*
+ * if we get here, the packet must be dropped
+ */
+ m_freem(m);
+ return;
+ }
+pass:
+
+ /*
+ * Process options and, if not destined for us,
+ * ship it on. ip_dooptions returns 1 when an
+ * error was detected (causing an icmp message
+ * to be sent and the original packet to be freed).
+ */
+ ip_nhops = 0; /* for source routed packets */
+ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0, args.next_hop))
+ return;
+
+ /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
+ * matter if it is destined to another node, or whether it is
+ * a multicast one, RSVP wants it! and prevents it from being forwarded
+ * anywhere else. Also checks if the rsvp daemon is running before
+ * grabbing the packet.
+ */
+ if (rsvp_on && ip->ip_p==IPPROTO_RSVP)
+ goto ours;
+
+ /*
+ * Check our list of addresses, to see if the packet is for us.
+ * If we don't have any addresses, assume any unicast packet
+ * we receive might be for us (and let the upper layers deal
+ * with it).
+ */
+ if (TAILQ_EMPTY(&in_ifaddrhead) &&
+ (m->m_flags & (M_MCAST|M_BCAST)) == 0)
+ goto ours;
+
+ /*
+ * Cache the destination address of the packet; this may be
+ * changed by use of 'ipfw fwd'.
+ */
+ pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
+
+ /*
+ * Enable a consistency check between the destination address
+ * and the arrival interface for a unicast packet (the RFC 1122
+ * strong ES model) if IP forwarding is disabled and the packet
+ * is not locally generated and the packet is not subject to
+ * 'ipfw fwd'.
+ *
+ * XXX - Checking also should be disabled if the destination
+ * address is ipnat'ed to a different interface.
+ *
+ * XXX - Checking is incompatible with IP aliases added
+ * to the loopback interface instead of the interface where
+ * the packets are received.
+ */
+ checkif = ip_checkinterface && (ipforwarding == 0) &&
+ m->m_pkthdr.rcvif != NULL &&
+ ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) &&
+ (args.next_hop == NULL);
+
+ /*
+ * Check for exact addresses in the hash bucket.
+ */
+ LIST_FOREACH(ia, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
+ /*
+ * If the address matches, verify that the packet
+ * arrived via the correct interface if checking is
+ * enabled.
+ */
+ if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
+ (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif))
+ goto ours;
+ }
+ /*
+ * Check for broadcast addresses.
+ *
+ * Only accept broadcast packets that arrive via the matching
+ * interface. Reception of forwarded directed broadcasts would
+ * be handled via ip_forward() and ether_output() with the loopback
+ * into the stack for SIMPLEX interfaces handled by ether_output().
+ */
+ if (m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
+ TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ia = ifatoia(ifa);
+ if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
+ pkt_dst.s_addr)
+ goto ours;
+ if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr)
+ goto ours;
+#ifdef BOOTP_COMPAT
+ if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY)
+ goto ours;
+#endif
+ }
+ }
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
+ struct in_multi *inm;
+ if (ip_mrouter) {
+ /*
+ * If we are acting as a multicast router, all
+ * incoming multicast packets are passed to the
+ * kernel-level multicast forwarding function.
+ * The packet is returned (relatively) intact; if
+ * ip_mforward() returns a non-zero value, the packet
+ * must be discarded, else it may be accepted below.
+ */
+ if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) {
+ ipstat.ips_cantforward++;
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * The process-level routing daemon needs to receive
+ * all multicast IGMP packets, whether or not this
+ * host belongs to their destination groups.
+ */
+ if (ip->ip_p == IPPROTO_IGMP)
+ goto ours;
+ ipstat.ips_forward++;
+ }
+ /*
+ * See if we belong to the destination multicast group on the
+ * arrival interface.
+ */
+ IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
+ if (inm == NULL) {
+ ipstat.ips_notmember++;
+ m_freem(m);
+ return;
+ }
+ goto ours;
+ }
+ if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
+ goto ours;
+ if (ip->ip_dst.s_addr == INADDR_ANY)
+ goto ours;
+
+ /*
+ * FAITH(Firewall Aided Internet Translator)
+ */
+ if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
+ if (ip_keepfaith) {
+ if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP)
+ goto ours;
+ }
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * Not for us; forward if possible and desirable.
+ */
+ if (ipforwarding == 0) {
+ ipstat.ips_cantforward++;
+ m_freem(m);
+ } else {
+#ifdef IPSEC
+ /*
+ * Enforce inbound IPsec SPD.
+ */
+ if (ipsec4_in_reject(m, NULL)) {
+ ipsecstat.in_polvio++;
+ goto bad;
+ }
+#endif /* IPSEC */
+ ip_forward(m, 0, args.next_hop);
+ }
+ return;
+
+ours:
+#ifdef IPSTEALTH
+ /*
+ * IPSTEALTH: Process non-routing options only
+ * if the packet is destined for us.
+ */
+ if (ipstealth && hlen > sizeof (struct ip) &&
+ ip_dooptions(m, 1, args.next_hop))
+ return;
+#endif /* IPSTEALTH */
+
+ /* Count the packet in the ip address stats */
+ if (ia != NULL) {
+ ia->ia_ifa.if_ipackets++;
+ ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
+ }
+
+ /*
+ * If offset or IP_MF are set, must reassemble.
+ * Otherwise, nothing need be done.
+ * (We could look in the reassembly queue to see
+ * if the packet was previously fragmented,
+ * but it's not worth the time; just let them time out.)
+ */
+ if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
+
+ sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
+ /*
+ * Look for queue of fragments
+ * of this datagram.
+ */
+ TAILQ_FOREACH(fp, &ipq[sum], ipq_list)
+ if (ip->ip_id == fp->ipq_id &&
+ ip->ip_src.s_addr == fp->ipq_src.s_addr &&
+ ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
+ ip->ip_p == fp->ipq_p)
+ goto found;
+
+ fp = 0;
+
+ /* check if there's a place for the new queue */
+ if (nipq > maxnipq) {
+ /*
+ * drop something from the tail of the current queue
+ * before proceeding further
+ */
+ struct ipq *q = TAILQ_LAST(&ipq[sum], ipqhead);
+ if (q == NULL) { /* gak */
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
+ if (r) {
+ ip_freef(&ipq[i], r);
+ break;
+ }
+ }
+ } else
+ ip_freef(&ipq[sum], q);
+ }
+found:
+ /*
+ * Adjust ip_len to not reflect header,
+ * convert offset of this to bytes.
+ */
+ ip->ip_len -= hlen;
+ if (ip->ip_off & IP_MF) {
+ /*
+ * Make sure that fragments have a data length
+ * that's a non-zero multiple of 8 bytes.
+ */
+ if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
+ ipstat.ips_toosmall++; /* XXX */
+ goto bad;
+ }
+ m->m_flags |= M_FRAG;
+ }
+ ip->ip_off <<= 3;
+
+ /*
+ * Attempt reassembly; if it succeeds, proceed.
+ * ip_reass() will return a different mbuf, and update
+ * the divert info in divert_info and args.divert_rule.
+ */
+ ipstat.ips_fragments++;
+ m->m_pkthdr.header = ip;
+ m = ip_reass(m,
+ &ipq[sum], fp, &divert_info, &args.divert_rule);
+ if (m == 0)
+ return;
+ ipstat.ips_reassembled++;
+ ip = mtod(m, struct ip *);
+ /* Get the header length of the reassembled packet */
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+#ifdef IPDIVERT
+ /* Restore original checksum before diverting packet */
+ if (divert_info != 0) {
+ ip->ip_len += hlen;
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(m, hlen);
+ ip->ip_off = ntohs(ip->ip_off);
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_len -= hlen;
+ }
+#endif
+ } else
+ ip->ip_len -= hlen;
+
+#ifdef IPDIVERT
+ /*
+ * Divert or tee packet to the divert protocol if required.
+ */
+ if (divert_info != 0) {
+ struct mbuf *clone = NULL;
+
+ /* Clone packet if we're doing a 'tee' */
+ if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0)
+ clone = m_dup(m, M_DONTWAIT);
+
+ /* Restore packet header fields to original values */
+ ip->ip_len += hlen;
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+
+ /* Deliver packet to divert input routine */
+ divert_packet(m, 1, divert_info & 0xffff, args.divert_rule);
+ ipstat.ips_delivered++;
+
+ /* If 'tee', continue with original packet */
+ if (clone == NULL)
+ return;
+ m = clone;
+ ip = mtod(m, struct ip *);
+ ip->ip_len += hlen;
+ /*
+ * Jump backwards to complete processing of the
+ * packet. But first clear divert_info to avoid
+ * entering this block again.
+ * We do not need to clear args.divert_rule
+ * or args.next_hop as they will not be used.
+ */
+ divert_info = 0;
+ goto pass;
+ }
+#endif
+
+#ifdef IPSEC
+ /*
+ * enforce IPsec policy checking if we are seeing last header.
+ * note that we do not visit this with protocols with pcb layer
+ * code - like udp/tcp/raw ip.
+ */
+ if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0 &&
+ ipsec4_in_reject(m, NULL)) {
+ ipsecstat.in_polvio++;
+ goto bad;
+ }
+#endif
+
+ /*
+ * Switch out to protocol's input routine.
+ */
+ ipstat.ips_delivered++;
+ if (args.next_hop && ip->ip_p == IPPROTO_TCP) {
+ /* TCP needs IPFORWARD info if available */
+ struct m_hdr tag;
+
+ tag.mh_type = MT_TAG;
+ tag.mh_flags = PACKET_TAG_IPFORWARD;
+ tag.mh_data = (caddr_t)args.next_hop;
+ tag.mh_next = m;
+
+ (*inetsw[ip_protox[ip->ip_p]].pr_input)(
+ (struct mbuf *)&tag, hlen);
+ } else
+ (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
+ return;
+bad:
+ m_freem(m);
+}
+
+/*
+ * IP software interrupt routine - to go away sometime soon
+ */
+static void
+ipintr(void)
+{
+ struct mbuf *m;
+
+ while (1) {
+ IF_DEQUEUE(&ipintrq, m);
+ if (m == 0)
+ return;
+ ip_input(m);
+ }
+}
+
+/*
+ * Take incoming datagram fragment and try to reassemble it into
+ * whole datagram. If a chain for reassembly of this datagram already
+ * exists, then it is given as fp; otherwise have to make a chain.
+ *
+ * When IPDIVERT enabled, keep additional state with each packet that
+ * tells us if we need to divert or tee the packet we're building.
+ * In particular, *divinfo includes the port and TEE flag,
+ * *divert_rule is the number of the matching rule.
+ */
+
+static struct mbuf *
+ip_reass(struct mbuf *m, struct ipqhead *head, struct ipq *fp,
+ u_int32_t *divinfo, u_int16_t *divert_rule)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ register struct mbuf *p, *q, *nq;
+ struct mbuf *t;
+ int hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+ int i, next;
+
+ /*
+ * Presence of header sizes in mbufs
+ * would confuse code below.
+ */
+ m->m_data += hlen;
+ m->m_len -= hlen;
+
+ /*
+ * If first fragment to arrive, create a reassembly queue.
+ */
+ if (fp == 0) {
+ /*
+ * Enforce upper bound on number of fragmented packets
+ * for which we attempt reassembly;
+ * If maxfrag is 0, never accept fragments.
+ * If maxfrag is -1, accept all fragments without limitation.
+ */
+ if ((ip_maxfragpackets >= 0) && (ip_nfragpackets >= ip_maxfragpackets))
+ goto dropfrag;
+ ip_nfragpackets++;
+ if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL)
+ goto dropfrag;
+ fp = mtod(t, struct ipq *);
+ TAILQ_INSERT_HEAD(head, fp, ipq_list);
+ nipq++;
+ fp->ipq_ttl = IPFRAGTTL;
+ fp->ipq_p = ip->ip_p;
+ fp->ipq_id = ip->ip_id;
+ fp->ipq_src = ip->ip_src;
+ fp->ipq_dst = ip->ip_dst;
+ fp->ipq_frags = m;
+ m->m_nextpkt = NULL;
+#ifdef IPDIVERT
+ fp->ipq_div_info = 0;
+ fp->ipq_div_cookie = 0;
+#endif
+ goto inserted;
+ }
+
+#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header))
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
+ if (GETIP(q)->ip_off > ip->ip_off)
+ break;
+
+ /*
+ * If there is a preceding segment, it may provide some of
+ * our data already. If so, drop the data from the incoming
+ * segment. If it provides all of our data, drop us, otherwise
+ * stick new segment in the proper place.
+ *
+ * If some of the data is dropped from the the preceding
+ * segment, then it's checksum is invalidated.
+ */
+ if (p) {
+ i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
+ if (i > 0) {
+ if (i >= ip->ip_len)
+ goto dropfrag;
+ m_adj(m, i);
+ m->m_pkthdr.csum_flags = 0;
+ ip->ip_off += i;
+ ip->ip_len -= i;
+ }
+ m->m_nextpkt = p->m_nextpkt;
+ p->m_nextpkt = m;
+ } else {
+ m->m_nextpkt = fp->ipq_frags;
+ fp->ipq_frags = m;
+ }
+
+ /*
+ * While we overlap succeeding segments trim them or,
+ * if they are completely covered, dequeue them.
+ */
+ for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
+ q = nq) {
+ i = (ip->ip_off + ip->ip_len) -
+ GETIP(q)->ip_off;
+ if (i < GETIP(q)->ip_len) {
+ GETIP(q)->ip_len -= i;
+ GETIP(q)->ip_off += i;
+ m_adj(q, i);
+ q->m_pkthdr.csum_flags = 0;
+ break;
+ }
+ nq = q->m_nextpkt;
+ m->m_nextpkt = nq;
+ m_freem(q);
+ }
+
+inserted:
+
+#ifdef IPDIVERT
+ /*
+ * Transfer firewall instructions to the fragment structure.
+ * Only trust info in the fragment at offset 0.
+ */
+ if (ip->ip_off == 0) {
+ fp->ipq_div_info = *divinfo;
+ fp->ipq_div_cookie = *divert_rule;
+ }
+ *divinfo = 0;
+ *divert_rule = 0;
+#endif
+
+ /*
+ * Check for complete reassembly.
+ */
+ next = 0;
+ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
+ if (GETIP(q)->ip_off != next)
+ return (0);
+ next += GETIP(q)->ip_len;
+ }
+ /* Make sure the last packet didn't have the IP_MF flag */
+ if (p->m_flags & M_FRAG)
+ return (0);
+
+ /*
+ * Reassembly is complete. Make sure the packet is a sane size.
+ */
+ q = fp->ipq_frags;
+ ip = GETIP(q);
+ if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
+ ipstat.ips_toolong++;
+ ip_freef(head, fp);
+ return (0);
+ }
+
+ /*
+ * Concatenate fragments.
+ */
+ m = q;
+ t = m->m_next;
+ m->m_next = 0;
+ m_cat(m, t);
+ nq = q->m_nextpkt;
+ q->m_nextpkt = 0;
+ for (q = nq; q != NULL; q = nq) {
+ nq = q->m_nextpkt;
+ q->m_nextpkt = NULL;
+ m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
+ m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
+ m_cat(m, q);
+ }
+
+#ifdef IPDIVERT
+ /*
+ * Extract firewall instructions from the fragment structure.
+ */
+ *divinfo = fp->ipq_div_info;
+ *divert_rule = fp->ipq_div_cookie;
+#endif
+
+ /*
+ * Create header for new ip packet by
+ * modifying header of first packet;
+ * dequeue and discard fragment reassembly header.
+ * Make header visible.
+ */
+ ip->ip_len = next;
+ ip->ip_src = fp->ipq_src;
+ ip->ip_dst = fp->ipq_dst;
+ TAILQ_REMOVE(head, fp, ipq_list);
+ nipq--;
+ (void) m_free(dtom(fp));
+ ip_nfragpackets--;
+ m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
+ m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
+ /* some debugging cruft by sklower, below, will go away soon */
+ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
+ register int plen = 0;
+ for (t = m; t; t = t->m_next)
+ plen += t->m_len;
+ m->m_pkthdr.len = plen;
+ }
+ return (m);
+
+dropfrag:
+#ifdef IPDIVERT
+ *divinfo = 0;
+ *divert_rule = 0;
+#endif
+ ipstat.ips_fragdropped++;
+ m_freem(m);
+ return (0);
+
+#undef GETIP
+}
+
+/*
+ * Free a fragment reassembly header and all
+ * associated datagrams.
+ */
+static void
+ip_freef(fhp, fp)
+ struct ipqhead *fhp;
+ struct ipq *fp;
+{
+ register struct mbuf *q;
+
+ while (fp->ipq_frags) {
+ q = fp->ipq_frags;
+ fp->ipq_frags = q->m_nextpkt;
+ m_freem(q);
+ }
+ TAILQ_REMOVE(fhp, fp, ipq_list);
+ (void) m_free(dtom(fp));
+ ip_nfragpackets--;
+ nipq--;
+}
+
+/*
+ * IP timer processing;
+ * if a timer expires on a reassembly
+ * queue, discard it.
+ */
+void
+ip_slowtimo()
+{
+ register struct ipq *fp;
+ int s = splnet();
+ int i;
+
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ for(fp = TAILQ_FIRST(&ipq[i]); fp;) {
+ struct ipq *fpp;
+
+ fpp = fp;
+ fp = TAILQ_NEXT(fp, ipq_list);
+ if(--fpp->ipq_ttl == 0) {
+ ipstat.ips_fragtimeout++;
+ ip_freef(&ipq[i], fpp);
+ }
+ }
+ }
+ /*
+ * If we are over the maximum number of fragments
+ * (due to the limit being lowered), drain off
+ * enough to get down to the new limit.
+ */
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ if (ip_maxfragpackets >= 0) {
+ while (ip_nfragpackets > ip_maxfragpackets &&
+ !TAILQ_EMPTY(&ipq[i])) {
+ ipstat.ips_fragdropped++;
+ ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
+ }
+ }
+ }
+ ipflow_slowtimo();
+ splx(s);
+}
+
+/*
+ * Drain off all datagram fragments.
+ */
+void
+ip_drain()
+{
+ int i;
+
+ for (i = 0; i < IPREASS_NHASH; i++) {
+ while(!TAILQ_EMPTY(&ipq[i])) {
+ ipstat.ips_fragdropped++;
+ ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i]));
+ }
+ }
+ in_rtqdrain();
+}
+
+/*
+ * Do option processing on a datagram,
+ * possibly discarding it if bad options are encountered,
+ * or forwarding it if source-routed.
+ * The pass argument is used when operating in the IPSTEALTH
+ * mode to tell what options to process:
+ * [LS]SRR (pass 0) or the others (pass 1).
+ * The reason for as many as two passes is that when doing IPSTEALTH,
+ * non-routing options should be processed only if the packet is for us.
+ * Returns 1 if packet has been forwarded/freed,
+ * 0 if the packet should be processed further.
+ */
+static int
+ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ u_char *cp;
+ struct in_ifaddr *ia;
+ int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
+ struct in_addr *sin, dst;
+ n_time ntime;
+ struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
+
+ dst = ip->ip_dst;
+ cp = (u_char *)(ip + 1);
+ cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < IPOPT_OLEN + sizeof(*cp)) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ optlen = cp[IPOPT_OLEN];
+ if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ /*
+ * Source routing with record.
+ * Find interface with current destination address.
+ * If none on this machine then drop if strictly routed,
+ * or do nothing if loosely routed.
+ * Record interface address and bring up next address
+ * component. If strictly routed make sure next
+ * address is on directly accessible net.
+ */
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+#ifdef IPSTEALTH
+ if (ipstealth && pass > 0)
+ break;
+#endif
+ if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ ipaddr.sin_addr = ip->ip_dst;
+ ia = (struct in_ifaddr *)
+ ifa_ifwithaddr((struct sockaddr *)&ipaddr);
+ if (ia == 0) {
+ if (opt == IPOPT_SSRR) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+ goto bad;
+ }
+ if (!ip_dosourceroute)
+ goto nosourcerouting;
+ /*
+ * Loose routing, and not at next destination
+ * yet; nothing to do except forward.
+ */
+ break;
+ }
+ off--; /* 0 origin */
+ if (off > optlen - (int)sizeof(struct in_addr)) {
+ /*
+ * End of source route. Should be for us.
+ */
+ if (!ip_acceptsourceroute)
+ goto nosourcerouting;
+ save_rte(cp, ip->ip_src);
+ break;
+ }
+#ifdef IPSTEALTH
+ if (ipstealth)
+ goto dropit;
+#endif
+ if (!ip_dosourceroute) {
+ if (ipforwarding) {
+ char buf[16]; /* aaa.bbb.ccc.ddd\0 */
+ /*
+ * Acting as a router, so generate ICMP
+ */
+nosourcerouting:
+ strcpy(buf, inet_ntoa(ip->ip_dst));
+ log(LOG_WARNING,
+ "attempted source route from %s to %s\n",
+ inet_ntoa(ip->ip_src), buf);
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+ goto bad;
+ } else {
+ /*
+ * Not acting as a router, so silently drop.
+ */
+#ifdef IPSTEALTH
+dropit:
+#endif
+ ipstat.ips_cantforward++;
+ m_freem(m);
+ return (1);
+ }
+ }
+
+ /*
+ * locate outgoing interface
+ */
+ (void)memcpy(&ipaddr.sin_addr, cp + off,
+ sizeof(ipaddr.sin_addr));
+
+ if (opt == IPOPT_SSRR) {
+#define INA struct in_ifaddr *
+#define SA struct sockaddr *
+ if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0)
+ ia = (INA)ifa_ifwithnet((SA)&ipaddr);
+ } else
+ ia = ip_rtaddr(ipaddr.sin_addr, &ipforward_rt);
+ if (ia == 0) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_SRCFAIL;
+ goto bad;
+ }
+ ip->ip_dst = ipaddr.sin_addr;
+ (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+ sizeof(struct in_addr));
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ /*
+ * Let ip_intr's mcast routing check handle mcast pkts
+ */
+ forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
+ break;
+
+ case IPOPT_RR:
+#ifdef IPSTEALTH
+ if (ipstealth && pass == 0)
+ break;
+#endif
+ if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ /*
+ * If no space remains, ignore.
+ */
+ off--; /* 0 origin */
+ if (off > optlen - (int)sizeof(struct in_addr))
+ break;
+ (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
+ sizeof(ipaddr.sin_addr));
+ /*
+ * locate outgoing interface; if we're the destination,
+ * use the incoming interface (should be same).
+ */
+ if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 &&
+ (ia = ip_rtaddr(ipaddr.sin_addr,
+ &ipforward_rt)) == 0) {
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_HOST;
+ goto bad;
+ }
+ (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
+ sizeof(struct in_addr));
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ break;
+
+ case IPOPT_TS:
+#ifdef IPSTEALTH
+ if (ipstealth && pass == 0)
+ break;
+#endif
+ code = cp - (u_char *)ip;
+ if (optlen < 4 || optlen > 40) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ if ((off = cp[IPOPT_OFFSET]) < 5) {
+ code = &cp[IPOPT_OLEN] - (u_char *)ip;
+ goto bad;
+ }
+ if (off > optlen - (int)sizeof(int32_t)) {
+ cp[IPOPT_OFFSET + 1] += (1 << 4);
+ if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ break;
+ }
+ off--; /* 0 origin */
+ sin = (struct in_addr *)(cp + off);
+ switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
+
+ case IPOPT_TS_TSONLY:
+ break;
+
+ case IPOPT_TS_TSANDADDR:
+ if (off + sizeof(n_time) +
+ sizeof(struct in_addr) > optlen) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ ipaddr.sin_addr = dst;
+ ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
+ m->m_pkthdr.rcvif);
+ if (ia == 0)
+ continue;
+ (void)memcpy(sin, &IA_SIN(ia)->sin_addr,
+ sizeof(struct in_addr));
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ break;
+
+ case IPOPT_TS_PRESPEC:
+ if (off + sizeof(n_time) +
+ sizeof(struct in_addr) > optlen) {
+ code = &cp[IPOPT_OFFSET] - (u_char *)ip;
+ goto bad;
+ }
+ (void)memcpy(&ipaddr.sin_addr, sin,
+ sizeof(struct in_addr));
+ if (ifa_ifwithaddr((SA)&ipaddr) == 0)
+ continue;
+ cp[IPOPT_OFFSET] += sizeof(struct in_addr);
+ break;
+
+ default:
+ code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
+ goto bad;
+ }
+ ntime = iptime();
+ (void)memcpy(cp + off, &ntime, sizeof(n_time));
+ cp[IPOPT_OFFSET] += sizeof(n_time);
+ }
+ }
+ if (forward && ipforwarding) {
+ ip_forward(m, 1, next_hop);
+ return (1);
+ }
+ return (0);
+bad:
+ icmp_error(m, type, code, 0, 0);
+ ipstat.ips_badoptions++;
+ return (1);
+}
+
+/*
+ * Given address of next destination (final or next hop),
+ * return internet address info of interface to be used to get there.
+ */
+struct in_ifaddr *
+ip_rtaddr(dst, rt)
+ struct in_addr dst;
+ struct route *rt;
+{
+ register struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&rt->ro_dst;
+
+ if (rt->ro_rt == 0 ||
+ !(rt->ro_rt->rt_flags & RTF_UP) ||
+ dst.s_addr != sin->sin_addr.s_addr) {
+ if (rt->ro_rt) {
+ RTFREE(rt->ro_rt);
+ rt->ro_rt = 0;
+ }
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = dst;
+
+ rtalloc_ign(rt, RTF_PRCLONING);
+ }
+ if (rt->ro_rt == 0)
+ return ((struct in_ifaddr *)0);
+ return (ifatoia(rt->ro_rt->rt_ifa));
+}
+
+/*
+ * Save incoming source route for use in replies,
+ * to be picked up later by ip_srcroute if the receiver is interested.
+ */
+void
+save_rte(option, dst)
+ u_char *option;
+ struct in_addr dst;
+{
+ unsigned olen;
+
+ olen = option[IPOPT_OLEN];
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf("save_rte: olen %d\n", olen);
+#endif
+ if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
+ return;
+ bcopy(option, ip_srcrt.srcopt, olen);
+ ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
+ ip_srcrt.dst = dst;
+}
+
+/*
+ * Retrieve incoming source route for use in replies,
+ * in the same form used by setsockopt.
+ * The first hop is placed before the options, will be removed later.
+ */
+struct mbuf *
+ip_srcroute()
+{
+ register struct in_addr *p, *q;
+ register struct mbuf *m;
+
+ if (ip_nhops == 0)
+ return ((struct mbuf *)0);
+ m = m_get(M_DONTWAIT, MT_HEADER);
+ if (m == 0)
+ return ((struct mbuf *)0);
+
+#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
+
+ /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
+ m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
+ OPTSIZ;
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
+#endif
+
+ /*
+ * First save first hop for return route
+ */
+ p = &ip_srcrt.route[ip_nhops - 1];
+ *(mtod(m, struct in_addr *)) = *p--;
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr));
+#endif
+
+ /*
+ * Copy option fields and padding (nop) to mbuf.
+ */
+ ip_srcrt.nop = IPOPT_NOP;
+ ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
+ (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
+ &ip_srcrt.nop, OPTSIZ);
+ q = (struct in_addr *)(mtod(m, caddr_t) +
+ sizeof(struct in_addr) + OPTSIZ);
+#undef OPTSIZ
+ /*
+ * Record return path as an IP source route,
+ * reversing the path (pointers are now aligned).
+ */
+ while (p >= ip_srcrt.route) {
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf(" %lx", (u_long)ntohl(q->s_addr));
+#endif
+ *q++ = *p--;
+ }
+ /*
+ * Last hop goes to final destination.
+ */
+ *q = ip_srcrt.dst;
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf(" %lx\n", (u_long)ntohl(q->s_addr));
+#endif
+ return (m);
+}
+
+/*
+ * Strip out IP options, at higher
+ * level protocol in the kernel.
+ * Second argument is buffer to which options
+ * will be moved, and return value is their length.
+ * XXX should be deleted; last arg currently ignored.
+ */
+void
+ip_stripoptions(m, mopt)
+ register struct mbuf *m;
+ struct mbuf *mopt;
+{
+ register int i;
+ struct ip *ip = mtod(m, struct ip *);
+ register caddr_t opts;
+ int olen;
+
+ olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
+ opts = (caddr_t)(ip + 1);
+ i = m->m_len - (sizeof (struct ip) + olen);
+ bcopy(opts + olen, opts, (unsigned)i);
+ m->m_len -= olen;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len -= olen;
+ ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2);
+}
+
+u_char inetctlerrmap[PRC_NCMDS] = {
+ 0, 0, 0, 0,
+ 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
+ EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
+ EMSGSIZE, EHOSTUNREACH, 0, 0,
+ 0, 0, 0, 0,
+ ENOPROTOOPT, ECONNREFUSED
+};
+
+/*
+ * Forward a packet. If some error occurs return the sender
+ * an icmp packet. Note we can't always generate a meaningful
+ * icmp message because icmp doesn't have a large enough repertoire
+ * of codes and types.
+ *
+ * If not forwarding, just drop the packet. This could be confusing
+ * if ipforwarding was zero but some routing protocol was advancing
+ * us as a gateway to somewhere. However, we must let the routing
+ * protocol deal with that.
+ *
+ * The srcrt parameter indicates whether the packet is being forwarded
+ * via a source route.
+ */
+static void
+ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ struct rtentry *rt;
+ int error, type = 0, code = 0;
+ struct mbuf *mcopy;
+ n_long dest;
+ struct in_addr pkt_dst;
+ struct ifnet *destifp;
+#ifdef IPSEC
+ struct ifnet dummyifp;
+#endif
+
+ dest = 0;
+ /*
+ * Cache the destination address of the packet; this may be
+ * changed by use of 'ipfw fwd'.
+ */
+ pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst;
+
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf("forward: src %lx dst %lx ttl %x\n",
+ (u_long)ip->ip_src.s_addr, (u_long)pkt_dst.s_addr,
+ ip->ip_ttl);
+#endif
+
+
+ if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(pkt_dst) == 0) {
+ ipstat.ips_cantforward++;
+ m_freem(m);
+ return;
+ }
+#ifdef IPSTEALTH
+ if (!ipstealth) {
+#endif
+ if (ip->ip_ttl <= IPTTLDEC) {
+ icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+ dest, 0);
+ return;
+ }
+#ifdef IPSTEALTH
+ }
+#endif
+
+ if (ip_rtaddr(pkt_dst, &ipforward_rt) == 0) {
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
+ return;
+ } else
+ rt = ipforward_rt.ro_rt;
+
+ /*
+ * Save the IP header and at most 8 bytes of the payload,
+ * in case we need to generate an ICMP message to the src.
+ *
+ * XXX this can be optimized a lot by saving the data in a local
+ * buffer on the stack (72 bytes at most), and only allocating the
+ * mbuf if really necessary. The vast majority of the packets
+ * are forwarded without having to send an ICMP back (either
+ * because unnecessary, or because rate limited), so we are
+ * really we are wasting a lot of work here.
+ *
+ * We don't use m_copy() because it might return a reference
+ * to a shared cluster. Both this function and ip_output()
+ * assume exclusive access to the IP header in `m', so any
+ * data in a cluster may change before we reach icmp_error().
+ */
+ MGET(mcopy, M_DONTWAIT, m->m_type);
+ if (mcopy != NULL) {
+ M_COPY_PKTHDR(mcopy, m);
+ mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
+ (int)ip->ip_len);
+ m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
+ }
+
+#ifdef IPSTEALTH
+ if (!ipstealth) {
+#endif
+ ip->ip_ttl -= IPTTLDEC;
+#ifdef IPSTEALTH
+ }
+#endif
+
+ /*
+ * If forwarding packet using same interface that it came in on,
+ * perhaps should send a redirect to sender to shortcut a hop.
+ * Only send redirect if source is sending directly to us,
+ * and if packet was not source routed (or has any options).
+ * Also, don't send redirect if forwarding using a default route
+ * or a route modified by a redirect.
+ */
+ if (rt->rt_ifp == m->m_pkthdr.rcvif &&
+ (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
+ satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
+ ipsendredirects && !srcrt && !next_hop) {
+#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa))
+ u_long src = ntohl(ip->ip_src.s_addr);
+
+ if (RTA(rt) &&
+ (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
+ if (rt->rt_flags & RTF_GATEWAY)
+ dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
+ else
+ dest = pkt_dst.s_addr;
+ /* Router requirements says to only send host redirects */
+ type = ICMP_REDIRECT;
+ code = ICMP_REDIRECT_HOST;
+#ifdef DIAGNOSTIC
+ if (ipprintfs)
+ printf("redirect (%d) to %lx\n", code, (u_long)dest);
+#endif
+ }
+ }
+
+ error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
+ IP_FORWARDING, 0);
+ if (error)
+ ipstat.ips_cantforward++;
+ else {
+ ipstat.ips_forward++;
+ if (type)
+ ipstat.ips_redirectsent++;
+ else {
+ if (mcopy) {
+ ipflow_create(&ipforward_rt, mcopy);
+ m_freem(mcopy);
+ }
+ return;
+ }
+ }
+ if (mcopy == NULL)
+ return;
+ destifp = NULL;
+
+ switch (error) {
+
+ case 0: /* forwarded, but need redirect */
+ /* type, code set above */
+ break;
+
+ case ENETUNREACH: /* shouldn't happen, checked above */
+ case EHOSTUNREACH:
+ case ENETDOWN:
+ case EHOSTDOWN:
+ default:
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_HOST;
+ break;
+
+ case EMSGSIZE:
+ type = ICMP_UNREACH;
+ code = ICMP_UNREACH_NEEDFRAG;
+#ifndef IPSEC
+ if (ipforward_rt.ro_rt)
+ destifp = ipforward_rt.ro_rt->rt_ifp;
+#else
+ /*
+ * If the packet is routed over IPsec tunnel, tell the
+ * originator the tunnel MTU.
+ * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
+ * XXX quickhack!!!
+ */
+ if (ipforward_rt.ro_rt) {
+ struct secpolicy *sp = NULL;
+ int ipsecerror;
+ int ipsechdr;
+ struct route *ro;
+
+ sp = ipsec4_getpolicybyaddr(mcopy,
+ IPSEC_DIR_OUTBOUND,
+ IP_FORWARDING,
+ &ipsecerror);
+
+ if (sp == NULL)
+ destifp = ipforward_rt.ro_rt->rt_ifp;
+ else {
+ /* count IPsec header size */
+ ipsechdr = ipsec4_hdrsiz(mcopy,
+ IPSEC_DIR_OUTBOUND,
+ NULL);
+
+ /*
+ * find the correct route for outer IPv4
+ * header, compute tunnel MTU.
+ *
+ * XXX BUG ALERT
+ * The "dummyifp" code relies upon the fact
+ * that icmp_error() touches only ifp->if_mtu.
+ */
+ /*XXX*/
+ destifp = NULL;
+ if (sp->req != NULL
+ && sp->req->sav != NULL
+ && sp->req->sav->sah != NULL) {
+ ro = &sp->req->sav->sah->sa_route;
+ if (ro->ro_rt && ro->ro_rt->rt_ifp) {
+ dummyifp.if_mtu =
+ ro->ro_rt->rt_ifp->if_mtu;
+ dummyifp.if_mtu -= ipsechdr;
+ destifp = &dummyifp;
+ }
+ }
+
+ key_freesp(sp);
+ }
+ }
+#endif /*IPSEC*/
+ ipstat.ips_cantfrag++;
+ break;
+
+ case ENOBUFS:
+ type = ICMP_SOURCEQUENCH;
+ code = 0;
+ break;
+
+ case EACCES: /* ipfw denied packet */
+ m_freem(mcopy);
+ return;
+ }
+ icmp_error(mcopy, type, code, dest, destifp);
+}
+
+void
+ip_savecontrol(inp, mp, ip, m)
+ register struct inpcb *inp;
+ register struct mbuf **mp;
+ register struct ip *ip;
+ register struct mbuf *m;
+{
+ if (inp->inp_socket->so_options & SO_TIMESTAMP) {
+ struct timeval tv;
+
+ microtime(&tv);
+ *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
+ SCM_TIMESTAMP, SOL_SOCKET);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ if (inp->inp_flags & INP_RECVDSTADDR) {
+ *mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
+ sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+#ifdef notyet
+ /* XXX
+ * Moving these out of udp_input() made them even more broken
+ * than they already were.
+ */
+ /* options were tossed already */
+ if (inp->inp_flags & INP_RECVOPTS) {
+ *mp = sbcreatecontrol((caddr_t) opts_deleted_above,
+ sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+ /* ip_srcroute doesn't do what we want here, need to fix */
+ if (inp->inp_flags & INP_RECVRETOPTS) {
+ *mp = sbcreatecontrol((caddr_t) ip_srcroute(),
+ sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+#endif
+ if (inp->inp_flags & INP_RECVIF) {
+ struct ifnet *ifp;
+ struct sdlbuf {
+ struct sockaddr_dl sdl;
+ u_char pad[32];
+ } sdlbuf;
+ struct sockaddr_dl *sdp;
+ struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
+
+ if (((ifp = m->m_pkthdr.rcvif))
+ && ( ifp->if_index && (ifp->if_index <= if_index))) {
+ sdp = (struct sockaddr_dl *)
+ (ifaddr_byindex(ifp->if_index)->ifa_addr);
+ /*
+ * Change our mind and don't try copy.
+ */
+ if ((sdp->sdl_family != AF_LINK)
+ || (sdp->sdl_len > sizeof(sdlbuf))) {
+ goto makedummy;
+ }
+ bcopy(sdp, sdl2, sdp->sdl_len);
+ } else {
+makedummy:
+ sdl2->sdl_len
+ = offsetof(struct sockaddr_dl, sdl_data[0]);
+ sdl2->sdl_family = AF_LINK;
+ sdl2->sdl_index = 0;
+ sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
+ }
+ *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
+ IP_RECVIF, IPPROTO_IP);
+ if (*mp)
+ mp = &(*mp)->m_next;
+ }
+}
+
+/*
+ * XXX these routines are called from the upper part of the kernel.
+ * They need to be locked when we remove Giant.
+ *
+ * They could also be moved to ip_mroute.c, since all the RSVP
+ * handling is done there already.
+ */
+static int ip_rsvp_on;
+struct socket *ip_rsvpd;
+int
+ip_rsvp_init(struct socket *so)
+{
+ if (so->so_type != SOCK_RAW ||
+ so->so_proto->pr_protocol != IPPROTO_RSVP)
+ return EOPNOTSUPP;
+
+ if (ip_rsvpd != NULL)
+ return EADDRINUSE;
+
+ ip_rsvpd = so;
+ /*
+ * This may seem silly, but we need to be sure we don't over-increment
+ * the RSVP counter, in case something slips up.
+ */
+ if (!ip_rsvp_on) {
+ ip_rsvp_on = 1;
+ rsvp_on++;
+ }
+
+ return 0;
+}
+
+int
+ip_rsvp_done(void)
+{
+ ip_rsvpd = NULL;
+ /*
+ * This may seem silly, but we need to be sure we don't over-decrement
+ * the RSVP counter, in case something slips up.
+ */
+ if (ip_rsvp_on) {
+ ip_rsvp_on = 0;
+ rsvp_on--;
+ }
+ return 0;
+}
diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c
new file mode 100644
index 0000000..668038e
--- /dev/null
+++ b/sys/netinet/ip_mroute.c
@@ -0,0 +1,2257 @@
+/*
+ * IP multicast forwarding procedures
+ *
+ * Written by David Waitzman, BBN Labs, August 1988.
+ * Modified by Steve Deering, Stanford, February 1989.
+ * Modified by Mark J. Steiglitz, Stanford, May, 1991
+ * Modified by Van Jacobson, LBL, January 1993
+ * Modified by Ajit Thyagarajan, PARC, August 1993
+ * Modified by Bill Fenner, PARC, April 1995
+ *
+ * MROUTING Revision: 3.5
+ * $FreeBSD$
+ */
+
+#include "opt_mrouting.h"
+#include "opt_random_ip_id.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/igmp.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_encap.h>
+#include <netinet/ip_mroute.h>
+#include <netinet/ip_var.h>
+#include <netinet/udp.h>
+#include <machine/in_cksum.h>
+
+#ifndef MROUTING
+extern u_long _ip_mcast_src(int vifi);
+extern int _ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
+ struct ip_moptions *imo);
+extern int _ip_mrouter_done(void);
+extern int _ip_mrouter_get(struct socket *so, struct sockopt *sopt);
+extern int _ip_mrouter_set(struct socket *so, struct sockopt *sopt);
+extern int _mrt_ioctl(int req, caddr_t data);
+
+/*
+ * Dummy routines and globals used when multicast routing is not compiled in.
+ */
+
+struct socket *ip_mrouter = NULL;
+u_int rsvpdebug = 0;
+
+int
+_ip_mrouter_set(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ return(EOPNOTSUPP);
+}
+
+int (*ip_mrouter_set)(struct socket *, struct sockopt *) = _ip_mrouter_set;
+
+
+int
+_ip_mrouter_get(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ return(EOPNOTSUPP);
+}
+
+int (*ip_mrouter_get)(struct socket *, struct sockopt *) = _ip_mrouter_get;
+
+int
+_ip_mrouter_done()
+{
+ return(0);
+}
+
+int (*ip_mrouter_done)(void) = _ip_mrouter_done;
+
+int
+_ip_mforward(ip, ifp, m, imo)
+ struct ip *ip;
+ struct ifnet *ifp;
+ struct mbuf *m;
+ struct ip_moptions *imo;
+{
+ return(0);
+}
+
+int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
+ struct ip_moptions *) = _ip_mforward;
+
+int
+_mrt_ioctl(int req, caddr_t data)
+{
+ return EOPNOTSUPP;
+}
+
+int (*mrt_ioctl)(int, caddr_t) = _mrt_ioctl;
+
+void
+rsvp_input(m, off) /* XXX must fixup manually */
+ struct mbuf *m;
+ int off;
+{
+ /* Can still get packets with rsvp_on = 0 if there is a local member
+ * of the group to which the RSVP packet is addressed. But in this
+ * case we want to throw the packet away.
+ */
+ if (!rsvp_on) {
+ m_freem(m);
+ return;
+ }
+
+ if (ip_rsvpd != NULL) {
+ if (rsvpdebug)
+ printf("rsvp_input: Sending packet up old-style socket\n");
+ rip_input(m, off);
+ return;
+ }
+ /* Drop the packet */
+ m_freem(m);
+}
+
+int (*legal_vif_num)(int) = 0;
+
+/*
+ * This should never be called, since IP_MULTICAST_VIF should fail, but
+ * just in case it does get called, the code a little lower in ip_output
+ * will assign the packet a local address.
+ */
+u_long
+_ip_mcast_src(int vifi) { return INADDR_ANY; }
+u_long (*ip_mcast_src)(int) = _ip_mcast_src;
+
+int
+ip_rsvp_vif_init(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ return(EINVAL);
+}
+
+int
+ip_rsvp_vif_done(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ return(EINVAL);
+}
+
+void
+ip_rsvp_force_done(so)
+ struct socket *so;
+{
+ return;
+}
+
+#else /* MROUTING */
+
+#define M_HASCL(m) ((m)->m_flags & M_EXT)
+
+static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables");
+
+#ifndef MROUTE_KLD
+/* The socket used to communicate with the multicast routing daemon. */
+struct socket *ip_mrouter = NULL;
+#endif
+
+#if defined(MROUTING) || defined(MROUTE_KLD)
+static struct mrtstat mrtstat;
+SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
+ &mrtstat, mrtstat, "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)");
+#endif
+
+static struct mfc *mfctable[MFCTBLSIZ];
+static u_char nexpire[MFCTBLSIZ];
+static struct vif viftable[MAXVIFS];
+static u_int mrtdebug = 0; /* debug level */
+#define DEBUG_MFC 0x02
+#define DEBUG_FORWARD 0x04
+#define DEBUG_EXPIRE 0x08
+#define DEBUG_XMIT 0x10
+static u_int tbfdebug = 0; /* tbf debug level */
+static u_int rsvpdebug = 0; /* rsvp debug level */
+
+static struct callout_handle expire_upcalls_ch;
+
+#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
+#define UPCALL_EXPIRE 6 /* number of timeouts */
+
+/*
+ * Define the token bucket filter structures
+ * tbftable -> each vif has one of these for storing info
+ */
+
+static struct tbf tbftable[MAXVIFS];
+#define TBF_REPROCESS (hz / 100) /* 100x / second */
+
+/*
+ * 'Interfaces' associated with decapsulator (so we can tell
+ * packets that went through it from ones that get reflected
+ * by a broken gateway). These interfaces are never linked into
+ * the system ifnet list & no routes point to them. I.e., packets
+ * can't be sent this way. They only exist as a placeholder for
+ * multicast source verification.
+ */
+static struct ifnet multicast_decap_if[MAXVIFS];
+
+#define ENCAP_TTL 64
+#define ENCAP_PROTO IPPROTO_IPIP /* 4 */
+
+/* prototype IP hdr for encapsulated packets */
+static struct ip multicast_encap_iphdr = {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ sizeof(struct ip) >> 2, IPVERSION,
+#else
+ IPVERSION, sizeof(struct ip) >> 2,
+#endif
+ 0, /* tos */
+ sizeof(struct ip), /* total length */
+ 0, /* id */
+ 0, /* frag offset */
+ ENCAP_TTL, ENCAP_PROTO,
+ 0, /* checksum */
+};
+
+/*
+ * Private variables.
+ */
+static vifi_t numvifs = 0;
+static const struct encaptab *encap_cookie = NULL;
+
+/*
+ * one-back cache used by mroute_encapcheck to locate a tunnel's vif
+ * given a datagram's src ip address.
+ */
+static u_long last_encap_src;
+static struct vif *last_encap_vif;
+
+static u_long X_ip_mcast_src(int vifi);
+static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo);
+static int X_ip_mrouter_done(void);
+static int X_ip_mrouter_get(struct socket *so, struct sockopt *m);
+static int X_ip_mrouter_set(struct socket *so, struct sockopt *m);
+static int X_legal_vif_num(int vif);
+static int X_mrt_ioctl(int cmd, caddr_t data);
+
+static int get_sg_cnt(struct sioc_sg_req *);
+static int get_vif_cnt(struct sioc_vif_req *);
+static int ip_mrouter_init(struct socket *, int);
+static int add_vif(struct vifctl *);
+static int del_vif(vifi_t);
+static int add_mfc(struct mfcctl *);
+static int del_mfc(struct mfcctl *);
+static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
+static int set_assert(int);
+static void expire_upcalls(void *);
+static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *,
+ vifi_t);
+static void phyint_send(struct ip *, struct vif *, struct mbuf *);
+static void encap_send(struct ip *, struct vif *, struct mbuf *);
+static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long);
+static void tbf_queue(struct vif *, struct mbuf *);
+static void tbf_process_q(struct vif *);
+static void tbf_reprocess_q(void *);
+static int tbf_dq_sel(struct vif *, struct ip *);
+static void tbf_send_packet(struct vif *, struct mbuf *);
+static void tbf_update_tokens(struct vif *);
+static int priority(struct vif *, struct ip *);
+
+/*
+ * whether or not special PIM assert processing is enabled.
+ */
+static int pim_assert;
+/*
+ * Rate limit for assert notification messages, in usec
+ */
+#define ASSERT_MSG_TIME 3000000
+
+/*
+ * Hash function for a source, group entry
+ */
+#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
+ ((g) >> 20) ^ ((g) >> 10) ^ (g))
+
+/*
+ * Find a route for a given origin IP address and Multicast group address
+ * Type of service parameter to be added in the future!!!
+ */
+
+#define MFCFIND(o, g, rt) { \
+ register struct mfc *_rt = mfctable[MFCHASH(o,g)]; \
+ rt = NULL; \
+ ++mrtstat.mrts_mfc_lookups; \
+ while (_rt) { \
+ if ((_rt->mfc_origin.s_addr == o) && \
+ (_rt->mfc_mcastgrp.s_addr == g) && \
+ (_rt->mfc_stall == NULL)) { \
+ rt = _rt; \
+ break; \
+ } \
+ _rt = _rt->mfc_next; \
+ } \
+ if (rt == NULL) { \
+ ++mrtstat.mrts_mfc_misses; \
+ } \
+}
+
+
+/*
+ * Macros to compute elapsed time efficiently
+ * Borrowed from Van Jacobson's scheduling code
+ */
+#define TV_DELTA(a, b, delta) { \
+ register int xxs; \
+ \
+ delta = (a).tv_usec - (b).tv_usec; \
+ if ((xxs = (a).tv_sec - (b).tv_sec)) { \
+ switch (xxs) { \
+ case 2: \
+ delta += 1000000; \
+ /* fall through */ \
+ case 1: \
+ delta += 1000000; \
+ break; \
+ default: \
+ delta += (1000000 * xxs); \
+ } \
+ } \
+}
+
+#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
+ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
+
+#ifdef UPCALL_TIMING
+u_long upcall_data[51];
+static void collate(struct timeval *);
+#endif /* UPCALL_TIMING */
+
+
+/*
+ * Handle MRT setsockopt commands to modify the multicast routing tables.
+ */
+static int
+X_ip_mrouter_set(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, optval;
+ vifi_t vifi;
+ struct vifctl vifc;
+ struct mfcctl mfc;
+
+ if (so != ip_mrouter && sopt->sopt_name != MRT_INIT)
+ return (EPERM);
+
+ error = 0;
+ switch (sopt->sopt_name) {
+ case MRT_INIT:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+ error = ip_mrouter_init(so, optval);
+ break;
+
+ case MRT_DONE:
+ error = ip_mrouter_done();
+ break;
+
+ case MRT_ADD_VIF:
+ error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
+ if (error)
+ break;
+ error = add_vif(&vifc);
+ break;
+
+ case MRT_DEL_VIF:
+ error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
+ if (error)
+ break;
+ error = del_vif(vifi);
+ break;
+
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc);
+ if (error)
+ break;
+ if (sopt->sopt_name == MRT_ADD_MFC)
+ error = add_mfc(&mfc);
+ else
+ error = del_mfc(&mfc);
+ break;
+
+ case MRT_ASSERT:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+ set_assert(optval);
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+#ifndef MROUTE_KLD
+int (*ip_mrouter_set)(struct socket *, struct sockopt *) = X_ip_mrouter_set;
+#endif
+
+/*
+ * Handle MRT getsockopt commands
+ */
+static int
+X_ip_mrouter_get(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error;
+ static int version = 0x0305; /* !!! why is this here? XXX */
+
+ switch (sopt->sopt_name) {
+ case MRT_VERSION:
+ error = sooptcopyout(sopt, &version, sizeof version);
+ break;
+
+ case MRT_ASSERT:
+ error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+#ifndef MROUTE_KLD
+int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get;
+#endif
+
+/*
+ * Handle ioctl commands to obtain information from the cache
+ */
+static int
+X_mrt_ioctl(cmd, data)
+ int cmd;
+ caddr_t data;
+{
+ int error = 0;
+
+ switch (cmd) {
+ case (SIOCGETVIFCNT):
+ return (get_vif_cnt((struct sioc_vif_req *)data));
+ break;
+ case (SIOCGETSGCNT):
+ return (get_sg_cnt((struct sioc_sg_req *)data));
+ break;
+ default:
+ return (EINVAL);
+ break;
+ }
+ return error;
+}
+
+#ifndef MROUTE_KLD
+int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl;
+#endif
+
+/*
+ * returns the packet, byte, rpf-failure count for the source group provided
+ */
+static int
+get_sg_cnt(req)
+ register struct sioc_sg_req *req;
+{
+ register struct mfc *rt;
+ int s;
+
+ s = splnet();
+ MFCFIND(req->src.s_addr, req->grp.s_addr, rt);
+ splx(s);
+ if (rt != NULL) {
+ req->pktcnt = rt->mfc_pkt_cnt;
+ req->bytecnt = rt->mfc_byte_cnt;
+ req->wrong_if = rt->mfc_wrong_if;
+ } else
+ req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
+
+ return 0;
+}
+
+/*
+ * returns the input and output packet and byte counts on the vif provided
+ */
+static int
+get_vif_cnt(req)
+ register struct sioc_vif_req *req;
+{
+ register vifi_t vifi = req->vifi;
+
+ if (vifi >= numvifs) return EINVAL;
+
+ req->icount = viftable[vifi].v_pkt_in;
+ req->ocount = viftable[vifi].v_pkt_out;
+ req->ibytes = viftable[vifi].v_bytes_in;
+ req->obytes = viftable[vifi].v_bytes_out;
+
+ return 0;
+}
+
+/*
+ * Enable multicast routing
+ */
+static int
+ip_mrouter_init(so, version)
+ struct socket *so;
+ int version;
+{
+ if (mrtdebug)
+ log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
+ so->so_type, so->so_proto->pr_protocol);
+
+ if (so->so_type != SOCK_RAW ||
+ so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP;
+
+ if (version != 1)
+ return ENOPROTOOPT;
+
+ if (ip_mrouter != NULL) return EADDRINUSE;
+
+ ip_mrouter = so;
+
+ bzero((caddr_t)mfctable, sizeof(mfctable));
+ bzero((caddr_t)nexpire, sizeof(nexpire));
+
+ pim_assert = 0;
+
+ expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT);
+
+ if (mrtdebug)
+ log(LOG_DEBUG, "ip_mrouter_init\n");
+
+ return 0;
+}
+
+/*
+ * Disable multicast routing
+ */
+static int
+X_ip_mrouter_done()
+{
+ vifi_t vifi;
+ int i;
+ struct ifnet *ifp;
+ struct ifreq ifr;
+ struct mfc *rt;
+ struct rtdetq *rte;
+ int s;
+
+ s = splnet();
+
+ /*
+ * For each phyint in use, disable promiscuous reception of all IP
+ * multicasts.
+ */
+ for (vifi = 0; vifi < numvifs; vifi++) {
+ if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
+ !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
+ ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
+ ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr
+ = INADDR_ANY;
+ ifp = viftable[vifi].v_ifp;
+ if_allmulti(ifp, 0);
+ }
+ }
+ bzero((caddr_t)tbftable, sizeof(tbftable));
+ bzero((caddr_t)viftable, sizeof(viftable));
+ numvifs = 0;
+ pim_assert = 0;
+
+ untimeout(expire_upcalls, (caddr_t)NULL, expire_upcalls_ch);
+
+ /*
+ * Free all multicast forwarding cache entries.
+ */
+ for (i = 0; i < MFCTBLSIZ; i++) {
+ for (rt = mfctable[i]; rt != NULL; ) {
+ struct mfc *nr = rt->mfc_next;
+
+ for (rte = rt->mfc_stall; rte != NULL; ) {
+ struct rtdetq *n = rte->next;
+
+ m_freem(rte->m);
+ free(rte, M_MRTABLE);
+ rte = n;
+ }
+ free(rt, M_MRTABLE);
+ rt = nr;
+ }
+ }
+
+ bzero((caddr_t)mfctable, sizeof(mfctable));
+
+ /*
+ * Reset de-encapsulation cache
+ */
+ last_encap_src = 0;
+ last_encap_vif = NULL;
+ if (encap_cookie) {
+ encap_detach(encap_cookie);
+ encap_cookie = NULL;
+ }
+
+ ip_mrouter = NULL;
+
+ splx(s);
+
+ if (mrtdebug)
+ log(LOG_DEBUG, "ip_mrouter_done\n");
+
+ return 0;
+}
+
+#ifndef MROUTE_KLD
+int (*ip_mrouter_done)(void) = X_ip_mrouter_done;
+#endif
+
+/*
+ * Set PIM assert processing global
+ */
+static int
+set_assert(i)
+ int i;
+{
+ if ((i != 1) && (i != 0))
+ return EINVAL;
+
+ pim_assert = i;
+
+ return 0;
+}
+
+/*
+ * Decide if a packet is from a tunnelled peer.
+ * Return 0 if not, 64 if so.
+ */
+static int
+mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ int hlen = ip->ip_hl << 2;
+ register struct vif *vifp;
+
+ /*
+ * don't claim the packet if it's not to a multicast destination or if
+ * we don't have an encapsulating tunnel with the source.
+ * Note: This code assumes that the remote site IP address
+ * uniquely identifies the tunnel (i.e., that this site has
+ * at most one tunnel with the remote site).
+ */
+ if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) {
+ return 0;
+ }
+ if (ip->ip_src.s_addr != last_encap_src) {
+ register struct vif *vife;
+
+ vifp = viftable;
+ vife = vifp + numvifs;
+ last_encap_src = ip->ip_src.s_addr;
+ last_encap_vif = 0;
+ for ( ; vifp < vife; ++vifp)
+ if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) {
+ if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT))
+ == VIFF_TUNNEL)
+ last_encap_vif = vifp;
+ break;
+ }
+ }
+ if ((vifp = last_encap_vif) == 0) {
+ last_encap_src = 0;
+ return 0;
+ }
+ return 64;
+}
+
+/*
+ * De-encapsulate a packet and feed it back through ip input (this
+ * routine is called whenever IP gets a packet that mroute_encap_func()
+ * claimed).
+ */
+static void
+mroute_encap_input(struct mbuf *m, int off)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ int hlen = ip->ip_hl << 2;
+
+ if (hlen > sizeof(struct ip))
+ ip_stripoptions(m, (struct mbuf *) 0);
+ m->m_data += sizeof(struct ip);
+ m->m_len -= sizeof(struct ip);
+ m->m_pkthdr.len -= sizeof(struct ip);
+
+ m->m_pkthdr.rcvif = last_encap_vif->v_ifp;
+
+ (void) IF_HANDOFF(&ipintrq, m, NULL);
+ /*
+ * normally we would need a "schednetisr(NETISR_IP)"
+ * here but we were called by ip_input and it is going
+ * to loop back & try to dequeue the packet we just
+ * queued as soon as we return so we avoid the
+ * unnecessary software interrrupt.
+ */
+}
+
+extern struct domain inetdomain;
+static struct protosw mroute_encap_protosw =
+{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR,
+ mroute_encap_input, 0, 0, rip_ctloutput,
+ 0,
+ 0, 0, 0, 0,
+ &rip_usrreqs
+};
+
+/*
+ * Add a vif to the vif table
+ */
+static int
+add_vif(vifcp)
+ register struct vifctl *vifcp;
+{
+ register struct vif *vifp = viftable + vifcp->vifc_vifi;
+ static struct sockaddr_in sin = {sizeof sin, AF_INET};
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ int error, s;
+ struct tbf *v_tbf = tbftable + vifcp->vifc_vifi;
+
+ if (vifcp->vifc_vifi >= MAXVIFS) return EINVAL;
+ if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE;
+
+ /* Find the interface with an address in AF_INET family */
+ sin.sin_addr = vifcp->vifc_lcl_addr;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
+ if (ifa == 0) return EADDRNOTAVAIL;
+ ifp = ifa->ifa_ifp;
+
+ if (vifcp->vifc_flags & VIFF_TUNNEL) {
+ if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) {
+ /*
+ * An encapsulating tunnel is wanted. Tell
+ * mroute_encap_input() to start paying attention
+ * to encapsulated packets.
+ */
+ if (encap_cookie == NULL) {
+ encap_cookie = encap_attach_func(AF_INET, -1,
+ mroute_encapcheck,
+ (struct protosw *)&mroute_encap_protosw, NULL);
+
+ if (encap_cookie == NULL) {
+ printf("ip_mroute: unable to attach encap\n");
+ return (EIO); /* XXX */
+ }
+ for (s = 0; s < MAXVIFS; ++s) {
+ multicast_decap_if[s].if_name = "mdecap";
+ multicast_decap_if[s].if_unit = s;
+ }
+ }
+ /*
+ * Set interface to fake encapsulator interface
+ */
+ ifp = &multicast_decap_if[vifcp->vifc_vifi];
+ /*
+ * Prepare cached route entry
+ */
+ bzero(&vifp->v_route, sizeof(vifp->v_route));
+ } else {
+ log(LOG_ERR, "source routed tunnels not supported\n");
+ return EOPNOTSUPP;
+ }
+ } else {
+ /* Make sure the interface supports multicast */
+ if ((ifp->if_flags & IFF_MULTICAST) == 0)
+ return EOPNOTSUPP;
+
+ /* Enable promiscuous reception of all IP multicasts from the if */
+ s = splnet();
+ error = if_allmulti(ifp, 1);
+ splx(s);
+ if (error)
+ return error;
+ }
+
+ s = splnet();
+ /* define parameters for the tbf structure */
+ vifp->v_tbf = v_tbf;
+ GET_TIME(vifp->v_tbf->tbf_last_pkt_t);
+ vifp->v_tbf->tbf_n_tok = 0;
+ vifp->v_tbf->tbf_q_len = 0;
+ vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
+ vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
+
+ vifp->v_flags = vifcp->vifc_flags;
+ vifp->v_threshold = vifcp->vifc_threshold;
+ vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
+ vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
+ vifp->v_ifp = ifp;
+ /* scaling up here allows division by 1024 in critical code */
+ vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000;
+ vifp->v_rsvp_on = 0;
+ vifp->v_rsvpd = NULL;
+ /* initialize per vif pkt counters */
+ vifp->v_pkt_in = 0;
+ vifp->v_pkt_out = 0;
+ vifp->v_bytes_in = 0;
+ vifp->v_bytes_out = 0;
+ splx(s);
+
+ /* Adjust numvifs up if the vifi is higher than numvifs */
+ if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;
+
+ if (mrtdebug)
+ log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n",
+ vifcp->vifc_vifi,
+ (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr),
+ (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
+ (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr),
+ vifcp->vifc_threshold,
+ vifcp->vifc_rate_limit);
+
+ return 0;
+}
+
+/*
+ * Delete a vif from the vif table
+ */
+static int
+del_vif(vifi)
+ vifi_t vifi;
+{
+ register struct vif *vifp = &viftable[vifi];
+ register struct mbuf *m;
+ struct ifnet *ifp;
+ struct ifreq ifr;
+ int s;
+
+ if (vifi >= numvifs) return EINVAL;
+ if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL;
+
+ s = splnet();
+
+ if (!(vifp->v_flags & VIFF_TUNNEL)) {
+ ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
+ ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY;
+ ifp = vifp->v_ifp;
+ if_allmulti(ifp, 0);
+ }
+
+ if (vifp == last_encap_vif) {
+ last_encap_vif = 0;
+ last_encap_src = 0;
+ }
+
+ /*
+ * Free packets queued at the interface
+ */
+ while (vifp->v_tbf->tbf_q) {
+ m = vifp->v_tbf->tbf_q;
+ vifp->v_tbf->tbf_q = m->m_act;
+ m_freem(m);
+ }
+
+ bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf)));
+ bzero((caddr_t)vifp, sizeof (*vifp));
+
+ if (mrtdebug)
+ log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs);
+
+ /* Adjust numvifs down */
+ for (vifi = numvifs; vifi > 0; vifi--)
+ if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break;
+ numvifs = vifi;
+
+ splx(s);
+
+ return 0;
+}
+
+/*
+ * Add an mfc entry
+ */
+static int
+add_mfc(mfccp)
+ struct mfcctl *mfccp;
+{
+ struct mfc *rt;
+ u_long hash;
+ struct rtdetq *rte;
+ register u_short nstl;
+ int s;
+ int i;
+
+ MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt);
+
+ /* If an entry already exists, just update the fields */
+ if (rt) {
+ if (mrtdebug & DEBUG_MFC)
+ log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n",
+ (u_long)ntohl(mfccp->mfcc_origin.s_addr),
+ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
+ mfccp->mfcc_parent);
+
+ s = splnet();
+ rt->mfc_parent = mfccp->mfcc_parent;
+ for (i = 0; i < numvifs; i++)
+ rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
+ splx(s);
+ return 0;
+ }
+
+ /*
+ * Find the entry for which the upcall was made and update
+ */
+ s = splnet();
+ hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
+ for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) {
+
+ if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
+ (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
+ (rt->mfc_stall != NULL)) {
+
+ if (nstl++)
+ log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n",
+ "multiple kernel entries",
+ (u_long)ntohl(mfccp->mfcc_origin.s_addr),
+ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
+ mfccp->mfcc_parent, (void *)rt->mfc_stall);
+
+ if (mrtdebug & DEBUG_MFC)
+ log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n",
+ (u_long)ntohl(mfccp->mfcc_origin.s_addr),
+ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
+ mfccp->mfcc_parent, (void *)rt->mfc_stall);
+
+ rt->mfc_origin = mfccp->mfcc_origin;
+ rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
+ rt->mfc_parent = mfccp->mfcc_parent;
+ for (i = 0; i < numvifs; i++)
+ rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
+ /* initialize pkt counters per src-grp */
+ rt->mfc_pkt_cnt = 0;
+ rt->mfc_byte_cnt = 0;
+ rt->mfc_wrong_if = 0;
+ rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
+
+ rt->mfc_expire = 0; /* Don't clean this guy up */
+ nexpire[hash]--;
+
+ /* free packets Qed at the end of this entry */
+ for (rte = rt->mfc_stall; rte != NULL; ) {
+ struct rtdetq *n = rte->next;
+
+ ip_mdq(rte->m, rte->ifp, rt, -1);
+ m_freem(rte->m);
+#ifdef UPCALL_TIMING
+ collate(&(rte->t));
+#endif /* UPCALL_TIMING */
+ free(rte, M_MRTABLE);
+ rte = n;
+ }
+ rt->mfc_stall = NULL;
+ }
+ }
+
+ /*
+ * It is possible that an entry is being inserted without an upcall
+ */
+ if (nstl == 0) {
+ if (mrtdebug & DEBUG_MFC)
+ log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n",
+ hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr),
+ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
+ mfccp->mfcc_parent);
+
+ for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) {
+
+ if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
+ (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
+
+ rt->mfc_origin = mfccp->mfcc_origin;
+ rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
+ rt->mfc_parent = mfccp->mfcc_parent;
+ for (i = 0; i < numvifs; i++)
+ rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
+ /* initialize pkt counters per src-grp */
+ rt->mfc_pkt_cnt = 0;
+ rt->mfc_byte_cnt = 0;
+ rt->mfc_wrong_if = 0;
+ rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
+ if (rt->mfc_expire)
+ nexpire[hash]--;
+ rt->mfc_expire = 0;
+ }
+ }
+ if (rt == NULL) {
+ /* no upcall, so make a new entry */
+ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
+ if (rt == NULL) {
+ splx(s);
+ return ENOBUFS;
+ }
+
+ /* insert new entry at head of hash chain */
+ rt->mfc_origin = mfccp->mfcc_origin;
+ rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
+ rt->mfc_parent = mfccp->mfcc_parent;
+ for (i = 0; i < numvifs; i++)
+ rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
+ /* initialize pkt counters per src-grp */
+ rt->mfc_pkt_cnt = 0;
+ rt->mfc_byte_cnt = 0;
+ rt->mfc_wrong_if = 0;
+ rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
+ rt->mfc_expire = 0;
+ rt->mfc_stall = NULL;
+
+ /* link into table */
+ rt->mfc_next = mfctable[hash];
+ mfctable[hash] = rt;
+ }
+ }
+ splx(s);
+ return 0;
+}
+
+#ifdef UPCALL_TIMING
+/*
+ * collect delay statistics on the upcalls
+ */
+static void collate(t)
+register struct timeval *t;
+{
+ register u_long d;
+ register struct timeval tp;
+ register u_long delta;
+
+ GET_TIME(tp);
+
+ if (TV_LT(*t, tp))
+ {
+ TV_DELTA(tp, *t, delta);
+
+ d = delta >> 10;
+ if (d > 50)
+ d = 50;
+
+ ++upcall_data[d];
+ }
+}
+#endif /* UPCALL_TIMING */
+
+/*
+ * Delete an mfc entry
+ */
+static int
+del_mfc(mfccp)
+ struct mfcctl *mfccp;
+{
+ struct in_addr origin;
+ struct in_addr mcastgrp;
+ struct mfc *rt;
+ struct mfc **nptr;
+ u_long hash;
+ int s;
+
+ origin = mfccp->mfcc_origin;
+ mcastgrp = mfccp->mfcc_mcastgrp;
+ hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
+
+ if (mrtdebug & DEBUG_MFC)
+ log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n",
+ (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
+
+ s = splnet();
+
+ nptr = &mfctable[hash];
+ while ((rt = *nptr) != NULL) {
+ if (origin.s_addr == rt->mfc_origin.s_addr &&
+ mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
+ rt->mfc_stall == NULL)
+ break;
+
+ nptr = &rt->mfc_next;
+ }
+ if (rt == NULL) {
+ splx(s);
+ return EADDRNOTAVAIL;
+ }
+
+ *nptr = rt->mfc_next;
+ free(rt, M_MRTABLE);
+
+ splx(s);
+
+ return 0;
+}
+
+/*
+ * Send a message to mrouted on the multicast routing socket
+ */
+static int
+socket_send(s, mm, src)
+ struct socket *s;
+ struct mbuf *mm;
+ struct sockaddr_in *src;
+{
+ if (s) {
+ if (sbappendaddr(&s->so_rcv,
+ (struct sockaddr *)src,
+ mm, (struct mbuf *)0) != 0) {
+ sorwakeup(s);
+ return 0;
+ }
+ }
+ m_freem(mm);
+ return -1;
+}
+
+/*
+ * IP multicast forwarding function. This function assumes that the packet
+ * pointed to by "ip" has arrived on (or is about to be sent to) the interface
+ * pointed to by "ifp", and the packet is to be relayed to other networks
+ * that have members of the packet's destination IP multicast group.
+ *
+ * The packet is returned unscathed to the caller, unless it is
+ * erroneous, in which case a non-zero return value tells the caller to
+ * discard it.
+ */
+
+#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
+
+static int
+X_ip_mforward(ip, ifp, m, imo)
+ register struct ip *ip;
+ struct ifnet *ifp;
+ struct mbuf *m;
+ struct ip_moptions *imo;
+{
+ register struct mfc *rt;
+ register u_char *ipoptions;
+ static struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
+ static int srctun = 0;
+ register struct mbuf *mm;
+ int s;
+ vifi_t vifi;
+ struct vif *vifp;
+
+ if (mrtdebug & DEBUG_FORWARD)
+ log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n",
+ (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr),
+ (void *)ifp);
+
+ if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
+ (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
+ /*
+ * Packet arrived via a physical interface or
+ * an encapsulated tunnel.
+ */
+ } else {
+ /*
+ * Packet arrived through a source-route tunnel.
+ * Source-route tunnels are no longer supported.
+ */
+ if ((srctun++ % 1000) == 0)
+ log(LOG_ERR,
+ "ip_mforward: received source-routed packet from %lx\n",
+ (u_long)ntohl(ip->ip_src.s_addr));
+
+ return 1;
+ }
+
+ if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) {
+ if (ip->ip_ttl < 255)
+ ip->ip_ttl++; /* compensate for -1 in *_send routines */
+ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
+ vifp = viftable + vifi;
+ printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n",
+ (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr),
+ vifi,
+ (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
+ vifp->v_ifp->if_name, vifp->v_ifp->if_unit);
+ }
+ return (ip_mdq(m, ifp, NULL, vifi));
+ }
+ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
+ printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n",
+ (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr));
+ if(!imo)
+ printf("In fact, no options were specified at all\n");
+ }
+
+ /*
+ * Don't forward a packet with time-to-live of zero or one,
+ * or a packet destined to a local-only group.
+ */
+ if (ip->ip_ttl <= 1 ||
+ ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP)
+ return 0;
+
+ /*
+ * Determine forwarding vifs from the forwarding cache table
+ */
+ s = splnet();
+ MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt);
+
+ /* Entry exists, so forward if necessary */
+ if (rt != NULL) {
+ splx(s);
+ return (ip_mdq(m, ifp, rt, -1));
+ } else {
+ /*
+ * If we don't have a route for packet's origin,
+ * Make a copy of the packet &
+ * send message to routing daemon
+ */
+
+ register struct mbuf *mb0;
+ register struct rtdetq *rte;
+ register u_long hash;
+ int hlen = ip->ip_hl << 2;
+#ifdef UPCALL_TIMING
+ struct timeval tp;
+
+ GET_TIME(tp);
+#endif
+
+ mrtstat.mrts_no_route++;
+ if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
+ log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n",
+ (u_long)ntohl(ip->ip_src.s_addr),
+ (u_long)ntohl(ip->ip_dst.s_addr));
+
+ /*
+ * Allocate mbufs early so that we don't do extra work if we are
+ * just going to fail anyway. Make sure to pullup the header so
+ * that other people can't step on it.
+ */
+ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT);
+ if (rte == NULL) {
+ splx(s);
+ return ENOBUFS;
+ }
+ mb0 = m_copy(m, 0, M_COPYALL);
+ if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
+ mb0 = m_pullup(mb0, hlen);
+ if (mb0 == NULL) {
+ free(rte, M_MRTABLE);
+ splx(s);
+ return ENOBUFS;
+ }
+
+ /* is there an upcall waiting for this packet? */
+ hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr);
+ for (rt = mfctable[hash]; rt; rt = rt->mfc_next) {
+ if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) &&
+ (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
+ (rt->mfc_stall != NULL))
+ break;
+ }
+
+ if (rt == NULL) {
+ int i;
+ struct igmpmsg *im;
+
+ /* no upcall, so make a new entry */
+ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
+ if (rt == NULL) {
+ free(rte, M_MRTABLE);
+ m_freem(mb0);
+ splx(s);
+ return ENOBUFS;
+ }
+ /* Make a copy of the header to send to the user level process */
+ mm = m_copy(mb0, 0, hlen);
+ if (mm == NULL) {
+ free(rte, M_MRTABLE);
+ m_freem(mb0);
+ free(rt, M_MRTABLE);
+ splx(s);
+ return ENOBUFS;
+ }
+
+ /*
+ * Send message to routing daemon to install
+ * a route into the kernel table
+ */
+ k_igmpsrc.sin_addr = ip->ip_src;
+
+ im = mtod(mm, struct igmpmsg *);
+ im->im_msgtype = IGMPMSG_NOCACHE;
+ im->im_mbz = 0;
+
+ mrtstat.mrts_upcalls++;
+
+ if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
+ log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
+ ++mrtstat.mrts_upq_sockfull;
+ free(rte, M_MRTABLE);
+ m_freem(mb0);
+ free(rt, M_MRTABLE);
+ splx(s);
+ return ENOBUFS;
+ }
+
+ /* insert new entry at head of hash chain */
+ rt->mfc_origin.s_addr = ip->ip_src.s_addr;
+ rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr;
+ rt->mfc_expire = UPCALL_EXPIRE;
+ nexpire[hash]++;
+ for (i = 0; i < numvifs; i++)
+ rt->mfc_ttls[i] = 0;
+ rt->mfc_parent = -1;
+
+ /* link into table */
+ rt->mfc_next = mfctable[hash];
+ mfctable[hash] = rt;
+ rt->mfc_stall = rte;
+
+ } else {
+ /* determine if q has overflowed */
+ int npkts = 0;
+ struct rtdetq **p;
+
+ for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
+ npkts++;
+
+ if (npkts > MAX_UPQ) {
+ mrtstat.mrts_upq_ovflw++;
+ free(rte, M_MRTABLE);
+ m_freem(mb0);
+ splx(s);
+ return 0;
+ }
+
+ /* Add this entry to the end of the queue */
+ *p = rte;
+ }
+
+ rte->m = mb0;
+ rte->ifp = ifp;
+#ifdef UPCALL_TIMING
+ rte->t = tp;
+#endif
+ rte->next = NULL;
+
+ splx(s);
+
+ return 0;
+ }
+}
+
+#ifndef MROUTE_KLD
+int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
+ struct ip_moptions *) = X_ip_mforward;
+#endif
+
+/*
+ * Clean up the cache entry if upcall is not serviced
+ */
+static void
+expire_upcalls(void *unused)
+{
+ struct rtdetq *rte;
+ struct mfc *mfc, **nptr;
+ int i;
+ int s;
+
+ s = splnet();
+ for (i = 0; i < MFCTBLSIZ; i++) {
+ if (nexpire[i] == 0)
+ continue;
+ nptr = &mfctable[i];
+ for (mfc = *nptr; mfc != NULL; mfc = *nptr) {
+ /*
+ * Skip real cache entries
+ * Make sure it wasn't marked to not expire (shouldn't happen)
+ * If it expires now
+ */
+ if (mfc->mfc_stall != NULL &&
+ mfc->mfc_expire != 0 &&
+ --mfc->mfc_expire == 0) {
+ if (mrtdebug & DEBUG_EXPIRE)
+ log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n",
+ (u_long)ntohl(mfc->mfc_origin.s_addr),
+ (u_long)ntohl(mfc->mfc_mcastgrp.s_addr));
+ /*
+ * drop all the packets
+ * free the mbuf with the pkt, if, timing info
+ */
+ for (rte = mfc->mfc_stall; rte; ) {
+ struct rtdetq *n = rte->next;
+
+ m_freem(rte->m);
+ free(rte, M_MRTABLE);
+ rte = n;
+ }
+ ++mrtstat.mrts_cache_cleanups;
+ nexpire[i]--;
+
+ *nptr = mfc->mfc_next;
+ free(mfc, M_MRTABLE);
+ } else {
+ nptr = &mfc->mfc_next;
+ }
+ }
+ }
+ splx(s);
+ expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT);
+}
+
+/*
+ * Packet forwarding routine once entry in the cache is made
+ */
+static int
+ip_mdq(m, ifp, rt, xmt_vif)
+ register struct mbuf *m;
+ register struct ifnet *ifp;
+ register struct mfc *rt;
+ register vifi_t xmt_vif;
+{
+ register struct ip *ip = mtod(m, struct ip *);
+ register vifi_t vifi;
+ register struct vif *vifp;
+ register int plen = ip->ip_len;
+
+/*
+ * Macro to send packet on vif. Since RSVP packets don't get counted on
+ * input, they shouldn't get counted on output, so statistics keeping is
+ * separate.
+ */
+#define MC_SEND(ip,vifp,m) { \
+ if ((vifp)->v_flags & VIFF_TUNNEL) \
+ encap_send((ip), (vifp), (m)); \
+ else \
+ phyint_send((ip), (vifp), (m)); \
+}
+
+ /*
+ * If xmt_vif is not -1, send on only the requested vif.
+ *
+ * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
+ */
+ if (xmt_vif < numvifs) {
+ MC_SEND(ip, viftable + xmt_vif, m);
+ return 1;
+ }
+
+ /*
+ * Don't forward if it didn't arrive from the parent vif for its origin.
+ */
+ vifi = rt->mfc_parent;
+ if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
+ /* came in the wrong interface */
+ if (mrtdebug & DEBUG_FORWARD)
+ log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
+ (void *)ifp, vifi, (void *)viftable[vifi].v_ifp);
+ ++mrtstat.mrts_wrong_if;
+ ++rt->mfc_wrong_if;
+ /*
+ * If we are doing PIM assert processing, and we are forwarding
+ * packets on this interface, and it is a broadcast medium
+ * interface (and not a tunnel), send a message to the routing daemon.
+ */
+ if (pim_assert && rt->mfc_ttls[vifi] &&
+ (ifp->if_flags & IFF_BROADCAST) &&
+ !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
+ struct sockaddr_in k_igmpsrc;
+ struct mbuf *mm;
+ struct igmpmsg *im;
+ int hlen = ip->ip_hl << 2;
+ struct timeval now;
+ register u_long delta;
+
+ GET_TIME(now);
+
+ TV_DELTA(rt->mfc_last_assert, now, delta);
+
+ if (delta > ASSERT_MSG_TIME) {
+ mm = m_copy(m, 0, hlen);
+ if (mm && (M_HASCL(mm) || mm->m_len < hlen))
+ mm = m_pullup(mm, hlen);
+ if (mm == NULL) {
+ return ENOBUFS;
+ }
+
+ rt->mfc_last_assert = now;
+
+ im = mtod(mm, struct igmpmsg *);
+ im->im_msgtype = IGMPMSG_WRONGVIF;
+ im->im_mbz = 0;
+ im->im_vif = vifi;
+
+ k_igmpsrc.sin_addr = im->im_src;
+
+ socket_send(ip_mrouter, mm, &k_igmpsrc);
+ }
+ }
+ return 0;
+ }
+
+ /* If I sourced this packet, it counts as output, else it was input. */
+ if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) {
+ viftable[vifi].v_pkt_out++;
+ viftable[vifi].v_bytes_out += plen;
+ } else {
+ viftable[vifi].v_pkt_in++;
+ viftable[vifi].v_bytes_in += plen;
+ }
+ rt->mfc_pkt_cnt++;
+ rt->mfc_byte_cnt += plen;
+
+ /*
+ * For each vif, decide if a copy of the packet should be forwarded.
+ * Forward if:
+ * - the ttl exceeds the vif's threshold
+ * - there are group members downstream on interface
+ */
+ for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
+ if ((rt->mfc_ttls[vifi] > 0) &&
+ (ip->ip_ttl > rt->mfc_ttls[vifi])) {
+ vifp->v_pkt_out++;
+ vifp->v_bytes_out += plen;
+ MC_SEND(ip, vifp, m);
+ }
+
+ return 0;
+}
+
+/*
+ * check if a vif number is legal/ok. This is used by ip_output, to export
+ * numvifs there,
+ */
+static int
+X_legal_vif_num(vif)
+ int vif;
+{
+ if (vif >= 0 && vif < numvifs)
+ return(1);
+ else
+ return(0);
+}
+
+#ifndef MROUTE_KLD
+int (*legal_vif_num)(int) = X_legal_vif_num;
+#endif
+
+/*
+ * Return the local address used by this vif
+ */
+static u_long
+X_ip_mcast_src(vifi)
+ int vifi;
+{
+ if (vifi >= 0 && vifi < numvifs)
+ return viftable[vifi].v_lcl_addr.s_addr;
+ else
+ return INADDR_ANY;
+}
+
+#ifndef MROUTE_KLD
+u_long (*ip_mcast_src)(int) = X_ip_mcast_src;
+#endif
+
+static void
+phyint_send(ip, vifp, m)
+ struct ip *ip;
+ struct vif *vifp;
+ struct mbuf *m;
+{
+ register struct mbuf *mb_copy;
+ register int hlen = ip->ip_hl << 2;
+
+ /*
+ * Make a new reference to the packet; make sure that
+ * the IP header is actually copied, not just referenced,
+ * so that ip_output() only scribbles on the copy.
+ */
+ mb_copy = m_copy(m, 0, M_COPYALL);
+ if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
+ mb_copy = m_pullup(mb_copy, hlen);
+ if (mb_copy == NULL)
+ return;
+
+ if (vifp->v_rate_limit == 0)
+ tbf_send_packet(vifp, mb_copy);
+ else
+ tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len);
+}
+
+static void
+encap_send(ip, vifp, m)
+ register struct ip *ip;
+ register struct vif *vifp;
+ register struct mbuf *m;
+{
+ register struct mbuf *mb_copy;
+ register struct ip *ip_copy;
+ register int i, len = ip->ip_len;
+
+ /*
+ * copy the old packet & pullup its IP header into the
+ * new mbuf so we can modify it. Try to fill the new
+ * mbuf since if we don't the ethernet driver will.
+ */
+ MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER);
+ if (mb_copy == NULL)
+ return;
+ mb_copy->m_data += max_linkhdr;
+ mb_copy->m_len = sizeof(multicast_encap_iphdr);
+
+ if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
+ m_freem(mb_copy);
+ return;
+ }
+ i = MHLEN - M_LEADINGSPACE(mb_copy);
+ if (i > len)
+ i = len;
+ mb_copy = m_pullup(mb_copy, i);
+ if (mb_copy == NULL)
+ return;
+ mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr);
+
+ /*
+ * fill in the encapsulating IP header.
+ */
+ ip_copy = mtod(mb_copy, struct ip *);
+ *ip_copy = multicast_encap_iphdr;
+#ifdef RANDOM_IP_ID
+ ip_copy->ip_id = ip_randomid();
+#else
+ ip_copy->ip_id = htons(ip_id++);
+#endif
+ ip_copy->ip_len += len;
+ ip_copy->ip_src = vifp->v_lcl_addr;
+ ip_copy->ip_dst = vifp->v_rmt_addr;
+
+ /*
+ * turn the encapsulated IP header back into a valid one.
+ */
+ ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
+ --ip->ip_ttl;
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ mb_copy->m_data += sizeof(multicast_encap_iphdr);
+ ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
+ mb_copy->m_data -= sizeof(multicast_encap_iphdr);
+
+ if (vifp->v_rate_limit == 0)
+ tbf_send_packet(vifp, mb_copy);
+ else
+ tbf_control(vifp, mb_copy, ip, ip_copy->ip_len);
+}
+
+/*
+ * Token bucket filter module
+ */
+
+static void
+tbf_control(vifp, m, ip, p_len)
+ register struct vif *vifp;
+ register struct mbuf *m;
+ register struct ip *ip;
+ register u_long p_len;
+{
+ register struct tbf *t = vifp->v_tbf;
+
+ if (p_len > MAX_BKT_SIZE) {
+ /* drop if packet is too large */
+ mrtstat.mrts_pkt2large++;
+ m_freem(m);
+ return;
+ }
+
+ tbf_update_tokens(vifp);
+
+ /* if there are enough tokens,
+ * and the queue is empty,
+ * send this packet out
+ */
+
+ if (t->tbf_q_len == 0) {
+ /* queue empty, send packet if enough tokens */
+ if (p_len <= t->tbf_n_tok) {
+ t->tbf_n_tok -= p_len;
+ tbf_send_packet(vifp, m);
+ } else {
+ /* queue packet and timeout till later */
+ tbf_queue(vifp, m);
+ timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
+ }
+ } else if (t->tbf_q_len < t->tbf_max_q_len) {
+ /* finite queue length, so queue pkts and process queue */
+ tbf_queue(vifp, m);
+ tbf_process_q(vifp);
+ } else {
+ /* queue length too much, try to dq and queue and process */
+ if (!tbf_dq_sel(vifp, ip)) {
+ mrtstat.mrts_q_overflow++;
+ m_freem(m);
+ return;
+ } else {
+ tbf_queue(vifp, m);
+ tbf_process_q(vifp);
+ }
+ }
+ return;
+}
+
+/*
+ * adds a packet to the queue at the interface
+ */
+static void
+tbf_queue(vifp, m)
+ register struct vif *vifp;
+ register struct mbuf *m;
+{
+ register int s = splnet();
+ register struct tbf *t = vifp->v_tbf;
+
+ if (t->tbf_t == NULL) {
+ /* Queue was empty */
+ t->tbf_q = m;
+ } else {
+ /* Insert at tail */
+ t->tbf_t->m_act = m;
+ }
+
+ /* Set new tail pointer */
+ t->tbf_t = m;
+
+#ifdef DIAGNOSTIC
+ /* Make sure we didn't get fed a bogus mbuf */
+ if (m->m_act)
+ panic("tbf_queue: m_act");
+#endif
+ m->m_act = NULL;
+
+ t->tbf_q_len++;
+
+ splx(s);
+}
+
+
+/*
+ * processes the queue at the interface
+ */
+static void
+tbf_process_q(vifp)
+ register struct vif *vifp;
+{
+ register struct mbuf *m;
+ register int len;
+ register int s = splnet();
+ register struct tbf *t = vifp->v_tbf;
+
+ /* loop through the queue at the interface and send as many packets
+ * as possible
+ */
+ while (t->tbf_q_len > 0) {
+ m = t->tbf_q;
+
+ len = mtod(m, struct ip *)->ip_len;
+
+ /* determine if the packet can be sent */
+ if (len <= t->tbf_n_tok) {
+ /* if so,
+ * reduce no of tokens, dequeue the packet,
+ * send the packet.
+ */
+ t->tbf_n_tok -= len;
+
+ t->tbf_q = m->m_act;
+ if (--t->tbf_q_len == 0)
+ t->tbf_t = NULL;
+
+ m->m_act = NULL;
+ tbf_send_packet(vifp, m);
+
+ } else break;
+ }
+ splx(s);
+}
+
+static void
+tbf_reprocess_q(xvifp)
+ void *xvifp;
+{
+ register struct vif *vifp = xvifp;
+ if (ip_mrouter == NULL)
+ return;
+
+ tbf_update_tokens(vifp);
+
+ tbf_process_q(vifp);
+
+ if (vifp->v_tbf->tbf_q_len)
+ timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
+}
+
+/* function that will selectively discard a member of the queue
+ * based on the precedence value and the priority
+ */
+static int
+tbf_dq_sel(vifp, ip)
+ register struct vif *vifp;
+ register struct ip *ip;
+{
+ register int s = splnet();
+ register u_int p;
+ register struct mbuf *m, *last;
+ register struct mbuf **np;
+ register struct tbf *t = vifp->v_tbf;
+
+ p = priority(vifp, ip);
+
+ np = &t->tbf_q;
+ last = NULL;
+ while ((m = *np) != NULL) {
+ if (p > priority(vifp, mtod(m, struct ip *))) {
+ *np = m->m_act;
+ /* If we're removing the last packet, fix the tail pointer */
+ if (m == t->tbf_t)
+ t->tbf_t = last;
+ m_freem(m);
+ /* it's impossible for the queue to be empty, but
+ * we check anyway. */
+ if (--t->tbf_q_len == 0)
+ t->tbf_t = NULL;
+ splx(s);
+ mrtstat.mrts_drop_sel++;
+ return(1);
+ }
+ np = &m->m_act;
+ last = m;
+ }
+ splx(s);
+ return(0);
+}
+
+static void
+tbf_send_packet(vifp, m)
+ register struct vif *vifp;
+ register struct mbuf *m;
+{
+ struct ip_moptions imo;
+ int error;
+ static struct route ro;
+ int s = splnet();
+
+ if (vifp->v_flags & VIFF_TUNNEL) {
+ /* If tunnel options */
+ ip_output(m, (struct mbuf *)0, &vifp->v_route,
+ IP_FORWARDING, (struct ip_moptions *)0);
+ } else {
+ imo.imo_multicast_ifp = vifp->v_ifp;
+ imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
+ imo.imo_multicast_loop = 1;
+ imo.imo_multicast_vif = -1;
+
+ /*
+ * Re-entrancy should not be a problem here, because
+ * the packets that we send out and are looped back at us
+ * should get rejected because they appear to come from
+ * the loopback interface, thus preventing looping.
+ */
+ error = ip_output(m, (struct mbuf *)0, &ro,
+ IP_FORWARDING, &imo);
+
+ if (mrtdebug & DEBUG_XMIT)
+ log(LOG_DEBUG, "phyint_send on vif %d err %d\n",
+ vifp - viftable, error);
+ }
+ splx(s);
+}
+
+/* determine the current time and then
+ * the elapsed time (between the last time and time now)
+ * in milliseconds & update the no. of tokens in the bucket
+ */
+static void
+tbf_update_tokens(vifp)
+ register struct vif *vifp;
+{
+ struct timeval tp;
+ register u_long tm;
+ register int s = splnet();
+ register struct tbf *t = vifp->v_tbf;
+
+ GET_TIME(tp);
+
+ TV_DELTA(tp, t->tbf_last_pkt_t, tm);
+
+ /*
+ * This formula is actually
+ * "time in seconds" * "bytes/second".
+ *
+ * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
+ *
+ * The (1000/1024) was introduced in add_vif to optimize
+ * this divide into a shift.
+ */
+ t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8;
+ t->tbf_last_pkt_t = tp;
+
+ if (t->tbf_n_tok > MAX_BKT_SIZE)
+ t->tbf_n_tok = MAX_BKT_SIZE;
+
+ splx(s);
+}
+
+static int
+priority(vifp, ip)
+ register struct vif *vifp;
+ register struct ip *ip;
+{
+ register int prio;
+
+ /* temporary hack; may add general packet classifier some day */
+
+ /*
+ * The UDP port space is divided up into four priority ranges:
+ * [0, 16384) : unclassified - lowest priority
+ * [16384, 32768) : audio - highest priority
+ * [32768, 49152) : whiteboard - medium priority
+ * [49152, 65536) : video - low priority
+ */
+ if (ip->ip_p == IPPROTO_UDP) {
+ struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
+ switch (ntohs(udp->uh_dport) & 0xc000) {
+ case 0x4000:
+ prio = 70;
+ break;
+ case 0x8000:
+ prio = 60;
+ break;
+ case 0xc000:
+ prio = 55;
+ break;
+ default:
+ prio = 50;
+ break;
+ }
+ if (tbfdebug > 1)
+ log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio);
+ } else {
+ prio = 50;
+ }
+ return prio;
+}
+
+/*
+ * End of token bucket filter modifications
+ */
+
+int
+ip_rsvp_vif_init(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, i, s;
+
+ if (rsvpdebug)
+ printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
+ so->so_type, so->so_proto->pr_protocol);
+
+ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
+ return EOPNOTSUPP;
+
+ /* Check mbuf. */
+ error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ if (error)
+ return (error);
+
+ if (rsvpdebug)
+ printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", i, rsvp_on);
+
+ s = splnet();
+
+ /* Check vif. */
+ if (!legal_vif_num(i)) {
+ splx(s);
+ return EADDRNOTAVAIL;
+ }
+
+ /* Check if socket is available. */
+ if (viftable[i].v_rsvpd != NULL) {
+ splx(s);
+ return EADDRINUSE;
+ }
+
+ viftable[i].v_rsvpd = so;
+ /* This may seem silly, but we need to be sure we don't over-increment
+ * the RSVP counter, in case something slips up.
+ */
+ if (!viftable[i].v_rsvp_on) {
+ viftable[i].v_rsvp_on = 1;
+ rsvp_on++;
+ }
+
+ splx(s);
+ return 0;
+}
+
+int
+ip_rsvp_vif_done(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, i, s;
+
+ if (rsvpdebug)
+ printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
+ so->so_type, so->so_proto->pr_protocol);
+
+ if (so->so_type != SOCK_RAW ||
+ so->so_proto->pr_protocol != IPPROTO_RSVP)
+ return EOPNOTSUPP;
+
+ error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ if (error)
+ return (error);
+
+ s = splnet();
+
+ /* Check vif. */
+ if (!legal_vif_num(i)) {
+ splx(s);
+ return EADDRNOTAVAIL;
+ }
+
+ if (rsvpdebug)
+ printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n",
+ viftable[i].v_rsvpd, so);
+
+ /*
+ * XXX as an additional consistency check, one could make sure
+ * that viftable[i].v_rsvpd == so, otherwise passing so as
+ * first parameter is pretty useless.
+ */
+ viftable[i].v_rsvpd = NULL;
+ /*
+ * This may seem silly, but we need to be sure we don't over-decrement
+ * the RSVP counter, in case something slips up.
+ */
+ if (viftable[i].v_rsvp_on) {
+ viftable[i].v_rsvp_on = 0;
+ rsvp_on--;
+ }
+
+ splx(s);
+ return 0;
+}
+
+void
+ip_rsvp_force_done(so)
+ struct socket *so;
+{
+ int vifi;
+ register int s;
+
+ /* Don't bother if it is not the right type of socket. */
+ if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
+ return;
+
+ s = splnet();
+
+ /* The socket may be attached to more than one vif...this
+ * is perfectly legal.
+ */
+ for (vifi = 0; vifi < numvifs; vifi++) {
+ if (viftable[vifi].v_rsvpd == so) {
+ viftable[vifi].v_rsvpd = NULL;
+ /* This may seem silly, but we need to be sure we don't
+ * over-decrement the RSVP counter, in case something slips up.
+ */
+ if (viftable[vifi].v_rsvp_on) {
+ viftable[vifi].v_rsvp_on = 0;
+ rsvp_on--;
+ }
+ }
+ }
+
+ splx(s);
+ return;
+}
+
+void
+rsvp_input(m, off)
+ struct mbuf *m;
+ int off;
+{
+ int vifi;
+ register struct ip *ip = mtod(m, struct ip *);
+ static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET };
+ register int s;
+ struct ifnet *ifp;
+
+ if (rsvpdebug)
+ printf("rsvp_input: rsvp_on %d\n",rsvp_on);
+
+ /* Can still get packets with rsvp_on = 0 if there is a local member
+ * of the group to which the RSVP packet is addressed. But in this
+ * case we want to throw the packet away.
+ */
+ if (!rsvp_on) {
+ m_freem(m);
+ return;
+ }
+
+ s = splnet();
+
+ if (rsvpdebug)
+ printf("rsvp_input: check vifs\n");
+
+#ifdef DIAGNOSTIC
+ if (!(m->m_flags & M_PKTHDR))
+ panic("rsvp_input no hdr");
+#endif
+
+ ifp = m->m_pkthdr.rcvif;
+ /* Find which vif the packet arrived on. */
+ for (vifi = 0; vifi < numvifs; vifi++)
+ if (viftable[vifi].v_ifp == ifp)
+ break;
+
+ if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) {
+ /*
+ * If the old-style non-vif-associated socket is set,
+ * then use it. Otherwise, drop packet since there
+ * is no specific socket for this vif.
+ */
+ if (ip_rsvpd != NULL) {
+ if (rsvpdebug)
+ printf("rsvp_input: Sending packet up old-style socket\n");
+ rip_input(m, off); /* xxx */
+ } else {
+ if (rsvpdebug && vifi == numvifs)
+ printf("rsvp_input: Can't find vif for packet.\n");
+ else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL)
+ printf("rsvp_input: No socket defined for vif %d\n",vifi);
+ m_freem(m);
+ }
+ splx(s);
+ return;
+ }
+ rsvp_src.sin_addr = ip->ip_src;
+
+ if (rsvpdebug && m)
+ printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n",
+ m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv)));
+
+ if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) {
+ if (rsvpdebug)
+ printf("rsvp_input: Failed to append to socket\n");
+ } else {
+ if (rsvpdebug)
+ printf("rsvp_input: send packet up\n");
+ }
+
+ splx(s);
+}
+
+#ifdef MROUTE_KLD
+
+static int
+ip_mroute_modevent(module_t mod, int type, void *unused)
+{
+ int s;
+
+ switch (type) {
+ static u_long (*old_ip_mcast_src)(int);
+ static int (*old_ip_mrouter_set)(struct socket *,
+ struct sockopt *);
+ static int (*old_ip_mrouter_get)(struct socket *,
+ struct sockopt *);
+ static int (*old_ip_mrouter_done)(void);
+ static int (*old_ip_mforward)(struct ip *, struct ifnet *,
+ struct mbuf *, struct ip_moptions *);
+ static int (*old_mrt_ioctl)(int, caddr_t);
+ static int (*old_legal_vif_num)(int);
+
+ case MOD_LOAD:
+ s = splnet();
+ /* XXX Protect against multiple loading */
+ old_ip_mcast_src = ip_mcast_src;
+ ip_mcast_src = X_ip_mcast_src;
+ old_ip_mrouter_get = ip_mrouter_get;
+ ip_mrouter_get = X_ip_mrouter_get;
+ old_ip_mrouter_set = ip_mrouter_set;
+ ip_mrouter_set = X_ip_mrouter_set;
+ old_ip_mrouter_done = ip_mrouter_done;
+ ip_mrouter_done = X_ip_mrouter_done;
+ old_ip_mforward = ip_mforward;
+ ip_mforward = X_ip_mforward;
+ old_mrt_ioctl = mrt_ioctl;
+ mrt_ioctl = X_mrt_ioctl;
+ old_legal_vif_num = legal_vif_num;
+ legal_vif_num = X_legal_vif_num;
+
+ splx(s);
+ return 0;
+
+ case MOD_UNLOAD:
+ if (ip_mrouter)
+ return EINVAL;
+
+ s = splnet();
+ ip_mrouter_get = old_ip_mrouter_get;
+ ip_mrouter_set = old_ip_mrouter_set;
+ ip_mrouter_done = old_ip_mrouter_done;
+ ip_mforward = old_ip_mforward;
+ mrt_ioctl = old_mrt_ioctl;
+ legal_vif_num = old_legal_vif_num;
+ splx(s);
+ return 0;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
+static moduledata_t ip_mroutemod = {
+ "ip_mroute",
+ ip_mroute_modevent,
+ 0
+};
+DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+
+#endif /* MROUTE_KLD */
+#endif /* MROUTING */
diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h
new file mode 100644
index 0000000..0e61652
--- /dev/null
+++ b/sys/netinet/ip_mroute.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 1989 Stephen Deering.
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Stephen Deering of Stanford University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_MROUTE_H_
+#define _NETINET_IP_MROUTE_H_
+
+/*
+ * Definitions for IP multicast forwarding.
+ *
+ * Written by David Waitzman, BBN Labs, August 1988.
+ * Modified by Steve Deering, Stanford, February 1989.
+ * Modified by Ajit Thyagarajan, PARC, August 1993.
+ * Modified by Ajit Thyagarajan, PARC, August 1994.
+ *
+ * MROUTING Revision: 3.3.1.3
+ */
+
+
+/*
+ * Multicast Routing set/getsockopt commands.
+ */
+#define MRT_INIT 100 /* initialize forwarder */
+#define MRT_DONE 101 /* shut down forwarder */
+#define MRT_ADD_VIF 102 /* create virtual interface */
+#define MRT_DEL_VIF 103 /* delete virtual interface */
+#define MRT_ADD_MFC 104 /* insert forwarding cache entry */
+#define MRT_DEL_MFC 105 /* delete forwarding cache entry */
+#define MRT_VERSION 106 /* get kernel version number */
+#define MRT_ASSERT 107 /* enable PIM assert processing */
+
+
+#define GET_TIME(t) microtime(&t)
+
+/*
+ * Types and macros for handling bitmaps with one bit per virtual interface.
+ */
+#define MAXVIFS 32
+typedef u_long vifbitmap_t;
+typedef u_short vifi_t; /* type of a vif index */
+#define ALL_VIFS (vifi_t)-1
+
+#define VIFM_SET(n, m) ((m) |= (1 << (n)))
+#define VIFM_CLR(n, m) ((m) &= ~(1 << (n)))
+#define VIFM_ISSET(n, m) ((m) & (1 << (n)))
+#define VIFM_CLRALL(m) ((m) = 0x00000000)
+#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom))
+#define VIFM_SAME(m1, m2) ((m1) == (m2))
+
+
+/*
+ * Argument structure for MRT_ADD_VIF.
+ * (MRT_DEL_VIF takes a single vifi_t argument.)
+ */
+struct vifctl {
+ vifi_t vifc_vifi; /* the index of the vif to be added */
+ u_char vifc_flags; /* VIFF_ flags defined below */
+ u_char vifc_threshold; /* min ttl required to forward on vif */
+ u_int vifc_rate_limit; /* max rate */
+ struct in_addr vifc_lcl_addr; /* local interface address */
+ struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */
+};
+
+#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */
+#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */
+
+/*
+ * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC
+ * (mfcc_tos to be added at a future point)
+ */
+struct mfcctl {
+ struct in_addr mfcc_origin; /* ip origin of mcasts */
+ struct in_addr mfcc_mcastgrp; /* multicast group associated*/
+ vifi_t mfcc_parent; /* incoming vif */
+ u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
+};
+
+/*
+ * The kernel's multicast routing statistics.
+ */
+struct mrtstat {
+ u_long mrts_mfc_lookups; /* # forw. cache hash table hits */
+ u_long mrts_mfc_misses; /* # forw. cache hash table misses */
+ u_long mrts_upcalls; /* # calls to mrouted */
+ u_long mrts_no_route; /* no route for packet's origin */
+ u_long mrts_bad_tunnel; /* malformed tunnel options */
+ u_long mrts_cant_tunnel; /* no room for tunnel options */
+ u_long mrts_wrong_if; /* arrived on wrong interface */
+ u_long mrts_upq_ovflw; /* upcall Q overflow */
+ u_long mrts_cache_cleanups; /* # entries with no upcalls */
+ u_long mrts_drop_sel; /* pkts dropped selectively */
+ u_long mrts_q_overflow; /* pkts dropped - Q overflow */
+ u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */
+ u_long mrts_upq_sockfull; /* upcalls dropped - socket full */
+};
+
+/*
+ * Argument structure used by mrouted to get src-grp pkt counts
+ */
+struct sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ u_long pktcnt;
+ u_long bytecnt;
+ u_long wrong_if;
+};
+
+/*
+ * Argument structure used by mrouted to get vif pkt counts
+ */
+struct sioc_vif_req {
+ vifi_t vifi; /* vif number */
+ u_long icount; /* Input packet count on vif */
+ u_long ocount; /* Output packet count on vif */
+ u_long ibytes; /* Input byte count on vif */
+ u_long obytes; /* Output byte count on vif */
+};
+
+
+/*
+ * The kernel's virtual-interface structure.
+ */
+struct vif {
+ u_char v_flags; /* VIFF_ flags defined above */
+ u_char v_threshold; /* min ttl required to forward on vif*/
+ u_int v_rate_limit; /* max rate */
+ struct tbf *v_tbf; /* token bucket structure at intf. */
+ struct in_addr v_lcl_addr; /* local interface address */
+ struct in_addr v_rmt_addr; /* remote address (tunnels only) */
+ struct ifnet *v_ifp; /* pointer to interface */
+ u_long v_pkt_in; /* # pkts in on interface */
+ u_long v_pkt_out; /* # pkts out on interface */
+ u_long v_bytes_in; /* # bytes in on interface */
+ u_long v_bytes_out; /* # bytes out on interface */
+ struct route v_route; /* cached route if this is a tunnel */
+ u_int v_rsvp_on; /* RSVP listening on this vif */
+ struct socket *v_rsvpd; /* RSVP daemon socket */
+};
+
+/*
+ * The kernel's multicast forwarding cache entry structure
+ * (A field for the type of service (mfc_tos) is to be added
+ * at a future point)
+ */
+struct mfc {
+ struct in_addr mfc_origin; /* IP origin of mcasts */
+ struct in_addr mfc_mcastgrp; /* multicast group associated*/
+ vifi_t mfc_parent; /* incoming vif */
+ u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */
+ u_long mfc_pkt_cnt; /* pkt count for src-grp */
+ u_long mfc_byte_cnt; /* byte count for src-grp */
+ u_long mfc_wrong_if; /* wrong if for src-grp */
+ int mfc_expire; /* time to clean entry up */
+ struct timeval mfc_last_assert; /* last time I sent an assert*/
+ struct rtdetq *mfc_stall; /* q of packets awaiting mfc */
+ struct mfc *mfc_next; /* next mfc entry */
+};
+
+/*
+ * Struct used to communicate from kernel to multicast router
+ * note the convenient similarity to an IP packet
+ */
+struct igmpmsg {
+ u_long unused1;
+ u_long unused2;
+ u_char im_msgtype; /* what type of message */
+#define IGMPMSG_NOCACHE 1
+#define IGMPMSG_WRONGVIF 2
+ u_char im_mbz; /* must be zero */
+ u_char im_vif; /* vif rec'd on */
+ u_char unused3;
+ struct in_addr im_src, im_dst;
+};
+
+/*
+ * Argument structure used for pkt info. while upcall is made
+ */
+struct rtdetq {
+ struct mbuf *m; /* A copy of the packet */
+ struct ifnet *ifp; /* Interface pkt came in on */
+ vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */
+#ifdef UPCALL_TIMING
+ struct timeval t; /* Timestamp */
+#endif /* UPCALL_TIMING */
+ struct rtdetq *next; /* Next in list of packets */
+};
+
+#define MFCTBLSIZ 256
+#if (MFCTBLSIZ & (MFCTBLSIZ - 1)) == 0 /* from sys:route.h */
+#define MFCHASHMOD(h) ((h) & (MFCTBLSIZ - 1))
+#else
+#define MFCHASHMOD(h) ((h) % MFCTBLSIZ)
+#endif
+
+#define MAX_UPQ 4 /* max. no of pkts in upcall Q */
+
+/*
+ * Token Bucket filter code
+ */
+#define MAX_BKT_SIZE 10000 /* 10K bytes size */
+#define MAXQSIZE 10 /* max # of pkts in queue */
+
+/*
+ * the token bucket filter at each vif
+ */
+struct tbf
+{
+ struct timeval tbf_last_pkt_t; /* arr. time of last pkt */
+ u_long tbf_n_tok; /* no of tokens in bucket */
+ u_long tbf_q_len; /* length of queue at this vif */
+ u_long tbf_max_q_len; /* max. queue length */
+ struct mbuf *tbf_q; /* Packet queue */
+ struct mbuf *tbf_t; /* tail-insertion pointer */
+};
+
+#ifdef _KERNEL
+
+struct sockopt;
+
+extern int (*ip_mrouter_set)(struct socket *, struct sockopt *);
+extern int (*ip_mrouter_get)(struct socket *, struct sockopt *);
+extern int (*ip_mrouter_done)(void);
+extern int (*mrt_ioctl)(int, caddr_t);
+
+#endif /* _KERNEL */
+
+#endif /* _NETINET_IP_MROUTE_H_ */
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
new file mode 100644
index 0000000..3402a28
--- /dev/null
+++ b/sys/netinet/ip_output.c
@@ -0,0 +1,2050 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#define _IP_VHL
+
+#include "opt_ipfw.h"
+#include "opt_ipdn.h"
+#include "opt_ipdivert.h"
+#include "opt_ipfilter.h"
+#include "opt_ipsec.h"
+#include "opt_pfil_hooks.h"
+#include "opt_random_ip_id.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+
+#include <machine/in_cksum.h>
+
+static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options");
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#include <netkey/key.h>
+#ifdef IPSEC_DEBUG
+#include <netkey/key_debug.h>
+#else
+#define KEYDEBUG(lev,arg)
+#endif
+#endif /*IPSEC*/
+
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
+ x, (ntohl(a.s_addr)>>24)&0xFF,\
+ (ntohl(a.s_addr)>>16)&0xFF,\
+ (ntohl(a.s_addr)>>8)&0xFF,\
+ (ntohl(a.s_addr))&0xFF, y);
+
+u_short ip_id;
+
+static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
+static struct ifnet *ip_multicast_if(struct in_addr *, int *);
+static void ip_mloopback
+ (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
+static int ip_getmoptions
+ (struct sockopt *, struct ip_moptions *);
+static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
+static int ip_setmoptions
+ (struct sockopt *, struct ip_moptions **);
+
+int ip_optcopy(struct ip *, struct ip *);
+
+
+extern struct protosw inetsw[];
+
+/*
+ * IP output. The packet in mbuf chain m contains a skeletal IP
+ * header (with len, off, ttl, proto, tos, src, dst).
+ * The mbuf chain containing the packet will be freed.
+ * The mbuf opt, if present, will not be freed.
+ */
+int
+ip_output(m0, opt, ro, flags, imo)
+ struct mbuf *m0;
+ struct mbuf *opt;
+ struct route *ro;
+ int flags;
+ struct ip_moptions *imo;
+{
+ struct ip *ip, *mhip;
+ struct ifnet *ifp = NULL; /* keep compiler happy */
+ struct mbuf *m;
+ int hlen = sizeof (struct ip);
+ int len, off, error = 0;
+ struct sockaddr_in *dst = NULL; /* keep compiler happy */
+ struct in_ifaddr *ia;
+ int isbroadcast, sw_csum;
+ struct in_addr pkt_dst;
+#ifdef IPSEC
+ struct route iproute;
+ struct socket *so = NULL;
+ struct secpolicy *sp = NULL;
+#endif
+ struct ip_fw_args args;
+ int src_was_INADDR_ANY = 0; /* as the name says... */
+#ifdef PFIL_HOOKS
+ struct packet_filter_hook *pfh;
+ struct mbuf *m1;
+ int rv;
+#endif /* PFIL_HOOKS */
+
+ args.eh = NULL;
+ args.rule = NULL;
+ args.next_hop = NULL;
+ args.divert_rule = 0; /* divert cookie */
+
+ /* Grab info from MT_TAG mbufs prepended to the chain. */
+ for (; m0 && m0->m_type == MT_TAG; m0 = m0->m_next) {
+ switch(m0->m_tag_id) {
+ default:
+ printf("ip_output: unrecognised MT_TAG tag %d\n",
+ m0->m_tag_id);
+ break;
+
+ case PACKET_TAG_DUMMYNET:
+ /*
+ * the packet was already tagged, so part of the
+ * processing was already done, and we need to go down.
+ * Get parameters from the header.
+ */
+ args.rule = ((struct dn_pkt *)m0)->rule;
+ opt = NULL ;
+ ro = & ( ((struct dn_pkt *)m0)->ro ) ;
+ imo = NULL ;
+ dst = ((struct dn_pkt *)m0)->dn_dst ;
+ ifp = ((struct dn_pkt *)m0)->ifp ;
+ flags = ((struct dn_pkt *)m0)->flags ;
+ break;
+
+ case PACKET_TAG_DIVERT:
+ args.divert_rule = (intptr_t)m0->m_data & 0xffff;
+ break;
+
+ case PACKET_TAG_IPFORWARD:
+ args.next_hop = (struct sockaddr_in *)m0->m_data;
+ break;
+ }
+ }
+ m = m0;
+
+ KASSERT(!m || (m->m_flags & M_PKTHDR) != 0, ("ip_output: no HDR"));
+
+ KASSERT(ro != NULL, ("ip_output: no route, proto %d",
+ mtod(m, struct ip *)->ip_p));
+
+#ifdef IPSEC
+ so = ipsec_getsocket(m);
+ (void)ipsec_setsocket(m, NULL);
+#endif
+ if (args.rule != NULL) { /* dummynet already saw us */
+ ip = mtod(m, struct ip *);
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2 ;
+ ia = ifatoia(ro->ro_rt->rt_ifa);
+ goto sendit;
+ }
+
+ if (opt) {
+ m = ip_insertoptions(m, opt, &len);
+ hlen = len;
+ }
+ ip = mtod(m, struct ip *);
+ pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst;
+
+ /*
+ * Fill in IP header.
+ */
+ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
+ ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
+ ip->ip_off &= IP_DF;
+#ifdef RANDOM_IP_ID
+ ip->ip_id = ip_randomid();
+#else
+ ip->ip_id = htons(ip_id++);
+#endif
+ ipstat.ips_localout++;
+ } else {
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+ }
+
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+ /*
+ * If there is a cached route,
+ * check that it is to the same destination
+ * and is still up. If not, free it and try again.
+ * The address family should also be checked in case of sharing the
+ * cache with IPv6.
+ */
+ if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
+ dst->sin_family != AF_INET ||
+ dst->sin_addr.s_addr != pkt_dst.s_addr)) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = (struct rtentry *)0;
+ }
+ if (ro->ro_rt == 0) {
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = pkt_dst;
+ }
+ /*
+ * If routing to interface only,
+ * short circuit routing lookup.
+ */
+ if (flags & IP_ROUTETOIF) {
+ if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
+ (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
+ ipstat.ips_noroute++;
+ error = ENETUNREACH;
+ goto bad;
+ }
+ ifp = ia->ia_ifp;
+ ip->ip_ttl = 1;
+ isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
+ imo != NULL && imo->imo_multicast_ifp != NULL) {
+ /*
+ * Bypass the normal routing lookup for multicast
+ * packets if the interface is specified.
+ */
+ ifp = imo->imo_multicast_ifp;
+ IFP_TO_IA(ifp, ia);
+ isbroadcast = 0; /* fool gcc */
+ } else {
+ /*
+ * If this is the case, we probably don't want to allocate
+ * a protocol-cloned route since we didn't get one from the
+ * ULP. This lets TCP do its thing, while not burdening
+ * forwarding or ICMP with the overhead of cloning a route.
+ * Of course, we still want to do any cloning requested by
+ * the link layer, as this is probably required in all cases
+ * for correct operation (as it is for ARP).
+ */
+ if (ro->ro_rt == 0)
+ rtalloc_ign(ro, RTF_PRCLONING);
+ if (ro->ro_rt == 0) {
+ ipstat.ips_noroute++;
+ error = EHOSTUNREACH;
+ goto bad;
+ }
+ ia = ifatoia(ro->ro_rt->rt_ifa);
+ ifp = ro->ro_rt->rt_ifp;
+ ro->ro_rt->rt_use++;
+ if (ro->ro_rt->rt_flags & RTF_GATEWAY)
+ dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
+ if (ro->ro_rt->rt_flags & RTF_HOST)
+ isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
+ else
+ isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ }
+ if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
+ struct in_multi *inm;
+
+ m->m_flags |= M_MCAST;
+ /*
+ * IP destination address is multicast. Make sure "dst"
+ * still points to the address in "ro". (It may have been
+ * changed to point to a gateway address, above.)
+ */
+ dst = (struct sockaddr_in *)&ro->ro_dst;
+ /*
+ * See if the caller provided any multicast options
+ */
+ if (imo != NULL) {
+ ip->ip_ttl = imo->imo_multicast_ttl;
+ if (imo->imo_multicast_vif != -1)
+ ip->ip_src.s_addr =
+ ip_mcast_src(imo->imo_multicast_vif);
+ } else
+ ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
+ /*
+ * Confirm that the outgoing interface supports multicast.
+ */
+ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
+ if ((ifp->if_flags & IFF_MULTICAST) == 0) {
+ ipstat.ips_noroute++;
+ error = ENETUNREACH;
+ goto bad;
+ }
+ }
+ /*
+ * If source address not specified yet, use address
+ * of outgoing interface.
+ */
+ if (ip->ip_src.s_addr == INADDR_ANY) {
+ /* Interface may have no addresses. */
+ if (ia != NULL)
+ ip->ip_src = IA_SIN(ia)->sin_addr;
+ }
+
+ if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
+ /*
+ * XXX
+ * delayed checksums are not currently
+ * compatible with IP multicast routing
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &=
+ ~CSUM_DELAY_DATA;
+ }
+ }
+ IN_LOOKUP_MULTI(pkt_dst, ifp, inm);
+ if (inm != NULL &&
+ (imo == NULL || imo->imo_multicast_loop)) {
+ /*
+ * If we belong to the destination multicast group
+ * on the outgoing interface, and the caller did not
+ * forbid loopback, loop back a copy.
+ */
+ ip_mloopback(ifp, m, dst, hlen);
+ }
+ else {
+ /*
+ * If we are acting as a multicast router, perform
+ * multicast forwarding as if the packet had just
+ * arrived on the interface to which we are about
+ * to send. The multicast forwarding function
+ * recursively calls this function, using the
+ * IP_FORWARDING flag to prevent infinite recursion.
+ *
+ * Multicasts that are looped back by ip_mloopback(),
+ * above, will be forwarded by the ip_input() routine,
+ * if necessary.
+ */
+ if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
+ /*
+ * Check if rsvp daemon is running. If not, don't
+ * set ip_moptions. This ensures that the packet
+ * is multicast and not just sent down one link
+ * as prescribed by rsvpd.
+ */
+ if (!rsvp_on)
+ imo = NULL;
+ if (ip_mforward(ip, ifp, m, imo) != 0) {
+ m_freem(m);
+ goto done;
+ }
+ }
+ }
+
+ /*
+ * Multicasts with a time-to-live of zero may be looped-
+ * back, above, but must not be transmitted on a network.
+ * Also, multicasts addressed to the loopback interface
+ * are not sent -- the above call to ip_mloopback() will
+ * loop back a copy if this host actually belongs to the
+ * destination group on the loopback interface.
+ */
+ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
+ m_freem(m);
+ goto done;
+ }
+
+ goto sendit;
+ }
+#ifndef notdef
+ /*
+ * If the source address is not specified yet, use the address
+ * of the outoing interface. In case, keep note we did that, so
+ * if the the firewall changes the next-hop causing the output
+ * interface to change, we can fix that.
+ */
+ if (ip->ip_src.s_addr == INADDR_ANY) {
+ /* Interface may have no addresses. */
+ if (ia != NULL) {
+ ip->ip_src = IA_SIN(ia)->sin_addr;
+ src_was_INADDR_ANY = 1;
+ }
+ }
+#endif /* notdef */
+ /*
+ * Verify that we have any chance at all of being able to queue
+ * the packet or packet fragments
+ */
+ if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
+ ifp->if_snd.ifq_maxlen) {
+ error = ENOBUFS;
+ ipstat.ips_odropped++;
+ goto bad;
+ }
+
+ /*
+ * Look for broadcast address and
+ * verify user is allowed to send
+ * such a packet.
+ */
+ if (isbroadcast) {
+ if ((ifp->if_flags & IFF_BROADCAST) == 0) {
+ error = EADDRNOTAVAIL;
+ goto bad;
+ }
+ if ((flags & IP_ALLOWBROADCAST) == 0) {
+ error = EACCES;
+ goto bad;
+ }
+ /* don't allow broadcast messages to be fragmented */
+ if ((u_short)ip->ip_len > ifp->if_mtu) {
+ error = EMSGSIZE;
+ goto bad;
+ }
+ m->m_flags |= M_BCAST;
+ } else {
+ m->m_flags &= ~M_BCAST;
+ }
+
+sendit:
+#ifdef IPSEC
+ /* get SP for this packet */
+ if (so == NULL)
+ sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
+ else
+ sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
+
+ if (sp == NULL) {
+ ipsecstat.out_inval++;
+ goto bad;
+ }
+
+ error = 0;
+
+ /* check policy */
+ switch (sp->policy) {
+ case IPSEC_POLICY_DISCARD:
+ /*
+ * This packet is just discarded.
+ */
+ ipsecstat.out_polvio++;
+ goto bad;
+
+ case IPSEC_POLICY_BYPASS:
+ case IPSEC_POLICY_NONE:
+ /* no need to do IPsec. */
+ goto skip_ipsec;
+
+ case IPSEC_POLICY_IPSEC:
+ if (sp->req == NULL) {
+ /* acquire a policy */
+ error = key_spdacquire(sp);
+ goto bad;
+ }
+ break;
+
+ case IPSEC_POLICY_ENTRUST:
+ default:
+ printf("ip_output: Invalid policy found. %d\n", sp->policy);
+ }
+ {
+ struct ipsec_output_state state;
+ bzero(&state, sizeof(state));
+ state.m = m;
+ if (flags & IP_ROUTETOIF) {
+ state.ro = &iproute;
+ bzero(&iproute, sizeof(iproute));
+ } else
+ state.ro = ro;
+ state.dst = (struct sockaddr *)dst;
+
+ ip->ip_sum = 0;
+
+ /*
+ * XXX
+ * delayed checksums are not currently compatible with IPsec
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+
+ error = ipsec4_output(&state, sp, flags);
+
+ m = state.m;
+ if (flags & IP_ROUTETOIF) {
+ /*
+ * if we have tunnel mode SA, we may need to ignore
+ * IP_ROUTETOIF.
+ */
+ if (state.ro != &iproute || state.ro->ro_rt != NULL) {
+ flags &= ~IP_ROUTETOIF;
+ ro = state.ro;
+ }
+ } else
+ ro = state.ro;
+ dst = (struct sockaddr_in *)state.dst;
+ if (error) {
+ /* mbuf is already reclaimed in ipsec4_output. */
+ m0 = NULL;
+ switch (error) {
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ case EMSGSIZE:
+ case ENOBUFS:
+ case ENOMEM:
+ break;
+ default:
+ printf("ip4_output (ipsec): error code %d\n", error);
+ /*fall through*/
+ case ENOENT:
+ /* don't show these error codes to the user */
+ error = 0;
+ break;
+ }
+ goto bad;
+ }
+ }
+
+ /* be sure to update variables that are affected by ipsec4_output() */
+ ip = mtod(m, struct ip *);
+#ifdef _IP_VHL
+ hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+#else
+ hlen = ip->ip_hl << 2;
+#endif
+ if (ro->ro_rt == NULL) {
+ if ((flags & IP_ROUTETOIF) == 0) {
+ printf("ip_output: "
+ "can't update route after IPsec processing\n");
+ error = EHOSTUNREACH; /*XXX*/
+ goto bad;
+ }
+ } else {
+ ia = ifatoia(ro->ro_rt->rt_ifa);
+ ifp = ro->ro_rt->rt_ifp;
+ }
+
+ /* make it flipped, again. */
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_off = ntohs(ip->ip_off);
+skip_ipsec:
+#endif /*IPSEC*/
+
+ /*
+ * IpHack's section.
+ * - Xlate: translate packet's addr/port (NAT).
+ * - Firewall: deny/allow/etc.
+ * - Wrap: fake packet's addr/port <unimpl.>
+ * - Encapsulate: put it in another IP and send out. <unimp.>
+ */
+#ifdef PFIL_HOOKS
+ /*
+ * Run through list of hooks for output packets.
+ */
+ m1 = m;
+ pfh = pfil_hook_get(PFIL_OUT, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh);
+ for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link))
+ if (pfh->pfil_func) {
+ rv = pfh->pfil_func(ip, hlen, ifp, 1, &m1);
+ if (rv) {
+ error = EHOSTUNREACH;
+ goto done;
+ }
+ m = m1;
+ if (m == NULL)
+ goto done;
+ ip = mtod(m, struct ip *);
+ }
+#endif /* PFIL_HOOKS */
+
+ /*
+ * Check with the firewall...
+ * but not if we are already being fwd'd from a firewall.
+ */
+ if (fw_enable && IPFW_LOADED && !args.next_hop) {
+ struct sockaddr_in *old = dst;
+
+ args.m = m;
+ args.next_hop = dst;
+ args.oif = ifp;
+ off = ip_fw_chk_ptr(&args);
+ m = args.m;
+ dst = args.next_hop;
+
+ /*
+ * On return we must do the following:
+ * m == NULL -> drop the pkt (old interface, deprecated)
+ * (off & IP_FW_PORT_DENY_FLAG) -> drop the pkt (new interface)
+ * 1<=off<= 0xffff -> DIVERT
+ * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
+ * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet
+ * dst != old -> IPFIREWALL_FORWARD
+ * off==0, dst==old -> accept
+ * If some of the above modules are not compiled in, then
+ * we should't have to check the corresponding condition
+ * (because the ipfw control socket should not accept
+ * unsupported rules), but better play safe and drop
+ * packets in case of doubt.
+ */
+ if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
+ if (m)
+ m_freem(m);
+ error = EACCES;
+ goto done;
+ }
+ ip = mtod(m, struct ip *);
+ if (off == 0 && dst == old) /* common case */
+ goto pass;
+ if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
+ /*
+ * pass the pkt to dummynet. Need to include
+ * pipe number, m, ifp, ro, dst because these are
+ * not recomputed in the next pass.
+ * All other parameters have been already used and
+ * so they are not needed anymore.
+ * XXX note: if the ifp or ro entry are deleted
+ * while a pkt is in dummynet, we are in trouble!
+ */
+ args.ro = ro;
+ args.dst = dst;
+ args.flags = flags;
+
+ error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
+ &args);
+ goto done;
+ }
+#ifdef IPDIVERT
+ if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
+ struct mbuf *clone = NULL;
+
+ /* Clone packet if we're doing a 'tee' */
+ if ((off & IP_FW_PORT_TEE_FLAG) != 0)
+ clone = m_dup(m, M_DONTWAIT);
+
+ /*
+ * XXX
+ * delayed checksums are not currently compatible
+ * with divert sockets.
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+
+ /* Restore packet header fields to original values */
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+
+ /* Deliver packet to divert input routine */
+ divert_packet(m, 0, off & 0xffff, args.divert_rule);
+
+ /* If 'tee', continue with original packet */
+ if (clone != NULL) {
+ m = clone;
+ ip = mtod(m, struct ip *);
+ goto pass;
+ }
+ goto done;
+ }
+#endif
+
+ /* IPFIREWALL_FORWARD */
+ /*
+ * Check dst to make sure it is directly reachable on the
+ * interface we previously thought it was.
+ * If it isn't (which may be likely in some situations) we have
+ * to re-route it (ie, find a route for the next-hop and the
+ * associated interface) and set them here. This is nested
+ * forwarding which in most cases is undesirable, except where
+ * such control is nigh impossible. So we do it here.
+ * And I'm babbling.
+ */
+ if (off == 0 && old != dst) { /* FORWARD, dst has changed */
+#if 0
+ /*
+ * XXX To improve readability, this block should be
+ * changed into a function call as below:
+ */
+ error = ip_ipforward(&m, &dst, &ifp);
+ if (error)
+ goto bad;
+ if (m == NULL) /* ip_input consumed the mbuf */
+ goto done;
+#else
+ struct in_ifaddr *ia;
+
+ /*
+ * XXX sro_fwd below is static, and a pointer
+ * to it gets passed to routines downstream.
+ * This could have surprisingly bad results in
+ * practice, because its content is overwritten
+ * by subsequent packets.
+ */
+ /* There must be a better way to do this next line... */
+ static struct route sro_fwd;
+ struct route *ro_fwd = &sro_fwd;
+
+#if 0
+ print_ip("IPFIREWALL_FORWARD: New dst ip: ",
+ dst->sin_addr, "\n");
+#endif
+
+ /*
+ * We need to figure out if we have been forwarded
+ * to a local socket. If so, then we should somehow
+ * "loop back" to ip_input, and get directed to the
+ * PCB as if we had received this packet. This is
+ * because it may be dificult to identify the packets
+ * you want to forward until they are being output
+ * and have selected an interface. (e.g. locally
+ * initiated packets) If we used the loopback inteface,
+ * we would not be able to control what happens
+ * as the packet runs through ip_input() as
+ * it is done through a ISR.
+ */
+ LIST_FOREACH(ia,
+ INADDR_HASH(dst->sin_addr.s_addr), ia_hash) {
+ /*
+ * If the addr to forward to is one
+ * of ours, we pretend to
+ * be the destination for this packet.
+ */
+ if (IA_SIN(ia)->sin_addr.s_addr ==
+ dst->sin_addr.s_addr)
+ break;
+ }
+ if (ia) { /* tell ip_input "dont filter" */
+ struct m_hdr tag;
+
+ tag.mh_type = MT_TAG;
+ tag.mh_flags = PACKET_TAG_IPFORWARD;
+ tag.mh_data = (caddr_t)args.next_hop;
+ tag.mh_next = m;
+
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = ifunit("lo0");
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m0->m_pkthdr.csum_data = 0xffff;
+ }
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip_input((struct mbuf *)&tag);
+ goto done;
+ }
+ /* Some of the logic for this was
+ * nicked from above.
+ *
+ * This rewrites the cached route in a local PCB.
+ * Is this what we want to do?
+ */
+ bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
+
+ ro_fwd->ro_rt = 0;
+ rtalloc_ign(ro_fwd, RTF_PRCLONING);
+
+ if (ro_fwd->ro_rt == 0) {
+ ipstat.ips_noroute++;
+ error = EHOSTUNREACH;
+ goto bad;
+ }
+
+ ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
+ ifp = ro_fwd->ro_rt->rt_ifp;
+ ro_fwd->ro_rt->rt_use++;
+ if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
+ dst = (struct sockaddr_in *)
+ ro_fwd->ro_rt->rt_gateway;
+ if (ro_fwd->ro_rt->rt_flags & RTF_HOST)
+ isbroadcast =
+ (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
+ else
+ isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ if (ro->ro_rt)
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = ro_fwd->ro_rt;
+ dst = (struct sockaddr_in *)&ro_fwd->ro_dst;
+
+#endif /* ... block to be put into a function */
+ /*
+ * If we added a default src ip earlier,
+ * which would have been gotten from the-then
+ * interface, do it again, from the new one.
+ */
+ if (src_was_INADDR_ANY)
+ ip->ip_src = IA_SIN(ia)->sin_addr;
+ goto pass ;
+ }
+
+ /*
+ * if we get here, none of the above matches, and
+ * we have to drop the pkt
+ */
+ m_freem(m);
+ error = EACCES; /* not sure this is the right error msg */
+ goto done;
+ }
+
+pass:
+ /* 127/8 must not appear on wire - RFC1122. */
+ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
+ if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
+ ipstat.ips_badaddr++;
+ error = EADDRNOTAVAIL;
+ goto bad;
+ }
+ }
+
+ m->m_pkthdr.csum_flags |= CSUM_IP;
+ sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
+ if (sw_csum & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m);
+ sw_csum &= ~CSUM_DELAY_DATA;
+ }
+ m->m_pkthdr.csum_flags &= ifp->if_hwassist;
+
+ /*
+ * If small enough for interface, or the interface will take
+ * care of the fragmentation for us, can just send directly.
+ */
+ if ((u_short)ip->ip_len <= ifp->if_mtu ||
+ ifp->if_hwassist & CSUM_FRAGMENT) {
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP) {
+ if (ip->ip_vhl == IP_VHL_BORING) {
+ ip->ip_sum = in_cksum_hdr(ip);
+ } else {
+ ip->ip_sum = in_cksum(m, hlen);
+ }
+ }
+
+ /* Record statistics for this interface address. */
+ if (!(flags & IP_FORWARDING) && ia) {
+ ia->ia_ifa.if_opackets++;
+ ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ }
+
+#ifdef IPSEC
+ /* clean ipsec history once it goes out of the node */
+ ipsec_delaux(m);
+#endif
+
+ error = (*ifp->if_output)(ifp, m,
+ (struct sockaddr *)dst, ro->ro_rt);
+ goto done;
+ }
+ /*
+ * Too large for interface; fragment if possible.
+ * Must be able to put at least 8 bytes per fragment.
+ */
+ if (ip->ip_off & IP_DF) {
+ error = EMSGSIZE;
+ /*
+ * This case can happen if the user changed the MTU
+ * of an interface after enabling IP on it. Because
+ * most netifs don't keep track of routes pointing to
+ * them, there is no way for one to update all its
+ * routes when the MTU is changed.
+ */
+ if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
+ && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
+ && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
+ ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
+ }
+ ipstat.ips_cantfrag++;
+ goto bad;
+ }
+ len = (ifp->if_mtu - hlen) &~ 7;
+ if (len < 8) {
+ error = EMSGSIZE;
+ goto bad;
+ }
+
+ /*
+ * if the interface will not calculate checksums on
+ * fragmented packets, then do it here.
+ */
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
+ (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
+ in_delayed_cksum(m);
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+
+ if (len > PAGE_SIZE) {
+ /*
+ * Fragement large datagrams such that each segment
+ * contains a multiple of PAGE_SIZE amount of data,
+ * plus headers. This enables a receiver to perform
+ * page-flipping zero-copy optimizations.
+ */
+
+ int newlen;
+ struct mbuf *mtmp;
+
+ for (mtmp = m, off = 0;
+ mtmp && ((off + mtmp->m_len) <= ifp->if_mtu);
+ mtmp = mtmp->m_next) {
+ off += mtmp->m_len;
+ }
+ /*
+ * firstlen (off - hlen) must be aligned on an
+ * 8-byte boundary
+ */
+ if (off < hlen)
+ goto smart_frag_failure;
+ off = ((off - hlen) & ~7) + hlen;
+ newlen = (~PAGE_MASK) & ifp->if_mtu;
+ if ((newlen + sizeof (struct ip)) > ifp->if_mtu) {
+ /* we failed, go back the default */
+smart_frag_failure:
+ newlen = len;
+ off = hlen + len;
+ }
+
+/* printf("ipfrag: len = %d, hlen = %d, mhlen = %d, newlen = %d, off = %d\n",
+ len, hlen, sizeof (struct ip), newlen, off);*/
+
+ len = newlen;
+
+ } else {
+ off = hlen + len;
+ }
+
+
+
+ {
+ int mhlen, firstlen = off - hlen;
+ struct mbuf **mnext = &m->m_nextpkt;
+ int nfrags = 1;
+
+ /*
+ * Loop through length of segment after first fragment,
+ * make new header and copy data of each part and link onto chain.
+ */
+ m0 = m;
+ mhlen = sizeof (struct ip);
+ for (; off < (u_short)ip->ip_len; off += len) {
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == 0) {
+ error = ENOBUFS;
+ ipstat.ips_odropped++;
+ goto sendorfree;
+ }
+ m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
+ m->m_data += max_linkhdr;
+ mhip = mtod(m, struct ip *);
+ *mhip = *ip;
+ if (hlen > sizeof (struct ip)) {
+ mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
+ mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
+ }
+ m->m_len = mhlen;
+ mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
+ if (off + len >= (u_short)ip->ip_len)
+ len = (u_short)ip->ip_len - off;
+ else
+ mhip->ip_off |= IP_MF;
+ mhip->ip_len = htons((u_short)(len + mhlen));
+ m->m_next = m_copy(m0, off, len);
+ if (m->m_next == 0) {
+ (void) m_free(m);
+ error = ENOBUFS; /* ??? */
+ ipstat.ips_odropped++;
+ goto sendorfree;
+ }
+ m->m_pkthdr.len = mhlen + len;
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
+ mhip->ip_off = htons(mhip->ip_off);
+ mhip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP) {
+ if (mhip->ip_vhl == IP_VHL_BORING) {
+ mhip->ip_sum = in_cksum_hdr(mhip);
+ } else {
+ mhip->ip_sum = in_cksum(m, mhlen);
+ }
+ }
+ *mnext = m;
+ mnext = &m->m_nextpkt;
+ nfrags++;
+ }
+ ipstat.ips_ofragments += nfrags;
+
+ /* set first/last markers for fragment chain */
+ m->m_flags |= M_LASTFRAG;
+ m0->m_flags |= M_FIRSTFRAG | M_FRAG;
+ m0->m_pkthdr.csum_data = nfrags;
+
+ /*
+ * Update first fragment by trimming what's been copied out
+ * and updating header, then send each fragment (in order).
+ */
+ m = m0;
+ m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
+ m->m_pkthdr.len = hlen + firstlen;
+ ip->ip_len = htons((u_short)m->m_pkthdr.len);
+ ip->ip_off |= IP_MF;
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP) {
+ if (ip->ip_vhl == IP_VHL_BORING) {
+ ip->ip_sum = in_cksum_hdr(ip);
+ } else {
+ ip->ip_sum = in_cksum(m, hlen);
+ }
+ }
+sendorfree:
+ for (m = m0; m; m = m0) {
+ m0 = m->m_nextpkt;
+ m->m_nextpkt = 0;
+#ifdef IPSEC
+ /* clean ipsec history once it goes out of the node */
+ ipsec_delaux(m);
+#endif
+ if (error == 0) {
+ /* Record statistics for this interface address. */
+ if (ia != NULL) {
+ ia->ia_ifa.if_opackets++;
+ ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ }
+
+ error = (*ifp->if_output)(ifp, m,
+ (struct sockaddr *)dst, ro->ro_rt);
+ } else
+ m_freem(m);
+ }
+
+ if (error == 0)
+ ipstat.ips_fragmented++;
+ }
+done:
+#ifdef IPSEC
+ if (ro == &iproute && ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = NULL;
+ }
+ if (sp != NULL) {
+ KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
+ printf("DP ip_output call free SP:%p\n", sp));
+ key_freesp(sp);
+ }
+#endif /* IPSEC */
+ return (error);
+bad:
+ m_freem(m);
+ goto done;
+}
+
+void
+in_delayed_cksum(struct mbuf *m)
+{
+ struct ip *ip;
+ u_short csum, offset;
+
+ ip = mtod(m, struct ip *);
+ offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
+ csum = in_cksum_skip(m, ip->ip_len, offset);
+ if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
+ csum = 0xffff;
+ offset += m->m_pkthdr.csum_data; /* checksum offset */
+
+ if (offset + sizeof(u_short) > m->m_len) {
+ printf("delayed m_pullup, m->len: %d off: %d p: %d\n",
+ m->m_len, offset, ip->ip_p);
+ /*
+ * XXX
+ * this shouldn't happen, but if it does, the
+ * correct behavior may be to insert the checksum
+ * in the existing chain instead of rearranging it.
+ */
+ m = m_pullup(m, offset + sizeof(u_short));
+ }
+ *(u_short *)(m->m_data + offset) = csum;
+}
+
+/*
+ * Insert IP options into preformed packet.
+ * Adjust IP destination as required for IP source routing,
+ * as indicated by a non-zero in_addr at the start of the options.
+ *
+ * XXX This routine assumes that the packet has no options in place.
+ */
+static struct mbuf *
+ip_insertoptions(m, opt, phlen)
+ register struct mbuf *m;
+ struct mbuf *opt;
+ int *phlen;
+{
+ register struct ipoption *p = mtod(opt, struct ipoption *);
+ struct mbuf *n;
+ register struct ip *ip = mtod(m, struct ip *);
+ unsigned optlen;
+
+ optlen = opt->m_len - sizeof(p->ipopt_dst);
+ if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
+ return (m); /* XXX should fail */
+ if (p->ipopt_dst.s_addr)
+ ip->ip_dst = p->ipopt_dst;
+ if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
+ MGETHDR(n, M_DONTWAIT, MT_HEADER);
+ if (n == 0)
+ return (m);
+ n->m_pkthdr.rcvif = (struct ifnet *)0;
+ n->m_pkthdr.len = m->m_pkthdr.len + optlen;
+ m->m_len -= sizeof(struct ip);
+ m->m_data += sizeof(struct ip);
+ n->m_next = m;
+ m = n;
+ m->m_len = optlen + sizeof(struct ip);
+ m->m_data += max_linkhdr;
+ (void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
+ } else {
+ m->m_data -= optlen;
+ m->m_len += optlen;
+ m->m_pkthdr.len += optlen;
+ ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
+ }
+ ip = mtod(m, struct ip *);
+ bcopy(p->ipopt_list, ip + 1, optlen);
+ *phlen = sizeof(struct ip) + optlen;
+ ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
+ ip->ip_len += optlen;
+ return (m);
+}
+
+/*
+ * Copy options from ip to jp,
+ * omitting those not copied during fragmentation.
+ */
+int
+ip_optcopy(ip, jp)
+ struct ip *ip, *jp;
+{
+ register u_char *cp, *dp;
+ int opt, optlen, cnt;
+
+ cp = (u_char *)(ip + 1);
+ dp = (u_char *)(jp + 1);
+ cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[0];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP) {
+ /* Preserve for IP mcast tunnel's LSRR alignment. */
+ *dp++ = IPOPT_NOP;
+ optlen = 1;
+ continue;
+ }
+
+ KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
+ ("ip_optcopy: malformed ipv4 option"));
+ optlen = cp[IPOPT_OLEN];
+ KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
+ ("ip_optcopy: malformed ipv4 option"));
+
+ /* bogus lengths should have been caught by ip_dooptions */
+ if (optlen > cnt)
+ optlen = cnt;
+ if (IPOPT_COPIED(opt)) {
+ bcopy(cp, dp, optlen);
+ dp += optlen;
+ }
+ }
+ for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
+ *dp++ = IPOPT_EOL;
+ return (optlen);
+}
+
+/*
+ * IP socket option processing.
+ */
+int
+ip_ctloutput(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ struct inpcb *inp = sotoinpcb(so);
+ int error, optval;
+
+ error = optval = 0;
+ if (sopt->sopt_level != IPPROTO_IP) {
+ return (EINVAL);
+ }
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case IP_OPTIONS:
+#ifdef notyet
+ case IP_RETOPTS:
+#endif
+ {
+ struct mbuf *m;
+ if (sopt->sopt_valsize > MLEN) {
+ error = EMSGSIZE;
+ break;
+ }
+ MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_HEADER);
+ if (m == 0) {
+ error = ENOBUFS;
+ break;
+ }
+ m->m_len = sopt->sopt_valsize;
+ error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
+ m->m_len);
+
+ return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
+ m));
+ }
+
+ case IP_TOS:
+ case IP_TTL:
+ case IP_RECVOPTS:
+ case IP_RECVRETOPTS:
+ case IP_RECVDSTADDR:
+ case IP_RECVIF:
+ case IP_FAITH:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ switch (sopt->sopt_name) {
+ case IP_TOS:
+ inp->inp_ip_tos = optval;
+ break;
+
+ case IP_TTL:
+ inp->inp_ip_ttl = optval;
+ break;
+#define OPTSET(bit) \
+ if (optval) \
+ inp->inp_flags |= bit; \
+ else \
+ inp->inp_flags &= ~bit;
+
+ case IP_RECVOPTS:
+ OPTSET(INP_RECVOPTS);
+ break;
+
+ case IP_RECVRETOPTS:
+ OPTSET(INP_RECVRETOPTS);
+ break;
+
+ case IP_RECVDSTADDR:
+ OPTSET(INP_RECVDSTADDR);
+ break;
+
+ case IP_RECVIF:
+ OPTSET(INP_RECVIF);
+ break;
+
+ case IP_FAITH:
+ OPTSET(INP_FAITH);
+ break;
+ }
+ break;
+#undef OPTSET
+
+ case IP_MULTICAST_IF:
+ case IP_MULTICAST_VIF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_LOOP:
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ error = ip_setmoptions(sopt, &inp->inp_moptions);
+ break;
+
+ case IP_PORTRANGE:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ switch (optval) {
+ case IP_PORTRANGE_DEFAULT:
+ inp->inp_flags &= ~(INP_LOWPORT);
+ inp->inp_flags &= ~(INP_HIGHPORT);
+ break;
+
+ case IP_PORTRANGE_HIGH:
+ inp->inp_flags &= ~(INP_LOWPORT);
+ inp->inp_flags |= INP_HIGHPORT;
+ break;
+
+ case IP_PORTRANGE_LOW:
+ inp->inp_flags &= ~(INP_HIGHPORT);
+ inp->inp_flags |= INP_LOWPORT;
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ break;
+
+#ifdef IPSEC
+ case IP_IPSEC_POLICY:
+ {
+ caddr_t req;
+ size_t len = 0;
+ int priv;
+ struct mbuf *m;
+ int optname;
+
+ if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
+ break;
+ if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
+ break;
+ priv = (sopt->sopt_td != NULL &&
+ suser(sopt->sopt_td) != 0) ? 0 : 1;
+ req = mtod(m, caddr_t);
+ len = m->m_len;
+ optname = sopt->sopt_name;
+ error = ipsec4_set_policy(inp, optname, req, len, priv);
+ m_freem(m);
+ break;
+ }
+#endif /*IPSEC*/
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case IP_OPTIONS:
+ case IP_RETOPTS:
+ if (inp->inp_options)
+ error = sooptcopyout(sopt,
+ mtod(inp->inp_options,
+ char *),
+ inp->inp_options->m_len);
+ else
+ sopt->sopt_valsize = 0;
+ break;
+
+ case IP_TOS:
+ case IP_TTL:
+ case IP_RECVOPTS:
+ case IP_RECVRETOPTS:
+ case IP_RECVDSTADDR:
+ case IP_RECVIF:
+ case IP_PORTRANGE:
+ case IP_FAITH:
+ switch (sopt->sopt_name) {
+
+ case IP_TOS:
+ optval = inp->inp_ip_tos;
+ break;
+
+ case IP_TTL:
+ optval = inp->inp_ip_ttl;
+ break;
+
+#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
+
+ case IP_RECVOPTS:
+ optval = OPTBIT(INP_RECVOPTS);
+ break;
+
+ case IP_RECVRETOPTS:
+ optval = OPTBIT(INP_RECVRETOPTS);
+ break;
+
+ case IP_RECVDSTADDR:
+ optval = OPTBIT(INP_RECVDSTADDR);
+ break;
+
+ case IP_RECVIF:
+ optval = OPTBIT(INP_RECVIF);
+ break;
+
+ case IP_PORTRANGE:
+ if (inp->inp_flags & INP_HIGHPORT)
+ optval = IP_PORTRANGE_HIGH;
+ else if (inp->inp_flags & INP_LOWPORT)
+ optval = IP_PORTRANGE_LOW;
+ else
+ optval = 0;
+ break;
+
+ case IP_FAITH:
+ optval = OPTBIT(INP_FAITH);
+ break;
+ }
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case IP_MULTICAST_IF:
+ case IP_MULTICAST_VIF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_LOOP:
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ error = ip_getmoptions(sopt, inp->inp_moptions);
+ break;
+
+#ifdef IPSEC
+ case IP_IPSEC_POLICY:
+ {
+ struct mbuf *m = NULL;
+ caddr_t req = NULL;
+ size_t len = 0;
+
+ if (m != 0) {
+ req = mtod(m, caddr_t);
+ len = m->m_len;
+ }
+ error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
+ if (error == 0)
+ error = soopt_mcopyout(sopt, m); /* XXX */
+ if (error == 0)
+ m_freem(m);
+ break;
+ }
+#endif /*IPSEC*/
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Set up IP options in pcb for insertion in output packets.
+ * Store in mbuf with pointer in pcbopt, adding pseudo-option
+ * with destination address if source routed.
+ */
+static int
+ip_pcbopts(optname, pcbopt, m)
+ int optname;
+ struct mbuf **pcbopt;
+ register struct mbuf *m;
+{
+ register int cnt, optlen;
+ register u_char *cp;
+ u_char opt;
+
+ /* turn off any old options */
+ if (*pcbopt)
+ (void)m_free(*pcbopt);
+ *pcbopt = 0;
+ if (m == (struct mbuf *)0 || m->m_len == 0) {
+ /*
+ * Only turning off any previous options.
+ */
+ if (m)
+ (void)m_free(m);
+ return (0);
+ }
+
+ if (m->m_len % sizeof(int32_t))
+ goto bad;
+ /*
+ * IP first-hop destination address will be stored before
+ * actual options; move other options back
+ * and clear it when none present.
+ */
+ if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
+ goto bad;
+ cnt = m->m_len;
+ m->m_len += sizeof(struct in_addr);
+ cp = mtod(m, u_char *) + sizeof(struct in_addr);
+ ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
+ bzero(mtod(m, caddr_t), sizeof(struct in_addr));
+
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[IPOPT_OPTVAL];
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < IPOPT_OLEN + sizeof(*cp))
+ goto bad;
+ optlen = cp[IPOPT_OLEN];
+ if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
+ goto bad;
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ /*
+ * user process specifies route as:
+ * ->A->B->C->D
+ * D must be our final destination (but we can't
+ * check that since we may not have connected yet).
+ * A is first hop destination, which doesn't appear in
+ * actual IP option, but is stored before the options.
+ */
+ if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
+ goto bad;
+ m->m_len -= sizeof(struct in_addr);
+ cnt -= sizeof(struct in_addr);
+ optlen -= sizeof(struct in_addr);
+ cp[IPOPT_OLEN] = optlen;
+ /*
+ * Move first hop before start of options.
+ */
+ bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
+ sizeof(struct in_addr));
+ /*
+ * Then copy rest of options back
+ * to close up the deleted entry.
+ */
+ ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
+ sizeof(struct in_addr)),
+ (caddr_t)&cp[IPOPT_OFFSET+1],
+ (unsigned)cnt + sizeof(struct in_addr));
+ break;
+ }
+ }
+ if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
+ goto bad;
+ *pcbopt = m;
+ return (0);
+
+bad:
+ (void)m_free(m);
+ return (EINVAL);
+}
+
+/*
+ * XXX
+ * The whole multicast option thing needs to be re-thought.
+ * Several of these options are equally applicable to non-multicast
+ * transmission, and one (IP_MULTICAST_TTL) totally duplicates a
+ * standard option (IP_TTL).
+ */
+
+/*
+ * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
+ */
+static struct ifnet *
+ip_multicast_if(a, ifindexp)
+ struct in_addr *a;
+ int *ifindexp;
+{
+ int ifindex;
+ struct ifnet *ifp;
+
+ if (ifindexp)
+ *ifindexp = 0;
+ if (ntohl(a->s_addr) >> 24 == 0) {
+ ifindex = ntohl(a->s_addr) & 0xffffff;
+ if (ifindex < 0 || if_index < ifindex)
+ return NULL;
+ ifp = ifnet_byindex(ifindex);
+ if (ifindexp)
+ *ifindexp = ifindex;
+ } else {
+ INADDR_TO_IFP(*a, ifp);
+ }
+ return ifp;
+}
+
+/*
+ * Set the IP multicast options in response to user setsockopt().
+ */
+static int
+ip_setmoptions(sopt, imop)
+ struct sockopt *sopt;
+ struct ip_moptions **imop;
+{
+ int error = 0;
+ int i;
+ struct in_addr addr;
+ struct ip_mreq mreq;
+ struct ifnet *ifp;
+ struct ip_moptions *imo = *imop;
+ struct route ro;
+ struct sockaddr_in *dst;
+ int ifindex;
+ int s;
+
+ if (imo == NULL) {
+ /*
+ * No multicast option buffer attached to the pcb;
+ * allocate one and initialize to default values.
+ */
+ imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS,
+ M_WAITOK);
+
+ if (imo == NULL)
+ return (ENOBUFS);
+ *imop = imo;
+ imo->imo_multicast_ifp = NULL;
+ imo->imo_multicast_addr.s_addr = INADDR_ANY;
+ imo->imo_multicast_vif = -1;
+ imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+ imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+ imo->imo_num_memberships = 0;
+ }
+
+ switch (sopt->sopt_name) {
+ /* store an index number for the vif you wanna use in the send */
+ case IP_MULTICAST_VIF:
+ if (legal_vif_num == 0) {
+ error = EOPNOTSUPP;
+ break;
+ }
+ error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ if (error)
+ break;
+ if (!legal_vif_num(i) && (i != -1)) {
+ error = EINVAL;
+ break;
+ }
+ imo->imo_multicast_vif = i;
+ break;
+
+ case IP_MULTICAST_IF:
+ /*
+ * Select the interface for outgoing multicast packets.
+ */
+ error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr);
+ if (error)
+ break;
+ /*
+ * INADDR_ANY is used to remove a previous selection.
+ * When no interface is selected, a default one is
+ * chosen every time a multicast packet is sent.
+ */
+ if (addr.s_addr == INADDR_ANY) {
+ imo->imo_multicast_ifp = NULL;
+ break;
+ }
+ /*
+ * The selected interface is identified by its local
+ * IP address. Find the interface and confirm that
+ * it supports multicasting.
+ */
+ s = splimp();
+ ifp = ip_multicast_if(&addr, &ifindex);
+ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
+ splx(s);
+ error = EADDRNOTAVAIL;
+ break;
+ }
+ imo->imo_multicast_ifp = ifp;
+ if (ifindex)
+ imo->imo_multicast_addr = addr;
+ else
+ imo->imo_multicast_addr.s_addr = INADDR_ANY;
+ splx(s);
+ break;
+
+ case IP_MULTICAST_TTL:
+ /*
+ * Set the IP time-to-live for outgoing multicast packets.
+ * The original multicast API required a char argument,
+ * which is inconsistent with the rest of the socket API.
+ * We allow either a char or an int.
+ */
+ if (sopt->sopt_valsize == 1) {
+ u_char ttl;
+ error = sooptcopyin(sopt, &ttl, 1, 1);
+ if (error)
+ break;
+ imo->imo_multicast_ttl = ttl;
+ } else {
+ u_int ttl;
+ error = sooptcopyin(sopt, &ttl, sizeof ttl,
+ sizeof ttl);
+ if (error)
+ break;
+ if (ttl > 255)
+ error = EINVAL;
+ else
+ imo->imo_multicast_ttl = ttl;
+ }
+ break;
+
+ case IP_MULTICAST_LOOP:
+ /*
+ * Set the loopback flag for outgoing multicast packets.
+ * Must be zero or one. The original multicast API required a
+ * char argument, which is inconsistent with the rest
+ * of the socket API. We allow either a char or an int.
+ */
+ if (sopt->sopt_valsize == 1) {
+ u_char loop;
+ error = sooptcopyin(sopt, &loop, 1, 1);
+ if (error)
+ break;
+ imo->imo_multicast_loop = !!loop;
+ } else {
+ u_int loop;
+ error = sooptcopyin(sopt, &loop, sizeof loop,
+ sizeof loop);
+ if (error)
+ break;
+ imo->imo_multicast_loop = !!loop;
+ }
+ break;
+
+ case IP_ADD_MEMBERSHIP:
+ /*
+ * Add a multicast group membership.
+ * Group must be a valid IP multicast address.
+ */
+ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
+ if (error)
+ break;
+
+ if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
+ error = EINVAL;
+ break;
+ }
+ s = splimp();
+ /*
+ * If no interface address was provided, use the interface of
+ * the route to the given multicast address.
+ */
+ if (mreq.imr_interface.s_addr == INADDR_ANY) {
+ bzero((caddr_t)&ro, sizeof(ro));
+ dst = (struct sockaddr_in *)&ro.ro_dst;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_family = AF_INET;
+ dst->sin_addr = mreq.imr_multiaddr;
+ rtalloc(&ro);
+ if (ro.ro_rt == NULL) {
+ error = EADDRNOTAVAIL;
+ splx(s);
+ break;
+ }
+ ifp = ro.ro_rt->rt_ifp;
+ rtfree(ro.ro_rt);
+ }
+ else {
+ ifp = ip_multicast_if(&mreq.imr_interface, NULL);
+ }
+
+ /*
+ * See if we found an interface, and confirm that it
+ * supports multicast.
+ */
+ if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
+ error = EADDRNOTAVAIL;
+ splx(s);
+ break;
+ }
+ /*
+ * See if the membership already exists or if all the
+ * membership slots are full.
+ */
+ for (i = 0; i < imo->imo_num_memberships; ++i) {
+ if (imo->imo_membership[i]->inm_ifp == ifp &&
+ imo->imo_membership[i]->inm_addr.s_addr
+ == mreq.imr_multiaddr.s_addr)
+ break;
+ }
+ if (i < imo->imo_num_memberships) {
+ error = EADDRINUSE;
+ splx(s);
+ break;
+ }
+ if (i == IP_MAX_MEMBERSHIPS) {
+ error = ETOOMANYREFS;
+ splx(s);
+ break;
+ }
+ /*
+ * Everything looks good; add a new record to the multicast
+ * address list for the given interface.
+ */
+ if ((imo->imo_membership[i] =
+ in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) {
+ error = ENOBUFS;
+ splx(s);
+ break;
+ }
+ ++imo->imo_num_memberships;
+ splx(s);
+ break;
+
+ case IP_DROP_MEMBERSHIP:
+ /*
+ * Drop a multicast group membership.
+ * Group must be a valid IP multicast address.
+ */
+ error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq);
+ if (error)
+ break;
+
+ if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) {
+ error = EINVAL;
+ break;
+ }
+
+ s = splimp();
+ /*
+ * If an interface address was specified, get a pointer
+ * to its ifnet structure.
+ */
+ if (mreq.imr_interface.s_addr == INADDR_ANY)
+ ifp = NULL;
+ else {
+ ifp = ip_multicast_if(&mreq.imr_interface, NULL);
+ if (ifp == NULL) {
+ error = EADDRNOTAVAIL;
+ splx(s);
+ break;
+ }
+ }
+ /*
+ * Find the membership in the membership array.
+ */
+ for (i = 0; i < imo->imo_num_memberships; ++i) {
+ if ((ifp == NULL ||
+ imo->imo_membership[i]->inm_ifp == ifp) &&
+ imo->imo_membership[i]->inm_addr.s_addr ==
+ mreq.imr_multiaddr.s_addr)
+ break;
+ }
+ if (i == imo->imo_num_memberships) {
+ error = EADDRNOTAVAIL;
+ splx(s);
+ break;
+ }
+ /*
+ * Give up the multicast address record to which the
+ * membership points.
+ */
+ in_delmulti(imo->imo_membership[i]);
+ /*
+ * Remove the gap in the membership array.
+ */
+ for (++i; i < imo->imo_num_memberships; ++i)
+ imo->imo_membership[i-1] = imo->imo_membership[i];
+ --imo->imo_num_memberships;
+ splx(s);
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ /*
+ * If all options have default values, no need to keep the mbuf.
+ */
+ if (imo->imo_multicast_ifp == NULL &&
+ imo->imo_multicast_vif == -1 &&
+ imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
+ imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
+ imo->imo_num_memberships == 0) {
+ free(*imop, M_IPMOPTS);
+ *imop = NULL;
+ }
+
+ return (error);
+}
+
+/*
+ * Return the IP multicast options in response to user getsockopt().
+ */
+static int
+ip_getmoptions(sopt, imo)
+ struct sockopt *sopt;
+ register struct ip_moptions *imo;
+{
+ struct in_addr addr;
+ struct in_ifaddr *ia;
+ int error, optval;
+ u_char coptval;
+
+ error = 0;
+ switch (sopt->sopt_name) {
+ case IP_MULTICAST_VIF:
+ if (imo != NULL)
+ optval = imo->imo_multicast_vif;
+ else
+ optval = -1;
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case IP_MULTICAST_IF:
+ if (imo == NULL || imo->imo_multicast_ifp == NULL)
+ addr.s_addr = INADDR_ANY;
+ else if (imo->imo_multicast_addr.s_addr) {
+ /* return the value user has set */
+ addr = imo->imo_multicast_addr;
+ } else {
+ IFP_TO_IA(imo->imo_multicast_ifp, ia);
+ addr.s_addr = (ia == NULL) ? INADDR_ANY
+ : IA_SIN(ia)->sin_addr.s_addr;
+ }
+ error = sooptcopyout(sopt, &addr, sizeof addr);
+ break;
+
+ case IP_MULTICAST_TTL:
+ if (imo == 0)
+ optval = coptval = IP_DEFAULT_MULTICAST_TTL;
+ else
+ optval = coptval = imo->imo_multicast_ttl;
+ if (sopt->sopt_valsize == 1)
+ error = sooptcopyout(sopt, &coptval, 1);
+ else
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case IP_MULTICAST_LOOP:
+ if (imo == 0)
+ optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
+ else
+ optval = coptval = imo->imo_multicast_loop;
+ if (sopt->sopt_valsize == 1)
+ error = sooptcopyout(sopt, &coptval, 1);
+ else
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Discard the IP multicast options.
+ */
+void
+ip_freemoptions(imo)
+ register struct ip_moptions *imo;
+{
+ register int i;
+
+ if (imo != NULL) {
+ for (i = 0; i < imo->imo_num_memberships; ++i)
+ in_delmulti(imo->imo_membership[i]);
+ free(imo, M_IPMOPTS);
+ }
+}
+
+/*
+ * Routine called from ip_output() to loop back a copy of an IP multicast
+ * packet to the input queue of a specified interface. Note that this
+ * calls the output routine of the loopback "driver", but with an interface
+ * pointer that might NOT be a loopback interface -- evil, but easier than
+ * replicating that code here.
+ */
+static void
+ip_mloopback(ifp, m, dst, hlen)
+ struct ifnet *ifp;
+ register struct mbuf *m;
+ register struct sockaddr_in *dst;
+ int hlen;
+{
+ register struct ip *ip;
+ struct mbuf *copym;
+
+ copym = m_copy(m, 0, M_COPYALL);
+ if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
+ copym = m_pullup(copym, hlen);
+ if (copym != NULL) {
+ /*
+ * We don't bother to fragment if the IP length is greater
+ * than the interface's MTU. Can this possibly matter?
+ */
+ ip = mtod(copym, struct ip *);
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (ip->ip_vhl == IP_VHL_BORING) {
+ ip->ip_sum = in_cksum_hdr(ip);
+ } else {
+ ip->ip_sum = in_cksum(copym, hlen);
+ }
+ /*
+ * NB:
+ * It's not clear whether there are any lingering
+ * reentrancy problems in other areas which might
+ * be exposed by using ip_input directly (in
+ * particular, everything which modifies the packet
+ * in-place). Yet another option is using the
+ * protosw directly to deliver the looped back
+ * packet. For the moment, we'll err on the side
+ * of safety by using if_simloop().
+ */
+#if 1 /* XXX */
+ if (dst->sin_family != AF_INET) {
+ printf("ip_mloopback: bad address family %d\n",
+ dst->sin_family);
+ dst->sin_family = AF_INET;
+ }
+#endif
+
+#ifdef notdef
+ copym->m_pkthdr.rcvif = ifp;
+ ip_input(copym);
+#else
+ /* if the checksum hasn't been computed, mark it as valid */
+ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ copym->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ copym->m_pkthdr.csum_data = 0xffff;
+ }
+ if_simloop(ifp, copym, dst->sin_family, 0);
+#endif
+ }
+}
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
new file mode 100644
index 0000000..e084d1d
--- /dev/null
+++ b/sys/netinet/ip_var.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_var.h 8.2 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IP_VAR_H_
+#define _NETINET_IP_VAR_H_
+
+#include <sys/queue.h>
+
+/*
+ * Overlay for ip header used by other protocols (tcp, udp).
+ */
+struct ipovly {
+ u_char ih_x1[9]; /* (unused) */
+ u_char ih_pr; /* protocol */
+ u_short ih_len; /* protocol length */
+ struct in_addr ih_src; /* source internet address */
+ struct in_addr ih_dst; /* destination internet address */
+};
+
+/*
+ * Ip reassembly queue structure. Each fragment
+ * being reassembled is attached to one of these structures.
+ * They are timed out after ipq_ttl drops to 0, and may also
+ * be reclaimed if memory becomes tight.
+ */
+struct ipq {
+ TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */
+ u_char ipq_ttl; /* time for reass q to live */
+ u_char ipq_p; /* protocol of this fragment */
+ u_short ipq_id; /* sequence id for reassembly */
+ struct mbuf *ipq_frags; /* to ip headers of fragments */
+ struct in_addr ipq_src,ipq_dst;
+#ifdef IPDIVERT
+ u_int32_t ipq_div_info; /* ipfw divert port & flags */
+ u_int16_t ipq_div_cookie; /* ipfw divert cookie */
+#endif
+};
+
+/*
+ * Structure stored in mbuf in inpcb.ip_options
+ * and passed to ip_output when ip options are in use.
+ * The actual length of the options (including ipopt_dst)
+ * is in m_len.
+ */
+#define MAX_IPOPTLEN 40
+
+struct ipoption {
+ struct in_addr ipopt_dst; /* first-hop dst if source routed */
+ char ipopt_list[MAX_IPOPTLEN]; /* options proper */
+};
+
+/*
+ * Structure attached to inpcb.ip_moptions and
+ * passed to ip_output when IP multicast options are in use.
+ */
+struct ip_moptions {
+ struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
+ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
+ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */
+ u_char imo_multicast_loop; /* 1 => hear sends if a member */
+ u_short imo_num_memberships; /* no. memberships this socket */
+ struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS];
+ u_long imo_multicast_vif; /* vif num outgoing multicasts */
+};
+
+struct ipstat {
+ u_long ips_total; /* total packets received */
+ u_long ips_badsum; /* checksum bad */
+ u_long ips_tooshort; /* packet too short */
+ u_long ips_toosmall; /* not enough data */
+ u_long ips_badhlen; /* ip header length < data size */
+ u_long ips_badlen; /* ip length < ip header length */
+ u_long ips_fragments; /* fragments received */
+ u_long ips_fragdropped; /* frags dropped (dups, out of space) */
+ u_long ips_fragtimeout; /* fragments timed out */
+ u_long ips_forward; /* packets forwarded */
+ u_long ips_fastforward; /* packets fast forwarded */
+ u_long ips_cantforward; /* packets rcvd for unreachable dest */
+ u_long ips_redirectsent; /* packets forwarded on same net */
+ u_long ips_noproto; /* unknown or unsupported protocol */
+ u_long ips_delivered; /* datagrams delivered to upper level*/
+ u_long ips_localout; /* total ip packets generated here */
+ u_long ips_odropped; /* lost packets due to nobufs, etc. */
+ u_long ips_reassembled; /* total packets reassembled ok */
+ u_long ips_fragmented; /* datagrams successfully fragmented */
+ u_long ips_ofragments; /* output fragments created */
+ u_long ips_cantfrag; /* don't fragment flag was set, etc. */
+ u_long ips_badoptions; /* error in option processing */
+ u_long ips_noroute; /* packets discarded due to no route */
+ u_long ips_badvers; /* ip version != 4 */
+ u_long ips_rawout; /* total raw ip packets generated */
+ u_long ips_toolong; /* ip length > max ip packet size */
+ u_long ips_notmember; /* multicasts for unregistered grps */
+ u_long ips_nogif; /* no match gif found */
+ u_long ips_badaddr; /* invalid address on header */
+};
+
+#ifdef _KERNEL
+
+/* flags passed to ip_output as last parameter */
+#define IP_FORWARDING 0x1 /* most of ip header exists */
+#define IP_RAWOUTPUT 0x2 /* raw ip header exists */
+#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */
+#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */
+
+struct ip;
+struct inpcb;
+struct route;
+struct sockopt;
+
+extern struct ipstat ipstat;
+#ifndef RANDOM_IP_ID
+extern u_short ip_id; /* ip packet ctr, for ids */
+#endif
+extern int ip_defttl; /* default IP ttl */
+extern int ipforwarding; /* ip forwarding */
+extern struct route ipforward_rt; /* ip forwarding cached route */
+extern u_char ip_protox[];
+extern struct socket *ip_rsvpd; /* reservation protocol daemon */
+extern struct socket *ip_mrouter; /* multicast routing daemon */
+extern int (*legal_vif_num)(int);
+extern u_long (*ip_mcast_src)(int);
+extern int rsvp_on;
+extern struct pr_usrreqs rip_usrreqs;
+
+int ip_ctloutput(struct socket *, struct sockopt *sopt);
+void ip_drain(void);
+void ip_freemoptions(struct ip_moptions *);
+void ip_init(void);
+extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
+ struct ip_moptions *);
+int ip_output(struct mbuf *,
+ struct mbuf *, struct route *, int, struct ip_moptions *);
+struct in_ifaddr *
+ ip_rtaddr(struct in_addr, struct route *);
+void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
+ struct mbuf *);
+void ip_slowtimo(void);
+struct mbuf *
+ ip_srcroute(void);
+void ip_stripoptions(struct mbuf *, struct mbuf *);
+#ifdef RANDOM_IP_ID
+u_int16_t
+ ip_randomid(void);
+#endif
+int rip_ctloutput(struct socket *, struct sockopt *);
+void rip_ctlinput(int, struct sockaddr *, void *);
+void rip_init(void);
+void rip_input(struct mbuf *, int);
+int rip_output(struct mbuf *, struct socket *, u_long);
+void ipip_input(struct mbuf *, int);
+void rsvp_input(struct mbuf *, int);
+int ip_rsvp_init(struct socket *);
+int ip_rsvp_done(void);
+int ip_rsvp_vif_init(struct socket *, struct sockopt *);
+int ip_rsvp_vif_done(struct socket *, struct sockopt *);
+void ip_rsvp_force_done(struct socket *);
+
+#ifdef IPDIVERT
+void div_init(void);
+void div_input(struct mbuf *, int);
+void divert_packet(struct mbuf *m, int incoming, int port, int rule);
+extern struct pr_usrreqs div_usrreqs;
+#endif
+
+void in_delayed_cksum(struct mbuf *m);
+
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_IP_VAR_H_ */
diff --git a/sys/netinet/ipprotosw.h b/sys/netinet/ipprotosw.h
new file mode 100644
index 0000000..bdc4c73
--- /dev/null
+++ b/sys/netinet/ipprotosw.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 1995, 1996, 1997, 1998, and 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)protosw.h 8.1 (Berkeley) 6/2/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_IPPROTOSW_H_
+#define _NETINET_IPPROTOSW_H_
+
+/*
+ * For pfil_head structure.
+ */
+#include <net/pfil.h>
+
+/* Forward declare these structures referenced from prototypes below. */
+struct mbuf;
+struct sockaddr;
+struct socket;
+struct sockopt;
+
+struct ipprotosw {
+ short pr_type; /* socket type used for */
+ struct domain *pr_domain; /* domain protocol a member of */
+ short pr_protocol; /* protocol number */
+ short pr_flags; /* see below */
+/* protocol-protocol hooks */
+ pr_in_input_t *pr_input; /* input to protocol (from below) */
+ pr_output_t *pr_output; /* output to protocol (from above) */
+ pr_ctlinput_t *pr_ctlinput; /* control input (from below) */
+ pr_ctloutput_t *pr_ctloutput; /* control output (from above) */
+/* user-protocol hook */
+ void *pr_ousrreq;
+/* utility hooks */
+ pr_init_t *pr_init;
+ pr_fasttimo_t *pr_fasttimo; /* fast timeout (200ms) */
+ pr_slowtimo_t *pr_slowtimo; /* slow timeout (500ms) */
+ pr_drain_t *pr_drain; /* flush any excess space possible */
+
+ struct pr_usrreqs *pr_usrreqs; /* supersedes pr_usrreq() */
+ struct pfil_head pr_pfh;
+};
+
+#endif /* !_NETINET_IPPROTOSW_H_ */
diff --git a/sys/netinet/libalias/HISTORY b/sys/netinet/libalias/HISTORY
new file mode 100644
index 0000000..c5bca59
--- /dev/null
+++ b/sys/netinet/libalias/HISTORY
@@ -0,0 +1,145 @@
+$FreeBSD$
+
+Version 1.0: August 11, 1996 (cjm)
+
+Version 1.1: August 20, 1996 (cjm)
+ - Host accepts incoming connections for ports 0 to 1023.
+
+Version 1.2: September 7, 1996 (cjm)
+ - Fragment handling error in alias_db.c corrected.
+
+Version 1.3: September 15, 1996 (cjm)
+ - Generalized mechanism for handling incoming
+ connections (no more 0 to 1023 restriction).
+
+ - Increased ICMP support (will handle traceroute now).
+
+ - Improved TCP close connection logic.
+
+Version 1.4: September 16, 1996 (cjm)
+
+Version 1.5: September 17, 1996 (cjm)
+ - Corrected error in handling incoming UDP packets
+ with zero checksum.
+
+Version 1.6: September 18, 1996
+ - Simplified ICMP data storage. Will now handle
+ tracert from Win95 and NT as well as FreeBSD
+ traceroute, which uses UDP packets to non-existent
+ ports.
+
+Version 1.7: January 9, 1997 (cjm)
+ - Reduced malloc() activity for ICMP echo and
+ timestamp requests.
+
+ - Added handling for out-of-order IP fragments.
+
+ - Switched to differential checksum computation
+ for IP headers (TCP, UDP and ICMP checksums
+ were already differential).
+
+ - Accepts FTP data connections from other than
+ port 20. This allows one ftp connections
+ from two hosts which are both running packet
+ aliasing.
+
+ - Checksum error on FTP transfers. Problem
+ in code located by Martin Renters and
+ Brian Somers.
+
+Version 1.8: January 14, 1997 (cjm)
+ - Fixed data type error in function StartPoint()
+ in alias_db.c (this bug did not exist before v1.7)
+ Problem in code located by Ari Suutari.
+
+Version 1.9: February 1, 1997 (Eivind Eklund <perhaps@yes.no>)
+ - Added support for IRC DCC (ee)
+
+ - Changed the aliasing routines to use ANSI style
+ throughout (ee)
+
+ - Minor API changes for integration with other
+ programs than PPP (ee)
+
+ - Fixed minor security hole in alias_ftp.c for
+ other applications of the aliasing software.
+ Hole could _not_ manifest in ppp+pktAlias, but
+ could potentially manifest in other applications
+ of the aliasing. (ee)
+
+ - Connections initiated from packet aliasing
+ host machine will not have their port number
+ aliased unless it conflicts with an aliasing
+ port already being used. (There is an option
+ to disable this for debugging) (cjm)
+
+ - Sockets will be allocated in cases where
+ there might be port interference with the
+ host machine. This can be disabled in cases
+ where the ppp host will be acting purely as a
+ masquerading router and not generate any
+ traffic of its own.
+ (cjm)
+
+Version 2.0: March, 1997 (cjm)
+ - Aliasing links are cleared only when a host interface address
+ changes.
+
+ - PacketAliasPermanentLink() API added.
+
+ - Option for only aliasing private, unregistered
+ IP addresses added.
+
+ - Substantial rework to the aliasing lookup engine.
+
+Version 2.1: May, 1997 (cjm)
+ - Continuing rework to the aliasing lookup engine
+ to support multiple incoming addresses and static
+ NAT. PacketAliasRedirectPort() and
+ PacketAliasRedirectAddr() added to API.
+
+ - Now supports outgoing as well as incoming ICMP
+ error messages.
+
+Version 2.2: July, 1997 (cjm)
+ - Rationalized API function names to all begin with
+ "PacketAlias..." Old function names are retained
+ for backwards compatibility.
+
+ - Packet aliasing engine will now free memory of
+ fragments which are never resolved after a timeout
+ period. Once a fragment is resolved, it becomes
+ the users responsibility to free the memory.
+
+Version 2.3: August 11, 1997 (cjm)
+ - Problem associated with socket file descriptor
+ accumulation in alias_db.c corrected. The sockets
+ had to be closed when a binding failed. Problem
+ in code located by Gordon Burditt.
+
+Version 2.4: September 1, 1997 (cjm)
+ - PKT_ALIAS_UNREGISTERED_ONLY option repaired.
+ This part of the code was incorrectly re-implemented
+ in version 2.1.
+
+Version 2.5: December, 1997 (ee)
+ - Added PKT_ALIAS_PUNCH_FW mode for firewall
+ bypass of FTP/IRC DCC data connections. Also added
+ improved TCP connection monitoring.
+
+Version 2.6: May, 1998 (amurai)
+ - Added supporting routine for NetBios over TCP/IP.
+
+Version 3.0: January 1, 1999
+ - Transparent proxying support added.
+ - PPTP redirecting support added based on patches
+ contributed by Dru Nelson <dnelson@redwoodsoft.com>.
+
+Version 3.1: May, 2000 (Erik Salander, erik@whistle.com)
+ - Added support to alias 227 replies, allows aliasing for
+ FTP servers in passive mode.
+ - Added support for PPTP aliasing.
+
+Version 3.2: July, 2000 (Erik Salander, erik@whistle.com and
+ Junichi Satoh, junichi@junichi.org)
+ - Added support for streaming media (RTSP and PNA) aliasing.
diff --git a/sys/netinet/libalias/Makefile b/sys/netinet/libalias/Makefile
new file mode 100644
index 0000000..a6f577d
--- /dev/null
+++ b/sys/netinet/libalias/Makefile
@@ -0,0 +1,13 @@
+# $FreeBSD$
+
+LIB= alias
+SHLIB_MAJOR= 4
+SHLIB_MINOR= 0
+CFLAGS+= -Wall -Wmissing-prototypes
+SRCS= alias.c alias_cuseeme.c alias_db.c alias_ftp.c alias_irc.c \
+ alias_nbt.c alias_pptp.c alias_proxy.c alias_smedia.c \
+ alias_util.c
+INCS= alias.h
+MAN= libalias.3
+
+.include <bsd.lib.mk>
diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c
new file mode 100644
index 0000000..320c5c2
--- /dev/null
+++ b/sys/netinet/libalias/alias.c
@@ -0,0 +1,1574 @@
+/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- */
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias.c provides supervisory control for the functions of the
+ packet aliasing software. It consists of routines to monitor
+ TCP connection state, protocol-specific aliasing routines,
+ fragment handling and the following outside world functional
+ interfaces: SaveFragmentPtr, GetFragmentPtr, FragmentAliasIn,
+ PacketAliasIn and PacketAliasOut.
+
+ The other C program files are briefly described. The data
+ structure framework which holds information needed to translate
+ packets is encapsulated in alias_db.c. Data is accessed by
+ function calls, so other segments of the program need not know
+ about the underlying data structures. Alias_ftp.c contains
+ special code for modifying the ftp PORT command used to establish
+ data connections, while alias_irc.c does the same for IRC
+ DCC. Alias_util.c contains a few utility routines.
+
+ Version 1.0 August, 1996 (cjm)
+
+ Version 1.1 August 20, 1996 (cjm)
+ PPP host accepts incoming connections for ports 0 to 1023.
+ (Gary Roberts pointed out the need to handle incoming
+ connections.)
+
+ Version 1.2 September 7, 1996 (cjm)
+ Fragment handling error in alias_db.c corrected.
+ (Tom Torrance helped fix this problem.)
+
+ Version 1.4 September 16, 1996 (cjm)
+ - A more generalized method for handling incoming
+ connections, without the 0-1023 restriction, is
+ implemented in alias_db.c
+ - Improved ICMP support in alias.c. Traceroute
+ packet streams can now be correctly aliased.
+ - TCP connection closing logic simplified in
+ alias.c and now allows for additional 1 minute
+ "grace period" after FIN or RST is observed.
+
+ Version 1.5 September 17, 1996 (cjm)
+ Corrected error in handling incoming UDP packets with 0 checksum.
+ (Tom Torrance helped fix this problem.)
+
+ Version 1.6 September 18, 1996 (cjm)
+ Simplified ICMP aliasing scheme. Should now support
+ traceroute from Win95 as well as FreeBSD.
+
+ Version 1.7 January 9, 1997 (cjm)
+ - Out-of-order fragment handling.
+ - IP checksum error fixed for ftp transfers
+ from aliasing host.
+ - Integer return codes added to all
+ aliasing/de-aliasing functions.
+ - Some obsolete comments cleaned up.
+ - Differential checksum computations for
+ IP header (TCP, UDP and ICMP were already
+ differential).
+
+ Version 2.1 May 1997 (cjm)
+ - Added support for outgoing ICMP error
+ messages.
+ - Added two functions PacketAliasIn2()
+ and PacketAliasOut2() for dynamic address
+ control (e.g. round-robin allocation of
+ incoming packets).
+
+ Version 2.2 July 1997 (cjm)
+ - Rationalized API function names to begin
+ with "PacketAlias..."
+ - Eliminated PacketAliasIn2() and
+ PacketAliasOut2() as poorly conceived.
+
+ Version 2.3 Dec 1998 (dillon)
+ - Major bounds checking additions, see FreeBSD/CVS
+
+ Version 3.1 May, 2000 (salander)
+ - Added hooks to handle PPTP.
+
+ Version 3.2 July, 2000 (salander and satoh)
+ - Added PacketUnaliasOut routine.
+ - Added hooks to handle RTSP/RTP.
+
+ See HISTORY file for additional revisions.
+*/
+
+#include <sys/types.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <stdio.h>
+
+#include "alias_local.h"
+#include "alias.h"
+
+#define NETBIOS_NS_PORT_NUMBER 137
+#define NETBIOS_DGM_PORT_NUMBER 138
+#define FTP_CONTROL_PORT_NUMBER 21
+#define IRC_CONTROL_PORT_NUMBER_1 6667
+#define IRC_CONTROL_PORT_NUMBER_2 6668
+#define CUSEEME_PORT_NUMBER 7648
+#define RTSP_CONTROL_PORT_NUMBER_1 554
+#define RTSP_CONTROL_PORT_NUMBER_2 7070
+#define TFTP_PORT_NUMBER 69
+#define PPTP_CONTROL_PORT_NUMBER 1723
+
+
+
+
+/* TCP Handling Routines
+
+ TcpMonitorIn() -- These routines monitor TCP connections, and
+ TcpMonitorOut() delete a link when a connection is closed.
+
+These routines look for SYN, FIN and RST flags to determine when TCP
+connections open and close. When a TCP connection closes, the data
+structure containing packet aliasing information is deleted after
+a timeout period.
+*/
+
+/* Local prototypes */
+static void TcpMonitorIn(struct ip *, struct alias_link *);
+
+static void TcpMonitorOut(struct ip *, struct alias_link *);
+
+
+static void
+TcpMonitorIn(struct ip *pip, struct alias_link *link)
+{
+ struct tcphdr *tc;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ switch (GetStateIn(link))
+ {
+ case ALIAS_TCP_STATE_NOT_CONNECTED:
+ if (tc->th_flags & TH_RST)
+ SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED);
+ else if (tc->th_flags & TH_SYN)
+ SetStateIn(link, ALIAS_TCP_STATE_CONNECTED);
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (tc->th_flags & (TH_FIN | TH_RST))
+ SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED);
+ break;
+ }
+}
+
+static void
+TcpMonitorOut(struct ip *pip, struct alias_link *link)
+{
+ struct tcphdr *tc;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ switch (GetStateOut(link))
+ {
+ case ALIAS_TCP_STATE_NOT_CONNECTED:
+ if (tc->th_flags & TH_RST)
+ SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED);
+ else if (tc->th_flags & TH_SYN)
+ SetStateOut(link, ALIAS_TCP_STATE_CONNECTED);
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (tc->th_flags & (TH_FIN | TH_RST))
+ SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED);
+ break;
+ }
+}
+
+
+
+
+
+/* Protocol Specific Packet Aliasing Routines
+
+ IcmpAliasIn(), IcmpAliasIn1(), IcmpAliasIn2()
+ IcmpAliasOut(), IcmpAliasOut1(), IcmpAliasOut2()
+ ProtoAliasIn(), ProtoAliasOut()
+ UdpAliasIn(), UdpAliasOut()
+ TcpAliasIn(), TcpAliasOut()
+
+These routines handle protocol specific details of packet aliasing.
+One may observe a certain amount of repetitive arithmetic in these
+functions, the purpose of which is to compute a revised checksum
+without actually summing over the entire data packet, which could be
+unnecessarily time consuming.
+
+The purpose of the packet aliasing routines is to replace the source
+address of the outgoing packet and then correctly put it back for
+any incoming packets. For TCP and UDP, ports are also re-mapped.
+
+For ICMP echo/timestamp requests and replies, the following scheme
+is used: the ID number is replaced by an alias for the outgoing
+packet.
+
+ICMP error messages are handled by looking at the IP fragment
+in the data section of the message.
+
+For TCP and UDP protocols, a port number is chosen for an outgoing
+packet, and then incoming packets are identified by IP address and
+port numbers. For TCP packets, there is additional logic in the event
+that sequence and ACK numbers have been altered (as in the case for
+FTP data port commands).
+
+The port numbers used by the packet aliasing module are not true
+ports in the Unix sense. No sockets are actually bound to ports.
+They are more correctly thought of as placeholders.
+
+All packets go through the aliasing mechanism, whether they come from
+the gateway machine or other machines on a local area network.
+*/
+
+
+/* Local prototypes */
+static int IcmpAliasIn1(struct ip *);
+static int IcmpAliasIn2(struct ip *);
+static int IcmpAliasIn (struct ip *);
+
+static int IcmpAliasOut1(struct ip *);
+static int IcmpAliasOut2(struct ip *);
+static int IcmpAliasOut (struct ip *);
+
+static int ProtoAliasIn(struct ip *);
+static int ProtoAliasOut(struct ip *);
+
+static int UdpAliasOut(struct ip *);
+static int UdpAliasIn (struct ip *);
+
+static int TcpAliasOut(struct ip *, int);
+static int TcpAliasIn (struct ip *);
+
+
+static int
+IcmpAliasIn1(struct ip *pip)
+{
+/*
+ De-alias incoming echo and timestamp replies.
+ Alias incoming echo and timestamp requests.
+*/
+ struct alias_link *link;
+ struct icmp *ic;
+
+ ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2));
+
+/* Get source address from ICMP data field and restore original data */
+ link = FindIcmpIn(pip->ip_src, pip->ip_dst, ic->icmp_id, 1);
+ if (link != NULL)
+ {
+ u_short original_id;
+ int accumulate;
+
+ original_id = GetOriginalPort(link);
+
+/* Adjust ICMP checksum */
+ accumulate = ic->icmp_id;
+ accumulate -= original_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/* Put original sequence number back in */
+ ic->icmp_id = original_id;
+
+/* Put original address back into IP header */
+ {
+ struct in_addr original_address;
+
+ original_address = GetOriginalAddress(link);
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_dst,
+ 2);
+ pip->ip_dst = original_address;
+ }
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+static int
+IcmpAliasIn2(struct ip *pip)
+{
+/*
+ Alias incoming ICMP error messages containing
+ IP header and first 64 bits of datagram.
+*/
+ struct ip *ip;
+ struct icmp *ic, *ic2;
+ struct udphdr *ud;
+ struct tcphdr *tc;
+ struct alias_link *link;
+
+ ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2));
+ ip = &ic->icmp_ip;
+
+ ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2));
+ tc = (struct tcphdr *) ud;
+ ic2 = (struct icmp *) ud;
+
+ if (ip->ip_p == IPPROTO_UDP)
+ link = FindUdpTcpIn(ip->ip_dst, ip->ip_src,
+ ud->uh_dport, ud->uh_sport,
+ IPPROTO_UDP, 0);
+ else if (ip->ip_p == IPPROTO_TCP)
+ link = FindUdpTcpIn(ip->ip_dst, ip->ip_src,
+ tc->th_dport, tc->th_sport,
+ IPPROTO_TCP, 0);
+ else if (ip->ip_p == IPPROTO_ICMP) {
+ if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP)
+ link = FindIcmpIn(ip->ip_dst, ip->ip_src, ic2->icmp_id, 0);
+ else
+ link = NULL;
+ } else
+ link = NULL;
+
+ if (link != NULL)
+ {
+ if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP)
+ {
+ u_short *sptr;
+ int accumulate;
+ struct in_addr original_address;
+ u_short original_port;
+
+ original_address = GetOriginalAddress(link);
+ original_port = GetOriginalPort(link);
+
+/* Adjust ICMP checksum */
+ sptr = (u_short *) &(ip->ip_src);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &original_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ accumulate += ud->uh_sport;
+ accumulate -= original_port;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/* Un-alias address in IP header */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_dst,
+ 2);
+ pip->ip_dst = original_address;
+
+/* Un-alias address and port number of original IP packet
+fragment contained in ICMP data section */
+ ip->ip_src = original_address;
+ ud->uh_sport = original_port;
+ }
+ else if (ip->ip_p == IPPROTO_ICMP)
+ {
+ u_short *sptr;
+ int accumulate;
+ struct in_addr original_address;
+ u_short original_id;
+
+ original_address = GetOriginalAddress(link);
+ original_id = GetOriginalPort(link);
+
+/* Adjust ICMP checksum */
+ sptr = (u_short *) &(ip->ip_src);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &original_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ accumulate += ic2->icmp_id;
+ accumulate -= original_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/* Un-alias address in IP header */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_dst,
+ 2);
+ pip->ip_dst = original_address;
+
+/* Un-alias address of original IP packet and sequence number of
+ embedded ICMP datagram */
+ ip->ip_src = original_address;
+ ic2->icmp_id = original_id;
+ }
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+static int
+IcmpAliasIn(struct ip *pip)
+{
+ int iresult;
+ struct icmp *ic;
+
+/* Return if proxy-only mode is enabled */
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return PKT_ALIAS_OK;
+
+ ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2));
+
+ iresult = PKT_ALIAS_IGNORED;
+ switch (ic->icmp_type)
+ {
+ case ICMP_ECHOREPLY:
+ case ICMP_TSTAMPREPLY:
+ if (ic->icmp_code == 0)
+ {
+ iresult = IcmpAliasIn1(pip);
+ }
+ break;
+ case ICMP_UNREACH:
+ case ICMP_SOURCEQUENCH:
+ case ICMP_TIMXCEED:
+ case ICMP_PARAMPROB:
+ iresult = IcmpAliasIn2(pip);
+ break;
+ case ICMP_ECHO:
+ case ICMP_TSTAMP:
+ iresult = IcmpAliasIn1(pip);
+ break;
+ }
+ return(iresult);
+}
+
+
+static int
+IcmpAliasOut1(struct ip *pip)
+{
+/*
+ Alias outgoing echo and timestamp requests.
+ De-alias outgoing echo and timestamp replies.
+*/
+ struct alias_link *link;
+ struct icmp *ic;
+
+ ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2));
+
+/* Save overwritten data for when echo packet returns */
+ link = FindIcmpOut(pip->ip_src, pip->ip_dst, ic->icmp_id, 1);
+ if (link != NULL)
+ {
+ u_short alias_id;
+ int accumulate;
+
+ alias_id = GetAliasPort(link);
+
+/* Since data field is being modified, adjust ICMP checksum */
+ accumulate = ic->icmp_id;
+ accumulate -= alias_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/* Alias sequence number */
+ ic->icmp_id = alias_id;
+
+/* Change source address */
+ {
+ struct in_addr alias_address;
+
+ alias_address = GetAliasAddress(link);
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &alias_address,
+ (u_short *) &pip->ip_src,
+ 2);
+ pip->ip_src = alias_address;
+ }
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+static int
+IcmpAliasOut2(struct ip *pip)
+{
+/*
+ Alias outgoing ICMP error messages containing
+ IP header and first 64 bits of datagram.
+*/
+ struct ip *ip;
+ struct icmp *ic, *ic2;
+ struct udphdr *ud;
+ struct tcphdr *tc;
+ struct alias_link *link;
+
+ ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2));
+ ip = &ic->icmp_ip;
+
+ ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2));
+ tc = (struct tcphdr *) ud;
+ ic2 = (struct icmp *) ud;
+
+ if (ip->ip_p == IPPROTO_UDP)
+ link = FindUdpTcpOut(ip->ip_dst, ip->ip_src,
+ ud->uh_dport, ud->uh_sport,
+ IPPROTO_UDP, 0);
+ else if (ip->ip_p == IPPROTO_TCP)
+ link = FindUdpTcpOut(ip->ip_dst, ip->ip_src,
+ tc->th_dport, tc->th_sport,
+ IPPROTO_TCP, 0);
+ else if (ip->ip_p == IPPROTO_ICMP) {
+ if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP)
+ link = FindIcmpOut(ip->ip_dst, ip->ip_src, ic2->icmp_id, 0);
+ else
+ link = NULL;
+ } else
+ link = NULL;
+
+ if (link != NULL)
+ {
+ if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP)
+ {
+ u_short *sptr;
+ int accumulate;
+ struct in_addr alias_address;
+ u_short alias_port;
+
+ alias_address = GetAliasAddress(link);
+ alias_port = GetAliasPort(link);
+
+/* Adjust ICMP checksum */
+ sptr = (u_short *) &(ip->ip_dst);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &alias_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ accumulate += ud->uh_dport;
+ accumulate -= alias_port;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/*
+ * Alias address in IP header if it comes from the host
+ * the original TCP/UDP packet was destined for.
+ */
+ if (pip->ip_src.s_addr == ip->ip_dst.s_addr) {
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &alias_address,
+ (u_short *) &pip->ip_src,
+ 2);
+ pip->ip_src = alias_address;
+ }
+
+/* Alias address and port number of original IP packet
+fragment contained in ICMP data section */
+ ip->ip_dst = alias_address;
+ ud->uh_dport = alias_port;
+ }
+ else if (ip->ip_p == IPPROTO_ICMP)
+ {
+ u_short *sptr;
+ int accumulate;
+ struct in_addr alias_address;
+ u_short alias_id;
+
+ alias_address = GetAliasAddress(link);
+ alias_id = GetAliasPort(link);
+
+/* Adjust ICMP checksum */
+ sptr = (u_short *) &(ip->ip_dst);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &alias_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ accumulate += ic2->icmp_id;
+ accumulate -= alias_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+/*
+ * Alias address in IP header if it comes from the host
+ * the original ICMP message was destined for.
+ */
+ if (pip->ip_src.s_addr == ip->ip_dst.s_addr) {
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &alias_address,
+ (u_short *) &pip->ip_src,
+ 2);
+ pip->ip_src = alias_address;
+ }
+
+/* Alias address of original IP packet and sequence number of
+ embedded ICMP datagram */
+ ip->ip_dst = alias_address;
+ ic2->icmp_id = alias_id;
+ }
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+static int
+IcmpAliasOut(struct ip *pip)
+{
+ int iresult;
+ struct icmp *ic;
+
+/* Return if proxy-only mode is enabled */
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return PKT_ALIAS_OK;
+
+ ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2));
+
+ iresult = PKT_ALIAS_IGNORED;
+ switch (ic->icmp_type)
+ {
+ case ICMP_ECHO:
+ case ICMP_TSTAMP:
+ if (ic->icmp_code == 0)
+ {
+ iresult = IcmpAliasOut1(pip);
+ }
+ break;
+ case ICMP_UNREACH:
+ case ICMP_SOURCEQUENCH:
+ case ICMP_TIMXCEED:
+ case ICMP_PARAMPROB:
+ iresult = IcmpAliasOut2(pip);
+ break;
+ case ICMP_ECHOREPLY:
+ case ICMP_TSTAMPREPLY:
+ iresult = IcmpAliasOut1(pip);
+ }
+ return(iresult);
+}
+
+
+
+static int
+ProtoAliasIn(struct ip *pip)
+{
+/*
+ Handle incoming IP packets. The
+ only thing which is done in this case is to alias
+ the dest IP address of the packet to our inside
+ machine.
+*/
+ struct alias_link *link;
+
+/* Return if proxy-only mode is enabled */
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return PKT_ALIAS_OK;
+
+ link = FindProtoIn(pip->ip_src, pip->ip_dst, pip->ip_p);
+ if (link != NULL)
+ {
+ struct in_addr original_address;
+
+ original_address = GetOriginalAddress(link);
+
+/* Restore original IP address */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_dst,
+ 2);
+ pip->ip_dst = original_address;
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+static int
+ProtoAliasOut(struct ip *pip)
+{
+/*
+ Handle outgoing IP packets. The
+ only thing which is done in this case is to alias
+ the source IP address of the packet.
+*/
+ struct alias_link *link;
+
+/* Return if proxy-only mode is enabled */
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return PKT_ALIAS_OK;
+
+ link = FindProtoOut(pip->ip_src, pip->ip_dst, pip->ip_p);
+ if (link != NULL)
+ {
+ struct in_addr alias_address;
+
+ alias_address = GetAliasAddress(link);
+
+/* Change source address */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &alias_address,
+ (u_short *) &pip->ip_src,
+ 2);
+ pip->ip_src = alias_address;
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+static int
+UdpAliasIn(struct ip *pip)
+{
+ struct udphdr *ud;
+ struct alias_link *link;
+
+/* Return if proxy-only mode is enabled */
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return PKT_ALIAS_OK;
+
+ ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ link = FindUdpTcpIn(pip->ip_src, pip->ip_dst,
+ ud->uh_sport, ud->uh_dport,
+ IPPROTO_UDP, 1);
+ if (link != NULL)
+ {
+ struct in_addr alias_address;
+ struct in_addr original_address;
+ u_short alias_port;
+ int accumulate;
+ u_short *sptr;
+ int r = 0;
+
+ alias_address = GetAliasAddress(link);
+ original_address = GetOriginalAddress(link);
+ alias_port = ud->uh_dport;
+ ud->uh_dport = GetOriginalPort(link);
+
+/* Special processing for IP encoding protocols */
+ if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER)
+ AliasHandleCUSeeMeIn(pip, original_address);
+/* If NETBIOS Datagram, It should be alias address in UDP Data, too */
+ else if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER
+ || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER)
+ r = AliasHandleUdpNbt(pip, link, &original_address, ud->uh_dport);
+ else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER
+ || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER)
+ r = AliasHandleUdpNbtNS(pip, link, &alias_address, &alias_port,
+ &original_address, &ud->uh_dport);
+
+/* If UDP checksum is not zero, then adjust since destination port */
+/* is being unaliased and destination address is being altered. */
+ if (ud->uh_sum != 0)
+ {
+ accumulate = alias_port;
+ accumulate -= ud->uh_dport;
+ sptr = (u_short *) &alias_address;
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &original_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ }
+
+/* Restore original IP address */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_dst,
+ 2);
+ pip->ip_dst = original_address;
+
+ /*
+ * If we cannot figure out the packet, ignore it.
+ */
+ if (r < 0)
+ return(PKT_ALIAS_IGNORED);
+ else
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+static int
+UdpAliasOut(struct ip *pip)
+{
+ struct udphdr *ud;
+ struct alias_link *link;
+
+/* Return if proxy-only mode is enabled */
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY)
+ return PKT_ALIAS_OK;
+
+ ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ link = FindUdpTcpOut(pip->ip_src, pip->ip_dst,
+ ud->uh_sport, ud->uh_dport,
+ IPPROTO_UDP, 1);
+ if (link != NULL)
+ {
+ u_short alias_port;
+ struct in_addr alias_address;
+
+ alias_address = GetAliasAddress(link);
+ alias_port = GetAliasPort(link);
+
+/* Special processing for IP encoding protocols */
+ if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER)
+ AliasHandleCUSeeMeOut(pip, link);
+/* If NETBIOS Datagram, It should be alias address in UDP Data, too */
+ else if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER
+ || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER)
+ AliasHandleUdpNbt(pip, link, &alias_address, alias_port);
+ else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER
+ || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER)
+ AliasHandleUdpNbtNS(pip, link, &pip->ip_src, &ud->uh_sport,
+ &alias_address, &alias_port);
+/*
+ * We don't know in advance what TID the TFTP server will choose,
+ * so we create a wilcard link (destination port is unspecified)
+ * that will match any TID from a given destination.
+ */
+ else if (ntohs(ud->uh_dport) == TFTP_PORT_NUMBER)
+ FindRtspOut(pip->ip_src, pip->ip_dst,
+ ud->uh_sport, alias_port, IPPROTO_UDP);
+
+/* If UDP checksum is not zero, adjust since source port is */
+/* being aliased and source address is being altered */
+ if (ud->uh_sum != 0)
+ {
+ int accumulate;
+ u_short *sptr;
+
+ accumulate = ud->uh_sport;
+ accumulate -= alias_port;
+ sptr = (u_short *) &(pip->ip_src);
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &alias_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ }
+
+/* Put alias port in UDP header */
+ ud->uh_sport = alias_port;
+
+/* Change source address */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &alias_address,
+ (u_short *) &pip->ip_src,
+ 2);
+ pip->ip_src = alias_address;
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+
+static int
+TcpAliasIn(struct ip *pip)
+{
+ struct tcphdr *tc;
+ struct alias_link *link;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ link = FindUdpTcpIn(pip->ip_src, pip->ip_dst,
+ tc->th_sport, tc->th_dport,
+ IPPROTO_TCP,
+ !(packetAliasMode & PKT_ALIAS_PROXY_ONLY));
+ if (link != NULL)
+ {
+ struct in_addr alias_address;
+ struct in_addr original_address;
+ struct in_addr proxy_address;
+ u_short alias_port;
+ u_short proxy_port;
+ int accumulate;
+ u_short *sptr;
+
+/* Special processing for IP encoding protocols */
+ if (ntohs(tc->th_dport) == PPTP_CONTROL_PORT_NUMBER
+ || ntohs(tc->th_sport) == PPTP_CONTROL_PORT_NUMBER)
+ AliasHandlePptpIn(pip, link);
+
+ alias_address = GetAliasAddress(link);
+ original_address = GetOriginalAddress(link);
+ proxy_address = GetProxyAddress(link);
+ alias_port = tc->th_dport;
+ tc->th_dport = GetOriginalPort(link);
+ proxy_port = GetProxyPort(link);
+
+/* Adjust TCP checksum since destination port is being unaliased */
+/* and destination port is being altered. */
+ accumulate = alias_port;
+ accumulate -= tc->th_dport;
+ sptr = (u_short *) &alias_address;
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &original_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+/* If this is a proxy, then modify the TCP source port and
+ checksum accumulation */
+ if (proxy_port != 0)
+ {
+ accumulate += tc->th_sport;
+ tc->th_sport = proxy_port;
+ accumulate -= tc->th_sport;
+
+ sptr = (u_short *) &pip->ip_src;
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &proxy_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ }
+
+/* See if ACK number needs to be modified */
+ if (GetAckModified(link) == 1)
+ {
+ int delta;
+
+ delta = GetDeltaAckIn(pip, link);
+ if (delta != 0)
+ {
+ sptr = (u_short *) &tc->th_ack;
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ tc->th_ack = htonl(ntohl(tc->th_ack) - delta);
+ sptr = (u_short *) &tc->th_ack;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ }
+ }
+
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+/* Restore original IP address */
+ sptr = (u_short *) &pip->ip_dst;
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ pip->ip_dst = original_address;
+ sptr = (u_short *) &pip->ip_dst;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+/* If this is a transparent proxy packet, then modify the source
+ address */
+ if (proxy_address.s_addr != 0)
+ {
+ sptr = (u_short *) &pip->ip_src;
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ pip->ip_src = proxy_address;
+ sptr = (u_short *) &pip->ip_src;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ }
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+
+/* Monitor TCP connection state */
+ TcpMonitorIn(pip, link);
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+static int
+TcpAliasOut(struct ip *pip, int maxpacketsize)
+{
+ int proxy_type;
+ u_short dest_port;
+ u_short proxy_server_port;
+ struct in_addr dest_address;
+ struct in_addr proxy_server_address;
+ struct tcphdr *tc;
+ struct alias_link *link;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ proxy_type = ProxyCheck(pip, &proxy_server_address, &proxy_server_port);
+
+ if (proxy_type == 0 && (packetAliasMode & PKT_ALIAS_PROXY_ONLY))
+ return PKT_ALIAS_OK;
+
+/* If this is a transparent proxy, save original destination,
+ then alter the destination and adjust checksums */
+ dest_port = tc->th_dport;
+ dest_address = pip->ip_dst;
+ if (proxy_type != 0)
+ {
+ int accumulate;
+ u_short *sptr;
+
+ accumulate = tc->th_dport;
+ tc->th_dport = proxy_server_port;
+ accumulate -= tc->th_dport;
+
+ sptr = (u_short *) &(pip->ip_dst);
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &proxy_server_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+ sptr = (u_short *) &(pip->ip_dst);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ pip->ip_dst = proxy_server_address;
+ sptr = (u_short *) &(pip->ip_dst);
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+ }
+
+ link = FindUdpTcpOut(pip->ip_src, pip->ip_dst,
+ tc->th_sport, tc->th_dport,
+ IPPROTO_TCP, 1);
+ if (link !=NULL)
+ {
+ u_short alias_port;
+ struct in_addr alias_address;
+ int accumulate;
+ u_short *sptr;
+
+/* Save original destination address, if this is a proxy packet.
+ Also modify packet to include destination encoding. This may
+ change the size of IP header. */
+ if (proxy_type != 0)
+ {
+ SetProxyPort(link, dest_port);
+ SetProxyAddress(link, dest_address);
+ ProxyModify(link, pip, maxpacketsize, proxy_type);
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ }
+
+/* Get alias address and port */
+ alias_port = GetAliasPort(link);
+ alias_address = GetAliasAddress(link);
+
+/* Monitor TCP connection state */
+ TcpMonitorOut(pip, link);
+
+/* Special processing for IP encoding protocols */
+ if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER
+ || ntohs(tc->th_sport) == FTP_CONTROL_PORT_NUMBER)
+ AliasHandleFtpOut(pip, link, maxpacketsize);
+ else if (ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_1
+ || ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_2)
+ AliasHandleIrcOut(pip, link, maxpacketsize);
+ else if (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1
+ || ntohs(tc->th_sport) == RTSP_CONTROL_PORT_NUMBER_1
+ || ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2
+ || ntohs(tc->th_sport) == RTSP_CONTROL_PORT_NUMBER_2)
+ AliasHandleRtspOut(pip, link, maxpacketsize);
+ else if (ntohs(tc->th_dport) == PPTP_CONTROL_PORT_NUMBER
+ || ntohs(tc->th_sport) == PPTP_CONTROL_PORT_NUMBER)
+ AliasHandlePptpOut(pip, link);
+
+/* Adjust TCP checksum since source port is being aliased */
+/* and source address is being altered */
+ accumulate = tc->th_sport;
+ tc->th_sport = alias_port;
+ accumulate -= tc->th_sport;
+
+ sptr = (u_short *) &(pip->ip_src);
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &alias_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+/* Modify sequence number if necessary */
+ if (GetAckModified(link) == 1)
+ {
+ int delta;
+
+ delta = GetDeltaSeqOut(pip, link);
+ if (delta != 0)
+ {
+ sptr = (u_short *) &tc->th_seq;
+ accumulate += *sptr++;
+ accumulate += *sptr;
+ tc->th_seq = htonl(ntohl(tc->th_seq) + delta);
+ sptr = (u_short *) &tc->th_seq;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ }
+ }
+
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+/* Change source address */
+ sptr = (u_short *) &(pip->ip_src);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ pip->ip_src = alias_address;
+ sptr = (u_short *) &(pip->ip_src);
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_IGNORED);
+}
+
+
+
+
+/* Fragment Handling
+
+ FragmentIn()
+ FragmentOut()
+
+The packet aliasing module has a limited ability for handling IP
+fragments. If the ICMP, TCP or UDP header is in the first fragment
+received, then the ID number of the IP packet is saved, and other
+fragments are identified according to their ID number and IP address
+they were sent from. Pointers to unresolved fragments can also be
+saved and recalled when a header fragment is seen.
+*/
+
+/* Local prototypes */
+static int FragmentIn(struct ip *);
+static int FragmentOut(struct ip *);
+
+
+static int
+FragmentIn(struct ip *pip)
+{
+ struct alias_link *link;
+
+ link = FindFragmentIn2(pip->ip_src, pip->ip_dst, pip->ip_id);
+ if (link != NULL)
+ {
+ struct in_addr original_address;
+
+ GetFragmentAddr(link, &original_address);
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_dst,
+ 2);
+ pip->ip_dst = original_address;
+
+ return(PKT_ALIAS_OK);
+ }
+ return(PKT_ALIAS_UNRESOLVED_FRAGMENT);
+}
+
+
+static int
+FragmentOut(struct ip *pip)
+{
+ struct in_addr alias_address;
+
+ alias_address = FindAliasAddress(pip->ip_src);
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &alias_address,
+ (u_short *) &pip->ip_src,
+ 2);
+ pip->ip_src = alias_address;
+
+ return(PKT_ALIAS_OK);
+}
+
+
+
+
+
+
+/* Outside World Access
+
+ PacketAliasSaveFragment()
+ PacketAliasGetFragment()
+ PacketAliasFragmentIn()
+ PacketAliasIn()
+ PacketAliasOut()
+ PacketUnaliasOut()
+
+(prototypes in alias.h)
+*/
+
+
+int
+PacketAliasSaveFragment(char *ptr)
+{
+ int iresult;
+ struct alias_link *link;
+ struct ip *pip;
+
+ pip = (struct ip *) ptr;
+ link = AddFragmentPtrLink(pip->ip_src, pip->ip_id);
+ iresult = PKT_ALIAS_ERROR;
+ if (link != NULL)
+ {
+ SetFragmentPtr(link, ptr);
+ iresult = PKT_ALIAS_OK;
+ }
+ return(iresult);
+}
+
+
+char *
+PacketAliasGetFragment(char *ptr)
+{
+ struct alias_link *link;
+ char *fptr;
+ struct ip *pip;
+
+ pip = (struct ip *) ptr;
+ link = FindFragmentPtr(pip->ip_src, pip->ip_id);
+ if (link != NULL)
+ {
+ GetFragmentPtr(link, &fptr);
+ SetFragmentPtr(link, NULL);
+ SetExpire(link, 0); /* Deletes link */
+
+ return(fptr);
+ }
+ else
+ {
+ return(NULL);
+ }
+}
+
+
+void
+PacketAliasFragmentIn(char *ptr, /* Points to correctly de-aliased
+ header fragment */
+ char *ptr_fragment /* Points to fragment which must
+ be de-aliased */
+ )
+{
+ struct ip *pip;
+ struct ip *fpip;
+
+ pip = (struct ip *) ptr;
+ fpip = (struct ip *) ptr_fragment;
+
+ DifferentialChecksum(&fpip->ip_sum,
+ (u_short *) &pip->ip_dst,
+ (u_short *) &fpip->ip_dst,
+ 2);
+ fpip->ip_dst = pip->ip_dst;
+}
+
+
+int
+PacketAliasIn(char *ptr, int maxpacketsize)
+{
+ struct in_addr alias_addr;
+ struct ip *pip;
+ int iresult;
+
+ if (packetAliasMode & PKT_ALIAS_REVERSE) {
+ packetAliasMode &= ~PKT_ALIAS_REVERSE;
+ iresult = PacketAliasOut(ptr, maxpacketsize);
+ packetAliasMode |= PKT_ALIAS_REVERSE;
+ return iresult;
+ }
+
+ HouseKeeping();
+ ClearCheckNewLink();
+ pip = (struct ip *) ptr;
+ alias_addr = pip->ip_dst;
+
+ /* Defense against mangled packets */
+ if (ntohs(pip->ip_len) > maxpacketsize
+ || (pip->ip_hl<<2) > maxpacketsize)
+ return PKT_ALIAS_IGNORED;
+
+ iresult = PKT_ALIAS_IGNORED;
+ if ( (ntohs(pip->ip_off) & IP_OFFMASK) == 0 )
+ {
+ switch (pip->ip_p)
+ {
+ case IPPROTO_ICMP:
+ iresult = IcmpAliasIn(pip);
+ break;
+ case IPPROTO_UDP:
+ iresult = UdpAliasIn(pip);
+ break;
+ case IPPROTO_TCP:
+ iresult = TcpAliasIn(pip);
+ break;
+ case IPPROTO_GRE:
+ if (packetAliasMode & PKT_ALIAS_PROXY_ONLY ||
+ AliasHandlePptpGreIn(pip) == 0)
+ iresult = PKT_ALIAS_OK;
+ else
+ iresult = ProtoAliasIn(pip);
+ break;
+ default:
+ iresult = ProtoAliasIn(pip);
+ break;
+ }
+
+ if (ntohs(pip->ip_off) & IP_MF)
+ {
+ struct alias_link *link;
+
+ link = FindFragmentIn1(pip->ip_src, alias_addr, pip->ip_id);
+ if (link != NULL)
+ {
+ iresult = PKT_ALIAS_FOUND_HEADER_FRAGMENT;
+ SetFragmentAddr(link, pip->ip_dst);
+ }
+ else
+ {
+ iresult = PKT_ALIAS_ERROR;
+ }
+ }
+ }
+ else
+ {
+ iresult = FragmentIn(pip);
+ }
+
+ return(iresult);
+}
+
+
+
+/* Unregistered address ranges */
+
+/* 10.0.0.0 -> 10.255.255.255 */
+#define UNREG_ADDR_A_LOWER 0x0a000000
+#define UNREG_ADDR_A_UPPER 0x0affffff
+
+/* 172.16.0.0 -> 172.31.255.255 */
+#define UNREG_ADDR_B_LOWER 0xac100000
+#define UNREG_ADDR_B_UPPER 0xac1fffff
+
+/* 192.168.0.0 -> 192.168.255.255 */
+#define UNREG_ADDR_C_LOWER 0xc0a80000
+#define UNREG_ADDR_C_UPPER 0xc0a8ffff
+
+int
+PacketAliasOut(char *ptr, /* valid IP packet */
+ int maxpacketsize /* How much the packet data may grow
+ (FTP and IRC inline changes) */
+ )
+{
+ int iresult;
+ struct in_addr addr_save;
+ struct ip *pip;
+
+ if (packetAliasMode & PKT_ALIAS_REVERSE) {
+ packetAliasMode &= ~PKT_ALIAS_REVERSE;
+ iresult = PacketAliasIn(ptr, maxpacketsize);
+ packetAliasMode |= PKT_ALIAS_REVERSE;
+ return iresult;
+ }
+
+ HouseKeeping();
+ ClearCheckNewLink();
+ pip = (struct ip *) ptr;
+
+ /* Defense against mangled packets */
+ if (ntohs(pip->ip_len) > maxpacketsize
+ || (pip->ip_hl<<2) > maxpacketsize)
+ return PKT_ALIAS_IGNORED;
+
+ addr_save = GetDefaultAliasAddress();
+ if (packetAliasMode & PKT_ALIAS_UNREGISTERED_ONLY)
+ {
+ u_long addr;
+ int iclass;
+
+ iclass = 0;
+ addr = ntohl(pip->ip_src.s_addr);
+ if (addr >= UNREG_ADDR_C_LOWER && addr <= UNREG_ADDR_C_UPPER)
+ iclass = 3;
+ else if (addr >= UNREG_ADDR_B_LOWER && addr <= UNREG_ADDR_B_UPPER)
+ iclass = 2;
+ else if (addr >= UNREG_ADDR_A_LOWER && addr <= UNREG_ADDR_A_UPPER)
+ iclass = 1;
+
+ if (iclass == 0)
+ {
+ SetDefaultAliasAddress(pip->ip_src);
+ }
+ }
+
+ iresult = PKT_ALIAS_IGNORED;
+ if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0)
+ {
+ switch (pip->ip_p)
+ {
+ case IPPROTO_ICMP:
+ iresult = IcmpAliasOut(pip);
+ break;
+ case IPPROTO_UDP:
+ iresult = UdpAliasOut(pip);
+ break;
+ case IPPROTO_TCP:
+ iresult = TcpAliasOut(pip, maxpacketsize);
+ break;
+ case IPPROTO_GRE:
+ if (AliasHandlePptpGreOut(pip) == 0)
+ iresult = PKT_ALIAS_OK;
+ else
+ iresult = ProtoAliasOut(pip);
+ break;
+ default:
+ iresult = ProtoAliasOut(pip);
+ break;
+ }
+ }
+ else
+ {
+ iresult = FragmentOut(pip);
+ }
+
+ SetDefaultAliasAddress(addr_save);
+ return(iresult);
+}
+
+int
+PacketUnaliasOut(char *ptr, /* valid IP packet */
+ int maxpacketsize /* for error checking */
+ )
+{
+ struct ip *pip;
+ struct icmp *ic;
+ struct udphdr *ud;
+ struct tcphdr *tc;
+ struct alias_link *link;
+ int iresult = PKT_ALIAS_IGNORED;
+
+ pip = (struct ip *) ptr;
+
+ /* Defense against mangled packets */
+ if (ntohs(pip->ip_len) > maxpacketsize
+ || (pip->ip_hl<<2) > maxpacketsize)
+ return(iresult);
+
+ ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2));
+ tc = (struct tcphdr *) ud;
+ ic = (struct icmp *) ud;
+
+ /* Find a link */
+ if (pip->ip_p == IPPROTO_UDP)
+ link = FindUdpTcpIn(pip->ip_dst, pip->ip_src,
+ ud->uh_dport, ud->uh_sport,
+ IPPROTO_UDP, 0);
+ else if (pip->ip_p == IPPROTO_TCP)
+ link = FindUdpTcpIn(pip->ip_dst, pip->ip_src,
+ tc->th_dport, tc->th_sport,
+ IPPROTO_TCP, 0);
+ else if (pip->ip_p == IPPROTO_ICMP)
+ link = FindIcmpIn(pip->ip_dst, pip->ip_src, ic->icmp_id, 0);
+ else
+ link = NULL;
+
+ /* Change it from an aliased packet to an unaliased packet */
+ if (link != NULL)
+ {
+ if (pip->ip_p == IPPROTO_UDP || pip->ip_p == IPPROTO_TCP)
+ {
+ u_short *sptr;
+ int accumulate;
+ struct in_addr original_address;
+ u_short original_port;
+
+ original_address = GetOriginalAddress(link);
+ original_port = GetOriginalPort(link);
+
+ /* Adjust TCP/UDP checksum */
+ sptr = (u_short *) &(pip->ip_src);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &original_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+
+ if (pip->ip_p == IPPROTO_UDP) {
+ accumulate += ud->uh_sport;
+ accumulate -= original_port;
+ ADJUST_CHECKSUM(accumulate, ud->uh_sum);
+ } else {
+ accumulate += tc->th_sport;
+ accumulate -= original_port;
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+ }
+
+ /* Adjust IP checksum */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_src,
+ 2);
+
+ /* Un-alias source address and port number */
+ pip->ip_src = original_address;
+ if (pip->ip_p == IPPROTO_UDP)
+ ud->uh_sport = original_port;
+ else
+ tc->th_sport = original_port;
+
+ iresult = PKT_ALIAS_OK;
+
+ } else if (pip->ip_p == IPPROTO_ICMP) {
+
+ u_short *sptr;
+ int accumulate;
+ struct in_addr original_address;
+ u_short original_id;
+
+ original_address = GetOriginalAddress(link);
+ original_id = GetOriginalPort(link);
+
+ /* Adjust ICMP checksum */
+ sptr = (u_short *) &(pip->ip_src);
+ accumulate = *sptr++;
+ accumulate += *sptr;
+ sptr = (u_short *) &original_address;
+ accumulate -= *sptr++;
+ accumulate -= *sptr;
+ accumulate += ic->icmp_id;
+ accumulate -= original_id;
+ ADJUST_CHECKSUM(accumulate, ic->icmp_cksum);
+
+ /* Adjust IP checksum */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *) &original_address,
+ (u_short *) &pip->ip_src,
+ 2);
+
+ /* Un-alias source address and port number */
+ pip->ip_src = original_address;
+ ic->icmp_id = original_id;
+
+ iresult = PKT_ALIAS_OK;
+ }
+ }
+ return(iresult);
+
+}
diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h
new file mode 100644
index 0000000..9df8929
--- /dev/null
+++ b/sys/netinet/libalias/alias.h
@@ -0,0 +1,182 @@
+/* lint -save -library Flexelint comment for external headers */
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Alias.h defines the outside world interfaces for the packet aliasing
+ * software.
+ *
+ * This software is placed into the public domain with no restrictions on its
+ * distribution.
+ */
+
+#ifndef _ALIAS_H_
+#define _ALIAS_H_
+
+/* The external interface to libalias, the packet aliasing engine. */
+
+/* Initialization and control functions. */
+void PacketAliasInit(void);
+void PacketAliasSetAddress(struct in_addr _addr);
+void PacketAliasSetFWBase(unsigned int _base, unsigned int _num);
+unsigned int
+ PacketAliasSetMode(unsigned int _flags, unsigned int _mask);
+void PacketAliasUninit(void);
+
+/* Packet Handling functions. */
+int PacketAliasIn(char *_ptr, int _maxpacketsize);
+int PacketAliasOut(char *_ptr, int _maxpacketsize);
+int PacketUnaliasOut(char *_ptr, int _maxpacketsize);
+
+/* Port and address redirection functions. */
+
+/*
+ * An anonymous structure, a pointer to which is returned from
+ * PacketAliasRedirectAddr(), PacketAliasRedirectPort() or
+ * PacketAliasRedirectProto(), passed to PacketAliasAddServer(),
+ * and freed by PacketAliasRedirectDelete().
+ */
+struct alias_link;
+
+int PacketAliasAddServer(struct alias_link *_link,
+ struct in_addr _addr, unsigned short _port);
+struct alias_link *
+ PacketAliasRedirectAddr(struct in_addr _src_addr,
+ struct in_addr _alias_addr);
+void PacketAliasRedirectDelete(struct alias_link *_link);
+struct alias_link *
+ PacketAliasRedirectPort(struct in_addr _src_addr,
+ unsigned short _src_port, struct in_addr _dst_addr,
+ unsigned short _dst_port, struct in_addr _alias_addr,
+ unsigned short _alias_port, unsigned char _proto);
+struct alias_link *
+ PacketAliasRedirectProto(struct in_addr _src_addr,
+ struct in_addr _dst_addr, struct in_addr _alias_addr,
+ unsigned char _proto);
+
+/* Fragment Handling functions. */
+void PacketAliasFragmentIn(char *_ptr, char *_ptr_fragment);
+char *PacketAliasGetFragment(char *_ptr);
+int PacketAliasSaveFragment(char *_ptr);
+
+/* Miscellaneous functions. */
+int PacketAliasCheckNewLink(void);
+unsigned short
+ PacketAliasInternetChecksum(unsigned short *_ptr, int _nbytes);
+void PacketAliasSetTarget(struct in_addr _target_addr);
+
+/* Transparent proxying routines. */
+int PacketAliasProxyRule(const char *_cmd);
+
+/* Mode flags, set using PacketAliasSetMode() */
+
+/*
+ * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log
+ * every time a link is created or deleted. This is useful for debugging.
+ */
+#define PKT_ALIAS_LOG 0x01
+
+/*
+ * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp,
+ * telnet or web servers will be prevented by the aliasing mechanism.
+ */
+#define PKT_ALIAS_DENY_INCOMING 0x02
+
+/*
+ * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the
+ * same port as they originated on. This allows e.g. rsh to work *99% of the
+ * time*, but _not_ 100% (it will be slightly flakey instead of not working
+ * at all). This mode bit is set by PacketAliasInit(), so it is a default
+ * mode of operation.
+ */
+#define PKT_ALIAS_SAME_PORTS 0x04
+
+/*
+ * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g.
+ * destination port and/or address is zero), the packet aliasing engine will
+ * attempt to allocate a socket for the aliasing port it chooses. This will
+ * avoid interference with the host machine. Fully specified links do not
+ * require this. This bit is set after a call to PacketAliasInit(), so it is
+ * a default mode of operation.
+ */
+#define PKT_ALIAS_USE_SOCKETS 0x08
+
+/*-
+ * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with
+ * unregistered source addresses will be aliased. Private
+ * addresses are those in the following ranges:
+ *
+ * 10.0.0.0 -> 10.255.255.255
+ * 172.16.0.0 -> 172.31.255.255
+ * 192.168.0.0 -> 192.168.255.255
+ */
+#define PKT_ALIAS_UNREGISTERED_ONLY 0x10
+
+/*
+ * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic
+ * aliasing links will be reset whenever PacketAliasSetAddress() changes the
+ * default aliasing address. If the default aliasing address is left
+ * unchanged by this function call, then the table of dynamic aliasing links
+ * will be left intact. This bit is set after a call to PacketAliasInit().
+ */
+#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20
+
+#ifndef NO_FW_PUNCH
+/*
+ * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will
+ * create a 'hole' in the firewall to allow the transfers to work. The
+ * ipfw rule number that the hole is created with is controlled by
+ * PacketAliasSetFWBase(). The hole will be attached to that
+ * particular alias_link, so when the link goes away the hole is deleted.
+ */
+#define PKT_ALIAS_PUNCH_FW 0x100
+#endif
+
+/*
+ * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only
+ * transparent proxying is performed.
+ */
+#define PKT_ALIAS_PROXY_ONLY 0x40
+
+/*
+ * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and
+ * PacketAliasOut() are reversed.
+ */
+#define PKT_ALIAS_REVERSE 0x80
+
+/* Function return codes. */
+#define PKT_ALIAS_ERROR -1
+#define PKT_ALIAS_OK 1
+#define PKT_ALIAS_IGNORED 2
+#define PKT_ALIAS_UNRESOLVED_FRAGMENT 3
+#define PKT_ALIAS_FOUND_HEADER_FRAGMENT 4
+
+#endif /* !_ALIAS_H_ */
+
+/* lint -restore */
diff --git a/sys/netinet/libalias/alias_cuseeme.c b/sys/netinet/libalias/alias_cuseeme.c
new file mode 100644
index 0000000..2c0587e
--- /dev/null
+++ b/sys/netinet/libalias/alias_cuseeme.c
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org>
+ * with the aid of code written by
+ * Junichi SATOH <junichi@astec.co.jp> 1996, 1997.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+
+#include "alias_local.h"
+
+/* CU-SeeMe Data Header */
+struct cu_header {
+ u_int16_t dest_family;
+ u_int16_t dest_port;
+ u_int32_t dest_addr;
+ int16_t family;
+ u_int16_t port;
+ u_int32_t addr;
+ u_int32_t seq;
+ u_int16_t msg;
+ u_int16_t data_type;
+ u_int16_t packet_len;
+};
+
+/* Open Continue Header */
+struct oc_header {
+ u_int16_t client_count; /* Number of client info structs */
+ u_int32_t seq_no;
+ char user_name[20];
+ char reserved[4]; /* flags, version stuff, etc */
+};
+
+/* client info structures */
+struct client_info {
+ u_int32_t address; /* Client address */
+ char reserved[8]; /* Flags, pruning bitfield, packet counts etc */
+};
+
+void
+AliasHandleCUSeeMeOut(struct ip *pip, struct alias_link *link)
+{
+ struct udphdr *ud;
+
+ ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2));
+ if (ntohs(ud->uh_ulen) - sizeof(struct udphdr) >= sizeof(struct cu_header)) {
+ struct cu_header *cu;
+ struct alias_link *cu_link;
+
+ cu = (struct cu_header *)(ud + 1);
+ if (cu->addr)
+ cu->addr = (u_int32_t)GetAliasAddress(link).s_addr;
+
+ cu_link = FindUdpTcpOut(pip->ip_src, GetDestAddress(link),
+ ud->uh_dport, 0, IPPROTO_UDP, 1);
+
+#ifndef NO_FW_PUNCH
+ if (cu_link)
+ PunchFWHole(cu_link);
+#endif
+ }
+}
+
+void
+AliasHandleCUSeeMeIn(struct ip *pip, struct in_addr original_addr)
+{
+ struct in_addr alias_addr;
+ struct udphdr *ud;
+ struct cu_header *cu;
+ struct oc_header *oc;
+ struct client_info *ci;
+ char *end;
+ int i;
+
+ alias_addr.s_addr = pip->ip_dst.s_addr;
+ ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2));
+ cu = (struct cu_header *)(ud + 1);
+ oc = (struct oc_header *)(cu + 1);
+ ci = (struct client_info *)(oc + 1);
+ end = (char *)ud + ntohs(ud->uh_ulen);
+
+ if ((char *)oc <= end) {
+ if(cu->dest_addr)
+ cu->dest_addr = (u_int32_t)original_addr.s_addr;
+ if(ntohs(cu->data_type) == 101)
+ /* Find and change our address */
+ for(i = 0; (char *)(ci + 1) <= end && i < oc->client_count; i++, ci++)
+ if(ci->address == (u_int32_t)alias_addr.s_addr) {
+ ci->address = (u_int32_t)original_addr.s_addr;
+ break;
+ }
+ }
+}
diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c
new file mode 100644
index 0000000..52384b3
--- /dev/null
+++ b/sys/netinet/libalias/alias_db.c
@@ -0,0 +1,2812 @@
+/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- */
+
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_db.c encapsulates all data structures used for storing
+ packet aliasing data. Other parts of the aliasing software
+ access data through functions provided in this file.
+
+ Data storage is based on the notion of a "link", which is
+ established for ICMP echo/reply packets, UDP datagrams and
+ TCP stream connections. A link stores the original source
+ and destination addresses. For UDP and TCP, it also stores
+ source and destination port numbers, as well as an alias
+ port number. Links are also used to store information about
+ fragments.
+
+ There is a facility for sweeping through and deleting old
+ links as new packets are sent through. A simple timeout is
+ used for ICMP and UDP links. TCP links are left alone unless
+ there is an incomplete connection, in which case the link
+ can be deleted after a certain amount of time.
+
+
+ Initial version: August, 1996 (cjm)
+
+ Version 1.4: September 16, 1996 (cjm)
+ Facility for handling incoming links added.
+
+ Version 1.6: September 18, 1996 (cjm)
+ ICMP data handling simplified.
+
+ Version 1.7: January 9, 1997 (cjm)
+ Fragment handling simplified.
+ Saves pointers for unresolved fragments.
+ Permits links for unspecified remote ports
+ or unspecified remote addresses.
+ Fixed bug which did not properly zero port
+ table entries after a link was deleted.
+ Cleaned up some obsolete comments.
+
+ Version 1.8: January 14, 1997 (cjm)
+ Fixed data type error in StartPoint().
+ (This error did not exist prior to v1.7
+ and was discovered and fixed by Ari Suutari)
+
+ Version 1.9: February 1, 1997
+ Optionally, connections initiated from packet aliasing host
+ machine will will not have their port number aliased unless it
+ conflicts with an aliasing port already being used. (cjm)
+
+ All options earlier being #ifdef'ed are now available through
+ a new interface, SetPacketAliasMode(). This allows run time
+ control (which is now available in PPP+pktAlias through the
+ 'alias' keyword). (ee)
+
+ Added ability to create an alias port without
+ either destination address or port specified.
+ port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee)
+
+ Removed K&R style function headers
+ and general cleanup. (ee)
+
+ Added packetAliasMode to replace compiler #defines's (ee)
+
+ Allocates sockets for partially specified
+ ports if ALIAS_USE_SOCKETS defined. (cjm)
+
+ Version 2.0: March, 1997
+ SetAliasAddress() will now clean up alias links
+ if the aliasing address is changed. (cjm)
+
+ PacketAliasPermanentLink() function added to support permanent
+ links. (J. Fortes suggested the need for this.)
+ Examples:
+
+ (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port
+
+ (192.168.0.2, port 21) <-> alias port 3604, known dest addr
+ unknown dest port
+
+ These permanent links allow for incoming connections to
+ machines on the local network. They can be given with a
+ user-chosen amount of specificity, with increasing specificity
+ meaning more security. (cjm)
+
+ Quite a bit of rework to the basic engine. The portTable[]
+ array, which kept track of which ports were in use was replaced
+ by a table/linked list structure. (cjm)
+
+ SetExpire() function added. (cjm)
+
+ DeleteLink() no longer frees memory association with a pointer
+ to a fragment (this bug was first recognized by E. Eklund in
+ v1.9).
+
+ Version 2.1: May, 1997 (cjm)
+ Packet aliasing engine reworked so that it can handle
+ multiple external addresses rather than just a single
+ host address.
+
+ PacketAliasRedirectPort() and PacketAliasRedirectAddr()
+ added to the API. The first function is a more generalized
+ version of PacketAliasPermanentLink(). The second function
+ implements static network address translation.
+
+ Version 3.2: July, 2000 (salander and satoh)
+ Added FindNewPortGroup to get contiguous range of port values.
+
+ Added QueryUdpTcpIn and QueryUdpTcpOut to look for an aliasing
+ link but not actually add one.
+
+ Added FindRtspOut, which is closely derived from FindUdpTcpOut,
+ except that the alias port (from FindNewPortGroup) is provided
+ as input.
+
+ See HISTORY file for additional revisions.
+*/
+
+
+/* System include files */
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+/* BSD network include files */
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "alias.h"
+#include "alias_local.h"
+
+
+
+/*
+ Constants (note: constants are also defined
+ near relevant functions or structs)
+*/
+
+/* Sizes of input and output link tables */
+#define LINK_TABLE_OUT_SIZE 101
+#define LINK_TABLE_IN_SIZE 4001
+
+/* Parameters used for cleanup of expired links */
+#define ALIAS_CLEANUP_INTERVAL_SECS 60
+#define ALIAS_CLEANUP_MAX_SPOKES 30
+
+/* Timeouts (in seconds) for different link types */
+#define ICMP_EXPIRE_TIME 60
+#define UDP_EXPIRE_TIME 60
+#define PROTO_EXPIRE_TIME 60
+#define FRAGMENT_ID_EXPIRE_TIME 10
+#define FRAGMENT_PTR_EXPIRE_TIME 30
+
+/* TCP link expire time for different cases */
+/* When the link has been used and closed - minimal grace time to
+ allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */
+#ifndef TCP_EXPIRE_DEAD
+# define TCP_EXPIRE_DEAD 10
+#endif
+
+/* When the link has been used and closed on one side - the other side
+ is allowed to still send data */
+#ifndef TCP_EXPIRE_SINGLEDEAD
+# define TCP_EXPIRE_SINGLEDEAD 90
+#endif
+
+/* When the link isn't yet up */
+#ifndef TCP_EXPIRE_INITIAL
+# define TCP_EXPIRE_INITIAL 300
+#endif
+
+/* When the link is up */
+#ifndef TCP_EXPIRE_CONNECTED
+# define TCP_EXPIRE_CONNECTED 86400
+#endif
+
+
+/* Dummy port number codes used for FindLinkIn/Out() and AddLink().
+ These constants can be anything except zero, which indicates an
+ unknown port number. */
+
+#define NO_DEST_PORT 1
+#define NO_SRC_PORT 1
+
+
+
+/* Data Structures
+
+ The fundamental data structure used in this program is
+ "struct alias_link". Whenever a TCP connection is made,
+ a UDP datagram is sent out, or an ICMP echo request is made,
+ a link record is made (if it has not already been created).
+ The link record is identified by the source address/port
+ and the destination address/port. In the case of an ICMP
+ echo request, the source port is treated as being equivalent
+ with the 16-bit ID number of the ICMP packet.
+
+ The link record also can store some auxiliary data. For
+ TCP connections that have had sequence and acknowledgment
+ modifications, data space is available to track these changes.
+ A state field is used to keep track in changes to the TCP
+ connection state. ID numbers of fragments can also be
+ stored in the auxiliary space. Pointers to unresolved
+ fragments can also be stored.
+
+ The link records support two independent chainings. Lookup
+ tables for input and out tables hold the initial pointers
+ the link chains. On input, the lookup table indexes on alias
+ port and link type. On output, the lookup table indexes on
+ source address, destination address, source port, destination
+ port and link type.
+*/
+
+struct ack_data_record /* used to save changes to ACK/sequence numbers */
+{
+ u_long ack_old;
+ u_long ack_new;
+ int delta;
+ int active;
+};
+
+struct tcp_state /* Information about TCP connection */
+{
+ int in; /* State for outside -> inside */
+ int out; /* State for inside -> outside */
+ int index; /* Index to ACK data array */
+ int ack_modified; /* Indicates whether ACK and sequence numbers */
+ /* been modified */
+};
+
+#define N_LINK_TCP_DATA 3 /* Number of distinct ACK number changes
+ saved for a modified TCP stream */
+struct tcp_dat
+{
+ struct tcp_state state;
+ struct ack_data_record ack[N_LINK_TCP_DATA];
+ int fwhole; /* Which firewall record is used for this hole? */
+};
+
+struct server /* LSNAT server pool (circular list) */
+{
+ struct in_addr addr;
+ u_short port;
+ struct server *next;
+};
+
+struct alias_link /* Main data structure */
+{
+ struct in_addr src_addr; /* Address and port information */
+ struct in_addr dst_addr;
+ struct in_addr alias_addr;
+ struct in_addr proxy_addr;
+ u_short src_port;
+ u_short dst_port;
+ u_short alias_port;
+ u_short proxy_port;
+ struct server *server;
+
+ int link_type; /* Type of link: TCP, UDP, ICMP, proto, frag */
+
+/* values for link_type */
+#define LINK_ICMP IPPROTO_ICMP
+#define LINK_UDP IPPROTO_UDP
+#define LINK_TCP IPPROTO_TCP
+#define LINK_FRAGMENT_ID (IPPROTO_MAX + 1)
+#define LINK_FRAGMENT_PTR (IPPROTO_MAX + 2)
+#define LINK_ADDR (IPPROTO_MAX + 3)
+#define LINK_PPTP (IPPROTO_MAX + 4)
+
+ int flags; /* indicates special characteristics */
+ int pflags; /* protocol-specific flags */
+
+/* flag bits */
+#define LINK_UNKNOWN_DEST_PORT 0x01
+#define LINK_UNKNOWN_DEST_ADDR 0x02
+#define LINK_PERMANENT 0x04
+#define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */
+#define LINK_UNFIREWALLED 0x08
+
+ int timestamp; /* Time link was last accessed */
+ int expire_time; /* Expire time for link */
+
+ int sockfd; /* socket descriptor */
+
+ LIST_ENTRY(alias_link) list_out; /* Linked list of pointers for */
+ LIST_ENTRY(alias_link) list_in; /* input and output lookup tables */
+
+ union /* Auxiliary data */
+ {
+ char *frag_ptr;
+ struct in_addr frag_addr;
+ struct tcp_dat *tcp;
+ } data;
+};
+
+
+
+
+
+/* Global Variables
+
+ The global variables listed here are only accessed from
+ within alias_db.c and so are prefixed with the static
+ designation.
+*/
+
+int packetAliasMode; /* Mode flags */
+ /* - documented in alias.h */
+
+static struct in_addr aliasAddress; /* Address written onto source */
+ /* field of IP packet. */
+
+static struct in_addr targetAddress; /* IP address incoming packets */
+ /* are sent to if no aliasing */
+ /* link already exists */
+
+static struct in_addr nullAddress; /* Used as a dummy parameter for */
+ /* some function calls */
+static LIST_HEAD(, alias_link)
+linkTableOut[LINK_TABLE_OUT_SIZE]; /* Lookup table of pointers to */
+ /* chains of link records. Each */
+static LIST_HEAD(, alias_link) /* link record is doubly indexed */
+linkTableIn[LINK_TABLE_IN_SIZE]; /* into input and output lookup */
+ /* tables. */
+
+static int icmpLinkCount; /* Link statistics */
+static int udpLinkCount;
+static int tcpLinkCount;
+static int pptpLinkCount;
+static int protoLinkCount;
+static int fragmentIdLinkCount;
+static int fragmentPtrLinkCount;
+static int sockCount;
+
+static int cleanupIndex; /* Index to chain of link table */
+ /* being inspected for old links */
+
+static int timeStamp; /* System time in seconds for */
+ /* current packet */
+
+static int lastCleanupTime; /* Last time IncrementalCleanup() */
+ /* was called */
+
+static int houseKeepingResidual; /* used by HouseKeeping() */
+
+static int deleteAllLinks; /* If equal to zero, DeleteLink() */
+ /* will not remove permanent links */
+
+static FILE *monitorFile; /* File descriptor for link */
+ /* statistics monitoring file */
+
+static int newDefaultLink; /* Indicates if a new aliasing */
+ /* link has been created after a */
+ /* call to PacketAliasIn/Out(). */
+
+#ifndef NO_FW_PUNCH
+static int fireWallFD = -1; /* File descriptor to be able to */
+ /* control firewall. Opened by */
+ /* PacketAliasSetMode on first */
+ /* setting the PKT_ALIAS_PUNCH_FW */
+ /* flag. */
+#endif
+
+
+
+
+
+
+
+/* Internal utility routines (used only in alias_db.c)
+
+Lookup table starting points:
+ StartPointIn() -- link table initial search point for
+ incoming packets
+ StartPointOut() -- link table initial search point for
+ outgoing packets
+
+Miscellaneous:
+ SeqDiff() -- difference between two TCP sequences
+ ShowAliasStats() -- send alias statistics to a monitor file
+*/
+
+
+/* Local prototypes */
+static u_int StartPointIn(struct in_addr, u_short, int);
+
+static u_int StartPointOut(struct in_addr, struct in_addr,
+ u_short, u_short, int);
+
+static int SeqDiff(u_long, u_long);
+
+static void ShowAliasStats(void);
+
+#ifndef NO_FW_PUNCH
+/* Firewall control */
+static void InitPunchFW(void);
+static void UninitPunchFW(void);
+static void ClearFWHole(struct alias_link *link);
+#endif
+
+/* Log file control */
+static void InitPacketAliasLog(void);
+static void UninitPacketAliasLog(void);
+
+static u_int
+StartPointIn(struct in_addr alias_addr,
+ u_short alias_port,
+ int link_type)
+{
+ u_int n;
+
+ n = alias_addr.s_addr;
+ if (link_type != LINK_PPTP)
+ n += alias_port;
+ n += link_type;
+ return(n % LINK_TABLE_IN_SIZE);
+}
+
+
+static u_int
+StartPointOut(struct in_addr src_addr, struct in_addr dst_addr,
+ u_short src_port, u_short dst_port, int link_type)
+{
+ u_int n;
+
+ n = src_addr.s_addr;
+ n += dst_addr.s_addr;
+ if (link_type != LINK_PPTP) {
+ n += src_port;
+ n += dst_port;
+ }
+ n += link_type;
+
+ return(n % LINK_TABLE_OUT_SIZE);
+}
+
+
+static int
+SeqDiff(u_long x, u_long y)
+{
+/* Return the difference between two TCP sequence numbers */
+
+/*
+ This function is encapsulated in case there are any unusual
+ arithmetic conditions that need to be considered.
+*/
+
+ return (ntohl(y) - ntohl(x));
+}
+
+
+static void
+ShowAliasStats(void)
+{
+/* Used for debugging */
+
+ if (monitorFile)
+ {
+ fprintf(monitorFile, "icmp=%d, udp=%d, tcp=%d, pptp=%d, proto=%d, frag_id=%d frag_ptr=%d",
+ icmpLinkCount,
+ udpLinkCount,
+ tcpLinkCount,
+ pptpLinkCount,
+ protoLinkCount,
+ fragmentIdLinkCount,
+ fragmentPtrLinkCount);
+
+ fprintf(monitorFile, " / tot=%d (sock=%d)\n",
+ icmpLinkCount + udpLinkCount
+ + tcpLinkCount
+ + pptpLinkCount
+ + protoLinkCount
+ + fragmentIdLinkCount
+ + fragmentPtrLinkCount,
+ sockCount);
+
+ fflush(monitorFile);
+ }
+}
+
+
+
+
+
+/* Internal routines for finding, deleting and adding links
+
+Port Allocation:
+ GetNewPort() -- find and reserve new alias port number
+ GetSocket() -- try to allocate a socket for a given port
+
+Link creation and deletion:
+ CleanupAliasData() - remove all link chains from lookup table
+ IncrementalCleanup() - look for stale links in a single chain
+ DeleteLink() - remove link
+ AddLink() - add link
+ ReLink() - change link
+
+Link search:
+ FindLinkOut() - find link for outgoing packets
+ FindLinkIn() - find link for incoming packets
+
+Port search:
+ FindNewPortGroup() - find an available group of ports
+*/
+
+/* Local prototypes */
+static int GetNewPort(struct alias_link *, int);
+
+static u_short GetSocket(u_short, int *, int);
+
+static void CleanupAliasData(void);
+
+static void IncrementalCleanup(void);
+
+static void DeleteLink(struct alias_link *);
+
+static struct alias_link *
+AddLink(struct in_addr, struct in_addr, struct in_addr,
+ u_short, u_short, int, int);
+
+static struct alias_link *
+ReLink(struct alias_link *,
+ struct in_addr, struct in_addr, struct in_addr,
+ u_short, u_short, int, int);
+
+static struct alias_link *
+FindLinkOut(struct in_addr, struct in_addr, u_short, u_short, int, int);
+
+static struct alias_link *
+FindLinkIn(struct in_addr, struct in_addr, u_short, u_short, int, int);
+
+
+#define ALIAS_PORT_BASE 0x08000
+#define ALIAS_PORT_MASK 0x07fff
+#define ALIAS_PORT_MASK_EVEN 0x07ffe
+#define GET_NEW_PORT_MAX_ATTEMPTS 20
+
+#define GET_ALIAS_PORT -1
+#define GET_ALIAS_ID GET_ALIAS_PORT
+
+#define FIND_EVEN_ALIAS_BASE 1
+
+/* GetNewPort() allocates port numbers. Note that if a port number
+ is already in use, that does not mean that it cannot be used by
+ another link concurrently. This is because GetNewPort() looks for
+ unused triplets: (dest addr, dest port, alias port). */
+
+static int
+GetNewPort(struct alias_link *link, int alias_port_param)
+{
+ int i;
+ int max_trials;
+ u_short port_sys;
+ u_short port_net;
+
+/*
+ Description of alias_port_param for GetNewPort(). When
+ this parameter is zero or positive, it precisely specifies
+ the port number. GetNewPort() will return this number
+ without check that it is in use.
+
+ When this parameter is GET_ALIAS_PORT, it indicates to get a randomly
+ selected port number.
+*/
+
+ if (alias_port_param == GET_ALIAS_PORT)
+ {
+ /*
+ * The aliasing port is automatically selected
+ * by one of two methods below:
+ */
+ max_trials = GET_NEW_PORT_MAX_ATTEMPTS;
+
+ if (packetAliasMode & PKT_ALIAS_SAME_PORTS)
+ {
+ /*
+ * When the PKT_ALIAS_SAME_PORTS option is
+ * chosen, the first try will be the
+ * actual source port. If this is already
+ * in use, the remainder of the trials
+ * will be random.
+ */
+ port_net = link->src_port;
+ port_sys = ntohs(port_net);
+ }
+ else
+ {
+ /* First trial and all subsequent are random. */
+ port_sys = random() & ALIAS_PORT_MASK;
+ port_sys += ALIAS_PORT_BASE;
+ port_net = htons(port_sys);
+ }
+ }
+ else if (alias_port_param >= 0 && alias_port_param < 0x10000)
+ {
+ link->alias_port = (u_short) alias_port_param;
+ return(0);
+ }
+ else
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/GetNewPort(): ");
+ fprintf(stderr, "input parameter error\n");
+#endif
+ return(-1);
+ }
+
+
+/* Port number search */
+ for (i=0; i<max_trials; i++)
+ {
+ int go_ahead;
+ struct alias_link *search_result;
+
+ search_result = FindLinkIn(link->dst_addr, link->alias_addr,
+ link->dst_port, port_net,
+ link->link_type, 0);
+
+ if (search_result == NULL)
+ go_ahead = 1;
+ else if (!(link->flags & LINK_PARTIALLY_SPECIFIED)
+ && (search_result->flags & LINK_PARTIALLY_SPECIFIED))
+ go_ahead = 1;
+ else
+ go_ahead = 0;
+
+ if (go_ahead)
+ {
+ if ((packetAliasMode & PKT_ALIAS_USE_SOCKETS)
+ && (link->flags & LINK_PARTIALLY_SPECIFIED)
+ && ((link->link_type == LINK_TCP) ||
+ (link->link_type == LINK_UDP)))
+ {
+ if (GetSocket(port_net, &link->sockfd, link->link_type))
+ {
+ link->alias_port = port_net;
+ return(0);
+ }
+ }
+ else
+ {
+ link->alias_port = port_net;
+ return(0);
+ }
+ }
+
+ port_sys = random() & ALIAS_PORT_MASK;
+ port_sys += ALIAS_PORT_BASE;
+ port_net = htons(port_sys);
+ }
+
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/GetnewPort(): ");
+ fprintf(stderr, "could not find free port\n");
+#endif
+
+ return(-1);
+}
+
+
+static u_short
+GetSocket(u_short port_net, int *sockfd, int link_type)
+{
+ int err;
+ int sock;
+ struct sockaddr_in sock_addr;
+
+ if (link_type == LINK_TCP)
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ else if (link_type == LINK_UDP)
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ else
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/GetSocket(): ");
+ fprintf(stderr, "incorrect link type\n");
+#endif
+ return(0);
+ }
+
+ if (sock < 0)
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/GetSocket(): ");
+ fprintf(stderr, "socket() error %d\n", *sockfd);
+#endif
+ return(0);
+ }
+
+ sock_addr.sin_family = AF_INET;
+ sock_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ sock_addr.sin_port = port_net;
+
+ err = bind(sock,
+ (struct sockaddr *) &sock_addr,
+ sizeof(sock_addr));
+ if (err == 0)
+ {
+ sockCount++;
+ *sockfd = sock;
+ return(1);
+ }
+ else
+ {
+ close(sock);
+ return(0);
+ }
+}
+
+
+/* FindNewPortGroup() returns a base port number for an available
+ range of contiguous port numbers. Note that if a port number
+ is already in use, that does not mean that it cannot be used by
+ another link concurrently. This is because FindNewPortGroup()
+ looks for unused triplets: (dest addr, dest port, alias port). */
+
+int
+FindNewPortGroup(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short src_port,
+ u_short dst_port,
+ u_short port_count,
+ u_char proto,
+ u_char align)
+{
+ int i, j;
+ int max_trials;
+ u_short port_sys;
+ int link_type;
+
+ /*
+ * Get link_type from protocol
+ */
+
+ switch (proto)
+ {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return (0);
+ break;
+ }
+
+ /*
+ * The aliasing port is automatically selected
+ * by one of two methods below:
+ */
+ max_trials = GET_NEW_PORT_MAX_ATTEMPTS;
+
+ if (packetAliasMode & PKT_ALIAS_SAME_PORTS) {
+ /*
+ * When the ALIAS_SAME_PORTS option is
+ * chosen, the first try will be the
+ * actual source port. If this is already
+ * in use, the remainder of the trials
+ * will be random.
+ */
+ port_sys = ntohs(src_port);
+
+ } else {
+
+ /* First trial and all subsequent are random. */
+ if (align == FIND_EVEN_ALIAS_BASE)
+ port_sys = random() & ALIAS_PORT_MASK_EVEN;
+ else
+ port_sys = random() & ALIAS_PORT_MASK;
+
+ port_sys += ALIAS_PORT_BASE;
+ }
+
+/* Port number search */
+ for (i = 0; i < max_trials; i++) {
+
+ struct alias_link *search_result;
+
+ for (j = 0; j < port_count; j++)
+ if (0 != (search_result = FindLinkIn(dst_addr, alias_addr,
+ dst_port, htons(port_sys + j),
+ link_type, 0)))
+ break;
+
+ /* Found a good range, return base */
+ if (j == port_count)
+ return (htons(port_sys));
+
+ /* Find a new base to try */
+ if (align == FIND_EVEN_ALIAS_BASE)
+ port_sys = random() & ALIAS_PORT_MASK_EVEN;
+ else
+ port_sys = random() & ALIAS_PORT_MASK;
+
+ port_sys += ALIAS_PORT_BASE;
+ }
+
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/FindNewPortGroup(): ");
+ fprintf(stderr, "could not find free port(s)\n");
+#endif
+
+ return(0);
+}
+
+static void
+CleanupAliasData(void)
+{
+ struct alias_link *link;
+ int i, icount;
+
+ icount = 0;
+ for (i=0; i<LINK_TABLE_OUT_SIZE; i++)
+ {
+ link = LIST_FIRST(&linkTableOut[i]);
+ while (link != NULL)
+ {
+ struct alias_link *link_next;
+ link_next = LIST_NEXT(link, list_out);
+ icount++;
+ DeleteLink(link);
+ link = link_next;
+ }
+ }
+
+ cleanupIndex =0;
+}
+
+
+static void
+IncrementalCleanup(void)
+{
+ int icount;
+ struct alias_link *link;
+
+ icount = 0;
+ link = LIST_FIRST(&linkTableOut[cleanupIndex++]);
+ while (link != NULL)
+ {
+ int idelta;
+ struct alias_link *link_next;
+
+ link_next = LIST_NEXT(link, list_out);
+ idelta = timeStamp - link->timestamp;
+ switch (link->link_type)
+ {
+ case LINK_TCP:
+ if (idelta > link->expire_time)
+ {
+ struct tcp_dat *tcp_aux;
+
+ tcp_aux = link->data.tcp;
+ if (tcp_aux->state.in != ALIAS_TCP_STATE_CONNECTED
+ || tcp_aux->state.out != ALIAS_TCP_STATE_CONNECTED)
+ {
+ DeleteLink(link);
+ icount++;
+ }
+ }
+ break;
+ default:
+ if (idelta > link->expire_time)
+ {
+ DeleteLink(link);
+ icount++;
+ }
+ break;
+ }
+ link = link_next;
+ }
+
+ if (cleanupIndex == LINK_TABLE_OUT_SIZE)
+ cleanupIndex = 0;
+}
+
+static void
+DeleteLink(struct alias_link *link)
+{
+
+/* Don't do anything if the link is marked permanent */
+ if (deleteAllLinks == 0 && link->flags & LINK_PERMANENT)
+ return;
+
+#ifndef NO_FW_PUNCH
+/* Delete associated firewall hole, if any */
+ ClearFWHole(link);
+#endif
+
+/* Free memory allocated for LSNAT server pool */
+ if (link->server != NULL) {
+ struct server *head, *curr, *next;
+
+ head = curr = link->server;
+ do {
+ next = curr->next;
+ free(curr);
+ } while ((curr = next) != head);
+ }
+
+/* Adjust output table pointers */
+ LIST_REMOVE(link, list_out);
+
+/* Adjust input table pointers */
+ LIST_REMOVE(link, list_in);
+
+/* Close socket, if one has been allocated */
+ if (link->sockfd != -1)
+ {
+ sockCount--;
+ close(link->sockfd);
+ }
+
+/* Link-type dependent cleanup */
+ switch(link->link_type)
+ {
+ case LINK_ICMP:
+ icmpLinkCount--;
+ break;
+ case LINK_UDP:
+ udpLinkCount--;
+ break;
+ case LINK_TCP:
+ tcpLinkCount--;
+ free(link->data.tcp);
+ break;
+ case LINK_PPTP:
+ pptpLinkCount--;
+ break;
+ case LINK_FRAGMENT_ID:
+ fragmentIdLinkCount--;
+ break;
+ case LINK_FRAGMENT_PTR:
+ fragmentPtrLinkCount--;
+ if (link->data.frag_ptr != NULL)
+ free(link->data.frag_ptr);
+ break;
+ case LINK_ADDR:
+ break;
+ default:
+ protoLinkCount--;
+ break;
+ }
+
+/* Free memory */
+ free(link);
+
+/* Write statistics, if logging enabled */
+ if (packetAliasMode & PKT_ALIAS_LOG)
+ {
+ ShowAliasStats();
+ }
+}
+
+
+static struct alias_link *
+AddLink(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short src_port,
+ u_short dst_port,
+ int alias_port_param, /* if less than zero, alias */
+ int link_type) /* port will be automatically */
+{ /* chosen. If greater than */
+ u_int start_point; /* zero, equal to alias port */
+ struct alias_link *link;
+
+ link = malloc(sizeof(struct alias_link));
+ if (link != NULL)
+ {
+ /* Basic initialization */
+ link->src_addr = src_addr;
+ link->dst_addr = dst_addr;
+ link->alias_addr = alias_addr;
+ link->proxy_addr.s_addr = INADDR_ANY;
+ link->src_port = src_port;
+ link->dst_port = dst_port;
+ link->proxy_port = 0;
+ link->server = NULL;
+ link->link_type = link_type;
+ link->sockfd = -1;
+ link->flags = 0;
+ link->pflags = 0;
+ link->timestamp = timeStamp;
+
+ /* Expiration time */
+ switch (link_type)
+ {
+ case LINK_ICMP:
+ link->expire_time = ICMP_EXPIRE_TIME;
+ break;
+ case LINK_UDP:
+ link->expire_time = UDP_EXPIRE_TIME;
+ break;
+ case LINK_TCP:
+ link->expire_time = TCP_EXPIRE_INITIAL;
+ break;
+ case LINK_PPTP:
+ link->flags |= LINK_PERMANENT; /* no timeout. */
+ break;
+ case LINK_FRAGMENT_ID:
+ link->expire_time = FRAGMENT_ID_EXPIRE_TIME;
+ break;
+ case LINK_FRAGMENT_PTR:
+ link->expire_time = FRAGMENT_PTR_EXPIRE_TIME;
+ break;
+ case LINK_ADDR:
+ break;
+ default:
+ link->expire_time = PROTO_EXPIRE_TIME;
+ break;
+ }
+
+ /* Determine alias flags */
+ if (dst_addr.s_addr == INADDR_ANY)
+ link->flags |= LINK_UNKNOWN_DEST_ADDR;
+ if (dst_port == 0)
+ link->flags |= LINK_UNKNOWN_DEST_PORT;
+
+ /* Determine alias port */
+ if (GetNewPort(link, alias_port_param) != 0)
+ {
+ free(link);
+ return(NULL);
+ }
+
+ /* Link-type dependent initialization */
+ switch(link_type)
+ {
+ struct tcp_dat *aux_tcp;
+
+ case LINK_ICMP:
+ icmpLinkCount++;
+ break;
+ case LINK_UDP:
+ udpLinkCount++;
+ break;
+ case LINK_TCP:
+ aux_tcp = malloc(sizeof(struct tcp_dat));
+ if (aux_tcp != NULL)
+ {
+ int i;
+
+ tcpLinkCount++;
+ aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED;
+ aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED;
+ aux_tcp->state.index = 0;
+ aux_tcp->state.ack_modified = 0;
+ for (i=0; i<N_LINK_TCP_DATA; i++)
+ aux_tcp->ack[i].active = 0;
+ aux_tcp->fwhole = -1;
+ link->data.tcp = aux_tcp;
+ }
+ else
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/AddLink: ");
+ fprintf(stderr, " cannot allocate auxiliary TCP data\n");
+#endif
+ free(link);
+ return (NULL);
+ }
+ break;
+ case LINK_PPTP:
+ pptpLinkCount++;
+ break;
+ case LINK_FRAGMENT_ID:
+ fragmentIdLinkCount++;
+ break;
+ case LINK_FRAGMENT_PTR:
+ fragmentPtrLinkCount++;
+ break;
+ case LINK_ADDR:
+ break;
+ default:
+ protoLinkCount++;
+ break;
+ }
+
+ /* Set up pointers for output lookup table */
+ start_point = StartPointOut(src_addr, dst_addr,
+ src_port, dst_port, link_type);
+ LIST_INSERT_HEAD(&linkTableOut[start_point], link, list_out);
+
+ /* Set up pointers for input lookup table */
+ start_point = StartPointIn(alias_addr, link->alias_port, link_type);
+ LIST_INSERT_HEAD(&linkTableIn[start_point], link, list_in);
+ }
+ else
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/AddLink(): ");
+ fprintf(stderr, "malloc() call failed.\n");
+#endif
+ }
+
+ if (packetAliasMode & PKT_ALIAS_LOG)
+ {
+ ShowAliasStats();
+ }
+
+ return(link);
+}
+
+static struct alias_link *
+ReLink(struct alias_link *old_link,
+ struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short src_port,
+ u_short dst_port,
+ int alias_port_param, /* if less than zero, alias */
+ int link_type) /* port will be automatically */
+{ /* chosen. If greater than */
+ struct alias_link *new_link; /* zero, equal to alias port */
+
+ new_link = AddLink(src_addr, dst_addr, alias_addr,
+ src_port, dst_port, alias_port_param,
+ link_type);
+#ifndef NO_FW_PUNCH
+ if (new_link != NULL &&
+ old_link->link_type == LINK_TCP &&
+ old_link->data.tcp->fwhole > 0) {
+ PunchFWHole(new_link);
+ }
+#endif
+ DeleteLink(old_link);
+ return new_link;
+}
+
+static struct alias_link *
+_FindLinkOut(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short dst_port,
+ int link_type,
+ int replace_partial_links)
+{
+ u_int i;
+ struct alias_link *link;
+
+ i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type);
+ LIST_FOREACH(link, &linkTableOut[i], list_out)
+ {
+ if (link->src_addr.s_addr == src_addr.s_addr
+ && link->server == NULL
+ && link->dst_addr.s_addr == dst_addr.s_addr
+ && link->dst_port == dst_port
+ && link->src_port == src_port
+ && link->link_type == link_type)
+ {
+ link->timestamp = timeStamp;
+ break;
+ }
+ }
+
+/* Search for partially specified links. */
+ if (link == NULL && replace_partial_links)
+ {
+ if (dst_port != 0 && dst_addr.s_addr != INADDR_ANY)
+ {
+ link = _FindLinkOut(src_addr, dst_addr, src_port, 0,
+ link_type, 0);
+ if (link == NULL)
+ link = _FindLinkOut(src_addr, nullAddress, src_port,
+ dst_port, link_type, 0);
+ }
+ if (link == NULL &&
+ (dst_port != 0 || dst_addr.s_addr != INADDR_ANY))
+ {
+ link = _FindLinkOut(src_addr, nullAddress, src_port, 0,
+ link_type, 0);
+ }
+ if (link != NULL)
+ {
+ link = ReLink(link,
+ src_addr, dst_addr, link->alias_addr,
+ src_port, dst_port, link->alias_port,
+ link_type);
+ }
+ }
+
+ return(link);
+}
+
+static struct alias_link *
+FindLinkOut(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short dst_port,
+ int link_type,
+ int replace_partial_links)
+{
+ struct alias_link *link;
+
+ link = _FindLinkOut(src_addr, dst_addr, src_port, dst_port,
+ link_type, replace_partial_links);
+
+ if (link == NULL)
+ {
+ /* The following allows permanent links to be
+ specified as using the default source address
+ (i.e. device interface address) without knowing
+ in advance what that address is. */
+ if (aliasAddress.s_addr != 0 &&
+ src_addr.s_addr == aliasAddress.s_addr)
+ {
+ link = _FindLinkOut(nullAddress, dst_addr, src_port, dst_port,
+ link_type, replace_partial_links);
+ }
+ }
+
+ return(link);
+}
+
+
+static struct alias_link *
+_FindLinkIn(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short dst_port,
+ u_short alias_port,
+ int link_type,
+ int replace_partial_links)
+{
+ int flags_in;
+ u_int start_point;
+ struct alias_link *link;
+ struct alias_link *link_fully_specified;
+ struct alias_link *link_unknown_all;
+ struct alias_link *link_unknown_dst_addr;
+ struct alias_link *link_unknown_dst_port;
+
+/* Initialize pointers */
+ link_fully_specified = NULL;
+ link_unknown_all = NULL;
+ link_unknown_dst_addr = NULL;
+ link_unknown_dst_port = NULL;
+
+/* If either the dest addr or port is unknown, the search
+ loop will have to know about this. */
+
+ flags_in = 0;
+ if (dst_addr.s_addr == INADDR_ANY)
+ flags_in |= LINK_UNKNOWN_DEST_ADDR;
+ if (dst_port == 0)
+ flags_in |= LINK_UNKNOWN_DEST_PORT;
+
+/* Search loop */
+ start_point = StartPointIn(alias_addr, alias_port, link_type);
+ LIST_FOREACH(link, &linkTableIn[start_point], list_in)
+ {
+ int flags;
+
+ flags = flags_in | link->flags;
+ if (!(flags & LINK_PARTIALLY_SPECIFIED))
+ {
+ if (link->alias_addr.s_addr == alias_addr.s_addr
+ && link->alias_port == alias_port
+ && link->dst_addr.s_addr == dst_addr.s_addr
+ && link->dst_port == dst_port
+ && link->link_type == link_type)
+ {
+ link_fully_specified = link;
+ break;
+ }
+ }
+ else if ((flags & LINK_UNKNOWN_DEST_ADDR)
+ && (flags & LINK_UNKNOWN_DEST_PORT))
+ {
+ if (link->alias_addr.s_addr == alias_addr.s_addr
+ && link->alias_port == alias_port
+ && link->link_type == link_type)
+ {
+ if (link_unknown_all == NULL)
+ link_unknown_all = link;
+ }
+ }
+ else if (flags & LINK_UNKNOWN_DEST_ADDR)
+ {
+ if (link->alias_addr.s_addr == alias_addr.s_addr
+ && link->alias_port == alias_port
+ && link->link_type == link_type
+ && link->dst_port == dst_port)
+ {
+ if (link_unknown_dst_addr == NULL)
+ link_unknown_dst_addr = link;
+ }
+ }
+ else if (flags & LINK_UNKNOWN_DEST_PORT)
+ {
+ if (link->alias_addr.s_addr == alias_addr.s_addr
+ && link->alias_port == alias_port
+ && link->link_type == link_type
+ && link->dst_addr.s_addr == dst_addr.s_addr)
+ {
+ if (link_unknown_dst_port == NULL)
+ link_unknown_dst_port = link;
+ }
+ }
+ }
+
+
+
+ if (link_fully_specified != NULL)
+ {
+ link_fully_specified->timestamp = timeStamp;
+ link = link_fully_specified;
+ }
+ else if (link_unknown_dst_port != NULL)
+ link = link_unknown_dst_port;
+ else if (link_unknown_dst_addr != NULL)
+ link = link_unknown_dst_addr;
+ else if (link_unknown_all != NULL)
+ link = link_unknown_all;
+ else
+ return (NULL);
+
+ if (replace_partial_links &&
+ (link->flags & LINK_PARTIALLY_SPECIFIED || link->server != NULL))
+ {
+ struct in_addr src_addr;
+ u_short src_port;
+
+ if (link->server != NULL) { /* LSNAT link */
+ src_addr = link->server->addr;
+ src_port = link->server->port;
+ link->server = link->server->next;
+ } else {
+ src_addr = link->src_addr;
+ src_port = link->src_port;
+ }
+
+ link = ReLink(link,
+ src_addr, dst_addr, alias_addr,
+ src_port, dst_port, alias_port,
+ link_type);
+ }
+
+ return (link);
+}
+
+static struct alias_link *
+FindLinkIn(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short dst_port,
+ u_short alias_port,
+ int link_type,
+ int replace_partial_links)
+{
+ struct alias_link *link;
+
+ link = _FindLinkIn(dst_addr, alias_addr, dst_port, alias_port,
+ link_type, replace_partial_links);
+
+ if (link == NULL)
+ {
+ /* The following allows permanent links to be
+ specified as using the default aliasing address
+ (i.e. device interface address) without knowing
+ in advance what that address is. */
+ if (aliasAddress.s_addr != 0 &&
+ alias_addr.s_addr == aliasAddress.s_addr)
+ {
+ link = _FindLinkIn(dst_addr, nullAddress, dst_port, alias_port,
+ link_type, replace_partial_links);
+ }
+ }
+
+ return(link);
+}
+
+
+
+
+/* External routines for finding/adding links
+
+-- "external" means outside alias_db.c, but within alias*.c --
+
+ FindIcmpIn(), FindIcmpOut()
+ FindFragmentIn1(), FindFragmentIn2()
+ AddFragmentPtrLink(), FindFragmentPtr()
+ FindProtoIn(), FindProtoOut()
+ FindUdpTcpIn(), FindUdpTcpOut()
+ AddPptp(), FindPptpOutByCallId(), FindPptpInByCallId(),
+ FindPptpOutByPeerCallId(), FindPptpInByPeerCallId()
+ FindOriginalAddress(), FindAliasAddress()
+
+(prototypes in alias_local.h)
+*/
+
+
+struct alias_link *
+FindIcmpIn(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short id_alias,
+ int create)
+{
+ struct alias_link *link;
+
+ link = FindLinkIn(dst_addr, alias_addr,
+ NO_DEST_PORT, id_alias,
+ LINK_ICMP, 0);
+ if (link == NULL && create && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING))
+ {
+ struct in_addr target_addr;
+
+ target_addr = FindOriginalAddress(alias_addr);
+ link = AddLink(target_addr, dst_addr, alias_addr,
+ id_alias, NO_DEST_PORT, id_alias,
+ LINK_ICMP);
+ }
+
+ return (link);
+}
+
+
+struct alias_link *
+FindIcmpOut(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short id,
+ int create)
+{
+ struct alias_link * link;
+
+ link = FindLinkOut(src_addr, dst_addr,
+ id, NO_DEST_PORT,
+ LINK_ICMP, 0);
+ if (link == NULL && create)
+ {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(src_addr);
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ id, NO_DEST_PORT, GET_ALIAS_ID,
+ LINK_ICMP);
+ }
+
+ return(link);
+}
+
+
+struct alias_link *
+FindFragmentIn1(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short ip_id)
+{
+ struct alias_link *link;
+
+ link = FindLinkIn(dst_addr, alias_addr,
+ NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_ID, 0);
+
+ if (link == NULL)
+ {
+ link = AddLink(nullAddress, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_ID);
+ }
+
+ return(link);
+}
+
+
+struct alias_link *
+FindFragmentIn2(struct in_addr dst_addr, /* Doesn't add a link if one */
+ struct in_addr alias_addr, /* is not found. */
+ u_short ip_id)
+{
+ return FindLinkIn(dst_addr, alias_addr,
+ NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_ID, 0);
+}
+
+
+struct alias_link *
+AddFragmentPtrLink(struct in_addr dst_addr,
+ u_short ip_id)
+{
+ return AddLink(nullAddress, dst_addr, nullAddress,
+ NO_SRC_PORT, NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_PTR);
+}
+
+
+struct alias_link *
+FindFragmentPtr(struct in_addr dst_addr,
+ u_short ip_id)
+{
+ return FindLinkIn(dst_addr, nullAddress,
+ NO_DEST_PORT, ip_id,
+ LINK_FRAGMENT_PTR, 0);
+}
+
+
+struct alias_link *
+FindProtoIn(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_char proto)
+{
+ struct alias_link *link;
+
+ link = FindLinkIn(dst_addr, alias_addr,
+ NO_DEST_PORT, 0,
+ proto, 1);
+
+ if (link == NULL && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING))
+ {
+ struct in_addr target_addr;
+
+ target_addr = FindOriginalAddress(alias_addr);
+ link = AddLink(target_addr, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, 0,
+ proto);
+ }
+
+ return (link);
+}
+
+
+struct alias_link *
+FindProtoOut(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_char proto)
+{
+ struct alias_link *link;
+
+ link = FindLinkOut(src_addr, dst_addr,
+ NO_SRC_PORT, NO_DEST_PORT,
+ proto, 1);
+
+ if (link == NULL)
+ {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(src_addr);
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, 0,
+ proto);
+ }
+
+ return (link);
+}
+
+
+struct alias_link *
+FindUdpTcpIn(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_short dst_port,
+ u_short alias_port,
+ u_char proto,
+ int create)
+{
+ int link_type;
+ struct alias_link *link;
+
+ switch (proto)
+ {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return NULL;
+ break;
+ }
+
+ link = FindLinkIn(dst_addr, alias_addr,
+ dst_port, alias_port,
+ link_type, create);
+
+ if (link == NULL && create && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING))
+ {
+ struct in_addr target_addr;
+
+ target_addr = FindOriginalAddress(alias_addr);
+ link = AddLink(target_addr, dst_addr, alias_addr,
+ alias_port, dst_port, alias_port,
+ link_type);
+ }
+
+ return(link);
+}
+
+
+struct alias_link *
+FindUdpTcpOut(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short dst_port,
+ u_char proto,
+ int create)
+{
+ int link_type;
+ struct alias_link *link;
+
+ switch (proto)
+ {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return NULL;
+ break;
+ }
+
+ link = FindLinkOut(src_addr, dst_addr, src_port, dst_port, link_type, create);
+
+ if (link == NULL && create)
+ {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(src_addr);
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ src_port, dst_port, GET_ALIAS_PORT,
+ link_type);
+ }
+
+ return(link);
+}
+
+
+struct alias_link *
+AddPptp(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_int16_t src_call_id)
+{
+ struct alias_link *link;
+
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ src_call_id, 0, GET_ALIAS_PORT,
+ LINK_PPTP);
+
+ return (link);
+}
+
+
+struct alias_link *
+FindPptpOutByCallId(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_int16_t src_call_id)
+{
+ u_int i;
+ struct alias_link *link;
+
+ i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP);
+ LIST_FOREACH(link, &linkTableOut[i], list_out)
+ if (link->link_type == LINK_PPTP &&
+ link->src_addr.s_addr == src_addr.s_addr &&
+ link->dst_addr.s_addr == dst_addr.s_addr &&
+ link->src_port == src_call_id)
+ break;
+
+ return (link);
+}
+
+
+struct alias_link *
+FindPptpOutByPeerCallId(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_int16_t dst_call_id)
+{
+ u_int i;
+ struct alias_link *link;
+
+ i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP);
+ LIST_FOREACH(link, &linkTableOut[i], list_out)
+ if (link->link_type == LINK_PPTP &&
+ link->src_addr.s_addr == src_addr.s_addr &&
+ link->dst_addr.s_addr == dst_addr.s_addr &&
+ link->dst_port == dst_call_id)
+ break;
+
+ return (link);
+}
+
+
+struct alias_link *
+FindPptpInByCallId(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_int16_t dst_call_id)
+{
+ u_int i;
+ struct alias_link *link;
+
+ i = StartPointIn(alias_addr, 0, LINK_PPTP);
+ LIST_FOREACH(link, &linkTableIn[i], list_in)
+ if (link->link_type == LINK_PPTP &&
+ link->dst_addr.s_addr == dst_addr.s_addr &&
+ link->alias_addr.s_addr == alias_addr.s_addr &&
+ link->dst_port == dst_call_id)
+ break;
+
+ return (link);
+}
+
+
+struct alias_link *
+FindPptpInByPeerCallId(struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_int16_t alias_call_id)
+{
+ struct alias_link *link;
+
+ link = FindLinkIn(dst_addr, alias_addr,
+ 0/* any */, alias_call_id,
+ LINK_PPTP, 0);
+
+
+ return (link);
+}
+
+
+struct alias_link *
+FindRtspOut(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ u_short src_port,
+ u_short alias_port,
+ u_char proto)
+{
+ int link_type;
+ struct alias_link *link;
+
+ switch (proto)
+ {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+ return NULL;
+ break;
+ }
+
+ link = FindLinkOut(src_addr, dst_addr, src_port, 0, link_type, 1);
+
+ if (link == NULL)
+ {
+ struct in_addr alias_addr;
+
+ alias_addr = FindAliasAddress(src_addr);
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ src_port, 0, alias_port,
+ link_type);
+ }
+
+ return(link);
+}
+
+
+struct in_addr
+FindOriginalAddress(struct in_addr alias_addr)
+{
+ struct alias_link *link;
+
+ link = FindLinkIn(nullAddress, alias_addr,
+ 0, 0, LINK_ADDR, 0);
+ if (link == NULL)
+ {
+ newDefaultLink = 1;
+ if (targetAddress.s_addr == INADDR_ANY)
+ return alias_addr;
+ else if (targetAddress.s_addr == INADDR_NONE)
+ return aliasAddress;
+ else
+ return targetAddress;
+ }
+ else
+ {
+ if (link->server != NULL) { /* LSNAT link */
+ struct in_addr src_addr;
+
+ src_addr = link->server->addr;
+ link->server = link->server->next;
+ return (src_addr);
+ } else if (link->src_addr.s_addr == INADDR_ANY)
+ return aliasAddress;
+ else
+ return link->src_addr;
+ }
+}
+
+
+struct in_addr
+FindAliasAddress(struct in_addr original_addr)
+{
+ struct alias_link *link;
+
+ link = FindLinkOut(original_addr, nullAddress,
+ 0, 0, LINK_ADDR, 0);
+ if (link == NULL)
+ {
+ return aliasAddress;
+ }
+ else
+ {
+ if (link->alias_addr.s_addr == INADDR_ANY)
+ return aliasAddress;
+ else
+ return link->alias_addr;
+ }
+}
+
+
+/* External routines for getting or changing link data
+ (external to alias_db.c, but internal to alias*.c)
+
+ SetFragmentData(), GetFragmentData()
+ SetFragmentPtr(), GetFragmentPtr()
+ SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut()
+ GetOriginalAddress(), GetDestAddress(), GetAliasAddress()
+ GetOriginalPort(), GetAliasPort()
+ SetAckModified(), GetAckModified()
+ GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq()
+ SetProtocolFlags(), GetProtocolFlags()
+ SetDestCallId()
+*/
+
+
+void
+SetFragmentAddr(struct alias_link *link, struct in_addr src_addr)
+{
+ link->data.frag_addr = src_addr;
+}
+
+
+void
+GetFragmentAddr(struct alias_link *link, struct in_addr *src_addr)
+{
+ *src_addr = link->data.frag_addr;
+}
+
+
+void
+SetFragmentPtr(struct alias_link *link, char *fptr)
+{
+ link->data.frag_ptr = fptr;
+}
+
+
+void
+GetFragmentPtr(struct alias_link *link, char **fptr)
+{
+ *fptr = link->data.frag_ptr;
+}
+
+
+void
+SetStateIn(struct alias_link *link, int state)
+{
+ /* TCP input state */
+ switch (state) {
+ case ALIAS_TCP_STATE_DISCONNECTED:
+ if (link->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED)
+ link->expire_time = TCP_EXPIRE_DEAD;
+ else
+ link->expire_time = TCP_EXPIRE_SINGLEDEAD;
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (link->data.tcp->state.out == ALIAS_TCP_STATE_CONNECTED)
+ link->expire_time = TCP_EXPIRE_CONNECTED;
+ break;
+ default:
+ abort();
+ }
+ link->data.tcp->state.in = state;
+}
+
+
+void
+SetStateOut(struct alias_link *link, int state)
+{
+ /* TCP output state */
+ switch (state) {
+ case ALIAS_TCP_STATE_DISCONNECTED:
+ if (link->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED)
+ link->expire_time = TCP_EXPIRE_DEAD;
+ else
+ link->expire_time = TCP_EXPIRE_SINGLEDEAD;
+ break;
+ case ALIAS_TCP_STATE_CONNECTED:
+ if (link->data.tcp->state.in == ALIAS_TCP_STATE_CONNECTED)
+ link->expire_time = TCP_EXPIRE_CONNECTED;
+ break;
+ default:
+ abort();
+ }
+ link->data.tcp->state.out = state;
+}
+
+
+int
+GetStateIn(struct alias_link *link)
+{
+ /* TCP input state */
+ return link->data.tcp->state.in;
+}
+
+
+int
+GetStateOut(struct alias_link *link)
+{
+ /* TCP output state */
+ return link->data.tcp->state.out;
+}
+
+
+struct in_addr
+GetOriginalAddress(struct alias_link *link)
+{
+ if (link->src_addr.s_addr == INADDR_ANY)
+ return aliasAddress;
+ else
+ return(link->src_addr);
+}
+
+
+struct in_addr
+GetDestAddress(struct alias_link *link)
+{
+ return(link->dst_addr);
+}
+
+
+struct in_addr
+GetAliasAddress(struct alias_link *link)
+{
+ if (link->alias_addr.s_addr == INADDR_ANY)
+ return aliasAddress;
+ else
+ return link->alias_addr;
+}
+
+
+struct in_addr
+GetDefaultAliasAddress()
+{
+ return aliasAddress;
+}
+
+
+void
+SetDefaultAliasAddress(struct in_addr alias_addr)
+{
+ aliasAddress = alias_addr;
+}
+
+
+u_short
+GetOriginalPort(struct alias_link *link)
+{
+ return(link->src_port);
+}
+
+
+u_short
+GetAliasPort(struct alias_link *link)
+{
+ return(link->alias_port);
+}
+
+#ifndef NO_FW_PUNCH
+static u_short
+GetDestPort(struct alias_link *link)
+{
+ return(link->dst_port);
+}
+#endif
+
+void
+SetAckModified(struct alias_link *link)
+{
+/* Indicate that ACK numbers have been modified in a TCP connection */
+ link->data.tcp->state.ack_modified = 1;
+}
+
+
+struct in_addr
+GetProxyAddress(struct alias_link *link)
+{
+ return link->proxy_addr;
+}
+
+
+void
+SetProxyAddress(struct alias_link *link, struct in_addr addr)
+{
+ link->proxy_addr = addr;
+}
+
+
+u_short
+GetProxyPort(struct alias_link *link)
+{
+ return link->proxy_port;
+}
+
+
+void
+SetProxyPort(struct alias_link *link, u_short port)
+{
+ link->proxy_port = port;
+}
+
+
+int
+GetAckModified(struct alias_link *link)
+{
+/* See if ACK numbers have been modified */
+ return link->data.tcp->state.ack_modified;
+}
+
+
+int
+GetDeltaAckIn(struct ip *pip, struct alias_link *link)
+{
+/*
+Find out how much the ACK number has been altered for an incoming
+TCP packet. To do this, a circular list of ACK numbers where the TCP
+packet size was altered is searched.
+*/
+
+ int i;
+ struct tcphdr *tc;
+ int delta, ack_diff_min;
+ u_long ack;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ ack = tc->th_ack;
+
+ delta = 0;
+ ack_diff_min = -1;
+ for (i=0; i<N_LINK_TCP_DATA; i++)
+ {
+ struct ack_data_record x;
+
+ x = link->data.tcp->ack[i];
+ if (x.active == 1)
+ {
+ int ack_diff;
+
+ ack_diff = SeqDiff(x.ack_new, ack);
+ if (ack_diff >= 0)
+ {
+ if (ack_diff_min >= 0)
+ {
+ if (ack_diff < ack_diff_min)
+ {
+ delta = x.delta;
+ ack_diff_min = ack_diff;
+ }
+ }
+ else
+ {
+ delta = x.delta;
+ ack_diff_min = ack_diff;
+ }
+ }
+ }
+ }
+ return (delta);
+}
+
+
+int
+GetDeltaSeqOut(struct ip *pip, struct alias_link *link)
+{
+/*
+Find out how much the sequence number has been altered for an outgoing
+TCP packet. To do this, a circular list of ACK numbers where the TCP
+packet size was altered is searched.
+*/
+
+ int i;
+ struct tcphdr *tc;
+ int delta, seq_diff_min;
+ u_long seq;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ seq = tc->th_seq;
+
+ delta = 0;
+ seq_diff_min = -1;
+ for (i=0; i<N_LINK_TCP_DATA; i++)
+ {
+ struct ack_data_record x;
+
+ x = link->data.tcp->ack[i];
+ if (x.active == 1)
+ {
+ int seq_diff;
+
+ seq_diff = SeqDiff(x.ack_old, seq);
+ if (seq_diff >= 0)
+ {
+ if (seq_diff_min >= 0)
+ {
+ if (seq_diff < seq_diff_min)
+ {
+ delta = x.delta;
+ seq_diff_min = seq_diff;
+ }
+ }
+ else
+ {
+ delta = x.delta;
+ seq_diff_min = seq_diff;
+ }
+ }
+ }
+ }
+ return (delta);
+}
+
+
+void
+AddSeq(struct ip *pip, struct alias_link *link, int delta)
+{
+/*
+When a TCP packet has been altered in length, save this
+information in a circular list. If enough packets have
+been altered, then this list will begin to overwrite itself.
+*/
+
+ struct tcphdr *tc;
+ struct ack_data_record x;
+ int hlen, tlen, dlen;
+ int i;
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ x.ack_old = htonl(ntohl(tc->th_seq) + dlen);
+ x.ack_new = htonl(ntohl(tc->th_seq) + dlen + delta);
+ x.delta = delta;
+ x.active = 1;
+
+ i = link->data.tcp->state.index;
+ link->data.tcp->ack[i] = x;
+
+ i++;
+ if (i == N_LINK_TCP_DATA)
+ link->data.tcp->state.index = 0;
+ else
+ link->data.tcp->state.index = i;
+}
+
+void
+SetExpire(struct alias_link *link, int expire)
+{
+ if (expire == 0)
+ {
+ link->flags &= ~LINK_PERMANENT;
+ DeleteLink(link);
+ }
+ else if (expire == -1)
+ {
+ link->flags |= LINK_PERMANENT;
+ }
+ else if (expire > 0)
+ {
+ link->expire_time = expire;
+ }
+ else
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/SetExpire(): ");
+ fprintf(stderr, "error in expire parameter\n");
+#endif
+ }
+}
+
+void
+ClearCheckNewLink(void)
+{
+ newDefaultLink = 0;
+}
+
+void
+SetProtocolFlags(struct alias_link *link, int pflags)
+{
+
+ link->pflags = pflags;;
+}
+
+int
+GetProtocolFlags(struct alias_link *link)
+{
+
+ return (link->pflags);
+}
+
+void
+SetDestCallId(struct alias_link *link, u_int16_t cid)
+{
+
+ deleteAllLinks = 1;
+ link = ReLink(link, link->src_addr, link->dst_addr, link->alias_addr,
+ link->src_port, cid, link->alias_port, link->link_type);
+ deleteAllLinks = 0;
+}
+
+
+/* Miscellaneous Functions
+
+ HouseKeeping()
+ InitPacketAliasLog()
+ UninitPacketAliasLog()
+*/
+
+/*
+ Whenever an outgoing or incoming packet is handled, HouseKeeping()
+ is called to find and remove timed-out aliasing links. Logic exists
+ to sweep through the entire table and linked list structure
+ every 60 seconds.
+
+ (prototype in alias_local.h)
+*/
+
+void
+HouseKeeping(void)
+{
+ int i, n, n100;
+ struct timeval tv;
+ struct timezone tz;
+
+ /*
+ * Save system time (seconds) in global variable timeStamp for
+ * use by other functions. This is done so as not to unnecessarily
+ * waste timeline by making system calls.
+ */
+ gettimeofday(&tv, &tz);
+ timeStamp = tv.tv_sec;
+
+ /* Compute number of spokes (output table link chains) to cover */
+ n100 = LINK_TABLE_OUT_SIZE * 100 + houseKeepingResidual;
+ n100 *= timeStamp - lastCleanupTime;
+ n100 /= ALIAS_CLEANUP_INTERVAL_SECS;
+
+ n = n100/100;
+
+ /* Handle different cases */
+ if (n > ALIAS_CLEANUP_MAX_SPOKES)
+ {
+ n = ALIAS_CLEANUP_MAX_SPOKES;
+ lastCleanupTime = timeStamp;
+ houseKeepingResidual = 0;
+
+ for (i=0; i<n; i++)
+ IncrementalCleanup();
+ }
+ else if (n > 0)
+ {
+ lastCleanupTime = timeStamp;
+ houseKeepingResidual = n100 - 100*n;
+
+ for (i=0; i<n; i++)
+ IncrementalCleanup();
+ }
+ else if (n < 0)
+ {
+#ifdef DEBUG
+ fprintf(stderr, "PacketAlias/HouseKeeping(): ");
+ fprintf(stderr, "something unexpected in time values\n");
+#endif
+ lastCleanupTime = timeStamp;
+ houseKeepingResidual = 0;
+ }
+}
+
+
+/* Init the log file and enable logging */
+static void
+InitPacketAliasLog(void)
+{
+ if ((~packetAliasMode & PKT_ALIAS_LOG)
+ && (monitorFile = fopen("/var/log/alias.log", "w")))
+ {
+ packetAliasMode |= PKT_ALIAS_LOG;
+ fprintf(monitorFile,
+ "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n");
+ }
+}
+
+
+/* Close the log-file and disable logging. */
+static void
+UninitPacketAliasLog(void)
+{
+ if (monitorFile) {
+ fclose(monitorFile);
+ monitorFile = NULL;
+ }
+ packetAliasMode &= ~PKT_ALIAS_LOG;
+}
+
+
+
+
+
+
+/* Outside world interfaces
+
+-- "outside world" means other than alias*.c routines --
+
+ PacketAliasRedirectPort()
+ PacketAliasAddServer()
+ PacketAliasRedirectProto()
+ PacketAliasRedirectAddr()
+ PacketAliasRedirectDelete()
+ PacketAliasSetAddress()
+ PacketAliasInit()
+ PacketAliasUninit()
+ PacketAliasSetMode()
+
+(prototypes in alias.h)
+*/
+
+/* Redirection from a specific public addr:port to a
+ private addr:port */
+struct alias_link *
+PacketAliasRedirectPort(struct in_addr src_addr, u_short src_port,
+ struct in_addr dst_addr, u_short dst_port,
+ struct in_addr alias_addr, u_short alias_port,
+ u_char proto)
+{
+ int link_type;
+ struct alias_link *link;
+
+ switch(proto)
+ {
+ case IPPROTO_UDP:
+ link_type = LINK_UDP;
+ break;
+ case IPPROTO_TCP:
+ link_type = LINK_TCP;
+ break;
+ default:
+#ifdef DEBUG
+ fprintf(stderr, "PacketAliasRedirectPort(): ");
+ fprintf(stderr, "only TCP and UDP protocols allowed\n");
+#endif
+ return NULL;
+ }
+
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ src_port, dst_port, alias_port,
+ link_type);
+
+ if (link != NULL)
+ {
+ link->flags |= LINK_PERMANENT;
+ }
+#ifdef DEBUG
+ else
+ {
+ fprintf(stderr, "PacketAliasRedirectPort(): "
+ "call to AddLink() failed\n");
+ }
+#endif
+
+ return link;
+}
+
+/* Add server to the pool of servers */
+int
+PacketAliasAddServer(struct alias_link *link, struct in_addr addr, u_short port)
+{
+ struct server *server;
+
+ server = malloc(sizeof(struct server));
+
+ if (server != NULL) {
+ struct server *head;
+
+ server->addr = addr;
+ server->port = port;
+
+ head = link->server;
+ if (head == NULL)
+ server->next = server;
+ else {
+ struct server *s;
+
+ for (s = head; s->next != head; s = s->next);
+ s->next = server;
+ server->next = head;
+ }
+ link->server = server;
+ return (0);
+ } else
+ return (-1);
+}
+
+/* Redirect packets of a given IP protocol from a specific
+ public address to a private address */
+struct alias_link *
+PacketAliasRedirectProto(struct in_addr src_addr,
+ struct in_addr dst_addr,
+ struct in_addr alias_addr,
+ u_char proto)
+{
+ struct alias_link *link;
+
+ link = AddLink(src_addr, dst_addr, alias_addr,
+ NO_SRC_PORT, NO_DEST_PORT, 0,
+ proto);
+
+ if (link != NULL)
+ {
+ link->flags |= LINK_PERMANENT;
+ }
+#ifdef DEBUG
+ else
+ {
+ fprintf(stderr, "PacketAliasRedirectProto(): "
+ "call to AddLink() failed\n");
+ }
+#endif
+
+ return link;
+}
+
+/* Static address translation */
+struct alias_link *
+PacketAliasRedirectAddr(struct in_addr src_addr,
+ struct in_addr alias_addr)
+{
+ struct alias_link *link;
+
+ link = AddLink(src_addr, nullAddress, alias_addr,
+ 0, 0, 0,
+ LINK_ADDR);
+
+ if (link != NULL)
+ {
+ link->flags |= LINK_PERMANENT;
+ }
+#ifdef DEBUG
+ else
+ {
+ fprintf(stderr, "PacketAliasRedirectAddr(): "
+ "call to AddLink() failed\n");
+ }
+#endif
+
+ return link;
+}
+
+
+void
+PacketAliasRedirectDelete(struct alias_link *link)
+{
+/* This is a dangerous function to put in the API,
+ because an invalid pointer can crash the program. */
+
+ deleteAllLinks = 1;
+ DeleteLink(link);
+ deleteAllLinks = 0;
+}
+
+
+void
+PacketAliasSetAddress(struct in_addr addr)
+{
+ if (packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE
+ && aliasAddress.s_addr != addr.s_addr)
+ CleanupAliasData();
+
+ aliasAddress = addr;
+}
+
+
+void
+PacketAliasSetTarget(struct in_addr target_addr)
+{
+ targetAddress = target_addr;
+}
+
+
+void
+PacketAliasInit(void)
+{
+ int i;
+ struct timeval tv;
+ struct timezone tz;
+ static int firstCall = 1;
+
+ if (firstCall == 1)
+ {
+ gettimeofday(&tv, &tz);
+ timeStamp = tv.tv_sec;
+ lastCleanupTime = tv.tv_sec;
+ houseKeepingResidual = 0;
+
+ for (i=0; i<LINK_TABLE_OUT_SIZE; i++)
+ LIST_INIT(&linkTableOut[i]);
+ for (i=0; i<LINK_TABLE_IN_SIZE; i++)
+ LIST_INIT(&linkTableIn[i]);
+
+ atexit(PacketAliasUninit);
+ firstCall = 0;
+ }
+ else
+ {
+ deleteAllLinks = 1;
+ CleanupAliasData();
+ deleteAllLinks = 0;
+ }
+
+ aliasAddress.s_addr = INADDR_ANY;
+ targetAddress.s_addr = INADDR_ANY;
+
+ icmpLinkCount = 0;
+ udpLinkCount = 0;
+ tcpLinkCount = 0;
+ pptpLinkCount = 0;
+ protoLinkCount = 0;
+ fragmentIdLinkCount = 0;
+ fragmentPtrLinkCount = 0;
+ sockCount = 0;
+
+ cleanupIndex =0;
+
+ packetAliasMode = PKT_ALIAS_SAME_PORTS
+ | PKT_ALIAS_USE_SOCKETS
+ | PKT_ALIAS_RESET_ON_ADDR_CHANGE;
+}
+
+void
+PacketAliasUninit(void) {
+ deleteAllLinks = 1;
+ CleanupAliasData();
+ deleteAllLinks = 0;
+ UninitPacketAliasLog();
+#ifndef NO_FW_PUNCH
+ UninitPunchFW();
+#endif
+}
+
+
+/* Change mode for some operations */
+unsigned int
+PacketAliasSetMode(
+ unsigned int flags, /* Which state to bring flags to */
+ unsigned int mask /* Mask of which flags to affect (use 0 to do a
+ probe for flag values) */
+)
+{
+/* Enable logging? */
+ if (flags & mask & PKT_ALIAS_LOG)
+ {
+ InitPacketAliasLog(); /* Do the enable */
+ } else
+/* _Disable_ logging? */
+ if (~flags & mask & PKT_ALIAS_LOG) {
+ UninitPacketAliasLog();
+ }
+
+#ifndef NO_FW_PUNCH
+/* Start punching holes in the firewall? */
+ if (flags & mask & PKT_ALIAS_PUNCH_FW) {
+ InitPunchFW();
+ } else
+/* Stop punching holes in the firewall? */
+ if (~flags & mask & PKT_ALIAS_PUNCH_FW) {
+ UninitPunchFW();
+ }
+#endif
+
+/* Other flags can be set/cleared without special action */
+ packetAliasMode = (flags & mask) | (packetAliasMode & ~mask);
+ return packetAliasMode;
+}
+
+
+int
+PacketAliasCheckNewLink(void)
+{
+ return newDefaultLink;
+}
+
+
+#ifndef NO_FW_PUNCH
+
+/*****************
+ Code to support firewall punching. This shouldn't really be in this
+ file, but making variables global is evil too.
+ ****************/
+
+/* Firewall include files */
+#include <net/if.h>
+#include <netinet/ip_fw.h>
+#include <string.h>
+#include <err.h>
+
+static void ClearAllFWHoles(void);
+
+static int fireWallBaseNum; /* The first firewall entry free for our use */
+static int fireWallNumNums; /* How many entries can we use? */
+static int fireWallActiveNum; /* Which entry did we last use? */
+static char *fireWallField; /* bool array for entries */
+
+#define fw_setfield(field, num) \
+do { \
+ (field)[(num) - fireWallBaseNum] = 1; \
+} /*lint -save -e717 */ while(0) /*lint -restore */
+#define fw_clrfield(field, num) \
+do { \
+ (field)[(num) - fireWallBaseNum] = 0; \
+} /*lint -save -e717 */ while(0) /*lint -restore */
+#define fw_tstfield(field, num) ((field)[(num) - fireWallBaseNum])
+
+static void
+InitPunchFW(void) {
+ fireWallField = malloc(fireWallNumNums);
+ if (fireWallField) {
+ memset(fireWallField, 0, fireWallNumNums);
+ if (fireWallFD < 0) {
+ fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
+ }
+ ClearAllFWHoles();
+ fireWallActiveNum = fireWallBaseNum;
+ }
+}
+
+static void
+UninitPunchFW(void) {
+ ClearAllFWHoles();
+ if (fireWallFD >= 0)
+ close(fireWallFD);
+ fireWallFD = -1;
+ if (fireWallField)
+ free(fireWallField);
+ fireWallField = NULL;
+ packetAliasMode &= ~PKT_ALIAS_PUNCH_FW;
+}
+
+/* Make a certain link go through the firewall */
+void
+PunchFWHole(struct alias_link *link) {
+ int r; /* Result code */
+ struct ip_fw rule; /* On-the-fly built rule */
+ int fwhole; /* Where to punch hole */
+
+/* Don't do anything unless we are asked to */
+ if ( !(packetAliasMode & PKT_ALIAS_PUNCH_FW) ||
+ fireWallFD < 0 ||
+ link->link_type != LINK_TCP)
+ return;
+
+ memset(&rule, 0, sizeof rule);
+
+/** Build rule **/
+
+ /* Find empty slot */
+ for (fwhole = fireWallActiveNum;
+ fwhole < fireWallBaseNum + fireWallNumNums &&
+ fw_tstfield(fireWallField, fwhole);
+ fwhole++)
+ ;
+ if (fwhole == fireWallBaseNum + fireWallNumNums) {
+ for (fwhole = fireWallBaseNum;
+ fwhole < fireWallActiveNum &&
+ fw_tstfield(fireWallField, fwhole);
+ fwhole++)
+ ;
+ if (fwhole == fireWallActiveNum) {
+ /* No rule point empty - we can't punch more holes. */
+ fireWallActiveNum = fireWallBaseNum;
+#ifdef DEBUG
+ fprintf(stderr, "libalias: Unable to create firewall hole!\n");
+#endif
+ return;
+ }
+ }
+ /* Start next search at next position */
+ fireWallActiveNum = fwhole+1;
+
+ /* Build generic part of the two rules */
+ rule.fw_number = fwhole;
+ IP_FW_SETNSRCP(&rule, 1); /* Number of source ports. */
+ IP_FW_SETNDSTP(&rule, 1); /* Number of destination ports. */
+ rule.fw_flg = IP_FW_F_ACCEPT | IP_FW_F_IN | IP_FW_F_OUT;
+ rule.fw_prot = IPPROTO_TCP;
+ rule.fw_smsk.s_addr = INADDR_BROADCAST;
+ rule.fw_dmsk.s_addr = INADDR_BROADCAST;
+
+ /* Build and apply specific part of the rules */
+ rule.fw_src = GetOriginalAddress(link);
+ rule.fw_dst = GetDestAddress(link);
+ rule.fw_uar.fw_pts[0] = ntohs(GetOriginalPort(link));
+ rule.fw_uar.fw_pts[1] = ntohs(GetDestPort(link));
+
+ /* Skip non-bound links - XXX should not be strictly necessary,
+ but seems to leave hole if not done. Leak of non-bound links?
+ (Code should be left even if the problem is fixed - it is a
+ clear optimization) */
+ if (rule.fw_uar.fw_pts[0] != 0 && rule.fw_uar.fw_pts[1] != 0) {
+ r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule);
+#ifdef DEBUG
+ if (r)
+ err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)");
+#endif
+ rule.fw_src = GetDestAddress(link);
+ rule.fw_dst = GetOriginalAddress(link);
+ rule.fw_uar.fw_pts[0] = ntohs(GetDestPort(link));
+ rule.fw_uar.fw_pts[1] = ntohs(GetOriginalPort(link));
+ r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule);
+#ifdef DEBUG
+ if (r)
+ err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)");
+#endif
+ }
+/* Indicate hole applied */
+ link->data.tcp->fwhole = fwhole;
+ fw_setfield(fireWallField, fwhole);
+}
+
+/* Remove a hole in a firewall associated with a particular alias
+ link. Calling this too often is harmless. */
+static void
+ClearFWHole(struct alias_link *link) {
+ if (link->link_type == LINK_TCP) {
+ int fwhole = link->data.tcp->fwhole; /* Where is the firewall hole? */
+ struct ip_fw rule;
+
+ if (fwhole < 0)
+ return;
+
+ memset(&rule, 0, sizeof rule);
+ rule.fw_number = fwhole;
+ while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule))
+ ;
+ fw_clrfield(fireWallField, fwhole);
+ link->data.tcp->fwhole = -1;
+ }
+}
+
+/* Clear out the entire range dedicated to firewall holes. */
+static void
+ClearAllFWHoles(void) {
+ struct ip_fw rule; /* On-the-fly built rule */
+ int i;
+
+ if (fireWallFD < 0)
+ return;
+
+ memset(&rule, 0, sizeof rule);
+ for (i = fireWallBaseNum; i < fireWallBaseNum + fireWallNumNums; i++) {
+ rule.fw_number = i;
+ while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule))
+ ;
+ }
+ memset(fireWallField, 0, fireWallNumNums);
+}
+#endif
+
+void
+PacketAliasSetFWBase(unsigned int base, unsigned int num) {
+#ifndef NO_FW_PUNCH
+ fireWallBaseNum = base;
+ fireWallNumNums = num;
+#endif
+}
diff --git a/sys/netinet/libalias/alias_ftp.c b/sys/netinet/libalias/alias_ftp.c
new file mode 100644
index 0000000..efc78c7
--- /dev/null
+++ b/sys/netinet/libalias/alias_ftp.c
@@ -0,0 +1,583 @@
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_ftp.c performs special processing for FTP sessions under
+ TCP. Specifically, when a PORT/EPRT command from the client
+ side or 227/229 reply from the server is sent, it is intercepted
+ and modified. The address is changed to the gateway machine
+ and an aliasing port is used.
+
+ For this routine to work, the message must fit entirely into a
+ single TCP packet. This is typically the case, but exceptions
+ can easily be envisioned under the actual specifications.
+
+ Probably the most troubling aspect of the approach taken here is
+ that the new message will typically be a different length, and
+ this causes a certain amount of bookkeeping to keep track of the
+ changes of sequence and acknowledgment numbers, since the client
+ machine is totally unaware of the modification to the TCP stream.
+
+
+ References: RFC 959, RFC 2428.
+
+ Initial version: August, 1996 (cjm)
+
+ Version 1.6
+ Brian Somers and Martin Renters identified an IP checksum
+ error for modified IP packets.
+
+ Version 1.7: January 9, 1996 (cjm)
+ Differential checksum computation for change
+ in IP packet length.
+
+ Version 2.1: May, 1997 (cjm)
+ Very minor changes to conform with
+ local/global/function naming conventions
+ within the packet aliasing module.
+
+ Version 3.1: May, 2000 (eds)
+ Add support for passive mode, alias the 227 replies.
+
+ See HISTORY file for record of revisions.
+*/
+
+/* Includes */
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include "alias_local.h"
+
+#define FTP_CONTROL_PORT_NUMBER 21
+#define MAX_MESSAGE_SIZE 128
+
+/* FTP protocol flags. */
+#define WAIT_CRLF 0x01
+
+enum ftp_message_type {
+ FTP_PORT_COMMAND,
+ FTP_EPRT_COMMAND,
+ FTP_227_REPLY,
+ FTP_229_REPLY,
+ FTP_UNKNOWN_MESSAGE
+};
+
+static int ParseFtpPortCommand(char *, int);
+static int ParseFtpEprtCommand(char *, int);
+static int ParseFtp227Reply(char *, int);
+static int ParseFtp229Reply(char *, int);
+static void NewFtpMessage(struct ip *, struct alias_link *, int, int);
+
+static struct in_addr true_addr; /* in network byte order. */
+static u_short true_port; /* in host byte order. */
+
+void
+AliasHandleFtpOut(
+struct ip *pip, /* IP packet to examine/patch */
+struct alias_link *link, /* The link to go through (aliased port) */
+int maxpacketsize /* The maximum size this packet can grow to (including headers) */)
+{
+ int hlen, tlen, dlen, pflags;
+ char *sptr;
+ struct tcphdr *tc;
+ int ftp_message_type;
+
+/* Calculate data length of TCP packet */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+/* Place string pointer and beginning of data */
+ sptr = (char *) pip;
+ sptr += hlen;
+
+/*
+ * Check that data length is not too long and previous message was
+ * properly terminated with CRLF.
+ */
+ pflags = GetProtocolFlags(link);
+ if (dlen <= MAX_MESSAGE_SIZE && !(pflags & WAIT_CRLF)) {
+ ftp_message_type = FTP_UNKNOWN_MESSAGE;
+
+ if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER) {
+/*
+ * When aliasing a client, check for the PORT/EPRT command.
+ */
+ if (ParseFtpPortCommand(sptr, dlen))
+ ftp_message_type = FTP_PORT_COMMAND;
+ else if (ParseFtpEprtCommand(sptr, dlen))
+ ftp_message_type = FTP_EPRT_COMMAND;
+ } else {
+/*
+ * When aliasing a server, check for the 227/229 reply.
+ */
+ if (ParseFtp227Reply(sptr, dlen))
+ ftp_message_type = FTP_227_REPLY;
+ else if (ParseFtp229Reply(sptr, dlen)) {
+ ftp_message_type = FTP_229_REPLY;
+ true_addr.s_addr = pip->ip_src.s_addr;
+ }
+ }
+
+ if (ftp_message_type != FTP_UNKNOWN_MESSAGE)
+ NewFtpMessage(pip, link, maxpacketsize, ftp_message_type);
+ }
+
+/* Track the msgs which are CRLF term'd for PORT/PASV FW breach */
+
+ if (dlen) { /* only if there's data */
+ sptr = (char *) pip; /* start over at beginning */
+ tlen = ntohs(pip->ip_len); /* recalc tlen, pkt may have grown */
+ if (sptr[tlen-2] == '\r' && sptr[tlen-1] == '\n')
+ pflags &= ~WAIT_CRLF;
+ else
+ pflags |= WAIT_CRLF;
+ SetProtocolFlags(link, pflags);
+ }
+}
+
+static int
+ParseFtpPortCommand(char *sptr, int dlen)
+{
+ char ch;
+ int i, state;
+ u_int32_t addr;
+ u_short port;
+ u_int8_t octet;
+
+ /* Format: "PORT A,D,D,R,PO,RT". */
+
+ /* Return if data length is too short. */
+ if (dlen < 18)
+ return 0;
+
+ addr = port = octet = 0;
+ state = -4;
+ for (i = 0; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state) {
+ case -4: if (ch == 'P') state++; else return 0; break;
+ case -3: if (ch == 'O') state++; else return 0; break;
+ case -2: if (ch == 'R') state++; else return 0; break;
+ case -1: if (ch == 'T') state++; else return 0; break;
+
+ case 0:
+ if (isspace(ch))
+ break;
+ else
+ state++;
+ case 1: case 3: case 5: case 7: case 9: case 11:
+ if (isdigit(ch)) {
+ octet = ch - '0';
+ state++;
+ } else
+ return 0;
+ break;
+ case 2: case 4: case 6: case 8:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',') {
+ addr = (addr << 8) + octet;
+ state++;
+ } else
+ return 0;
+ break;
+ case 10: case 12:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',' || state == 12) {
+ port = (port << 8) + octet;
+ state++;
+ } else
+ return 0;
+ break;
+ }
+ }
+
+ if (state == 13) {
+ true_addr.s_addr = htonl(addr);
+ true_port = port;
+ return 1;
+ } else
+ return 0;
+}
+
+static int
+ParseFtpEprtCommand(char *sptr, int dlen)
+{
+ char ch, delim;
+ int i, state;
+ u_int32_t addr;
+ u_short port;
+ u_int8_t octet;
+
+ /* Format: "EPRT |1|A.D.D.R|PORT|". */
+
+ /* Return if data length is too short. */
+ if (dlen < 18)
+ return 0;
+
+ addr = port = octet = 0;
+ delim = '|'; /* XXX gcc -Wuninitialized */
+ state = -4;
+ for (i = 0; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state)
+ {
+ case -4: if (ch == 'E') state++; else return 0; break;
+ case -3: if (ch == 'P') state++; else return 0; break;
+ case -2: if (ch == 'R') state++; else return 0; break;
+ case -1: if (ch == 'T') state++; else return 0; break;
+
+ case 0:
+ if (!isspace(ch)) {
+ delim = ch;
+ state++;
+ }
+ break;
+ case 1:
+ if (ch == '1') /* IPv4 address */
+ state++;
+ else
+ return 0;
+ break;
+ case 2:
+ if (ch == delim)
+ state++;
+ else
+ return 0;
+ break;
+ case 3: case 5: case 7: case 9:
+ if (isdigit(ch)) {
+ octet = ch - '0';
+ state++;
+ } else
+ return 0;
+ break;
+ case 4: case 6: case 8: case 10:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == '.' || state == 10) {
+ addr = (addr << 8) + octet;
+ state++;
+ } else
+ return 0;
+ break;
+ case 11:
+ if (isdigit(ch)) {
+ port = ch - '0';
+ state++;
+ } else
+ return 0;
+ break;
+ case 12:
+ if (isdigit(ch))
+ port = 10 * port + ch - '0';
+ else if (ch == delim)
+ state++;
+ else
+ return 0;
+ break;
+ }
+ }
+
+ if (state == 13) {
+ true_addr.s_addr = htonl(addr);
+ true_port = port;
+ return 1;
+ } else
+ return 0;
+}
+
+static int
+ParseFtp227Reply(char *sptr, int dlen)
+{
+ char ch;
+ int i, state;
+ u_int32_t addr;
+ u_short port;
+ u_int8_t octet;
+
+ /* Format: "227 Entering Passive Mode (A,D,D,R,PO,RT)" */
+
+ /* Return if data length is too short. */
+ if (dlen < 17)
+ return 0;
+
+ addr = port = octet = 0;
+
+ state = -3;
+ for (i = 0; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state)
+ {
+ case -3: if (ch == '2') state++; else return 0; break;
+ case -2: if (ch == '2') state++; else return 0; break;
+ case -1: if (ch == '7') state++; else return 0; break;
+
+ case 0:
+ if (ch == '(')
+ state++;
+ break;
+ case 1: case 3: case 5: case 7: case 9: case 11:
+ if (isdigit(ch)) {
+ octet = ch - '0';
+ state++;
+ } else
+ return 0;
+ break;
+ case 2: case 4: case 6: case 8:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',') {
+ addr = (addr << 8) + octet;
+ state++;
+ } else
+ return 0;
+ break;
+ case 10: case 12:
+ if (isdigit(ch))
+ octet = 10 * octet + ch - '0';
+ else if (ch == ',' || (state == 12 && ch == ')')) {
+ port = (port << 8) + octet;
+ state++;
+ } else
+ return 0;
+ break;
+ }
+ }
+
+ if (state == 13) {
+ true_port = port;
+ true_addr.s_addr = htonl(addr);
+ return 1;
+ } else
+ return 0;
+}
+
+static int
+ParseFtp229Reply(char *sptr, int dlen)
+{
+ char ch, delim;
+ int i, state;
+ u_short port;
+
+ /* Format: "229 Entering Extended Passive Mode (|||PORT|)" */
+
+ /* Return if data length is too short. */
+ if (dlen < 11)
+ return 0;
+
+ port = 0;
+ delim = '|'; /* XXX gcc -Wuninitialized */
+
+ state = -3;
+ for (i = 0; i < dlen; i++) {
+ ch = sptr[i];
+ switch (state)
+ {
+ case -3: if (ch == '2') state++; else return 0; break;
+ case -2: if (ch == '2') state++; else return 0; break;
+ case -1: if (ch == '9') state++; else return 0; break;
+
+ case 0:
+ if (ch == '(')
+ state++;
+ break;
+ case 1:
+ delim = ch;
+ state++;
+ break;
+ case 2: case 3:
+ if (ch == delim)
+ state++;
+ else
+ return 0;
+ break;
+ case 4:
+ if (isdigit(ch)) {
+ port = ch - '0';
+ state++;
+ } else
+ return 0;
+ break;
+ case 5:
+ if (isdigit(ch))
+ port = 10 * port + ch - '0';
+ else if (ch == delim)
+ state++;
+ else
+ return 0;
+ break;
+ case 6:
+ if (ch == ')')
+ state++;
+ else
+ return 0;
+ break;
+ }
+ }
+
+ if (state == 7) {
+ true_port = port;
+ return 1;
+ } else
+ return 0;
+}
+
+static void
+NewFtpMessage(struct ip *pip,
+ struct alias_link *link,
+ int maxpacketsize,
+ int ftp_message_type)
+{
+ struct alias_link *ftp_link;
+
+/* Security checks. */
+ if (pip->ip_src.s_addr != true_addr.s_addr)
+ return;
+
+ if (true_port < IPPORT_RESERVED)
+ return;
+
+/* Establish link to address and port found in FTP control message. */
+ ftp_link = FindUdpTcpOut(true_addr, GetDestAddress(link),
+ htons(true_port), 0, IPPROTO_TCP, 1);
+
+ if (ftp_link != NULL)
+ {
+ int slen, hlen, tlen, dlen;
+ struct tcphdr *tc;
+
+#ifndef NO_FW_PUNCH
+ /* Punch hole in firewall */
+ PunchFWHole(ftp_link);
+#endif
+
+/* Calculate data length of TCP packet */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+/* Create new FTP message. */
+ {
+ char stemp[MAX_MESSAGE_SIZE + 1];
+ char *sptr;
+ u_short alias_port;
+ u_char *ptr;
+ int a1, a2, a3, a4, p1, p2;
+ struct in_addr alias_address;
+
+/* Decompose alias address into quad format */
+ alias_address = GetAliasAddress(link);
+ ptr = (u_char *) &alias_address.s_addr;
+ a1 = *ptr++; a2=*ptr++; a3=*ptr++; a4=*ptr;
+
+ alias_port = GetAliasPort(ftp_link);
+
+ switch (ftp_message_type)
+ {
+ case FTP_PORT_COMMAND:
+ case FTP_227_REPLY:
+ /* Decompose alias port into pair format. */
+ ptr = (char *) &alias_port;
+ p1 = *ptr++; p2=*ptr;
+
+ if (ftp_message_type == FTP_PORT_COMMAND) {
+ /* Generate PORT command string. */
+ sprintf(stemp, "PORT %d,%d,%d,%d,%d,%d\r\n",
+ a1,a2,a3,a4,p1,p2);
+ } else {
+ /* Generate 227 reply string. */
+ sprintf(stemp,
+ "227 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n",
+ a1,a2,a3,a4,p1,p2);
+ }
+ break;
+ case FTP_EPRT_COMMAND:
+ /* Generate EPRT command string. */
+ sprintf(stemp, "EPRT |1|%d.%d.%d.%d|%d|\r\n",
+ a1,a2,a3,a4,ntohs(alias_port));
+ break;
+ case FTP_229_REPLY:
+ /* Generate 229 reply string. */
+ sprintf(stemp, "229 Entering Extended Passive Mode (|||%d|)\r\n",
+ ntohs(alias_port));
+ break;
+ }
+
+/* Save string length for IP header modification */
+ slen = strlen(stemp);
+
+/* Copy modified buffer into IP packet. */
+ sptr = (char *) pip; sptr += hlen;
+ strncpy(sptr, stemp, maxpacketsize-hlen);
+ }
+
+/* Save information regarding modified seq and ack numbers */
+ {
+ int delta;
+
+ SetAckModified(link);
+ delta = GetDeltaSeqOut(pip, link);
+ AddSeq(pip, link, delta+slen-dlen);
+ }
+
+/* Revise IP header */
+ {
+ u_short new_len;
+
+ new_len = htons(hlen + slen);
+ DifferentialChecksum(&pip->ip_sum,
+ &new_len,
+ &pip->ip_len,
+ 1);
+ pip->ip_len = new_len;
+ }
+
+/* Compute TCP checksum for revised packet */
+ tc->th_sum = 0;
+ tc->th_sum = TcpChecksum(pip);
+ }
+ else
+ {
+#ifdef DEBUG
+ fprintf(stderr,
+ "PacketAlias/HandleFtpOut: Cannot allocate FTP data port\n");
+#endif
+ }
+}
diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c
new file mode 100644
index 0000000..82c39e3
--- /dev/null
+++ b/sys/netinet/libalias/alias_irc.c
@@ -0,0 +1,357 @@
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* Alias_irc.c intercepts packages contain IRC CTCP commands, and
+ changes DCC commands to export a port on the aliasing host instead
+ of an aliased host.
+
+ For this routine to work, the DCC command must fit entirely into a
+ single TCP packet. This will usually happen, but is not
+ guaranteed.
+
+ The interception is likely to change the length of the packet.
+ The handling of this is copied more-or-less verbatim from
+ ftp_alias.c
+
+ Initial version: Eivind Eklund <perhaps@yes.no> (ee) 97-01-29
+
+ Version 2.1: May, 1997 (cjm)
+ Very minor changes to conform with
+ local/global/function naming conventions
+ withing the packet alising module.
+*/
+
+/* Includes */
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <limits.h>
+
+#include "alias_local.h"
+
+/* Local defines */
+#define DBprintf(a)
+
+
+void
+AliasHandleIrcOut(struct ip *pip, /* IP packet to examine */
+ struct alias_link *link, /* Which link are we on? */
+ int maxsize /* Maximum size of IP packet including headers */
+ )
+{
+ int hlen, tlen, dlen;
+ struct in_addr true_addr;
+ u_short true_port;
+ char *sptr;
+ struct tcphdr *tc;
+ int i; /* Iterator through the source */
+
+/* Calculate data length of TCP packet */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ /* Return if data length is too short - assume an entire PRIVMSG in each packet. */
+ if (dlen<sizeof(":A!a@n.n PRIVMSG A :aDCC 1 1a")-1)
+ return;
+
+/* Place string pointer at beginning of data */
+ sptr = (char *) pip;
+ sptr += hlen;
+ maxsize -= hlen; /* We're interested in maximum size of data, not packet */
+
+ /* Search for a CTCP command [Note 1] */
+ for( i=0; i<dlen; i++ ) {
+ if(sptr[i]=='\001')
+ goto lFOUND_CTCP;
+ }
+ return; /* No CTCP commands in */
+ /* Handle CTCP commands - the buffer may have to be copied */
+lFOUND_CTCP:
+ {
+ char newpacket[65536]; /* Estimate of maximum packet size :) */
+ int copyat = i; /* Same */
+ int iCopy = 0; /* How much data have we written to copy-back string? */
+ unsigned long org_addr; /* Original IP address */
+ unsigned short org_port; /* Original source port address */
+ lCTCP_START:
+ if( i >= dlen || iCopy >= sizeof(newpacket) )
+ goto lPACKET_DONE;
+ newpacket[iCopy++] = sptr[i++]; /* Copy the CTCP start character */
+ /* Start of a CTCP */
+ if( i+4 >= dlen ) /* Too short for DCC */
+ goto lBAD_CTCP;
+ if( sptr[i+0] != 'D' )
+ goto lBAD_CTCP;
+ if( sptr[i+1] != 'C' )
+ goto lBAD_CTCP;
+ if( sptr[i+2] != 'C' )
+ goto lBAD_CTCP;
+ if( sptr[i+3] != ' ' )
+ goto lBAD_CTCP;
+ /* We have a DCC command - handle it! */
+ i+= 4; /* Skip "DCC " */
+ if( iCopy+4 > sizeof(newpacket) )
+ goto lPACKET_DONE;
+ newpacket[iCopy++] = 'D';
+ newpacket[iCopy++] = 'C';
+ newpacket[iCopy++] = 'C';
+ newpacket[iCopy++] = ' ';
+
+ DBprintf(("Found DCC\n"));
+ /* Skip any extra spaces (should not occur according to
+ protocol, but DCC breaks CTCP protocol anyway */
+ while(sptr[i] == ' ') {
+ if( ++i >= dlen) {
+ DBprintf(("DCC packet terminated in just spaces\n"));
+ goto lPACKET_DONE;
+ }
+ }
+
+ DBprintf(("Transferring command...\n"));
+ while(sptr[i] != ' ') {
+ newpacket[iCopy++] = sptr[i];
+ if( ++i >= dlen || iCopy >= sizeof(newpacket) ) {
+ DBprintf(("DCC packet terminated during command\n"));
+ goto lPACKET_DONE;
+ }
+ }
+ /* Copy _one_ space */
+ if( i+1 < dlen && iCopy < sizeof(newpacket) )
+ newpacket[iCopy++] = sptr[i++];
+
+ DBprintf(("Done command - removing spaces\n"));
+ /* Skip any extra spaces (should not occur according to
+ protocol, but DCC breaks CTCP protocol anyway */
+ while(sptr[i] == ' ') {
+ if( ++i >= dlen ) {
+ DBprintf(("DCC packet terminated in just spaces (post-command)\n"));
+ goto lPACKET_DONE;
+ }
+ }
+
+ DBprintf(("Transferring filename...\n"));
+ while(sptr[i] != ' ') {
+ newpacket[iCopy++] = sptr[i];
+ if( ++i >= dlen || iCopy >= sizeof(newpacket) ) {
+ DBprintf(("DCC packet terminated during filename\n"));
+ goto lPACKET_DONE;
+ }
+ }
+ /* Copy _one_ space */
+ if( i+1 < dlen && iCopy < sizeof(newpacket) )
+ newpacket[iCopy++] = sptr[i++];
+
+ DBprintf(("Done filename - removing spaces\n"));
+ /* Skip any extra spaces (should not occur according to
+ protocol, but DCC breaks CTCP protocol anyway */
+ while(sptr[i] == ' ') {
+ if( ++i >= dlen ) {
+ DBprintf(("DCC packet terminated in just spaces (post-filename)\n"));
+ goto lPACKET_DONE;
+ }
+ }
+
+ DBprintf(("Fetching IP address\n"));
+ /* Fetch IP address */
+ org_addr = 0;
+ while(i<dlen && isdigit(sptr[i])) {
+ if( org_addr > ULONG_MAX/10UL ) { /* Terminate on overflow */
+ DBprintf(("DCC Address overflow (org_addr == 0x%08lx, next char %c\n", org_addr, sptr[i]));
+ goto lBAD_CTCP;
+ }
+ org_addr *= 10;
+ org_addr += sptr[i++]-'0';
+ }
+ DBprintf(("Skipping space\n"));
+ if( i+1 >= dlen || sptr[i] != ' ' ) {
+ DBprintf(("Overflow (%d >= %d) or bad character (%02x) terminating IP address\n", i+1, dlen, sptr[i]));
+ goto lBAD_CTCP;
+ }
+ /* Skip any extra spaces (should not occur according to
+ protocol, but DCC breaks CTCP protocol anyway, so we might
+ as well play it safe */
+ while(sptr[i] == ' ') {
+ if( ++i >= dlen ) {
+ DBprintf(("Packet failure - space overflow.\n"));
+ goto lPACKET_DONE;
+ }
+ }
+ DBprintf(("Fetching port number\n"));
+ /* Fetch source port */
+ org_port = 0;
+ while(i<dlen && isdigit(sptr[i])) {
+ if( org_port > 6554 ) { /* Terminate on overflow (65536/10 rounded up*/
+ DBprintf(("DCC: port number overflow\n"));
+ goto lBAD_CTCP;
+ }
+ org_port *= 10;
+ org_port += sptr[i++]-'0';
+ }
+ /* Skip illegal addresses (or early termination) */
+ if( i >= dlen || (sptr[i] != '\001' && sptr[i] != ' ') ) {
+ DBprintf(("Bad port termination\n"));
+ goto lBAD_CTCP;
+ }
+ DBprintf(("Got IP %lu and port %u\n", org_addr, (unsigned)org_port));
+
+ /* We've got the address and port - now alias it */
+ {
+ struct alias_link *dcc_link;
+ struct in_addr destaddr;
+
+
+ true_port = htons(org_port);
+ true_addr.s_addr = htonl(org_addr);
+ destaddr.s_addr = 0;
+
+ /* Sanity/Security checking */
+ if (!org_addr || !org_port ||
+ pip->ip_src.s_addr != true_addr.s_addr ||
+ org_port < IPPORT_RESERVED)
+ goto lBAD_CTCP;
+
+ /* Steal the FTP_DATA_PORT - it doesn't really matter, and this
+ would probably allow it through at least _some_
+ firewalls. */
+ dcc_link = FindUdpTcpOut(true_addr, destaddr,
+ true_port, 0,
+ IPPROTO_TCP, 1);
+ DBprintf(("Got a DCC link\n"));
+ if ( dcc_link ) {
+ struct in_addr alias_address; /* Address from aliasing */
+ u_short alias_port; /* Port given by aliasing */
+ int n;
+
+#ifndef NO_FW_PUNCH
+ /* Generate firewall hole as appropriate */
+ PunchFWHole(dcc_link);
+#endif
+
+ alias_address = GetAliasAddress(link);
+ n = snprintf(&newpacket[iCopy],
+ sizeof(newpacket)-iCopy,
+ "%lu ", (u_long)htonl(alias_address.s_addr));
+ if( n < 0 ) {
+ DBprintf(("DCC packet construct failure.\n"));
+ goto lBAD_CTCP;
+ }
+ if( (iCopy += n) >= sizeof(newpacket) ) { /* Truncated/fit exactly - bad news */
+ DBprintf(("DCC constructed packet overflow.\n"));
+ goto lBAD_CTCP;
+ }
+ alias_port = GetAliasPort(dcc_link);
+ n = snprintf(&newpacket[iCopy],
+ sizeof(newpacket)-iCopy,
+ "%u", htons(alias_port) );
+ if( n < 0 ) {
+ DBprintf(("DCC packet construct failure.\n"));
+ goto lBAD_CTCP;
+ }
+ iCopy += n;
+ /* Done - truncated cases will be taken care of by lBAD_CTCP */
+ DBprintf(("Aliased IP %lu and port %u\n", alias_address.s_addr, (unsigned)alias_port));
+ }
+ }
+ /* An uninteresting CTCP - state entered right after '\001' has
+ been pushed. Also used to copy the rest of a DCC, after IP
+ address and port has been handled */
+ lBAD_CTCP:
+ for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) {
+ newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */
+ if(sptr[i] == '\001') {
+ goto lNORMAL_TEXT;
+ }
+ }
+ goto lPACKET_DONE;
+ /* Normal text */
+ lNORMAL_TEXT:
+ for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) {
+ newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */
+ if(sptr[i] == '\001') {
+ goto lCTCP_START;
+ }
+ }
+ /* Handle the end of a packet */
+ lPACKET_DONE:
+ iCopy = iCopy > maxsize-copyat ? maxsize-copyat : iCopy;
+ memcpy(sptr+copyat, newpacket, iCopy);
+
+/* Save information regarding modified seq and ack numbers */
+ {
+ int delta;
+
+ SetAckModified(link);
+ delta = GetDeltaSeqOut(pip, link);
+ AddSeq(pip, link, delta+copyat+iCopy-dlen);
+ }
+
+ /* Revise IP header */
+ {
+ u_short new_len;
+
+ new_len = htons(hlen + iCopy + copyat);
+ DifferentialChecksum(&pip->ip_sum,
+ &new_len,
+ &pip->ip_len,
+ 1);
+ pip->ip_len = new_len;
+ }
+
+ /* Compute TCP checksum for revised packet */
+ tc->th_sum = 0;
+ tc->th_sum = TcpChecksum(pip);
+ return;
+ }
+}
+
+/* Notes:
+ [Note 1]
+ The initial search will most often fail; it could be replaced with a 32-bit specific search.
+ Such a search would be done for 32-bit unsigned value V:
+ V ^= 0x01010101; (Search is for null bytes)
+ if( ((V-0x01010101)^V) & 0x80808080 ) {
+ (found a null bytes which was a 01 byte)
+ }
+ To assert that the processor is 32-bits, do
+ extern int ircdccar[32]; (32 bits)
+ extern int ircdccar[CHAR_BIT*sizeof(unsigned int)];
+ which will generate a type-error on all but 32-bit machines.
+
+ [Note 2] This routine really ought to be replaced with one that
+ creates a transparent proxy on the aliasing host, to allow arbitary
+ changes in the TCP stream. This should not be too difficult given
+ this base; I (ee) will try to do this some time later.
+ */
diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h
new file mode 100644
index 0000000..c25259a
--- /dev/null
+++ b/sys/netinet/libalias/alias_local.h
@@ -0,0 +1,229 @@
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Alias_local.h contains the function prototypes for alias.c,
+ * alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well
+ * as any future add-ons). It also includes macros, globals and
+ * struct definitions shared by more than one alias*.c file.
+ *
+ * This include file is intended to be used only within the aliasing
+ * software. Outside world interfaces are defined in alias.h
+ *
+ * This software is placed into the public domain with no restrictions
+ * on its distribution.
+ *
+ * Initial version: August, 1996 (cjm)
+ *
+ * <updated several times by original author and Eivind Eklund>
+ */
+
+#ifndef _ALIAS_LOCAL_H_
+#define _ALIAS_LOCAL_H_
+
+/* Macros */
+
+/*
+ * The following macro is used to update an
+ * internet checksum. "delta" is a 32-bit
+ * accumulation of all the changes to the
+ * checksum (adding in new 16-bit words and
+ * subtracting out old words), and "cksum"
+ * is the checksum value to be updated.
+ */
+#define ADJUST_CHECKSUM(acc, cksum) \
+ do { \
+ acc += cksum; \
+ if (acc < 0) { \
+ acc = -acc; \
+ acc = (acc >> 16) + (acc & 0xffff); \
+ acc += acc >> 16; \
+ cksum = (u_short) ~acc; \
+ } else { \
+ acc = (acc >> 16) + (acc & 0xffff); \
+ acc += acc >> 16; \
+ cksum = (u_short) acc; \
+ } \
+ } while (0)
+
+/* Globals */
+
+extern int packetAliasMode;
+
+/* Prototypes */
+
+/* General utilities */
+u_short IpChecksum(struct ip *_pip);
+u_short TcpChecksum(struct ip *_pip);
+void DifferentialChecksum(u_short *_cksum, u_short *_new, u_short *_old,
+ int _n);
+
+/* Internal data access */
+struct alias_link *
+ FindIcmpIn(struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _id_alias, int _create);
+struct alias_link *
+ FindIcmpOut(struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_short _id, int _create);
+struct alias_link *
+ FindFragmentIn1(struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _ip_id);
+struct alias_link *
+ FindFragmentIn2(struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _ip_id);
+struct alias_link *
+ AddFragmentPtrLink(struct in_addr _dst_addr, u_short _ip_id);
+struct alias_link *
+ FindFragmentPtr(struct in_addr _dst_addr, u_short _ip_id);
+struct alias_link *
+ FindProtoIn(struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_char _proto);
+struct alias_link *
+ FindProtoOut(struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_char _proto);
+struct alias_link *
+ FindUdpTcpIn(struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _dst_port, u_short _alias_port, u_char _proto, int _create);
+struct alias_link *
+ FindUdpTcpOut(struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_short _src_port, u_short _dst_port, u_char _proto, int _create);
+struct alias_link *
+ AddPptp(struct in_addr _src_addr, struct in_addr _dst_addr,
+ struct in_addr _alias_addr, u_int16_t _src_call_id);
+struct alias_link *
+ FindPptpOutByCallId(struct in_addr _src_addr,
+ struct in_addr _dst_addr, u_int16_t _src_call_id);
+struct alias_link *
+ FindPptpInByCallId(struct in_addr _dst_addr,
+ struct in_addr _alias_addr, u_int16_t _dst_call_id);
+struct alias_link *
+ FindPptpOutByPeerCallId(struct in_addr _src_addr,
+ struct in_addr _dst_addr, u_int16_t _dst_call_id);
+struct alias_link *
+ FindPptpInByPeerCallId(struct in_addr _dst_addr,
+ struct in_addr _alias_addr, u_int16_t _alias_call_id);
+struct alias_link *
+ FindRtspOut(struct in_addr _src_addr, struct in_addr _dst_addr,
+ u_short _src_port, u_short _alias_port, u_char _proto);
+struct in_addr
+ FindOriginalAddress(struct in_addr _alias_addr);
+struct in_addr
+ FindAliasAddress(struct in_addr _original_addr);
+
+/* External data access/modification */
+int FindNewPortGroup(struct in_addr _dst_addr, struct in_addr _alias_addr,
+ u_short _src_port, u_short _dst_port, u_short _port_count,
+ u_char _proto, u_char _align);
+void GetFragmentAddr(struct alias_link *_link, struct in_addr *_src_addr);
+void SetFragmentAddr(struct alias_link *_link, struct in_addr _src_addr);
+void GetFragmentPtr(struct alias_link *_link, char **_fptr);
+void SetFragmentPtr(struct alias_link *_link, char *fptr);
+void SetStateIn(struct alias_link *_link, int _state);
+void SetStateOut(struct alias_link *_link, int _state);
+int GetStateIn(struct alias_link *_link);
+int GetStateOut(struct alias_link *_link);
+struct in_addr
+ GetOriginalAddress(struct alias_link *_link);
+struct in_addr
+ GetDestAddress(struct alias_link *_link);
+struct in_addr
+ GetAliasAddress(struct alias_link *_link);
+struct in_addr
+ GetDefaultAliasAddress(void);
+void SetDefaultAliasAddress(struct in_addr _alias_addr);
+u_short GetOriginalPort(struct alias_link *_link);
+u_short GetAliasPort(struct alias_link *_link);
+struct in_addr
+ GetProxyAddress(struct alias_link *_link);
+void SetProxyAddress(struct alias_link *_link, struct in_addr _addr);
+u_short GetProxyPort(struct alias_link *_link);
+void SetProxyPort(struct alias_link *_link, u_short _port);
+void SetAckModified(struct alias_link *_link);
+int GetAckModified(struct alias_link *_link);
+int GetDeltaAckIn(struct ip *_pip, struct alias_link *_link);
+int GetDeltaSeqOut(struct ip *_pip, struct alias_link *_link);
+void AddSeq(struct ip *_pip, struct alias_link *_link, int _delta);
+void SetExpire(struct alias_link *_link, int _expire);
+void ClearCheckNewLink(void);
+void SetProtocolFlags(struct alias_link *_link, int _pflags);
+int GetProtocolFlags(struct alias_link *_link);
+void SetDestCallId(struct alias_link *_link, u_int16_t _cid);
+#ifndef NO_FW_PUNCH
+void PunchFWHole(struct alias_link *_link);
+#endif
+
+/* Housekeeping function */
+void HouseKeeping(void);
+
+/* Tcp specfic routines */
+/* lint -save -library Suppress flexelint warnings */
+
+/* FTP routines */
+void AliasHandleFtpOut(struct ip *_pip, struct alias_link *_link,
+ int _maxpacketsize);
+
+/* IRC routines */
+void AliasHandleIrcOut(struct ip *_pip, struct alias_link *_link,
+ int _maxsize);
+
+/* RTSP routines */
+void AliasHandleRtspOut(struct ip *_pip, struct alias_link *_link,
+ int _maxpacketsize);
+
+/* PPTP routines */
+void AliasHandlePptpOut(struct ip *_pip, struct alias_link *_link);
+void AliasHandlePptpIn(struct ip *_pip, struct alias_link *_link);
+int AliasHandlePptpGreOut(struct ip *_pip);
+int AliasHandlePptpGreIn(struct ip *_pip);
+
+/* NetBIOS routines */
+int AliasHandleUdpNbt(struct ip *_pip, struct alias_link *_link,
+ struct in_addr *_alias_address, u_short _alias_port);
+int AliasHandleUdpNbtNS(struct ip *_pip, struct alias_link *_link,
+ struct in_addr *_alias_address, u_short *_alias_port,
+ struct in_addr *_original_address, u_short *_original_port);
+
+/* CUSeeMe routines */
+void AliasHandleCUSeeMeOut(struct ip *_pip, struct alias_link *_link);
+void AliasHandleCUSeeMeIn(struct ip *_pip, struct in_addr _original_addr);
+
+/* Transparent proxy routines */
+int ProxyCheck(struct ip *_pip, struct in_addr *_proxy_server_addr,
+ u_short *_proxy_server_port);
+void ProxyModify(struct alias_link *_link, struct ip *_pip,
+ int _maxpacketsize, int _proxy_type);
+
+enum alias_tcp_state {
+ ALIAS_TCP_STATE_NOT_CONNECTED,
+ ALIAS_TCP_STATE_CONNECTED,
+ ALIAS_TCP_STATE_DISCONNECTED
+};
+
+/*lint -restore */
+
+#endif /* !_ALIAS_LOCAL_H_ */
diff --git a/sys/netinet/libalias/alias_nbt.c b/sys/netinet/libalias/alias_nbt.c
new file mode 100644
index 0000000..89e8089
--- /dev/null
+++ b/sys/netinet/libalias/alias_nbt.c
@@ -0,0 +1,704 @@
+/*-
+ * Written by Atsushi Murai <amurai@spec.co.jp>
+ * Copyright (c) 1998, System Planning and Engineering Co.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * TODO:
+ * oClean up.
+ * oConsidering for word alignment for other platform.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ alias_nbt.c performs special processing for NetBios over TCP/IP
+ sessions by UDP.
+
+ Initial version: May, 1998 (Atsushi Murai <amurai@spec.co.jp>)
+
+ See HISTORY file for record of revisions.
+*/
+
+/* Includes */
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+
+#include "alias_local.h"
+
+typedef struct {
+ struct in_addr oldaddr;
+ u_short oldport;
+ struct in_addr newaddr;
+ u_short newport;
+ u_short *uh_sum;
+} NBTArguments;
+
+typedef struct {
+ unsigned char type;
+ unsigned char flags;
+ u_short id;
+ struct in_addr source_ip;
+ u_short source_port;
+ u_short len;
+ u_short offset;
+} NbtDataHeader;
+
+#define OpQuery 0
+#define OpUnknown 4
+#define OpRegist 5
+#define OpRelease 6
+#define OpWACK 7
+#define OpRefresh 8
+typedef struct {
+ u_short nametrid;
+ u_short dir:1, opcode:4, nmflags:7, rcode:4;
+ u_short qdcount;
+ u_short ancount;
+ u_short nscount;
+ u_short arcount;
+} NbtNSHeader;
+
+#define FMT_ERR 0x1
+#define SRV_ERR 0x2
+#define IMP_ERR 0x4
+#define RFS_ERR 0x5
+#define ACT_ERR 0x6
+#define CFT_ERR 0x7
+
+
+#ifdef DEBUG
+static void PrintRcode( u_char rcode ) {
+
+ switch (rcode) {
+ case FMT_ERR:
+ printf("\nFormat Error.");
+ case SRV_ERR:
+ printf("\nSever failure.");
+ case IMP_ERR:
+ printf("\nUnsupported request error.\n");
+ case RFS_ERR:
+ printf("\nRefused error.\n");
+ case ACT_ERR:
+ printf("\nActive error.\n");
+ case CFT_ERR:
+ printf("\nName in conflict error.\n");
+ default:
+ printf("\n?%c?=%0x\n", '?', rcode );
+
+ }
+}
+#endif
+
+
+/* Handling Name field */
+static u_char *AliasHandleName ( u_char *p, char *pmax ) {
+
+ u_char *s;
+ u_char c;
+ int compress;
+
+ /* Following length field */
+
+ if (p == NULL || (char *)p >= pmax)
+ return(NULL);
+
+ if (*p & 0xc0 ) {
+ p = p + 2;
+ if ((char *)p > pmax)
+ return(NULL);
+ return ((u_char *)p);
+ }
+ while ( ( *p & 0x3f) != 0x00 ) {
+ s = p + 1;
+ if ( *p == 0x20 )
+ compress = 1;
+ else
+ compress = 0;
+
+ /* Get next length field */
+ p = (u_char *)(p + (*p & 0x3f) + 1);
+ if ((char *)p > pmax) {
+ p = NULL;
+ break;
+ }
+#ifdef DEBUG
+ printf(":");
+#endif
+ while (s < p) {
+ if ( compress == 1 ) {
+ c = (u_char )(((((*s & 0x0f) << 4) | (*(s+1) & 0x0f)) - 0x11));
+#ifdef DEBUG
+ if (isprint( c ) )
+ printf("%c", c );
+ else
+ printf("<0x%02x>", c );
+#endif
+ s +=2;
+ } else {
+#ifdef DEBUG
+ printf("%c", *s);
+#endif
+ s++;
+ }
+ }
+#ifdef DEBUG
+ printf(":");
+#endif
+ fflush(stdout);
+ }
+
+ /* Set up to out of Name field */
+ if (p == NULL || (char *)p >= pmax)
+ p = NULL;
+ else
+ p++;
+ return ((u_char *)p);
+}
+
+/*
+ * NetBios Datagram Handler (IP/UDP)
+ */
+#define DGM_DIRECT_UNIQ 0x10
+#define DGM_DIRECT_GROUP 0x11
+#define DGM_BROADCAST 0x12
+#define DGM_ERROR 0x13
+#define DGM_QUERY 0x14
+#define DGM_POSITIVE_RES 0x15
+#define DGM_NEGATIVE_RES 0x16
+
+int AliasHandleUdpNbt(
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *link,
+ struct in_addr *alias_address,
+ u_short alias_port
+) {
+ struct udphdr * uh;
+ NbtDataHeader *ndh;
+ u_char *p = NULL;
+ char *pmax;
+
+ /* Calculate data length of UDP packet */
+ uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2));
+ pmax = (char *)uh + ntohs( uh->uh_ulen );
+
+ ndh = (NbtDataHeader *)((char *)uh + (sizeof (struct udphdr)));
+ if ((char *)(ndh + 1) > pmax)
+ return(-1);
+#ifdef DEBUG
+ printf("\nType=%02x,", ndh->type );
+#endif
+ switch ( ndh->type ) {
+ case DGM_DIRECT_UNIQ:
+ case DGM_DIRECT_GROUP:
+ case DGM_BROADCAST:
+ p = (u_char *)ndh + 14;
+ p = AliasHandleName ( p, pmax ); /* Source Name */
+ p = AliasHandleName ( p, pmax ); /* Destination Name */
+ break;
+ case DGM_ERROR:
+ p = (u_char *)ndh + 11;
+ break;
+ case DGM_QUERY:
+ case DGM_POSITIVE_RES:
+ case DGM_NEGATIVE_RES:
+ p = (u_char *)ndh + 10;
+ p = AliasHandleName ( p, pmax ); /* Destination Name */
+ break;
+ }
+ if (p == NULL || (char *)p > pmax)
+ p = NULL;
+#ifdef DEBUG
+ printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) );
+#endif
+ /* Doing a IP address and Port number Translation */
+ if ( uh->uh_sum != 0 ) {
+ int acc;
+ u_short *sptr;
+ acc = ndh->source_port;
+ acc -= alias_port;
+ sptr = (u_short *) &(ndh->source_ip);
+ acc += *sptr++;
+ acc += *sptr;
+ sptr = (u_short *) alias_address;
+ acc -= *sptr++;
+ acc -= *sptr;
+ ADJUST_CHECKSUM(acc, uh->uh_sum);
+ }
+ ndh->source_ip = *alias_address;
+ ndh->source_port = alias_port;
+#ifdef DEBUG
+ printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) );
+ fflush(stdout);
+#endif
+ return((p == NULL) ? -1 : 0);
+}
+/* Question Section */
+#define QS_TYPE_NB 0x0020
+#define QS_TYPE_NBSTAT 0x0021
+#define QS_CLAS_IN 0x0001
+typedef struct {
+ u_short type; /* The type of Request */
+ u_short class; /* The class of Request */
+} NBTNsQuestion;
+
+static u_char *
+AliasHandleQuestion(
+ u_short count,
+ NBTNsQuestion *q,
+ char *pmax,
+ NBTArguments *nbtarg)
+{
+
+ while ( count != 0 ) {
+ /* Name Filed */
+ q = (NBTNsQuestion *)AliasHandleName((u_char *)q, pmax);
+
+ if (q == NULL || (char *)(q + 1) > pmax) {
+ q = NULL;
+ break;
+ }
+
+ /* Type and Class filed */
+ switch ( ntohs(q->type) ) {
+ case QS_TYPE_NB:
+ case QS_TYPE_NBSTAT:
+ q= q+1;
+ break;
+ default:
+#ifdef DEBUG
+ printf("\nUnknown Type on Question %0x\n", ntohs(q->type) );
+#endif
+ break;
+ }
+ count--;
+ }
+
+ /* Set up to out of Question Section */
+ return ((u_char *)q);
+}
+
+/* Resource Record */
+#define RR_TYPE_A 0x0001
+#define RR_TYPE_NS 0x0002
+#define RR_TYPE_NULL 0x000a
+#define RR_TYPE_NB 0x0020
+#define RR_TYPE_NBSTAT 0x0021
+#define RR_CLAS_IN 0x0001
+#define SizeOfNsResource 8
+typedef struct {
+ u_short type;
+ u_short class;
+ unsigned int ttl;
+ u_short rdlen;
+} NBTNsResource;
+
+#define SizeOfNsRNB 6
+typedef struct {
+ u_short g:1, ont:2, resv:13;
+ struct in_addr addr;
+} NBTNsRNB;
+
+static u_char *
+AliasHandleResourceNB(
+ NBTNsResource *q,
+ char *pmax,
+ NBTArguments *nbtarg)
+{
+ NBTNsRNB *nb;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return(NULL);
+ /* Check out a length */
+ bcount = ntohs(q->rdlen);
+
+ /* Forward to Resource NB position */
+ nb = (NBTNsRNB *)((u_char *)q + SizeOfNsResource);
+
+ /* Processing all in_addr array */
+#ifdef DEBUG
+ printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr));
+ printf("->%s, %dbytes] ",inet_ntoa(nbtarg->newaddr ), bcount);
+#endif
+ while ( nb != NULL && bcount != 0 ) {
+ if ((char *)(nb + 1) > pmax) {
+ nb = NULL;
+ break;
+ }
+#ifdef DEBUG
+ printf("<%s>", inet_ntoa(nb->addr) );
+#endif
+ if (!bcmp(&nbtarg->oldaddr,&nb->addr, sizeof(struct in_addr) ) ) {
+ if ( *nbtarg->uh_sum != 0 ) {
+ int acc;
+ u_short *sptr;
+
+ sptr = (u_short *) &(nb->addr);
+ acc = *sptr++;
+ acc += *sptr;
+ sptr = (u_short *) &(nbtarg->newaddr);
+ acc -= *sptr++;
+ acc -= *sptr;
+ ADJUST_CHECKSUM(acc, *nbtarg->uh_sum);
+ }
+
+ nb->addr = nbtarg->newaddr;
+#ifdef DEBUG
+ printf("O");
+#endif
+ }
+#ifdef DEBUG
+ else {
+ printf(".");
+ }
+#endif
+ nb=(NBTNsRNB *)((u_char *)nb + SizeOfNsRNB);
+ bcount -= SizeOfNsRNB;
+ }
+ if (nb == NULL || (char *)(nb + 1) > pmax) {
+ nb = NULL;
+ }
+
+ return ((u_char *)nb);
+}
+
+#define SizeOfResourceA 6
+typedef struct {
+ struct in_addr addr;
+} NBTNsResourceA;
+
+static u_char *
+AliasHandleResourceA(
+ NBTNsResource *q,
+ char *pmax,
+ NBTArguments *nbtarg)
+{
+ NBTNsResourceA *a;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return(NULL);
+
+ /* Forward to Resource A position */
+ a = (NBTNsResourceA *)( (u_char *)q + sizeof(NBTNsResource) );
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ /* Processing all in_addr array */
+#ifdef DEBUG
+ printf("Arec [%s", inet_ntoa(nbtarg->oldaddr));
+ printf("->%s]",inet_ntoa(nbtarg->newaddr ));
+#endif
+ while ( bcount != 0 ) {
+ if (a == NULL || (char *)(a + 1) > pmax)
+ return(NULL);
+#ifdef DEBUG
+ printf("..%s", inet_ntoa(a->addr) );
+#endif
+ if ( !bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr) ) ) {
+ if ( *nbtarg->uh_sum != 0 ) {
+ int acc;
+ u_short *sptr;
+
+ sptr = (u_short *) &(a->addr); /* Old */
+ acc = *sptr++;
+ acc += *sptr;
+ sptr = (u_short *) &nbtarg->newaddr; /* New */
+ acc -= *sptr++;
+ acc -= *sptr;
+ ADJUST_CHECKSUM(acc, *nbtarg->uh_sum);
+ }
+
+ a->addr = nbtarg->newaddr;
+ }
+ a++; /*XXXX*/
+ bcount -= SizeOfResourceA;
+ }
+ if (a == NULL || (char *)(a + 1) > pmax)
+ a = NULL;
+ return ((u_char *)a);
+}
+
+typedef struct {
+ u_short opcode:4, flags:8, resv:4;
+} NBTNsResourceNULL;
+
+static u_char *
+AliasHandleResourceNULL(
+ NBTNsResource *q,
+ char *pmax,
+ NBTArguments *nbtarg)
+{
+ NBTNsResourceNULL *n;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return(NULL);
+
+ /* Forward to Resource NULL position */
+ n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) );
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ /* Processing all in_addr array */
+ while ( bcount != 0 ) {
+ if ((char *)(n + 1) > pmax) {
+ n = NULL;
+ break;
+ }
+ n++;
+ bcount -= sizeof(NBTNsResourceNULL);
+ }
+ if ((char *)(n + 1) > pmax)
+ n = NULL;
+
+ return ((u_char *)n);
+}
+
+static u_char *
+AliasHandleResourceNS(
+ NBTNsResource *q,
+ char *pmax,
+ NBTArguments *nbtarg)
+{
+ NBTNsResourceNULL *n;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return(NULL);
+
+ /* Forward to Resource NULL position */
+ n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) );
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ /* Resource Record Name Filed */
+ q = (NBTNsResource *)AliasHandleName( (u_char *)n, pmax ); /* XXX */
+
+ if (q == NULL || (char *)((u_char *)n + bcount) > pmax)
+ return(NULL);
+ else
+ return ((u_char *)n + bcount);
+}
+
+typedef struct {
+ u_short numnames;
+} NBTNsResourceNBSTAT;
+
+static u_char *
+AliasHandleResourceNBSTAT(
+ NBTNsResource *q,
+ char *pmax,
+ NBTArguments *nbtarg)
+{
+ NBTNsResourceNBSTAT *n;
+ u_short bcount;
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ return(NULL);
+
+ /* Forward to Resource NBSTAT position */
+ n = (NBTNsResourceNBSTAT *)( (u_char *)q + sizeof(NBTNsResource) );
+
+ /* Check out of length */
+ bcount = ntohs(q->rdlen);
+
+ if (q == NULL || (char *)((u_char *)n + bcount) > pmax)
+ return(NULL);
+ else
+ return ((u_char *)n + bcount);
+}
+
+static u_char *
+AliasHandleResource(
+ u_short count,
+ NBTNsResource *q,
+ char *pmax,
+ NBTArguments
+ *nbtarg)
+{
+ while ( count != 0 ) {
+ /* Resource Record Name Filed */
+ q = (NBTNsResource *)AliasHandleName( (u_char *)q, pmax );
+
+ if (q == NULL || (char *)(q + 1) > pmax)
+ break;
+#ifdef DEBUG
+ printf("type=%02x, count=%d\n", ntohs(q->type), count );
+#endif
+
+ /* Type and Class filed */
+ switch ( ntohs(q->type) ) {
+ case RR_TYPE_NB:
+ q = (NBTNsResource *)AliasHandleResourceNB(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_A:
+ q = (NBTNsResource *)AliasHandleResourceA(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_NS:
+ q = (NBTNsResource *)AliasHandleResourceNS(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_NULL:
+ q = (NBTNsResource *)AliasHandleResourceNULL(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ case RR_TYPE_NBSTAT:
+ q = (NBTNsResource *)AliasHandleResourceNBSTAT(
+ q,
+ pmax,
+ nbtarg
+ );
+ break;
+ default:
+#ifdef DEBUG
+ printf(
+ "\nUnknown Type of Resource %0x\n",
+ ntohs(q->type)
+ );
+#endif
+ break;
+ }
+ count--;
+ }
+ fflush(stdout);
+ return ((u_char *)q);
+}
+
+int AliasHandleUdpNbtNS(
+ struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *link,
+ struct in_addr *alias_address,
+ u_short *alias_port,
+ struct in_addr *original_address,
+ u_short *original_port )
+{
+ struct udphdr * uh;
+ NbtNSHeader * nsh;
+ u_char * p;
+ char *pmax;
+ NBTArguments nbtarg;
+
+ /* Set up Common Parameter */
+ nbtarg.oldaddr = *alias_address;
+ nbtarg.oldport = *alias_port;
+ nbtarg.newaddr = *original_address;
+ nbtarg.newport = *original_port;
+
+ /* Calculate data length of UDP packet */
+ uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2));
+ nbtarg.uh_sum = &(uh->uh_sum);
+ nsh = (NbtNSHeader *)((char *)uh + (sizeof(struct udphdr)));
+ p = (u_char *)(nsh + 1);
+ pmax = (char *)uh + ntohs( uh->uh_ulen );
+
+ if ((char *)(nsh + 1) > pmax)
+ return(-1);
+
+#ifdef DEBUG
+ printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x"
+ ", an=%04x, ns=%04x, ar=%04x, [%d]-->",
+ nsh->dir ? "Response": "Request",
+ nsh->nametrid,
+ nsh->opcode,
+ nsh->nmflags,
+ nsh->rcode,
+ ntohs(nsh->qdcount),
+ ntohs(nsh->ancount),
+ ntohs(nsh->nscount),
+ ntohs(nsh->arcount),
+ (u_char *)p -(u_char *)nsh
+ );
+#endif
+
+ /* Question Entries */
+ if (ntohs(nsh->qdcount) !=0 ) {
+ p = AliasHandleQuestion(
+ ntohs(nsh->qdcount),
+ (NBTNsQuestion *)p,
+ pmax,
+ &nbtarg
+ );
+ }
+
+ /* Answer Resource Records */
+ if (ntohs(nsh->ancount) !=0 ) {
+ p = AliasHandleResource(
+ ntohs(nsh->ancount),
+ (NBTNsResource *)p,
+ pmax,
+ &nbtarg
+ );
+ }
+
+ /* Authority Resource Recodrs */
+ if (ntohs(nsh->nscount) !=0 ) {
+ p = AliasHandleResource(
+ ntohs(nsh->nscount),
+ (NBTNsResource *)p,
+ pmax,
+ &nbtarg
+ );
+ }
+
+ /* Additional Resource Recodrs */
+ if (ntohs(nsh->arcount) !=0 ) {
+ p = AliasHandleResource(
+ ntohs(nsh->arcount),
+ (NBTNsResource *)p,
+ pmax,
+ &nbtarg
+ );
+ }
+
+#ifdef DEBUG
+ PrintRcode(nsh->rcode);
+#endif
+ return ((p == NULL) ? -1 : 0);
+}
diff --git a/sys/netinet/libalias/alias_pptp.c b/sys/netinet/libalias/alias_pptp.c
new file mode 100644
index 0000000..2d3d9ac
--- /dev/null
+++ b/sys/netinet/libalias/alias_pptp.c
@@ -0,0 +1,369 @@
+/*
+ * alias_pptp.c
+ *
+ * Copyright (c) 2000 Whistle Communications, Inc.
+ * All rights reserved.
+ *
+ * Subject to the following obligations and disclaimer of warranty, use and
+ * redistribution of this software, in source or object code forms, with or
+ * without modifications are expressly permitted by Whistle Communications;
+ * provided, however, that:
+ * 1. Any and all reproductions of the source or object code must include the
+ * copyright notice above and the following disclaimer of warranties; and
+ * 2. No rights are granted, in any manner or form, to use Whistle
+ * Communications, Inc. trademarks, including the mark "WHISTLE
+ * COMMUNICATIONS" on advertising, endorsements, or otherwise except as
+ * such appears in the above copyright notice or in the software.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
+ * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
+ * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
+ * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
+ * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
+ * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
+ * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
+ * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ *
+ * Author: Erik Salander <erik@whistle.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_pptp.c performs special processing for PPTP sessions under TCP.
+ Specifically, watch PPTP control messages and alias the Call ID or the
+ Peer's Call ID in the appropriate messages. Note, PPTP requires
+ "de-aliasing" of incoming packets, this is different than any other
+ TCP applications that are currently (ie. FTP, IRC and RTSP) aliased.
+
+ For Call IDs encountered for the first time, a PPTP alias link is created.
+ The PPTP alias link uses the Call ID in place of the original port number.
+ An alias Call ID is created.
+
+ For this routine to work, the PPTP control messages must fit entirely
+ into a single TCP packet. This is typically the case, but is not
+ required by the spec.
+
+ Unlike some of the other TCP applications that are aliased (ie. FTP,
+ IRC and RTSP), the PPTP control messages that need to be aliased are
+ guaranteed to remain the same length. The aliased Call ID is a fixed
+ length field.
+
+ Reference: RFC 2637
+
+ Initial version: May, 2000 (eds)
+
+*/
+
+/* Includes */
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <stdio.h>
+
+#include "alias_local.h"
+
+/*
+ * PPTP definitions
+ */
+
+struct grehdr /* Enhanced GRE header. */
+{
+ u_int16_t gh_flags; /* Flags. */
+ u_int16_t gh_protocol; /* Protocol type. */
+ u_int16_t gh_length; /* Payload length. */
+ u_int16_t gh_call_id; /* Call ID. */
+ u_int32_t gh_seq_no; /* Sequence number (optional). */
+ u_int32_t gh_ack_no; /* Acknowledgment number (optional). */
+};
+typedef struct grehdr GreHdr;
+
+/* The PPTP protocol ID used in the GRE 'proto' field. */
+#define PPTP_GRE_PROTO 0x880b
+
+/* Bits that must be set a certain way in all PPTP/GRE packets. */
+#define PPTP_INIT_VALUE ((0x2001 << 16) | PPTP_GRE_PROTO)
+#define PPTP_INIT_MASK 0xef7fffff
+
+#define PPTP_MAGIC 0x1a2b3c4d
+#define PPTP_CTRL_MSG_TYPE 1
+
+enum {
+ PPTP_StartCtrlConnRequest = 1,
+ PPTP_StartCtrlConnReply = 2,
+ PPTP_StopCtrlConnRequest = 3,
+ PPTP_StopCtrlConnReply = 4,
+ PPTP_EchoRequest = 5,
+ PPTP_EchoReply = 6,
+ PPTP_OutCallRequest = 7,
+ PPTP_OutCallReply = 8,
+ PPTP_InCallRequest = 9,
+ PPTP_InCallReply = 10,
+ PPTP_InCallConn = 11,
+ PPTP_CallClearRequest = 12,
+ PPTP_CallDiscNotify = 13,
+ PPTP_WanErrorNotify = 14,
+ PPTP_SetLinkInfo = 15
+};
+
+ /* Message structures */
+ struct pptpMsgHead {
+ u_int16_t length; /* total length */
+ u_int16_t msgType; /* PPTP message type */
+ u_int32_t magic; /* magic cookie */
+ u_int16_t type; /* control message type */
+ u_int16_t resv0; /* reserved */
+ };
+ typedef struct pptpMsgHead *PptpMsgHead;
+
+ struct pptpCodes {
+ u_int8_t resCode; /* Result Code */
+ u_int8_t errCode; /* Error Code */
+ };
+ typedef struct pptpCodes *PptpCode;
+
+ struct pptpCallIds {
+ u_int16_t cid1; /* Call ID field #1 */
+ u_int16_t cid2; /* Call ID field #2 */
+ };
+ typedef struct pptpCallIds *PptpCallId;
+
+static PptpCallId AliasVerifyPptp(struct ip *, u_int16_t *);
+
+
+void
+AliasHandlePptpOut(struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *link) /* The PPTP control link */
+{
+ struct alias_link *pptp_link;
+ PptpCallId cptr;
+ PptpCode codes;
+ u_int16_t ctl_type; /* control message type */
+ struct tcphdr *tc;
+
+ /* Verify valid PPTP control message */
+ if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL)
+ return;
+
+ /* Modify certain PPTP messages */
+ switch (ctl_type) {
+ case PPTP_OutCallRequest:
+ case PPTP_OutCallReply:
+ case PPTP_InCallRequest:
+ case PPTP_InCallReply:
+ /* Establish PPTP link for address and Call ID found in control message. */
+ pptp_link = AddPptp(GetOriginalAddress(link), GetDestAddress(link),
+ GetAliasAddress(link), cptr->cid1);
+ break;
+ case PPTP_CallClearRequest:
+ case PPTP_CallDiscNotify:
+ /* Find PPTP link for address and Call ID found in control message. */
+ pptp_link = FindPptpOutByCallId(GetOriginalAddress(link),
+ GetDestAddress(link),
+ cptr->cid1);
+ break;
+ default:
+ return;
+ }
+
+ if (pptp_link != NULL) {
+ int accumulate = cptr->cid1;
+
+ /* alias the Call Id */
+ cptr->cid1 = GetAliasPort(pptp_link);
+
+ /* Compute TCP checksum for revised packet */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ accumulate -= cptr->cid1;
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+ switch (ctl_type) {
+ case PPTP_OutCallReply:
+ case PPTP_InCallReply:
+ codes = (PptpCode)(cptr + 1);
+ if (codes->resCode == 1) /* Connection established, */
+ SetDestCallId(pptp_link, /* note the Peer's Call ID. */
+ cptr->cid2);
+ else
+ SetExpire(pptp_link, 0); /* Connection refused. */
+ break;
+ case PPTP_CallDiscNotify: /* Connection closed. */
+ SetExpire(pptp_link, 0);
+ break;
+ }
+ }
+}
+
+void
+AliasHandlePptpIn(struct ip *pip, /* IP packet to examine/patch */
+ struct alias_link *link) /* The PPTP control link */
+{
+ struct alias_link *pptp_link;
+ PptpCallId cptr;
+ u_int16_t *pcall_id;
+ u_int16_t ctl_type; /* control message type */
+ struct tcphdr *tc;
+
+ /* Verify valid PPTP control message */
+ if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL)
+ return;
+
+ /* Modify certain PPTP messages */
+ switch (ctl_type)
+ {
+ case PPTP_InCallConn:
+ case PPTP_WanErrorNotify:
+ case PPTP_SetLinkInfo:
+ pcall_id = &cptr->cid1;
+ break;
+ case PPTP_OutCallReply:
+ case PPTP_InCallReply:
+ pcall_id = &cptr->cid2;
+ break;
+ case PPTP_CallDiscNotify: /* Connection closed. */
+ pptp_link = FindPptpInByCallId(GetDestAddress(link),
+ GetAliasAddress(link),
+ cptr->cid1);
+ if (pptp_link != NULL)
+ SetExpire(pptp_link, 0);
+ return;
+ default:
+ return;
+ }
+
+ /* Find PPTP link for address and Call ID found in PPTP Control Msg */
+ pptp_link = FindPptpInByPeerCallId(GetDestAddress(link),
+ GetAliasAddress(link),
+ *pcall_id);
+
+ if (pptp_link != NULL) {
+ int accumulate = *pcall_id;
+
+ /* De-alias the Peer's Call Id. */
+ *pcall_id = GetOriginalPort(pptp_link);
+
+ /* Compute TCP checksum for modified packet */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ accumulate -= *pcall_id;
+ ADJUST_CHECKSUM(accumulate, tc->th_sum);
+
+ if (ctl_type == PPTP_OutCallReply || ctl_type == PPTP_InCallReply) {
+ PptpCode codes = (PptpCode)(cptr + 1);
+
+ if (codes->resCode == 1) /* Connection established, */
+ SetDestCallId(pptp_link, /* note the Call ID. */
+ cptr->cid1);
+ else
+ SetExpire(pptp_link, 0); /* Connection refused. */
+ }
+ }
+}
+
+static PptpCallId
+AliasVerifyPptp(struct ip *pip, u_int16_t *ptype) /* IP packet to examine/patch */
+{
+ int hlen, tlen, dlen;
+ PptpMsgHead hptr;
+ struct tcphdr *tc;
+
+ /* Calculate some lengths */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ /* Verify data length */
+ if (dlen < (sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds)))
+ return(NULL);
+
+ /* Move up to PPTP message header */
+ hptr = (PptpMsgHead)(((char *) pip) + hlen);
+
+ /* Return the control message type */
+ *ptype = ntohs(hptr->type);
+
+ /* Verify PPTP Control Message */
+ if ((ntohs(hptr->msgType) != PPTP_CTRL_MSG_TYPE) ||
+ (ntohl(hptr->magic) != PPTP_MAGIC))
+ return(NULL);
+
+ /* Verify data length. */
+ if ((*ptype == PPTP_OutCallReply || *ptype == PPTP_InCallReply) &&
+ (dlen < sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds) +
+ sizeof(struct pptpCodes)))
+ return (NULL);
+ else
+ return (PptpCallId)(hptr + 1);
+}
+
+
+int
+AliasHandlePptpGreOut(struct ip *pip)
+{
+ GreHdr *gr;
+ struct alias_link *link;
+
+ gr = (GreHdr *)((char *)pip + (pip->ip_hl << 2));
+
+ /* Check GRE header bits. */
+ if ((ntohl(*((u_int32_t *)gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE)
+ return (-1);
+
+ link = FindPptpOutByPeerCallId(pip->ip_src, pip->ip_dst, gr->gh_call_id);
+ if (link != NULL) {
+ struct in_addr alias_addr = GetAliasAddress(link);
+
+ /* Change source IP address. */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *)&alias_addr,
+ (u_short *)&pip->ip_src,
+ 2);
+ pip->ip_src = alias_addr;
+ }
+
+ return (0);
+}
+
+
+int
+AliasHandlePptpGreIn(struct ip *pip)
+{
+ GreHdr *gr;
+ struct alias_link *link;
+
+ gr = (GreHdr *)((char *)pip + (pip->ip_hl << 2));
+
+ /* Check GRE header bits. */
+ if ((ntohl(*((u_int32_t *)gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE)
+ return (-1);
+
+ link = FindPptpInByPeerCallId(pip->ip_src, pip->ip_dst, gr->gh_call_id);
+ if (link != NULL) {
+ struct in_addr src_addr = GetOriginalAddress(link);
+
+ /* De-alias the Peer's Call Id. */
+ gr->gh_call_id = GetOriginalPort(link);
+
+ /* Restore original IP address. */
+ DifferentialChecksum(&pip->ip_sum,
+ (u_short *)&src_addr,
+ (u_short *)&pip->ip_dst,
+ 2);
+ pip->ip_dst = src_addr;
+ }
+
+ return (0);
+}
diff --git a/sys/netinet/libalias/alias_proxy.c b/sys/netinet/libalias/alias_proxy.c
new file mode 100644
index 0000000..3bd007c
--- /dev/null
+++ b/sys/netinet/libalias/alias_proxy.c
@@ -0,0 +1,837 @@
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* file: alias_proxy.c
+
+ This file encapsulates special operations related to transparent
+ proxy redirection. This is where packets with a particular destination,
+ usually tcp port 80, are redirected to a proxy server.
+
+ When packets are proxied, the destination address and port are
+ modified. In certain cases, it is necessary to somehow encode
+ the original address/port info into the packet. Two methods are
+ presently supported: addition of a [DEST addr port] string at the
+ beginning a of tcp stream, or inclusion of an optional field
+ in the IP header.
+
+ There is one public API function:
+
+ PacketAliasProxyRule() -- Adds and deletes proxy
+ rules.
+
+ Rules are stored in a linear linked list, so lookup efficiency
+ won't be too good for large lists.
+
+
+ Initial development: April, 1998 (cjm)
+*/
+
+
+/* System includes */
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <netdb.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+/* BSD IPV4 includes */
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <arpa/inet.h>
+
+#include "alias_local.h" /* Functions used by alias*.c */
+#include "alias.h" /* Public API functions for libalias */
+
+
+
+/*
+ Data structures
+ */
+
+/*
+ * A linked list of arbitrary length, based on struct proxy_entry is
+ * used to store proxy rules.
+ */
+struct proxy_entry
+{
+#define PROXY_TYPE_ENCODE_NONE 1
+#define PROXY_TYPE_ENCODE_TCPSTREAM 2
+#define PROXY_TYPE_ENCODE_IPHDR 3
+ int rule_index;
+ int proxy_type;
+ u_char proto;
+ u_short proxy_port;
+ u_short server_port;
+
+ struct in_addr server_addr;
+
+ struct in_addr src_addr;
+ struct in_addr src_mask;
+
+ struct in_addr dst_addr;
+ struct in_addr dst_mask;
+
+ struct proxy_entry *next;
+ struct proxy_entry *last;
+};
+
+
+
+/*
+ File scope variables
+*/
+
+static struct proxy_entry *proxyList;
+
+
+
+/* Local (static) functions:
+
+ IpMask() -- Utility function for creating IP
+ masks from integer (1-32) specification.
+ IpAddr() -- Utility function for converting string
+ to IP address
+ IpPort() -- Utility function for converting string
+ to port number
+ RuleAdd() -- Adds an element to the rule list.
+ RuleDelete() -- Removes an element from the rule list.
+ RuleNumberDelete() -- Removes all elements from the rule list
+ having a certain rule number.
+ ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning
+ of a TCP stream.
+ ProxyEncodeIpHeader() -- Adds an IP option indicating the true
+ destination of a proxied IP packet
+*/
+
+static int IpMask(int, struct in_addr *);
+static int IpAddr(char *, struct in_addr *);
+static int IpPort(char *, int, int *);
+static void RuleAdd(struct proxy_entry *);
+static void RuleDelete(struct proxy_entry *);
+static int RuleNumberDelete(int);
+static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int);
+static void ProxyEncodeIpHeader(struct ip *, int);
+
+static int
+IpMask(int nbits, struct in_addr *mask)
+{
+ int i;
+ u_int imask;
+
+ if (nbits < 0 || nbits > 32)
+ return -1;
+
+ imask = 0;
+ for (i=0; i<nbits; i++)
+ imask = (imask >> 1) + 0x80000000;
+ mask->s_addr = htonl(imask);
+
+ return 0;
+}
+
+static int
+IpAddr(char *s, struct in_addr *addr)
+{
+ if (inet_aton(s, addr) == 0)
+ return -1;
+ else
+ return 0;
+}
+
+static int
+IpPort(char *s, int proto, int *port)
+{
+ int n;
+
+ n = sscanf(s, "%d", port);
+ if (n != 1)
+ {
+ struct servent *se;
+
+ if (proto == IPPROTO_TCP)
+ se = getservbyname(s, "tcp");
+ else if (proto == IPPROTO_UDP)
+ se = getservbyname(s, "udp");
+ else
+ return -1;
+
+ if (se == NULL)
+ return -1;
+
+ *port = (u_int) ntohs(se->s_port);
+ }
+
+ return 0;
+}
+
+void
+RuleAdd(struct proxy_entry *entry)
+{
+ int rule_index;
+ struct proxy_entry *ptr;
+ struct proxy_entry *ptr_last;
+
+ if (proxyList == NULL)
+ {
+ proxyList = entry;
+ entry->last = NULL;
+ entry->next = NULL;
+ return;
+ }
+
+ rule_index = entry->rule_index;
+ ptr = proxyList;
+ ptr_last = NULL;
+ while (ptr != NULL)
+ {
+ if (ptr->rule_index >= rule_index)
+ {
+ if (ptr_last == NULL)
+ {
+ entry->next = proxyList;
+ entry->last = NULL;
+ proxyList->last = entry;
+ proxyList = entry;
+ return;
+ }
+
+ ptr_last->next = entry;
+ ptr->last = entry;
+ entry->last = ptr->last;
+ entry->next = ptr;
+ return;
+ }
+ ptr_last = ptr;
+ ptr = ptr->next;
+ }
+
+ ptr_last->next = entry;
+ entry->last = ptr_last;
+ entry->next = NULL;
+}
+
+static void
+RuleDelete(struct proxy_entry *entry)
+{
+ if (entry->last != NULL)
+ entry->last->next = entry->next;
+ else
+ proxyList = entry->next;
+
+ if (entry->next != NULL)
+ entry->next->last = entry->last;
+
+ free(entry);
+}
+
+static int
+RuleNumberDelete(int rule_index)
+{
+ int err;
+ struct proxy_entry *ptr;
+
+ err = -1;
+ ptr = proxyList;
+ while (ptr != NULL)
+ {
+ struct proxy_entry *ptr_next;
+
+ ptr_next = ptr->next;
+ if (ptr->rule_index == rule_index)
+ {
+ err = 0;
+ RuleDelete(ptr);
+ }
+
+ ptr = ptr_next;
+ }
+
+ return err;
+}
+
+static void
+ProxyEncodeTcpStream(struct alias_link *link,
+ struct ip *pip,
+ int maxpacketsize)
+{
+ int slen;
+ char buffer[40];
+ struct tcphdr *tc;
+
+/* Compute pointer to tcp header */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+
+/* Don't modify if once already modified */
+
+ if (GetAckModified (link))
+ return;
+
+/* Translate destination address and port to string form */
+ snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]",
+ inet_ntoa(GetProxyAddress (link)), (u_int) ntohs(GetProxyPort (link)));
+
+/* Pad string out to a multiple of two in length */
+ slen = strlen(buffer);
+ switch (slen % 2)
+ {
+ case 0:
+ strcat(buffer, " \n");
+ slen += 2;
+ break;
+ case 1:
+ strcat(buffer, "\n");
+ slen += 1;
+ }
+
+/* Check for packet overflow */
+ if ((ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize)
+ return;
+
+/* Shift existing TCP data and insert destination string */
+ {
+ int dlen;
+ int hlen;
+ u_char *p;
+
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ dlen = ntohs (pip->ip_len) - hlen;
+
+/* Modify first packet that has data in it */
+
+ if (dlen == 0)
+ return;
+
+ p = (char *) pip;
+ p += hlen;
+
+ memmove(p + slen, p, dlen);
+ memcpy(p, buffer, slen);
+ }
+
+/* Save information about modfied sequence number */
+ {
+ int delta;
+
+ SetAckModified(link);
+ delta = GetDeltaSeqOut(pip, link);
+ AddSeq(pip, link, delta+slen);
+ }
+
+/* Update IP header packet length and checksum */
+ {
+ int accumulate;
+
+ accumulate = pip->ip_len;
+ pip->ip_len = htons(ntohs(pip->ip_len) + slen);
+ accumulate -= pip->ip_len;
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+ }
+
+/* Update TCP checksum, Use TcpChecksum since so many things have
+ already changed. */
+
+ tc->th_sum = 0;
+ tc->th_sum = TcpChecksum (pip);
+}
+
+static void
+ProxyEncodeIpHeader(struct ip *pip,
+ int maxpacketsize)
+{
+#define OPTION_LEN_BYTES 8
+#define OPTION_LEN_INT16 4
+#define OPTION_LEN_INT32 2
+ u_char option[OPTION_LEN_BYTES];
+
+#ifdef DEBUG
+ fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip));
+ fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip));
+#endif
+
+/* Check to see that there is room to add an IP option */
+ if (pip->ip_hl > (0x0f - OPTION_LEN_INT32))
+ return;
+
+/* Build option and copy into packet */
+ {
+ u_char *ptr;
+ struct tcphdr *tc;
+
+ ptr = (u_char *) pip;
+ ptr += 20;
+ memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20);
+
+ option[0] = 0x64; /* class: 3 (reserved), option 4 */
+ option[1] = OPTION_LEN_BYTES;
+
+ memcpy(&option[2], (u_char *) &pip->ip_dst, 4);
+
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ memcpy(&option[6], (u_char *) &tc->th_sport, 2);
+
+ memcpy(ptr, option, 8);
+ }
+
+/* Update checksum, header length and packet length */
+ {
+ int i;
+ int accumulate;
+ u_short *sptr;
+
+ sptr = (u_short *) option;
+ accumulate = 0;
+ for (i=0; i<OPTION_LEN_INT16; i++)
+ accumulate -= *(sptr++);
+
+ sptr = (u_short *) pip;
+ accumulate += *sptr;
+ pip->ip_hl += OPTION_LEN_INT32;
+ accumulate -= *sptr;
+
+ accumulate += pip->ip_len;
+ pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES);
+ accumulate -= pip->ip_len;
+
+ ADJUST_CHECKSUM(accumulate, pip->ip_sum);
+ }
+#undef OPTION_LEN_BYTES
+#undef OPTION_LEN_INT16
+#undef OPTION_LEN_INT32
+#ifdef DEBUG
+ fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip));
+ fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip));
+#endif
+}
+
+
+/* Functions by other packet alias source files
+
+ ProxyCheck() -- Checks whether an outgoing packet should
+ be proxied.
+ ProxyModify() -- Encodes the original destination address/port
+ for a packet which is to be redirected to
+ a proxy server.
+*/
+
+int
+ProxyCheck(struct ip *pip,
+ struct in_addr *proxy_server_addr,
+ u_short *proxy_server_port)
+{
+ u_short dst_port;
+ struct in_addr src_addr;
+ struct in_addr dst_addr;
+ struct proxy_entry *ptr;
+
+ src_addr = pip->ip_src;
+ dst_addr = pip->ip_dst;
+ dst_port = ((struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)))
+ ->th_dport;
+
+ ptr = proxyList;
+ while (ptr != NULL)
+ {
+ u_short proxy_port;
+
+ proxy_port = ptr->proxy_port;
+ if ((dst_port == proxy_port || proxy_port == 0)
+ && pip->ip_p == ptr->proto
+ && src_addr.s_addr != ptr->server_addr.s_addr)
+ {
+ struct in_addr src_addr_masked;
+ struct in_addr dst_addr_masked;
+
+ src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr;
+ dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr;
+
+ if ((src_addr_masked.s_addr == ptr->src_addr.s_addr)
+ && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr))
+ {
+ if ((*proxy_server_port = ptr->server_port) == 0)
+ *proxy_server_port = dst_port;
+ *proxy_server_addr = ptr->server_addr;
+ return ptr->proxy_type;
+ }
+ }
+ ptr = ptr->next;
+ }
+
+ return 0;
+}
+
+void
+ProxyModify(struct alias_link *link,
+ struct ip *pip,
+ int maxpacketsize,
+ int proxy_type)
+{
+ switch (proxy_type)
+ {
+ case PROXY_TYPE_ENCODE_IPHDR:
+ ProxyEncodeIpHeader(pip, maxpacketsize);
+ break;
+
+ case PROXY_TYPE_ENCODE_TCPSTREAM:
+ ProxyEncodeTcpStream(link, pip, maxpacketsize);
+ break;
+ }
+}
+
+
+/*
+ Public API functions
+*/
+
+int
+PacketAliasProxyRule(const char *cmd)
+{
+/*
+ * This function takes command strings of the form:
+ *
+ * server <addr>[:<port>]
+ * [port <port>]
+ * [rule n]
+ * [proto tcp|udp]
+ * [src <addr>[/n]]
+ * [dst <addr>[/n]]
+ * [type encode_tcp_stream|encode_ip_hdr|no_encode]
+ *
+ * delete <rule number>
+ *
+ * Subfields can be in arbitrary order. Port numbers and addresses
+ * must be in either numeric or symbolic form. An optional rule number
+ * is used to control the order in which rules are searched. If two
+ * rules have the same number, then search order cannot be guaranteed,
+ * and the rules should be disjoint. If no rule number is specified,
+ * then 0 is used, and group 0 rules are always checked before any
+ * others.
+ */
+ int i, n, len;
+ int cmd_len;
+ int token_count;
+ int state;
+ char *token;
+ char buffer[256];
+ char str_port[sizeof(buffer)];
+ char str_server_port[sizeof(buffer)];
+ char *res = buffer;
+
+ int rule_index;
+ int proto;
+ int proxy_type;
+ int proxy_port;
+ int server_port;
+ struct in_addr server_addr;
+ struct in_addr src_addr, src_mask;
+ struct in_addr dst_addr, dst_mask;
+ struct proxy_entry *proxy_entry;
+
+/* Copy command line into a buffer */
+ cmd += strspn(cmd, " \t");
+ cmd_len = strlen(cmd);
+ if (cmd_len > (sizeof(buffer) - 1))
+ return -1;
+ strcpy(buffer, cmd);
+
+/* Convert to lower case */
+ len = strlen(buffer);
+ for (i=0; i<len; i++)
+ buffer[i] = tolower((unsigned char)buffer[i]);
+
+/* Set default proxy type */
+
+/* Set up default values */
+ rule_index = 0;
+ proxy_type = PROXY_TYPE_ENCODE_NONE;
+ proto = IPPROTO_TCP;
+ proxy_port = 0;
+ server_addr.s_addr = 0;
+ server_port = 0;
+ src_addr.s_addr = 0;
+ IpMask(0, &src_mask);
+ dst_addr.s_addr = 0;
+ IpMask(0, &dst_mask);
+
+ str_port[0] = 0;
+ str_server_port[0] = 0;
+
+/* Parse command string with state machine */
+#define STATE_READ_KEYWORD 0
+#define STATE_READ_TYPE 1
+#define STATE_READ_PORT 2
+#define STATE_READ_SERVER 3
+#define STATE_READ_RULE 4
+#define STATE_READ_DELETE 5
+#define STATE_READ_PROTO 6
+#define STATE_READ_SRC 7
+#define STATE_READ_DST 8
+ state = STATE_READ_KEYWORD;
+ token = strsep(&res, " \t");
+ token_count = 0;
+ while (token != NULL)
+ {
+ token_count++;
+ switch (state)
+ {
+ case STATE_READ_KEYWORD:
+ if (strcmp(token, "type") == 0)
+ state = STATE_READ_TYPE;
+ else if (strcmp(token, "port") == 0)
+ state = STATE_READ_PORT;
+ else if (strcmp(token, "server") == 0)
+ state = STATE_READ_SERVER;
+ else if (strcmp(token, "rule") == 0)
+ state = STATE_READ_RULE;
+ else if (strcmp(token, "delete") == 0)
+ state = STATE_READ_DELETE;
+ else if (strcmp(token, "proto") == 0)
+ state = STATE_READ_PROTO;
+ else if (strcmp(token, "src") == 0)
+ state = STATE_READ_SRC;
+ else if (strcmp(token, "dst") == 0)
+ state = STATE_READ_DST;
+ else
+ return -1;
+ break;
+
+ case STATE_READ_TYPE:
+ if (strcmp(token, "encode_ip_hdr") == 0)
+ proxy_type = PROXY_TYPE_ENCODE_IPHDR;
+ else if (strcmp(token, "encode_tcp_stream") == 0)
+ proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM;
+ else if (strcmp(token, "no_encode") == 0)
+ proxy_type = PROXY_TYPE_ENCODE_NONE;
+ else
+ return -1;
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_PORT:
+ strcpy(str_port, token);
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_SERVER:
+ {
+ int err;
+ char *p;
+ char s[sizeof(buffer)];
+
+ p = token;
+ while (*p != ':' && *p != 0)
+ p++;
+
+ if (*p != ':')
+ {
+ err = IpAddr(token, &server_addr);
+ if (err)
+ return -1;
+ }
+ else
+ {
+ *p = ' ';
+
+ n = sscanf(token, "%s %s", s, str_server_port);
+ if (n != 2)
+ return -1;
+
+ err = IpAddr(s, &server_addr);
+ if (err)
+ return -1;
+ }
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_RULE:
+ n = sscanf(token, "%d", &rule_index);
+ if (n != 1 || rule_index < 0)
+ return -1;
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_DELETE:
+ {
+ int err;
+ int rule_to_delete;
+
+ if (token_count != 2)
+ return -1;
+
+ n = sscanf(token, "%d", &rule_to_delete);
+ if (n != 1)
+ return -1;
+ err = RuleNumberDelete(rule_to_delete);
+ if (err)
+ return -1;
+ return 0;
+ }
+
+ case STATE_READ_PROTO:
+ if (strcmp(token, "tcp") == 0)
+ proto = IPPROTO_TCP;
+ else if (strcmp(token, "udp") == 0)
+ proto = IPPROTO_UDP;
+ else
+ return -1;
+ state = STATE_READ_KEYWORD;
+ break;
+
+ case STATE_READ_SRC:
+ case STATE_READ_DST:
+ {
+ int err;
+ char *p;
+ struct in_addr mask;
+ struct in_addr addr;
+
+ p = token;
+ while (*p != '/' && *p != 0)
+ p++;
+
+ if (*p != '/')
+ {
+ IpMask(32, &mask);
+ err = IpAddr(token, &addr);
+ if (err)
+ return -1;
+ }
+ else
+ {
+ int nbits;
+ char s[sizeof(buffer)];
+
+ *p = ' ';
+ n = sscanf(token, "%s %d", s, &nbits);
+ if (n != 2)
+ return -1;
+
+ err = IpAddr(s, &addr);
+ if (err)
+ return -1;
+
+ err = IpMask(nbits, &mask);
+ if (err)
+ return -1;
+ }
+
+ if (state == STATE_READ_SRC)
+ {
+ src_addr = addr;
+ src_mask = mask;
+ }
+ else
+ {
+ dst_addr = addr;
+ dst_mask = mask;
+ }
+ }
+ state = STATE_READ_KEYWORD;
+ break;
+
+ default:
+ return -1;
+ break;
+ }
+
+ do {
+ token = strsep(&res, " \t");
+ } while (token != NULL && !*token);
+ }
+#undef STATE_READ_KEYWORD
+#undef STATE_READ_TYPE
+#undef STATE_READ_PORT
+#undef STATE_READ_SERVER
+#undef STATE_READ_RULE
+#undef STATE_READ_DELETE
+#undef STATE_READ_PROTO
+#undef STATE_READ_SRC
+#undef STATE_READ_DST
+
+/* Convert port strings to numbers. This needs to be done after
+ the string is parsed, because the prototype might not be designated
+ before the ports (which might be symbolic entries in /etc/services) */
+
+ if (strlen(str_port) != 0)
+ {
+ int err;
+
+ err = IpPort(str_port, proto, &proxy_port);
+ if (err)
+ return -1;
+ }
+ else
+ {
+ proxy_port = 0;
+ }
+
+ if (strlen(str_server_port) != 0)
+ {
+ int err;
+
+ err = IpPort(str_server_port, proto, &server_port);
+ if (err)
+ return -1;
+ }
+ else
+ {
+ server_port = 0;
+ }
+
+/* Check that at least the server address has been defined */
+ if (server_addr.s_addr == 0)
+ return -1;
+
+/* Add to linked list */
+ proxy_entry = malloc(sizeof(struct proxy_entry));
+ if (proxy_entry == NULL)
+ return -1;
+
+ proxy_entry->proxy_type = proxy_type;
+ proxy_entry->rule_index = rule_index;
+ proxy_entry->proto = proto;
+ proxy_entry->proxy_port = htons(proxy_port);
+ proxy_entry->server_port = htons(server_port);
+ proxy_entry->server_addr = server_addr;
+ proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr;
+ proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr;
+ proxy_entry->src_mask = src_mask;
+ proxy_entry->dst_mask = dst_mask;
+
+ RuleAdd(proxy_entry);
+
+ return 0;
+}
diff --git a/sys/netinet/libalias/alias_smedia.c b/sys/netinet/libalias/alias_smedia.c
new file mode 100644
index 0000000..027a724
--- /dev/null
+++ b/sys/netinet/libalias/alias_smedia.c
@@ -0,0 +1,433 @@
+/*
+ * alias_smedia.c
+ *
+ * Copyright (c) 2000 Whistle Communications, Inc.
+ * All rights reserved.
+ *
+ * Subject to the following obligations and disclaimer of warranty, use and
+ * redistribution of this software, in source or object code forms, with or
+ * without modifications are expressly permitted by Whistle Communications;
+ * provided, however, that:
+ * 1. Any and all reproductions of the source or object code must include the
+ * copyright notice above and the following disclaimer of warranties; and
+ * 2. No rights are granted, in any manner or form, to use Whistle
+ * Communications, Inc. trademarks, including the mark "WHISTLE
+ * COMMUNICATIONS" on advertising, endorsements, or otherwise except as
+ * such appears in the above copyright notice or in the software.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
+ * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
+ * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
+ * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
+ * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
+ * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
+ * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
+ * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+ * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
+ * OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2000 Junichi SATOH <junichi@astec.co.jp>
+ * <junichi@junichi.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Authors: Erik Salander <erik@whistle.com>
+ * Junichi SATOH <junichi@astec.co.jp>
+ * <junichi@junichi.org>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ Alias_smedia.c is meant to contain the aliasing code for streaming media
+ protocols. It performs special processing for RSTP sessions under TCP.
+ Specifically, when a SETUP request is sent by a client, or a 200 reply
+ is sent by a server, it is intercepted and modified. The address is
+ changed to the gateway machine and an aliasing port is used.
+
+ More specifically, the "client_port" configuration parameter is
+ parsed for SETUP requests. The "server_port" configuration parameter is
+ parsed for 200 replies eminating from a server. This is intended to handle
+ the unicast case.
+
+ RTSP also allows a redirection of a stream to another client by using the
+ "destination" configuration parameter. The destination config parm would
+ indicate a different IP address. This function is NOT supported by the
+ RTSP translation code below.
+
+ The RTSP multicast functions without any address translation intervention.
+
+ For this routine to work, the SETUP/200 must fit entirely
+ into a single TCP packet. This is typically the case, but exceptions
+ can easily be envisioned under the actual specifications.
+
+ Probably the most troubling aspect of the approach taken here is
+ that the new SETUP/200 will typically be a different length, and
+ this causes a certain amount of bookkeeping to keep track of the
+ changes of sequence and acknowledgment numbers, since the client
+ machine is totally unaware of the modification to the TCP stream.
+
+ Initial version: May, 2000 (eds)
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include "alias_local.h"
+
+#define RTSP_CONTROL_PORT_NUMBER_1 554
+#define RTSP_CONTROL_PORT_NUMBER_2 7070
+#define RTSP_PORT_GROUP 2
+
+#define ISDIGIT(a) (((a) >= '0') && ((a) <= '9'))
+
+static int
+search_string(char *data, int dlen, const char *search_str)
+{
+ int i, j, k;
+ int search_str_len;
+
+ search_str_len = strlen(search_str);
+ for (i = 0; i < dlen - search_str_len; i++) {
+ for (j = i, k = 0; j < dlen - search_str_len; j++, k++) {
+ if (data[j] != search_str[k] &&
+ data[j] != search_str[k] - ('a' - 'A')) {
+ break;
+ }
+ if (k == search_str_len - 1) {
+ return j + 1;
+ }
+ }
+ }
+ return -1;
+}
+
+static int
+alias_rtsp_out(struct ip *pip,
+ struct alias_link *link,
+ char *data,
+ const char *port_str)
+{
+ int hlen, tlen, dlen;
+ struct tcphdr *tc;
+ int i, j, pos, state, port_dlen, new_dlen, delta;
+ u_short p[2], new_len;
+ u_short sport, eport, base_port;
+ u_short salias = 0, ealias = 0, base_alias = 0;
+ const char *transport_str = "transport:";
+ char newdata[2048], *port_data, *port_newdata, stemp[80];
+ int links_created = 0, pkt_updated = 0;
+ struct alias_link *rtsp_link = NULL;
+ struct in_addr null_addr;
+
+ /* Calculate data length of TCP packet */
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ /* Find keyword, "Transport: " */
+ pos = search_string(data, dlen, transport_str);
+ if (pos < 0) {
+ return -1;
+ }
+ port_data = data + pos;
+ port_dlen = dlen - pos;
+
+ memcpy(newdata, data, pos);
+ port_newdata = newdata + pos;
+
+ while (port_dlen > strlen(port_str)) {
+ /* Find keyword, appropriate port string */
+ pos = search_string(port_data, port_dlen, port_str);
+ if (pos < 0) {
+ break;
+ }
+
+ memcpy (port_newdata, port_data, pos + 1);
+ port_newdata += (pos + 1);
+
+ p[0] = p[1] = 0;
+ sport = eport = 0;
+ state = 0;
+ for (i = pos; i < port_dlen; i++) {
+ switch(state) {
+ case 0:
+ if (port_data[i] == '=') {
+ state++;
+ }
+ break;
+ case 1:
+ if (ISDIGIT(port_data[i])) {
+ p[0] = p[0] * 10 + port_data[i] - '0';
+ } else {
+ if (port_data[i] == ';') {
+ state = 3;
+ }
+ if (port_data[i] == '-') {
+ state++;
+ }
+ }
+ break;
+ case 2:
+ if (ISDIGIT(port_data[i])) {
+ p[1] = p[1] * 10 + port_data[i] - '0';
+ } else {
+ state++;
+ }
+ break;
+ case 3:
+ base_port = p[0];
+ sport = htons(p[0]);
+ eport = htons(p[1]);
+
+ if (!links_created) {
+
+ links_created = 1;
+ /* Find an even numbered port number base that
+ satisfies the contiguous number of ports we need */
+ null_addr.s_addr = 0;
+ if (0 == (salias = FindNewPortGroup(null_addr,
+ FindAliasAddress(pip->ip_src),
+ sport, 0,
+ RTSP_PORT_GROUP,
+ IPPROTO_UDP, 1))) {
+#ifdef DEBUG
+ fprintf(stderr,
+ "PacketAlias/RTSP: Cannot find contiguous RTSP data ports\n");
+#endif
+ } else {
+
+ base_alias = ntohs(salias);
+ for (j = 0; j < RTSP_PORT_GROUP; j++) {
+ /* Establish link to port found in RTSP packet */
+ rtsp_link = FindRtspOut(GetOriginalAddress(link), null_addr,
+ htons(base_port + j), htons(base_alias + j),
+ IPPROTO_UDP);
+ if (rtsp_link != NULL) {
+#ifndef NO_FW_PUNCH
+ /* Punch hole in firewall */
+ PunchFWHole(rtsp_link);
+#endif
+ } else {
+#ifdef DEBUG
+ fprintf(stderr,
+ "PacketAlias/RTSP: Cannot allocate RTSP data ports\n");
+#endif
+ break;
+ }
+ }
+ }
+ ealias = htons(base_alias + (RTSP_PORT_GROUP - 1));
+ }
+
+ if (salias && rtsp_link) {
+
+ pkt_updated = 1;
+
+ /* Copy into IP packet */
+ sprintf(stemp, "%d", ntohs(salias));
+ memcpy(port_newdata, stemp, strlen(stemp));
+ port_newdata += strlen(stemp);
+
+ if (eport != 0) {
+ *port_newdata = '-';
+ port_newdata++;
+
+ /* Copy into IP packet */
+ sprintf(stemp, "%d", ntohs(ealias));
+ memcpy(port_newdata, stemp, strlen(stemp));
+ port_newdata += strlen(stemp);
+ }
+
+ *port_newdata = ';';
+ port_newdata++;
+ }
+ state++;
+ break;
+ }
+ if (state > 3) {
+ break;
+ }
+ }
+ port_data += i;
+ port_dlen -= i;
+ }
+
+ if (!pkt_updated)
+ return -1;
+
+ memcpy (port_newdata, port_data, port_dlen);
+ port_newdata += port_dlen;
+ *port_newdata = '\0';
+
+ /* Create new packet */
+ new_dlen = port_newdata - newdata;
+ memcpy (data, newdata, new_dlen);
+
+ SetAckModified(link);
+ delta = GetDeltaSeqOut(pip, link);
+ AddSeq(pip, link, delta + new_dlen - dlen);
+
+ new_len = htons(hlen + new_dlen);
+ DifferentialChecksum(&pip->ip_sum,
+ &new_len,
+ &pip->ip_len,
+ 1);
+ pip->ip_len = new_len;
+
+ tc->th_sum = 0;
+ tc->th_sum = TcpChecksum(pip);
+
+ return 0;
+}
+
+/* Support the protocol used by early versions of RealPlayer */
+
+static int
+alias_pna_out(struct ip *pip,
+ struct alias_link *link,
+ char *data,
+ int dlen)
+{
+ struct alias_link *pna_links;
+ u_short msg_id, msg_len;
+ char *work;
+ u_short alias_port, port;
+ struct tcphdr *tc;
+
+ work = data;
+ work += 5;
+ while (work + 4 < data + dlen) {
+ memcpy(&msg_id, work, 2);
+ work += 2;
+ memcpy(&msg_len, work, 2);
+ work += 2;
+ if (ntohs(msg_id) == 0) {
+ /* end of options */
+ return 0;
+ }
+ if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) {
+ memcpy(&port, work, 2);
+ pna_links = FindUdpTcpOut(pip->ip_src, GetDestAddress(link),
+ port, 0, IPPROTO_UDP, 1);
+ if (pna_links != NULL) {
+#ifndef NO_FW_PUNCH
+ /* Punch hole in firewall */
+ PunchFWHole(pna_links);
+#endif
+ tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2));
+ alias_port = GetAliasPort(pna_links);
+ memcpy(work, &alias_port, 2);
+
+ /* Compute TCP checksum for revised packet */
+ tc->th_sum = 0;
+ tc->th_sum = TcpChecksum(pip);
+ }
+ }
+ work += ntohs(msg_len);
+ }
+
+ return 0;
+}
+
+void
+AliasHandleRtspOut(struct ip *pip, struct alias_link *link, int maxpacketsize)
+{
+ int hlen, tlen, dlen;
+ struct tcphdr *tc;
+ char *data;
+ const char *setup = "SETUP", *pna = "PNA", *str200 = "200";
+ const char *okstr = "OK", *client_port_str = "client_port";
+ const char *server_port_str = "server_port";
+ int i, parseOk;
+
+ tc = (struct tcphdr *)((char *)pip + (pip->ip_hl << 2));
+ hlen = (pip->ip_hl + tc->th_off) << 2;
+ tlen = ntohs(pip->ip_len);
+ dlen = tlen - hlen;
+
+ data = (char*)pip;
+ data += hlen;
+
+ /* When aliasing a client, check for the SETUP request */
+ if ((ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1) ||
+ (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2)) {
+
+ if (dlen >= strlen(setup)) {
+ if (memcmp(data, setup, strlen(setup)) == 0) {
+ alias_rtsp_out(pip, link, data, client_port_str);
+ return;
+ }
+ }
+ if (dlen >= strlen(pna)) {
+ if (memcmp(data, pna, strlen(pna)) == 0) {
+ alias_pna_out(pip, link, data, dlen);
+ }
+ }
+
+ } else {
+
+ /* When aliasing a server, check for the 200 reply
+ Accomodate varying number of blanks between 200 & OK */
+
+ if (dlen >= strlen(str200)) {
+
+ for (parseOk = 0, i = 0;
+ i <= dlen - strlen(str200);
+ i++) {
+ if (memcmp(&data[i], str200, strlen(str200)) == 0) {
+ parseOk = 1;
+ break;
+ }
+ }
+ if (parseOk) {
+
+ i += strlen(str200); /* skip string found */
+ while(data[i] == ' ') /* skip blank(s) */
+ i++;
+
+ if ((dlen - i) >= strlen(okstr)) {
+
+ if (memcmp(&data[i], okstr, strlen(okstr)) == 0)
+ alias_rtsp_out(pip, link, data, server_port_str);
+
+ }
+ }
+ }
+ }
+}
diff --git a/sys/netinet/libalias/alias_util.c b/sys/netinet/libalias/alias_util.c
new file mode 100644
index 0000000..787c859
--- /dev/null
+++ b/sys/netinet/libalias/alias_util.c
@@ -0,0 +1,169 @@
+/*-
+ * Copyright (c) 2001 Charles Mott <cm@linktel.net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+/*
+ Alias_util.c contains general utilities used by other functions
+ in the packet aliasing module. At the moment, there are functions
+ for computing IP header and TCP packet checksums.
+
+ The checksum routines are based upon example code in a Unix networking
+ text written by Stevens (sorry, I can't remember the title -- but
+ at least this is a good author).
+
+ Initial Version: August, 1996 (cjm)
+
+ Version 1.7: January 9, 1997
+ Added differential checksum update function.
+*/
+
+/*
+Note: the checksum routines assume that the actual checksum word has
+been zeroed out. If the checksum word is filled with the proper value,
+then these routines will give a result of zero (useful for testing
+purposes);
+*/
+
+#include <sys/types.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include "alias.h"
+#include "alias_local.h"
+
+u_short
+PacketAliasInternetChecksum(u_short *ptr, int nbytes)
+{
+ int sum, oddbyte;
+
+ sum = 0;
+ while (nbytes > 1)
+ {
+ sum += *ptr++;
+ nbytes -= 2;
+ }
+ if (nbytes == 1)
+ {
+ oddbyte = 0;
+ ((u_char *) &oddbyte)[0] = *(u_char *) ptr;
+ ((u_char *) &oddbyte)[1] = 0;
+ sum += oddbyte;
+ }
+ sum = (sum >> 16) + (sum & 0xffff);
+ sum += (sum >> 16);
+ return(~sum);
+}
+
+u_short
+IpChecksum(struct ip *pip)
+{
+ return( PacketAliasInternetChecksum((u_short *) pip,
+ (pip->ip_hl << 2)) );
+
+}
+
+u_short
+TcpChecksum(struct ip *pip)
+{
+ u_short *ptr;
+ struct tcphdr *tc;
+ int nhdr, ntcp, nbytes;
+ int sum, oddbyte;
+
+ nhdr = pip->ip_hl << 2;
+ ntcp = ntohs(pip->ip_len) - nhdr;
+
+ tc = (struct tcphdr *) ((char *) pip + nhdr);
+ ptr = (u_short *) tc;
+
+/* Add up TCP header and data */
+ nbytes = ntcp;
+ sum = 0;
+ while (nbytes > 1)
+ {
+ sum += *ptr++;
+ nbytes -= 2;
+ }
+ if (nbytes == 1)
+ {
+ oddbyte = 0;
+ ((u_char *) &oddbyte)[0] = *(u_char *) ptr;
+ ((u_char *) &oddbyte)[1] = 0;
+ sum += oddbyte;
+ }
+
+/* "Pseudo-header" data */
+ ptr = (u_short *) &(pip->ip_dst);
+ sum += *ptr++;
+ sum += *ptr;
+ ptr = (u_short *) &(pip->ip_src);
+ sum += *ptr++;
+ sum += *ptr;
+ sum += htons((u_short) ntcp);
+ sum += htons((u_short) pip->ip_p);
+
+/* Roll over carry bits */
+ sum = (sum >> 16) + (sum & 0xffff);
+ sum += (sum >> 16);
+
+/* Return checksum */
+ return((u_short) ~sum);
+}
+
+
+void
+DifferentialChecksum(u_short *cksum, u_short *new, u_short *old, int n)
+{
+ int i;
+ int accumulate;
+
+ accumulate = *cksum;
+ for (i=0; i<n; i++)
+ {
+ accumulate -= *new++;
+ accumulate += *old++;
+ }
+
+ if (accumulate < 0)
+ {
+ accumulate = -accumulate;
+ accumulate = (accumulate >> 16) + (accumulate & 0xffff);
+ accumulate += accumulate >> 16;
+ *cksum = (u_short) ~accumulate;
+ }
+ else
+ {
+ accumulate = (accumulate >> 16) + (accumulate & 0xffff);
+ accumulate += accumulate >> 16;
+ *cksum = (u_short) accumulate;
+ }
+}
+
diff --git a/sys/netinet/libalias/libalias.3 b/sys/netinet/libalias/libalias.3
new file mode 100644
index 0000000..cd8b97c
--- /dev/null
+++ b/sys/netinet/libalias/libalias.3
@@ -0,0 +1,981 @@
+.\"-
+.\" Copyright (c) 2001 Charles Mott <cm@linktel.net>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd April 13, 2000
+.Dt LIBALIAS 3
+.Os
+.Sh NAME
+.Nm libalias
+.Nd packet aliasing library for masquerading and network address translation
+.Sh SYNOPSIS
+.In sys/types.h
+.In netinet/in.h
+.In alias.h
+.Pp
+Function prototypes are given in the main body of the text.
+.Sh DESCRIPTION
+The
+.Nm
+library is a collection of functions for aliasing and de-aliasing of IP
+packets, intended for masquerading and network address translation (NAT).
+.Sh INTRODUCTION
+This library is a moderately portable set of functions designed to assist
+in the process of IP masquerading and network address translation.
+Outgoing packets from a local network with unregistered IP addresses can
+be aliased to appear as if they came from an accessible IP address.
+Incoming packets are then de-aliased so that they are sent to the correct
+machine on the local network.
+.Pp
+A certain amount of flexibility is built into the packet aliasing engine.
+In the simplest mode of operation, a many-to-one address mapping takes
+place between local network and the packet aliasing host.
+This is known as IP masquerading.
+In addition, one-to-one mappings between local and public addresses can
+also be implemented, which is known as static NAT.
+In between these extremes, different groups of private addresses can be
+linked to different public addresses, comprising several distinct
+many-to-one mappings.
+Also, a given public address and port can be statically redirected to a
+private address/port.
+.Pp
+The packet aliasing engine was designed to operate in user space outside
+of the kernel, without any access to private kernel data structure, but
+the source code can also be ported to a kernel environment.
+.Sh INITIALIZATION AND CONTROL
+Two special functions,
+.Fn PacketAliasInit
+and
+.Fn PacketAliasSetAddress ,
+must always be called before any packet handling may be performed.
+In addition, the operating mode of the packet aliasing engine can be
+customized by calling
+.Fn PacketAliasSetMode .
+.Pp
+.Ft void
+.Fn PacketAliasInit void
+.Bd -ragged -offset indent
+This function has no arguments or return value and is used to initialize
+internal data structures.
+The following mode bits are always set after calling
+.Fn PacketAliasInit .
+See the description of
+.Fn PacketAliasSetMode
+below for the meaning of these mode bits.
+.Pp
+.Bl -item -offset indent -compact
+.It
+.Dv PKT_ALIAS_SAME_PORTS
+.It
+.Dv PKT_ALIAS_USE_SOCKETS
+.It
+.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE
+.El
+.Pp
+This function will always return the packet aliasing engine to the same
+initial state.
+.Fn PacketAliasSetAddress
+must be called afterwards, and any desired changes from the default mode
+bits listed above require a call to
+.Fn PacketAliasSetMode .
+.Pp
+It is mandatory that this function be called at the beginning of a program
+prior to any packet handling.
+.Ed
+.Pp
+.Ft void
+.Fn PacketAliasUninit void
+.Bd -ragged -offset indent
+This function has no arguments or return value and is used to clear any
+resources attached to internal data structures.
+.Pp
+This functions should be called when a program stops using the aliasing
+engine; it does, amongst other things, clear out any firewall holes.
+To provide backwards compatibility and extra security, it is added to
+the
+.Xr atexit 3
+chain by
+.Fn PacketAliasInit .
+Calling it multiple times is harmless.
+.Ed
+.Pp
+.Ft void
+.Fn PacketAliasSetAddress "struct in_addr addr"
+.Bd -ragged -offset indent
+This function sets the source address to which outgoing packets from the
+local area network are aliased.
+All outgoing packets are re-mapped to this address unless overridden by a
+static address mapping established by
+.Fn PacketAliasRedirectAddr .
+.Pp
+If the
+.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE
+mode bit is set (the default mode of operation), then the internal aliasing
+link tables will be reset any time the aliasing address changes.
+This is useful for interfaces such as
+.Xr ppp 8 ,
+where the IP
+address may or may not change on successive dial-up attempts.
+.Pp
+If the
+.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE
+mode bit is set to zero, this function can also be used to dynamically change
+the aliasing address on a packet to packet basis (it is a low overhead call).
+.Pp
+It is mandatory that this function be called prior to any packet handling.
+.Ed
+.Pp
+.Ft unsigned int
+.Fn PacketAliasSetMode "unsigned int flags" "unsigned int mask"
+.Bd -ragged -offset indent
+This function sets or clears mode bits
+according to the value of
+.Fa flags .
+Only bits marked in
+.Fa mask
+are affected.
+The following mode bits are defined in
+.Aq Pa alias.h :
+.Bl -tag -width indent
+.It Dv PKT_ALIAS_LOG
+Enables logging into
+.Pa /var/log/alias.log .
+Each time an aliasing link is created or deleted, the log file is appended
+with the current number of ICMP, TCP and UDP links.
+Mainly useful for debugging when the log file is viewed continuously with
+.Xr tail 1 .
+.It Dv PKT_ALIAS_DENY_INCOMING
+If this mode bit is set, all incoming packets associated with new TCP
+connections or new UDP transactions will be marked for being ignored
+.Fn ( PacketAliasIn
+returns
+.Dv PKT_ALIAS_IGNORED
+code)
+by the calling program.
+Response packets to connections or transactions initiated from the packet
+aliasing host or local network will be unaffected.
+This mode bit is useful for implementing a one-way firewall.
+.It Dv PKT_ALIAS_SAME_PORTS
+If this mode bit is set, the packet aliasing engine will attempt to leave
+the alias port numbers unchanged from the actual local port numbers.
+This can be done as long as the quintuple (proto, alias addr, alias port,
+remote addr, remote port) is unique.
+If a conflict exists, a new aliasing port number is chosen even if this
+mode bit is set.
+.It Dv PKT_ALIAS_USE_SOCKETS
+This bit should be set when the packet aliasing host originates network
+traffic as well as forwards it.
+When the packet aliasing host is waiting for a connection from an unknown
+host address or unknown port number (e.g. an FTP data connection), this
+mode bit specifies that a socket be allocated as a place holder to prevent
+port conflicts.
+Once a connection is established, usually within a minute or so, the socket
+is closed.
+.It Dv PKT_ALIAS_UNREGISTERED_ONLY
+If this mode bit is set, traffic on the local network which does not
+originate from unregistered address spaces will be ignored.
+Standard Class A, B and C unregistered addresses are:
+.Bd -literal -offset indent
+10.0.0.0 -> 10.255.255.255 (Class A subnet)
+172.16.0.0 -> 172.31.255.255 (Class B subnets)
+192.168.0.0 -> 192.168.255.255 (Class C subnets)
+.Ed
+.Pp
+This option is useful in the case that packet aliasing host has both
+registered and unregistered subnets on different interfaces.
+The registered subnet is fully accessible to the outside world, so traffic
+from it does not need to be passed through the packet aliasing engine.
+.It Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE
+When this mode bit is set and
+.Fn PacketAliasSetAddress
+is called to change the aliasing address, the internal link table of the
+packet aliasing engine will be cleared.
+This operating mode is useful for
+.Xr ppp 8
+links where the interface address can sometimes change or remain the same
+between dial-up attempts.
+If this mode bit is not set, the link table will never be reset in the event
+of an address change.
+.It Dv PKT_ALIAS_PUNCH_FW
+This option makes
+.Nm
+`punch holes' in an
+.Xr ipfirewall 4
+based firewall for FTP/IRC DCC connections.
+The holes punched are bound by from/to IP address and port; it will not be
+possible to use a hole for another connection.
+A hole is removed when the connection that uses it dies.
+To cater to unexpected death of a program using
+.Nm
+(e.g. kill -9),
+changing the state of the flag will clear the entire firewall range
+allocated for holes.
+This will also happen on the initial call to
+.Fn PacketAliasSetFWBase .
+This call must happen prior to setting this flag.
+.It Dv PKT_ALIAS_REVERSE
+This option makes
+.Nm
+reverse the way it handles incoming and outgoing packets, allowing it
+to be fed with data that passes through the internal interface rather
+than the external one.
+.It Dv PKT_ALIAS_PROXY_ONLY
+This option tells
+.Nm
+to obey transparent proxy rules only.
+Normal packet aliasing is not performed.
+See
+.Fn PacketAliasProxyRule
+below for details.
+.El
+.Ed
+.Pp
+.Ft void
+.Fn PacketAliasSetFWBase "unsigned int base" "unsigned int num"
+.Bd -ragged -offset indent
+Set firewall range allocated for punching firewall holes (with the
+.Dv PKT_ALIAS_PUNCH_FW
+flag).
+The range will be cleared for all rules on initialization.
+.Ed
+.Sh PACKET HANDLING
+The packet handling functions are used to modify incoming (remote to local)
+and outgoing (local to remote) packets.
+The calling program is responsible for receiving and sending packets via
+network interfaces.
+.Pp
+Along with
+.Fn PacketAliasInit
+and
+.Fn PacketAliasSetAddress ,
+the two packet handling functions,
+.Fn PacketAliasIn
+and
+.Fn PacketAliasOut ,
+comprise minimal set of functions needed for a basic IP masquerading
+implementation.
+.Pp
+.Ft int
+.Fn PacketAliasIn "char *buffer" "int maxpacketsize"
+.Bd -ragged -offset indent
+An incoming packet coming from a remote machine to the local network is
+de-aliased by this function.
+The IP packet is pointed to by
+.Fa buffer ,
+and
+.Fa maxpacketsize
+indicates the size of the data structure containing the packet and should
+be at least as large as the actual packet size.
+.Pp
+Return codes:
+.Bl -tag -width indent
+.It Dv PKT_ALIAS_OK
+The packet aliasing process was successful.
+.It Dv PKT_ALIAS_IGNORED
+The packet was ignored and not de-aliased.
+This can happen if the protocol is unrecognized, possibly an ICMP message
+type is not handled or if incoming packets for new connections are being
+ignored (if
+.Dv PKT_ALIAS_DENY_INCOMING
+mode bit was set by
+.Fn PacketAliasSetMode ) .
+.It Dv PKT_ALIAS_UNRESOLVED_FRAGMENT
+This is returned when a fragment cannot be resolved because the header
+fragment has not been sent yet.
+In this situation, fragments must be saved with
+.Fn PacketAliasSaveFragment
+until a header fragment is found.
+.It Dv PKT_ALIAS_FOUND_HEADER_FRAGMENT
+The packet aliasing process was successful, and a header fragment was found.
+This is a signal to retrieve any unresolved fragments with
+.Fn PacketAliasGetFragment
+and de-alias them with
+.Fn PacketAliasFragmentIn .
+.It Dv PKT_ALIAS_ERROR
+An internal error within the packet aliasing engine occurred.
+.El
+.Ed
+.Pp
+.Ft int
+.Fn PacketAliasOut "char *buffer" "int maxpacketsize"
+.Bd -ragged -offset indent
+An outgoing packet coming from the local network to a remote machine is
+aliased by this function.
+The IP packet is pointed to by
+.Fa buffer ,
+and
+.Fa maxpacketsize
+indicates the maximum packet size permissible should the packet length be
+changed.
+IP encoding protocols place address and port information in the encapsulated
+data stream which has to be modified and can account for changes in packet
+length.
+Well known examples of such protocols are FTP and IRC DCC.
+.Pp
+Return codes:
+.Bl -tag -width indent
+.It Dv PKT_ALIAS_OK
+The packet aliasing process was successful.
+.It Dv PKT_ALIAS_IGNORED
+The packet was ignored and not aliased.
+This can happen if the protocol is unrecognized, or possibly an ICMP message
+type is not handled.
+.It Dv PKT_ALIAS_ERROR
+An internal error within the packet aliasing engine occurred.
+.El
+.Ed
+.Sh PORT AND ADDRESS REDIRECTION
+The functions described in this section allow machines on the local network
+to be accessible in some degree to new incoming connections from the external
+network.
+Individual ports can be re-mapped or static network address translations can
+be designated.
+.Pp
+.Ft struct alias_link *
+.Fo PacketAliasRedirectPort
+.Fa "struct in_addr local_addr"
+.Fa "u_short local_port"
+.Fa "struct in_addr remote_addr"
+.Fa "u_short remote_port"
+.Fa "struct in_addr alias_addr"
+.Fa "u_short alias_port"
+.Fa "u_char proto"
+.Fc
+.Bd -ragged -offset indent
+This function specifies that traffic from a given remote address/port to
+an alias address/port be redirected to a specified local address/port.
+The parameter
+.Fa proto
+can be either
+.Dv IPPROTO_TCP
+or
+.Dv IPPROTO_UDP ,
+as defined in
+.Aq Pa netinet/in.h .
+.Pp
+If
+.Fa local_addr
+or
+.Fa alias_addr
+is zero, this indicates that the packet aliasing address as established
+by
+.Fn PacketAliasSetAddress
+is to be used.
+Even if
+.Fn PacketAliasSetAddress
+is called to change the address after
+.Fn PacketAliasRedirectPort
+is called, a zero reference will track this change.
+.Pp
+If the link is further set up to operate for a load sharing, then
+.Fa local_addr
+and
+.Fa local_port
+are ignored, and are selected dynamically from the server pool, as described in
+.Fn PacketAliasAddServer
+below.
+.Pp
+If
+.Fa remote_addr
+is zero, this indicates to redirect packets from any remote address.
+Likewise, if
+.Fa remote_port
+is zero, this indicates to redirect packets originating from any remote
+port number.
+Almost always, the remote port specification will be zero, but non-zero
+remote addresses can sometimes be useful for firewalling.
+If two calls to
+.Fn PacketAliasRedirectPort
+overlap in their address/port specifications, then the most recent call
+will have precedence.
+.Pp
+This function returns a pointer which can subsequently be used by
+.Fn PacketAliasRedirectDelete .
+If
+.Dv NULL
+is returned, then the function call did not complete successfully.
+.Pp
+All port numbers should be in network address byte order, so it is necessary
+to use
+.Xr htons 3
+to convert these parameters from internally readable numbers to network byte
+order.
+Addresses are also in network byte order, which is implicit in the use of the
+.Fa struct in_addr
+data type.
+.Ed
+.Pp
+.Ft struct alias_link *
+.Fo PacketAliasRedirectAddr
+.Fa "struct in_addr local_addr"
+.Fa "struct in_addr alias_addr"
+.Fc
+.Bd -ragged -offset indent
+This function designates that all incoming traffic to
+.Fa alias_addr
+be redirected to
+.Fa local_addr .
+Similarly, all outgoing traffic from
+.Fa local_addr
+is aliased to
+.Fa alias_addr .
+.Pp
+If
+.Fa local_addr
+or
+.Fa alias_addr
+is zero, this indicates that the packet aliasing address as established by
+.Fn PacketAliasSetAddress
+is to be used.
+Even if
+.Fn PacketAliasSetAddress
+is called to change the address after
+.Fn PacketAliasRedirectAddr
+is called, a zero reference will track this change.
+.Pp
+If the link is further set up to operate for a load sharing, then
+.Fa local_addr
+is ignored, and is selected dynamically from the server pool, as described in
+.Fn PacketAliasAddServer
+below.
+.Pp
+If subsequent calls to
+.Fn PacketAliasRedirectAddr
+use the same aliasing address, all new incoming traffic to this aliasing
+address will be redirected to the local address made in the last function
+call.
+New traffic generated by any of the local machines, designated in the
+several function calls, will be aliased to the same address.
+Consider the following example:
+.Bd -literal -offset indent
+PacketAliasRedirectAddr(inet_aton("192.168.0.2"),
+ inet_aton("141.221.254.101"));
+PacketAliasRedirectAddr(inet_aton("192.168.0.3"),
+ inet_aton("141.221.254.101"));
+PacketAliasRedirectAddr(inet_aton("192.168.0.4"),
+ inet_aton("141.221.254.101"));
+.Ed
+.Pp
+Any outgoing connections such as
+.Xr telnet 1
+or
+.Xr ftp 1
+from 192.168.0.2, 192.168.0.3 and 192.168.0.4 will appear to come from
+141.221.254.101.
+Any incoming connections to 141.221.254.101 will be directed to 192.168.0.4.
+.Pp
+Any calls to
+.Fn PacketAliasRedirectPort
+will have precedence over address mappings designated by
+.Fn PacketAliasRedirectAddr .
+.Pp
+This function returns a pointer which can subsequently be used by
+.Fn PacketAliasRedirectDelete .
+If
+.Dv NULL
+is returned, then the function call did not complete successfully.
+.Ed
+.Pp
+.Ft int
+.Fo PacketAliasAddServer
+.Fa "struct alias_link *link"
+.Fa "struct in_addr addr"
+.Fa "u_short port"
+.Fc
+.Bd -ragged -offset indent
+This function sets the
+.Fa link
+up for Load Sharing using IP Network Address Translation (RFC 2391, LSNAT).
+LSNAT operates as follows.
+A client attempts to access a server by using the server virtual address.
+The LSNAT router transparently redirects the request to one of the hosts
+in server pool, selected using a real-time load sharing algorithm.
+Multiple sessions may be initiated from the same client, and each session
+could be directed to a different host based on load balance across server
+pool hosts at the time.
+If load share is desired for just a few specific services, the configuration
+on LSNAT could be defined to restrict load share for just the services
+desired.
+.Pp
+Currently, only the simplest selection algorithm is implemented, where a
+host is selected on a round-robin basis only, without regard to load on
+the host.
+.Pp
+First, the
+.Fa link
+is created by either
+.Fn PacketAliasRedirectPort
+or
+.Fn PacketAliasRedirectAddr .
+Then,
+.Fn PacketAliasAddServer
+is called multiple times to add entries to the
+.Fa link Ns 's
+server pool.
+.Pp
+For links created with
+.Fn PacketAliasRedirectAddr ,
+the
+.Fa port
+argument is ignored and could have any value, e.g. htons(~0).
+.Pp
+This function returns 0 on success, -1 otherwise.
+.Ed
+.Pp
+.Ft void
+.Fn PacketAliasRedirectDelete "struct alias_link *link"
+.Bd -ragged -offset indent
+This function will delete a specific static redirect rule entered by
+.Fn PacketAliasRedirectPort
+or
+.Fn PacketAliasRedirectAddr .
+The parameter
+.Fa link
+is the pointer returned by either of the redirection functions.
+If an invalid pointer is passed to
+.Fn PacketAliasRedirectDelete ,
+then a program crash or unpredictable operation could result, so it is
+necessary to be careful using this function.
+.Ed
+.Pp
+.Ft int
+.Fn PacketAliasProxyRule "const char *cmd"
+.Bd -ragged -offset indent
+The passed
+.Fa cmd
+string consists of one or more pairs of words.
+The first word in each pair is a token and the second is the value that
+should be applied for that token.
+Tokens and their argument types are as follows:
+.Bl -tag -width indent
+.It Cm type encode_ip_hdr | encode_tcp_stream | no_encode
+In order to support transparent proxying, it is necessary to somehow
+pass the original address and port information into the new destination
+server.
+If
+.Cm encode_ip_hdr
+is specified, the original address and port is passed as an extra IP
+option.
+If
+.Cm encode_tcp_stream
+is specified, the original address and port is passed as the first
+piece of data in the TCP stream in the format
+.Dq DEST Ar IP port .
+.It Cm port Ar portnum
+Only packets with the destination port
+.Ar portnum
+are proxied.
+.It Cm server Ar host Ns Xo
+.Op : Ns Ar portnum
+.Xc
+This specifies the
+.Ar host
+and
+.Ar portnum
+that the data is to be redirected to.
+.Ar host
+must be an IP address rather than a DNS host name.
+If
+.Ar portnum
+is not specified, the destination port number is not changed.
+.Pp
+The
+.Ar server
+specification is mandatory unless the
+.Cm delete
+command is being used.
+.It Cm rule Ar index
+Normally, each call to
+.Fn PacketAliasProxyRule
+inserts the next rule at the start of a linear list of rules.
+If an
+.Ar index
+is specified, the new rule will be checked after all rules with lower
+indices.
+Calls to
+.Fn PacketAliasProxyRule
+that do not specify a rule are assigned rule 0.
+.It Cm delete Ar index
+This token and its argument MUST NOT be used with any other tokens.
+When used, all existing rules with the given
+.Ar index
+are deleted.
+.It Cm proto tcp | udp
+If specified, only packets of the given protocol type are matched.
+.It Cm src Ar IP Ns Xo
+.Op / Ns Ar bits
+.Xc
+If specified, only packets with a source address matching the given
+.Ar IP
+are matched.
+If
+.Ar bits
+is also specified, then the first
+.Ar bits
+bits of
+.Ar IP
+are taken as a network specification, and all IP addresses from that
+network will be matched.
+.It Cm dst Ar IP Ns Xo
+.Op / Ns Ar bits
+.Xc
+If specified, only packets with a destination address matching the given
+.Ar IP
+are matched.
+If
+.Ar bits
+is also specified, then the first
+.Ar bits
+bits of
+.Ar IP
+are taken as a network specification, and all IP addresses from that
+network will be matched.
+.El
+.Pp
+This function is usually used to redirect outgoing connections for
+internal machines that are not permitted certain types of internet
+access, or to restrict access to certain external machines.
+.Ed
+.Pp
+.Ft struct alias_link *
+.Fo PacketAliasRedirectProto
+.Fa "struct in_addr local_addr"
+.Fa "struct in_addr remote_addr"
+.Fa "struct in_addr alias_addr"
+.Fa "u_char proto"
+.Fc
+.Bd -ragged -offset indent
+This function specifies that any IP packet with protocol number of
+.Fa proto
+from a given remote address to an alias address be
+redirected to a specified local address.
+.Pp
+If
+.Fa local_addr
+or
+.Fa alias_addr
+is zero, this indicates that the packet aliasing address as established
+by
+.Fn PacketAliasSetAddress
+is to be used.
+Even if
+.Fn PacketAliasSetAddress
+is called to change the address after
+.Fn PacketAliasRedirectProto
+is called, a zero reference will track this change.
+.Pp
+If
+.Fa remote_addr
+is zero, this indicates to redirect packets from any remote address.
+Non-zero remote addresses can sometimes be useful for firewalling.
+.Pp
+If two calls to
+.Fn PacketAliasRedirectProto
+overlap in their address specifications, then the most recent call
+will have precedence.
+.Pp
+This function returns a pointer which can subsequently be used by
+.Fn PacketAliasRedirectDelete .
+If
+.Dv NULL
+is returned, then the function call did not complete successfully.
+.Ed
+.Sh FRAGMENT HANDLING
+The functions in this section are used to deal with incoming fragments.
+.Pp
+Outgoing fragments are handled within
+.Fn PacketAliasOut
+by changing the address according to any applicable mapping set by
+.Fn PacketAliasRedirectAddr ,
+or the default aliasing address set by
+.Fn PacketAliasSetAddress .
+.Pp
+Incoming fragments are handled in one of two ways.
+If the header of a fragmented IP packet has already been seen, then all
+subsequent fragments will be re-mapped in the same manner the header
+fragment was.
+Fragments which arrive before the header are saved and then retrieved
+once the header fragment has been resolved.
+.Pp
+.Ft int
+.Fn PacketAliasSaveFragment "char *ptr"
+.Bd -ragged -offset indent
+When
+.Fn PacketAliasIn
+returns
+.Dv PKT_ALIAS_UNRESOLVED_FRAGMENT ,
+this function can be used to save the pointer to the unresolved fragment.
+.Pp
+It is implicitly assumed that
+.Fa ptr
+points to a block of memory allocated by
+.Xr malloc 3 .
+If the fragment is never resolved, the packet aliasing engine will
+automatically free the memory after a timeout period.
+[Eventually this function should be modified so that a callback function
+for freeing memory is passed as an argument.]
+.Pp
+This function returns
+.Dv PKT_ALIAS_OK
+if it was successful and
+.Dv PKT_ALIAS_ERROR
+if there was an error.
+.Ed
+.Pp
+.Ft char *
+.Fn PacketAliasGetFragment "char *buffer"
+.Bd -ragged -offset indent
+This function can be used to retrieve fragment pointers saved by
+.Fn PacketAliasSaveFragment .
+The IP header fragment pointed to by
+.Fa buffer
+is the header fragment indicated when
+.Fn PacketAliasIn
+returns
+.Dv PKT_ALIAS_FOUND_HEADER_FRAGMENT .
+Once a fragment pointer is retrieved, it becomes the calling program's
+responsibility to free the dynamically allocated memory for the fragment.
+.Pp
+.Fn PacketAliasGetFragment
+can be called sequentially until there are no more fragments available,
+at which time it returns
+.Dv NULL .
+.Ed
+.Pp
+.Ft void
+.Fn PacketAliasFragmentIn "char *header" "char *fragment"
+.Bd -ragged -offset indent
+When a fragment is retrieved with
+.Fn PacketAliasGetFragment ,
+it can then be de-aliased with a call to
+.Fn PacketAliasFragmentIn .
+The
+.Fa header
+argument is the pointer to a header fragment used as a template, and
+.Fa fragment
+is the pointer to the packet to be de-aliased.
+.Ed
+.Sh MISCELLANEOUS FUNCTIONS
+.Ft void
+.Fn PacketAliasSetTarget "struct in_addr addr"
+.Bd -ragged -offset indent
+When an incoming packet not associated with any pre-existing aliasing link
+arrives at the host machine, it will be sent to the address indicated by a
+call to
+.Fn PacketAliasSetTarget .
+.Pp
+If this function is called with an
+.Dv INADDR_NONE
+address argument, then all new incoming packets go to the address set by
+.Fn PacketAliasSetAddress .
+.Pp
+If this function is not called, or is called with an
+.Dv INADDR_ANY
+address argument, then all new incoming packets go to the address specified
+in the packet.
+This allows external machines to talk directly to internal machines if they
+can route packets to the machine in question.
+.Ed
+.Pp
+.Ft int
+.Fn PacketAliasCheckNewLink void
+.Bd -ragged -offset indent
+This function returns a non-zero value when a new aliasing link is created.
+In circumstances where incoming traffic is being sequentially sent to
+different local servers, this function can be used to trigger when
+.Fn PacketAliasSetTarget
+is called to change the default target address.
+.Ed
+.Pp
+.Ft u_short
+.Fn PacketAliasInternetChecksum "u_short *buffer" "int nbytes"
+.Bd -ragged -offset indent
+This is a utility function that does not seem to be available elsewhere and
+is included as a convenience.
+It computes the internet checksum, which is used in both IP and
+protocol-specific headers (TCP, UDP, ICMP).
+.Pp
+The
+.Fa buffer
+argument points to the data block to be checksummed, and
+.Fa nbytes
+is the number of bytes.
+The 16-bit checksum field should be zeroed before computing the checksum.
+.Pp
+Checksums can also be verified by operating on a block of data including
+its checksum.
+If the checksum is valid,
+.Fn PacketAliasInternetChecksum
+will return zero.
+.Ed
+.Pp
+.Ft int
+.Fn PacketUnaliasOut "char *buffer" "int maxpacketsize"
+.Bd -ragged -offset indent
+An outgoing packet, which has already been aliased,
+has its private address/port information restored by this function.
+The IP packet is pointed to by
+.Fa buffer ,
+and
+.Fa maxpacketsize
+is provided for error checking purposes.
+This function can be used if an already-aliased packet needs to have its
+original IP header restored for further processing (eg. logging).
+.Ed
+.Sh BUGS
+PPTP aliasing does not work when more than one internal client
+connects to the same external server at the same time, because
+PPTP requires a single TCP control connection to be established
+between any two IP addresses.
+.Sh AUTHORS
+.An Charles Mott Aq cm@linktel.net ,
+versions 1.0 - 1.8, 2.0 - 2.4.
+.An Eivind Eklund Aq eivind@FreeBSD.org ,
+versions 1.8b, 1.9 and 2.5.
+Added IRC DCC support as well as contributing a number of architectural
+improvements; added the firewall bypass for FTP/IRC DCC.
+.An Erik Salander Aq erik@whistle.com
+added support for PPTP and RTSP.
+.An Junichi Satoh Aq junichi@junichi.org
+added support for RTSP/PNA.
+.Sh ACKNOWLEDGMENTS
+Listed below, in approximate chronological order, are individuals who
+have provided valuable comments and/or debugging assistance.
+.Pp
+.Bd -ragged -offset indent
+.An -split
+.An Gary Roberts
+.An Tom Torrance
+.An Reto Burkhalter
+.An Martin Renters
+.An Brian Somers
+.An Paul Traina
+.An Ari Suutari
+.An Dave Remien
+.An J. Fortes
+.An Andrzej Bialecki
+.An Gordon Burditt
+.Ed
+.Sh CONCEPTUAL BACKGROUND
+This section is intended for those who are planning to modify the source
+code or want to create somewhat esoteric applications using the packet
+aliasing functions.
+.Pp
+The conceptual framework under which the packet aliasing engine operates
+is described here.
+Central to the discussion is the idea of an
+.Em aliasing link
+which describes the relationship for a given packet transaction between
+the local machine, aliased identity and remote machine.
+It is discussed how such links come into existence and are destroyed.
+.Ss ALIASING LINKS
+There is a notion of an
+.Em aliasing link ,
+which is a 7-tuple describing a specific translation:
+.Bd -literal -offset indent
+(local addr, local port, alias addr, alias port,
+ remote addr, remote port, protocol)
+.Ed
+.Pp
+Outgoing packets have the local address and port number replaced with the
+alias address and port number.
+Incoming packets undergo the reverse process.
+The packet aliasing engine attempts to match packets against an internal
+table of aliasing links to determine how to modify a given IP packet.
+Both the IP header and protocol dependent headers are modified as necessary.
+Aliasing links are created and deleted as necessary according to network
+traffic.
+.Pp
+Protocols can be TCP, UDP or even ICMP in certain circumstances.
+(Some types of ICMP packets can be aliased according to sequence or ID
+number which acts as an equivalent port number for identifying how
+individual packets should be handled.)
+.Pp
+Each aliasing link must have a unique combination of the following five
+quantities: alias address/port, remote address/port and protocol.
+This ensures that several machines on a local network can share the
+same aliasing IP address.
+In cases where conflicts might arise, the aliasing port is chosen so that
+uniqueness is maintained.
+.Ss STATIC AND DYNAMIC LINKS
+Aliasing links can either be static or dynamic.
+Static links persist indefinitely and represent fixed rules for translating
+IP packets.
+Dynamic links come into existence for a specific TCP connection or UDP
+transaction or ICMP ECHO sequence.
+For the case of TCP, the connection can be monitored to see when the
+associated aliasing link should be deleted.
+Aliasing links for UDP transactions (and ICMP ECHO and TIMESTAMP requests)
+work on a simple timeout rule.
+When no activity is observed on a dynamic link for a certain amount of time
+it is automatically deleted.
+Timeout rules also apply to TCP connections which do not open or close
+properly.
+.Ss PARTIALLY SPECIFIED ALIASING LINKS
+Aliasing links can be partially specified, meaning that the remote address
+and/or remote port are unknown.
+In this case, when a packet matching the incomplete specification is found,
+a fully specified dynamic link is created.
+If the original partially specified link is dynamic, it will be deleted
+after the fully specified link is created, otherwise it will persist.
+.Pp
+For instance, a partially specified link might be
+.Bd -literal -offset indent
+(192.168.0.4, 23, 204.228.203.215, 8066, 0, 0, tcp)
+.Ed
+.Pp
+The zeros denote unspecified components for the remote address and port.
+If this link were static it would have the effect of redirecting all
+incoming traffic from port 8066 of 204.228.203.215 to port 23 (telnet)
+of machine 192.168.0.4 on the local network.
+Each individual telnet connection would initiate the creation of a distinct
+dynamic link.
+.Ss DYNAMIC LINK CREATION
+In addition to aliasing links, there are also address mappings that can be
+stored within the internal data table of the packet aliasing mechanism.
+.Bd -literal -offset indent
+(local addr, alias addr)
+.Ed
+.Pp
+Address mappings are searched when creating new dynamic links.
+.Pp
+All outgoing packets from the local network automatically create a dynamic
+link if they do not match an already existing fully specified link.
+If an address mapping exists for the outgoing packet, this determines
+the alias address to be used.
+If no mapping exists, then a default address, usually the address of the
+packet aliasing host, is used.
+If necessary, this default address can be changed as often as each individual
+packet arrives.
+.Pp
+The aliasing port number is determined such that the new dynamic link does
+not conflict with any existing links.
+In the default operating mode, the packet aliasing engine attempts to set
+the aliasing port equal to the local port number.
+If this results in a conflict, then port numbers are randomly chosen until
+a unique aliasing link can be established.
+In an alternate operating mode, the first choice of an aliasing port is also
+random and unrelated to the local port number.
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
new file mode 100644
index 0000000..f104cfc
--- /dev/null
+++ b/sys/netinet/raw_ip.c
@@ -0,0 +1,707 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95
+ * $FreeBSD$
+ */
+
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_random_ip_id.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#define _IP_VHL
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_mroute.h>
+
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#endif /*IPSEC*/
+
+struct inpcbhead ripcb;
+struct inpcbinfo ripcbinfo;
+
+/* control hooks for ipfw and dummynet */
+ip_fw_ctl_t *ip_fw_ctl_ptr;
+ip_dn_ctl_t *ip_dn_ctl_ptr;
+
+/*
+ * Nominal space allocated to a raw ip socket.
+ */
+#define RIPSNDQ 8192
+#define RIPRCVQ 8192
+
+/*
+ * Raw interface to IP protocol.
+ */
+
+/*
+ * Initialize raw connection block q.
+ */
+void
+rip_init()
+{
+ INP_INFO_LOCK_INIT(&ripcbinfo, "rip");
+ LIST_INIT(&ripcb);
+ ripcbinfo.listhead = &ripcb;
+ /*
+ * XXX We don't use the hash list for raw IP, but it's easier
+ * to allocate a one entry hash list than it is to check all
+ * over the place for hashbase == NULL.
+ */
+ ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask);
+ ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask);
+ ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets);
+}
+
+static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET };
+/*
+ * Setup generic address and protocol structures
+ * for raw_input routine, then pass them along with
+ * mbuf chain.
+ */
+void
+rip_input(m, off)
+ struct mbuf *m;
+ int off;
+{
+ register struct ip *ip = mtod(m, struct ip *);
+ register struct inpcb *inp;
+ struct inpcb *last = 0;
+ struct mbuf *opts = 0;
+ int proto = ip->ip_p;
+
+ ripsrc.sin_addr = ip->ip_src;
+ LIST_FOREACH(inp, &ripcb, inp_list) {
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_ip_p && inp->inp_ip_p != proto)
+ continue;
+ if (inp->inp_laddr.s_addr &&
+ inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
+ continue;
+ if (inp->inp_faddr.s_addr &&
+ inp->inp_faddr.s_addr != ip->ip_src.s_addr)
+ continue;
+ if (last) {
+ struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);
+
+#ifdef IPSEC
+ /* check AH/ESP integrity. */
+ if (n && ipsec4_in_reject_so(n, last->inp_socket)) {
+ m_freem(n);
+ ipsecstat.in_polvio++;
+ /* do not inject data to pcb */
+ } else
+#endif /*IPSEC*/
+ if (n) {
+ if (last->inp_flags & INP_CONTROLOPTS ||
+ last->inp_socket->so_options & SO_TIMESTAMP)
+ ip_savecontrol(last, &opts, ip, n);
+ if (sbappendaddr(&last->inp_socket->so_rcv,
+ (struct sockaddr *)&ripsrc, n,
+ opts) == 0) {
+ /* should notify about lost packet */
+ m_freem(n);
+ if (opts)
+ m_freem(opts);
+ } else
+ sorwakeup(last->inp_socket);
+ opts = 0;
+ }
+ }
+ last = inp;
+ }
+#ifdef IPSEC
+ /* check AH/ESP integrity. */
+ if (last && ipsec4_in_reject_so(m, last->inp_socket)) {
+ m_freem(m);
+ ipsecstat.in_polvio++;
+ ipstat.ips_delivered--;
+ /* do not inject data to pcb */
+ } else
+#endif /*IPSEC*/
+ if (last) {
+ if (last->inp_flags & INP_CONTROLOPTS ||
+ last->inp_socket->so_options & SO_TIMESTAMP)
+ ip_savecontrol(last, &opts, ip, m);
+ if (sbappendaddr(&last->inp_socket->so_rcv,
+ (struct sockaddr *)&ripsrc, m, opts) == 0) {
+ m_freem(m);
+ if (opts)
+ m_freem(opts);
+ } else
+ sorwakeup(last->inp_socket);
+ } else {
+ m_freem(m);
+ ipstat.ips_noproto++;
+ ipstat.ips_delivered--;
+ }
+}
+
+/*
+ * Generate IP header and pass packet to ip_output.
+ * Tack on options user may have setup with control call.
+ */
+int
+rip_output(m, so, dst)
+ struct mbuf *m;
+ struct socket *so;
+ u_long dst;
+{
+ register struct ip *ip;
+ register struct inpcb *inp = sotoinpcb(so);
+ int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+
+ /*
+ * If the user handed us a complete IP packet, use it.
+ * Otherwise, allocate an mbuf for a header and fill it in.
+ */
+ if ((inp->inp_flags & INP_HDRINCL) == 0) {
+ if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
+ m_freem(m);
+ return(EMSGSIZE);
+ }
+ M_PREPEND(m, sizeof(struct ip), M_TRYWAIT);
+ ip = mtod(m, struct ip *);
+ ip->ip_tos = inp->inp_ip_tos;
+ ip->ip_off = 0;
+ ip->ip_p = inp->inp_ip_p;
+ ip->ip_len = m->m_pkthdr.len;
+ ip->ip_src = inp->inp_laddr;
+ ip->ip_dst.s_addr = dst;
+ ip->ip_ttl = inp->inp_ip_ttl;
+ } else {
+ if (m->m_pkthdr.len > IP_MAXPACKET) {
+ m_freem(m);
+ return(EMSGSIZE);
+ }
+ ip = mtod(m, struct ip *);
+ /* don't allow both user specified and setsockopt options,
+ and don't allow packet length sizes that will crash */
+ if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2))
+ && inp->inp_options)
+ || (ip->ip_len > m->m_pkthdr.len)
+ || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) {
+ m_freem(m);
+ return EINVAL;
+ }
+ if (ip->ip_id == 0)
+#ifdef RANDOM_IP_ID
+ ip->ip_id = ip_randomid();
+#else
+ ip->ip_id = htons(ip_id++);
+#endif
+ /* XXX prevent ip_output from overwriting header fields */
+ flags |= IP_RAWOUTPUT;
+ ipstat.ips_rawout++;
+ }
+
+#ifdef IPSEC
+ if (ipsec_setsocket(m, so) != 0) {
+ m_freem(m);
+ return ENOBUFS;
+ }
+#endif /*IPSEC*/
+
+ return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
+ inp->inp_moptions));
+}
+
+/*
+ * Raw IP socket option processing.
+ */
+int
+rip_ctloutput(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ struct inpcb *inp = sotoinpcb(so);
+ int error, optval;
+
+ if (sopt->sopt_level != IPPROTO_IP)
+ return (EINVAL);
+
+ error = 0;
+
+ switch (sopt->sopt_dir) {
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case IP_HDRINCL:
+ optval = inp->inp_flags & INP_HDRINCL;
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case IP_FW_ADD: /* ADD actually returns the body... */
+ case IP_FW_GET:
+ if (IPFW_LOADED)
+ error = ip_fw_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT;
+ break;
+
+ case IP_DUMMYNET_GET:
+ if (DUMMYNET_LOADED)
+ error = ip_dn_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT;
+ break ;
+
+ case MRT_INIT:
+ case MRT_DONE:
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ case MRT_VERSION:
+ case MRT_ASSERT:
+ error = ip_mrouter_get(so, sopt);
+ break;
+
+ default:
+ error = ip_ctloutput(so, sopt);
+ break;
+ }
+ break;
+
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case IP_HDRINCL:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+ if (optval)
+ inp->inp_flags |= INP_HDRINCL;
+ else
+ inp->inp_flags &= ~INP_HDRINCL;
+ break;
+
+ case IP_FW_ADD:
+ case IP_FW_DEL:
+ case IP_FW_FLUSH:
+ case IP_FW_ZERO:
+ case IP_FW_RESETLOG:
+ if (IPFW_LOADED)
+ error = ip_fw_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT;
+ break;
+
+ case IP_DUMMYNET_CONFIGURE:
+ case IP_DUMMYNET_DEL:
+ case IP_DUMMYNET_FLUSH:
+ if (DUMMYNET_LOADED)
+ error = ip_dn_ctl_ptr(sopt);
+ else
+ error = ENOPROTOOPT ;
+ break ;
+
+ case IP_RSVP_ON:
+ error = ip_rsvp_init(so);
+ break;
+
+ case IP_RSVP_OFF:
+ error = ip_rsvp_done();
+ break;
+
+ /* XXX - should be combined */
+ case IP_RSVP_VIF_ON:
+ error = ip_rsvp_vif_init(so, sopt);
+ break;
+
+ case IP_RSVP_VIF_OFF:
+ error = ip_rsvp_vif_done(so, sopt);
+ break;
+
+ case MRT_INIT:
+ case MRT_DONE:
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ case MRT_VERSION:
+ case MRT_ASSERT:
+ error = ip_mrouter_set(so, sopt);
+ break;
+
+ default:
+ error = ip_ctloutput(so, sopt);
+ break;
+ }
+ break;
+ }
+
+ return (error);
+}
+
+/*
+ * This function exists solely to receive the PRC_IFDOWN messages which
+ * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa,
+ * and calls in_ifadown() to remove all routes corresponding to that address.
+ * It also receives the PRC_IFUP messages from if_up() and reinstalls the
+ * interface routes.
+ */
+void
+rip_ctlinput(cmd, sa, vip)
+ int cmd;
+ struct sockaddr *sa;
+ void *vip;
+{
+ struct in_ifaddr *ia;
+ struct ifnet *ifp;
+ int err;
+ int flags;
+
+ switch (cmd) {
+ case PRC_IFDOWN:
+ TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
+ if (ia->ia_ifa.ifa_addr == sa
+ && (ia->ia_flags & IFA_ROUTE)) {
+ /*
+ * in_ifscrub kills the interface route.
+ */
+ in_ifscrub(ia->ia_ifp, ia);
+ /*
+ * in_ifadown gets rid of all the rest of
+ * the routes. This is not quite the right
+ * thing to do, but at least if we are running
+ * a routing process they will come back.
+ */
+ in_ifadown(&ia->ia_ifa, 0);
+ break;
+ }
+ }
+ break;
+
+ case PRC_IFUP:
+ TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
+ if (ia->ia_ifa.ifa_addr == sa)
+ break;
+ }
+ if (ia == 0 || (ia->ia_flags & IFA_ROUTE))
+ return;
+ flags = RTF_UP;
+ ifp = ia->ia_ifa.ifa_ifp;
+
+ if ((ifp->if_flags & IFF_LOOPBACK)
+ || (ifp->if_flags & IFF_POINTOPOINT))
+ flags |= RTF_HOST;
+
+ err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
+ if (err == 0)
+ ia->ia_flags |= IFA_ROUTE;
+ break;
+ }
+}
+
+u_long rip_sendspace = RIPSNDQ;
+u_long rip_recvspace = RIPRCVQ;
+
+SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
+SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
+ &rip_recvspace, 0, "Maximum incoming raw IP datagram size");
+
+static int
+rip_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ int error, s;
+
+ inp = sotoinpcb(so);
+ if (inp)
+ panic("rip_attach");
+ if (td && (error = suser(td)) != 0)
+ return error;
+
+ error = soreserve(so, rip_sendspace, rip_recvspace);
+ if (error)
+ return error;
+ s = splnet();
+ error = in_pcballoc(so, &ripcbinfo, td);
+ splx(s);
+ if (error)
+ return error;
+ inp = (struct inpcb *)so->so_pcb;
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_ip_p = proto;
+ inp->inp_ip_ttl = ip_defttl;
+ return 0;
+}
+
+static int
+rip_detach(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ if (inp == 0)
+ panic("rip_detach");
+ if (so == ip_mrouter)
+ ip_mrouter_done();
+ ip_rsvp_force_done(so);
+ if (so == ip_rsvpd)
+ ip_rsvp_done();
+ in_pcbdetach(inp);
+ return 0;
+}
+
+static int
+rip_abort(struct socket *so)
+{
+ soisdisconnected(so);
+ return rip_detach(so);
+}
+
+static int
+rip_disconnect(struct socket *so)
+{
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return ENOTCONN;
+ return rip_abort(so);
+}
+
+static int
+rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ struct sockaddr_in *addr = (struct sockaddr_in *)nam;
+
+ if (nam->sa_len != sizeof(*addr))
+ return EINVAL;
+
+ if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) &&
+ (addr->sin_family != AF_IMPLINK)) ||
+ (addr->sin_addr.s_addr &&
+ ifa_ifwithaddr((struct sockaddr *)addr) == 0))
+ return EADDRNOTAVAIL;
+ inp->inp_laddr = addr->sin_addr;
+ return 0;
+}
+
+static int
+rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ struct sockaddr_in *addr = (struct sockaddr_in *)nam;
+
+ if (nam->sa_len != sizeof(*addr))
+ return EINVAL;
+ if (TAILQ_EMPTY(&ifnet))
+ return EADDRNOTAVAIL;
+ if ((addr->sin_family != AF_INET) &&
+ (addr->sin_family != AF_IMPLINK))
+ return EAFNOSUPPORT;
+ inp->inp_faddr = addr->sin_addr;
+ soisconnected(so);
+ return 0;
+}
+
+static int
+rip_shutdown(struct socket *so)
+{
+ socantsendmore(so);
+ return 0;
+}
+
+static int
+rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ register u_long dst;
+
+ if (so->so_state & SS_ISCONNECTED) {
+ if (nam) {
+ m_freem(m);
+ return EISCONN;
+ }
+ dst = inp->inp_faddr.s_addr;
+ } else {
+ if (nam == NULL) {
+ m_freem(m);
+ return ENOTCONN;
+ }
+ dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
+ }
+ return rip_output(m, so, dst);
+}
+
+static int
+rip_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n, s;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = ripcbinfo.ipi_count;
+ req->oldidx = 2 * (sizeof xig)
+ + (n + n/8) * sizeof(struct xinpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ s = splnet();
+ gencnt = ripcbinfo.ipi_gencnt;
+ n = ripcbinfo.ipi_count;
+ splx(s);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return error;
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return ENOMEM;
+
+ s = splnet();
+ for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ if (inp->inp_gencnt <= gencnt) {
+ if (cr_canseesocket(req->td->td_ucred,
+ inp->inp_socket))
+ continue;
+ inp_list[i++] = inp;
+ }
+ }
+ splx(s);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ if (inp->inp_gencnt <= gencnt) {
+ struct xinpcb xi;
+ xi.xi_len = sizeof xi;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xi.xi_inp, sizeof *inp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xi.xi_socket);
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ }
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ s = splnet();
+ xig.xig_gen = ripcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = ripcbinfo.ipi_count;
+ splx(s);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return error;
+}
+
+/*
+ * This is the wrapper function for in_setsockaddr. We just pass down
+ * the pcbinfo for in_setpeeraddr to lock.
+ */
+static int
+rip_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setsockaddr(so, nam, &ripcbinfo));
+}
+
+/*
+ * This is the wrapper function for in_setpeeraddr. We just pass down
+ * the pcbinfo for in_setpeeraddr to lock.
+ */
+static int
+rip_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setpeeraddr(so, nam, &ripcbinfo));
+}
+
+
+SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0,
+ rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
+
+struct pr_usrreqs rip_usrreqs = {
+ rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect,
+ pru_connect2_notsupp, in_control, rip_detach, rip_disconnect,
+ pru_listen_notsupp, rip_peeraddr, pru_rcvd_notsupp,
+ pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown,
+ rip_sockaddr, sosend, soreceive, sopoll
+};
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
new file mode 100644
index 0000000..fee449f
--- /dev/null
+++ b/sys/netinet/tcp.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_H_
+#define _NETINET_TCP_H_
+
+typedef u_int32_t tcp_seq;
+typedef u_int32_t tcp_cc; /* connection count per rfc1644 */
+
+#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */
+#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */
+
+/*
+ * TCP header.
+ * Per RFC 793, September, 1981.
+ */
+struct tcphdr {
+ u_short th_sport; /* source port */
+ u_short th_dport; /* destination port */
+ tcp_seq th_seq; /* sequence number */
+ tcp_seq th_ack; /* acknowledgement number */
+#if BYTE_ORDER == LITTLE_ENDIAN
+ u_int th_x2:4, /* (unused) */
+ th_off:4; /* data offset */
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+ u_int th_off:4, /* data offset */
+ th_x2:4; /* (unused) */
+#endif
+ u_char th_flags;
+#define TH_FIN 0x01
+#define TH_SYN 0x02
+#define TH_RST 0x04
+#define TH_PUSH 0x08
+#define TH_ACK 0x10
+#define TH_URG 0x20
+#define TH_ECE 0x40
+#define TH_CWR 0x80
+#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR)
+
+ u_short th_win; /* window */
+ u_short th_sum; /* checksum */
+ u_short th_urp; /* urgent pointer */
+};
+
+#define TCPOPT_EOL 0
+#define TCPOPT_NOP 1
+#define TCPOPT_MAXSEG 2
+#define TCPOLEN_MAXSEG 4
+#define TCPOPT_WINDOW 3
+#define TCPOLEN_WINDOW 3
+#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
+#define TCPOLEN_SACK_PERMITTED 2
+#define TCPOPT_SACK 5 /* Experimental */
+#define TCPOPT_TIMESTAMP 8
+#define TCPOLEN_TIMESTAMP 10
+#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
+#define TCPOPT_TSTAMP_HDR \
+ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
+
+#define TCPOPT_CC 11 /* CC options: RFC-1644 */
+#define TCPOPT_CCNEW 12
+#define TCPOPT_CCECHO 13
+#define TCPOLEN_CC 6
+#define TCPOLEN_CC_APPA (TCPOLEN_CC+2)
+#define TCPOPT_CC_HDR(ccopt) \
+ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|(ccopt)<<8|TCPOLEN_CC)
+
+/*
+ * Default maximum segment size for TCP.
+ * With an IP MSS of 576, this is 536,
+ * but 512 is probably more convenient.
+ * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)).
+ */
+#define TCP_MSS 512
+
+/*
+ * Default maximum segment size for TCP6.
+ * With an IP6 MSS of 1280, this is 1220,
+ * but 1024 is probably more convenient. (xxx kazu in doubt)
+ * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr))
+ */
+#define TCP6_MSS 1024
+
+#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */
+#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */
+
+#define TCP_MAX_WINSHIFT 14 /* maximum window shift */
+
+#define TCP_MAXBURST 4 /* maximum segments in a burst */
+
+#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */
+#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr))
+ /* max space left for options */
+
+/*
+ * User-settable options (used with setsockopt).
+ */
+#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */
+#define TCP_MAXSEG 0x02 /* set maximum segment size */
+#define TCP_NOPUSH 0x04 /* don't push last block of write */
+#define TCP_NOOPT 0x08 /* don't use TCP options */
+
+#endif
diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c
new file mode 100644
index 0000000..89e9d7c
--- /dev/null
+++ b/sys/netinet/tcp_debug.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_tcpdebug.h"
+
+#ifndef INET
+#error The option TCPDEBUG requires option INET.
+#endif
+
+#ifdef TCPDEBUG
+/* load symbolic names */
+#define PRUREQUESTS
+#define TCPSTATES
+#define TCPTIMERS
+#define TANAMES
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_debug.h>
+
+#ifdef TCPDEBUG
+static int tcpconsdebug = 0;
+#endif
+
+static struct tcp_debug tcp_debug[TCP_NDEBUG];
+static int tcp_debx;
+
+/*
+ * Tcp debug routines
+ */
+void
+tcp_trace(act, ostate, tp, ipgen, th, req)
+ short act, ostate;
+ struct tcpcb *tp;
+ void *ipgen;
+ struct tcphdr *th;
+ int req;
+{
+#ifdef INET6
+ int isipv6;
+#endif /* INET6 */
+ tcp_seq seq, ack;
+ int len, flags;
+ struct tcp_debug *td = &tcp_debug[tcp_debx++];
+
+#ifdef INET6
+ isipv6 = (ipgen != NULL && ((struct ip *)ipgen)->ip_v == 6) ? 1 : 0;
+#endif /* INET6 */
+ td->td_family =
+#ifdef INET6
+ (isipv6 != 0) ? AF_INET6 :
+#endif
+ AF_INET;
+ if (tcp_debx == TCP_NDEBUG)
+ tcp_debx = 0;
+ td->td_time = iptime();
+ td->td_act = act;
+ td->td_ostate = ostate;
+ td->td_tcb = (caddr_t)tp;
+ if (tp)
+ td->td_cb = *tp;
+ else
+ bzero((caddr_t)&td->td_cb, sizeof (*tp));
+ if (ipgen) {
+ switch (td->td_family) {
+ case AF_INET:
+ bcopy((caddr_t)ipgen, (caddr_t)&td->td_ti.ti_i,
+ sizeof(td->td_ti.ti_i));
+ bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf));
+ break;
+#ifdef INET6
+ case AF_INET6:
+ bcopy((caddr_t)ipgen, (caddr_t)td->td_ip6buf,
+ sizeof(td->td_ip6buf));
+ bzero((caddr_t)&td->td_ti.ti_i,
+ sizeof(td->td_ti.ti_i));
+ break;
+#endif
+ default:
+ bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf));
+ bzero((caddr_t)&td->td_ti.ti_i,
+ sizeof(td->td_ti.ti_i));
+ break;
+ }
+ } else {
+ bzero((caddr_t)&td->td_ti.ti_i, sizeof(td->td_ti.ti_i));
+ bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf));
+ }
+ if (th) {
+ switch (td->td_family) {
+ case AF_INET:
+ td->td_ti.ti_t = *th;
+ bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th));
+ break;
+#ifdef INET6
+ case AF_INET6:
+ td->td_ti6.th = *th;
+ bzero((caddr_t)&td->td_ti.ti_t,
+ sizeof(td->td_ti.ti_t));
+ break;
+#endif
+ default:
+ bzero((caddr_t)&td->td_ti.ti_t,
+ sizeof(td->td_ti.ti_t));
+ bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th));
+ break;
+ }
+ } else {
+ bzero((caddr_t)&td->td_ti.ti_t, sizeof(td->td_ti.ti_t));
+ bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th));
+ }
+ td->td_req = req;
+#ifdef TCPDEBUG
+ if (tcpconsdebug == 0)
+ return;
+ if (tp)
+ printf("%p %s:", tp, tcpstates[ostate]);
+ else
+ printf("???????? ");
+ printf("%s ", tanames[act]);
+ switch (act) {
+
+ case TA_INPUT:
+ case TA_OUTPUT:
+ case TA_DROP:
+ if (ipgen == NULL || th == NULL)
+ break;
+ seq = th->th_seq;
+ ack = th->th_ack;
+ len =
+#ifdef INET6
+ isipv6 ? ((struct ip6_hdr *)ipgen)->ip6_plen :
+#endif
+ ((struct ip *)ipgen)->ip_len;
+ if (act == TA_OUTPUT) {
+ seq = ntohl(seq);
+ ack = ntohl(ack);
+ len = ntohs((u_short)len);
+ }
+ if (act == TA_OUTPUT)
+ len -= sizeof (struct tcphdr);
+ if (len)
+ printf("[%x..%x)", seq, seq+len);
+ else
+ printf("%x", seq);
+ printf("@%x, urp=%x", ack, th->th_urp);
+ flags = th->th_flags;
+ if (flags) {
+ char *cp = "<";
+#define pf(f) { \
+ if (th->th_flags & TH_##f) { \
+ printf("%s%s", cp, #f); \
+ cp = ","; \
+ } \
+}
+ pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG);
+ printf(">");
+ }
+ break;
+
+ case TA_USER:
+ printf("%s", prurequests[req&0xff]);
+ if ((req & 0xff) == PRU_SLOWTIMO)
+ printf("<%s>", tcptimers[req>>8]);
+ break;
+ }
+ if (tp)
+ printf(" -> %s", tcpstates[tp->t_state]);
+ /* print out internal state of tp !?! */
+ printf("\n");
+ if (tp == 0)
+ return;
+ printf(
+ "\trcv_(nxt,wnd,up) (%lx,%lx,%lx) snd_(una,nxt,max) (%lx,%lx,%lx)\n",
+ (u_long)tp->rcv_nxt, tp->rcv_wnd, (u_long)tp->rcv_up,
+ (u_long)tp->snd_una, (u_long)tp->snd_nxt, (u_long)tp->snd_max);
+ printf("\tsnd_(wl1,wl2,wnd) (%lx,%lx,%lx)\n",
+ (u_long)tp->snd_wl1, (u_long)tp->snd_wl2, tp->snd_wnd);
+#endif /* TCPDEBUG */
+}
diff --git a/sys/netinet/tcp_debug.h b/sys/netinet/tcp_debug.h
new file mode 100644
index 0000000..773d3e4
--- /dev/null
+++ b/sys/netinet/tcp_debug.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_DEBUG_H_
+#define _NETINET_TCP_DEBUG_H_
+
+struct tcp_debug {
+ n_time td_time;
+ short td_act;
+ short td_ostate;
+ caddr_t td_tcb;
+ int td_family;
+ /*
+ * Co-existense of td_ti and td_ti6 below is ugly, but it is necessary
+ * to achieve backword compatibility to some extent.
+ */
+ struct tcpiphdr td_ti;
+ struct {
+#if !defined(_KERNEL) && defined(INET6)
+ struct ip6_hdr ip6;
+#else
+ u_char ip6buf[40]; /* sizeof(struct ip6_hdr) */
+#endif
+ struct tcphdr th;
+ } td_ti6;
+#define td_ip6buf td_ti6.ip6buf
+ short td_req;
+ struct tcpcb td_cb;
+};
+
+#define TA_INPUT 0
+#define TA_OUTPUT 1
+#define TA_USER 2
+#define TA_RESPOND 3
+#define TA_DROP 4
+
+#ifdef TANAMES
+static char *tanames[] =
+ { "input", "output", "user", "respond", "drop" };
+#endif
+
+#define TCP_NDEBUG 100
+
+#ifndef _KERNEL
+/* XXX common variables for broken applications. */
+struct tcp_debug tcp_debug[TCP_NDEBUG];
+int tcp_debx;
+#endif
+
+#endif /* !_NETINET_TCP_DEBUG_H_ */
diff --git a/sys/netinet/tcp_fsm.h b/sys/netinet/tcp_fsm.h
new file mode 100644
index 0000000..c8f87f6
--- /dev/null
+++ b/sys/netinet/tcp_fsm.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_FSM_H_
+#define _NETINET_TCP_FSM_H_
+
+/*
+ * TCP FSM state definitions.
+ * Per RFC793, September, 1981.
+ */
+
+#define TCP_NSTATES 11
+
+#define TCPS_CLOSED 0 /* closed */
+#define TCPS_LISTEN 1 /* listening for connection */
+#define TCPS_SYN_SENT 2 /* active, have sent syn */
+#define TCPS_SYN_RECEIVED 3 /* have send and received syn */
+/* states < TCPS_ESTABLISHED are those where connections not established */
+#define TCPS_ESTABLISHED 4 /* established */
+#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */
+/* states > TCPS_CLOSE_WAIT are those where user has closed */
+#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */
+#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */
+#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */
+/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */
+#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */
+#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */
+
+/* for KAME src sync over BSD*'s */
+#define TCP6_NSTATES TCP_NSTATES
+#define TCP6S_CLOSED TCPS_CLOSED
+#define TCP6S_LISTEN TCPS_LISTEN
+#define TCP6S_SYN_SENT TCPS_SYN_SENT
+#define TCP6S_SYN_RECEIVED TCPS_SYN_RECEIVED
+#define TCP6S_ESTABLISHED TCPS_ESTABLISHED
+#define TCP6S_CLOSE_WAIT TCPS_CLOSE_WAIT
+#define TCP6S_FIN_WAIT_1 TCPS_FIN_WAIT_1
+#define TCP6S_CLOSING TCPS_CLOSING
+#define TCP6S_LAST_ACK TCPS_LAST_ACK
+#define TCP6S_FIN_WAIT_2 TCPS_FIN_WAIT_2
+#define TCP6S_TIME_WAIT TCPS_TIME_WAIT
+
+#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED)
+#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED)
+#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT)
+
+#ifdef TCPOUTFLAGS
+/*
+ * Flags used when sending segments in tcp_output.
+ * Basic flags (TH_RST,TH_ACK,TH_SYN,TH_FIN) are totally
+ * determined by state, with the proviso that TH_FIN is sent only
+ * if all data queued for output is included in the segment.
+ */
+static u_char tcp_outflags[TCP_NSTATES] = {
+ TH_RST|TH_ACK, /* 0, CLOSED */
+ 0, /* 1, LISTEN */
+ TH_SYN, /* 2, SYN_SENT */
+ TH_SYN|TH_ACK, /* 3, SYN_RECEIVED */
+ TH_ACK, /* 4, ESTABLISHED */
+ TH_ACK, /* 5, CLOSE_WAIT */
+ TH_FIN|TH_ACK, /* 6, FIN_WAIT_1 */
+ TH_FIN|TH_ACK, /* 7, CLOSING */
+ TH_FIN|TH_ACK, /* 8, LAST_ACK */
+ TH_ACK, /* 9, FIN_WAIT_2 */
+ TH_ACK, /* 10, TIME_WAIT */
+};
+#endif
+
+#ifdef KPROF
+int tcp_acounts[TCP_NSTATES][PRU_NREQ];
+#endif
+
+#ifdef TCPSTATES
+const char *tcpstates[] = {
+ "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD",
+ "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING",
+ "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT",
+};
+#endif
+
+#endif
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
new file mode 100644
index 0000000..0fb62e0
--- /dev/null
+++ b/sys/netinet/tcp_input.c
@@ -0,0 +1,2785 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_ipfw.h" /* for ipfw_fwd */
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_tcp_input.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+
+#endif /* TCPDEBUG */
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#ifdef INET6
+#include <netinet6/ipsec6.h>
+#endif
+#include <netkey/key.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
+
+static int tcprexmtthresh = 3;
+tcp_cc tcp_ccgen;
+
+struct tcpstat tcpstat;
+SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
+ &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
+
+static int log_in_vain = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
+ &log_in_vain, 0, "Log all incoming TCP connections");
+
+static int blackhole = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
+ &blackhole, 0, "Do not send RST when dropping refused connections");
+
+int tcp_delack_enabled = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
+ &tcp_delack_enabled, 0,
+ "Delay ACK to try and piggyback it onto a data packet");
+
+#ifdef TCP_DROP_SYNFIN
+static int drop_synfin = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
+ &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
+#endif
+
+struct inpcbhead tcb;
+#define tcb6 tcb /* for KAME src sync over BSD*'s */
+struct inpcbinfo tcbinfo;
+struct mtx *tcbinfo_mtx;
+
+static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void tcp_pulloutofband(struct socket *,
+ struct tcphdr *, struct mbuf *, int);
+static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
+ struct mbuf *);
+static void tcp_xmit_timer(struct tcpcb *, int);
+static int tcp_newreno(struct tcpcb *, struct tcphdr *);
+
+/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
+#ifdef INET6
+#define ND6_HINT(tp) \
+do { \
+ if ((tp) && (tp)->t_inpcb && \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
+ (tp)->t_inpcb->in6p_route.ro_rt) \
+ nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+} while (0)
+#else
+#define ND6_HINT(tp)
+#endif
+
+/*
+ * Indicate whether this ack should be delayed. We can delay the ack if
+ * - delayed acks are enabled and
+ * - there is no delayed ack timer in progress and
+ * - our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window.
+ */
+#define DELAY_ACK(tp) \
+ (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
+ (tp->t_flags & TF_RXWIN0SENT) == 0)
+
+static int
+tcp_reass(tp, th, tlenp, m)
+ register struct tcpcb *tp;
+ register struct tcphdr *th;
+ int *tlenp;
+ struct mbuf *m;
+{
+ struct tseg_qent *q;
+ struct tseg_qent *p = NULL;
+ struct tseg_qent *nq;
+ struct tseg_qent *te;
+ struct socket *so = tp->t_inpcb->inp_socket;
+ int flags;
+
+ /*
+ * Call with th==0 after become established to
+ * force pre-ESTABLISHED data up to user socket.
+ */
+ if (th == 0)
+ goto present;
+
+ /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
+ MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
+ M_NOWAIT);
+ if (te == NULL) {
+ tcpstat.tcps_rcvmemdrop++;
+ m_freem(m);
+ return (0);
+ }
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+ if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
+ break;
+ p = q;
+ }
+
+ /*
+ * If there is a preceding segment, it may provide some of
+ * our data already. If so, drop the data from the incoming
+ * segment. If it provides all of our data, drop us.
+ */
+ if (p != NULL) {
+ register int i;
+ /* conversion to int (in i) handles seq wraparound */
+ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
+ if (i > 0) {
+ if (i >= *tlenp) {
+ tcpstat.tcps_rcvduppack++;
+ tcpstat.tcps_rcvdupbyte += *tlenp;
+ m_freem(m);
+ FREE(te, M_TSEGQ);
+ /*
+ * Try to present any queued data
+ * at the left window edge to the user.
+ * This is needed after the 3-WHS
+ * completes.
+ */
+ goto present; /* ??? */
+ }
+ m_adj(m, i);
+ *tlenp -= i;
+ th->th_seq += i;
+ }
+ }
+ tcpstat.tcps_rcvoopack++;
+ tcpstat.tcps_rcvoobyte += *tlenp;
+
+ /*
+ * While we overlap succeeding segments trim them or,
+ * if they are completely covered, dequeue them.
+ */
+ while (q) {
+ register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
+ if (i <= 0)
+ break;
+ if (i < q->tqe_len) {
+ q->tqe_th->th_seq += i;
+ q->tqe_len -= i;
+ m_adj(q->tqe_m, i);
+ break;
+ }
+
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ m_freem(q->tqe_m);
+ FREE(q, M_TSEGQ);
+ q = nq;
+ }
+
+ /* Insert the new segment queue entry into place. */
+ te->tqe_m = m;
+ te->tqe_th = th;
+ te->tqe_len = *tlenp;
+
+ if (p == NULL) {
+ LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+ } else {
+ LIST_INSERT_AFTER(p, te, tqe_q);
+ }
+
+present:
+ /*
+ * Present data to user, advancing rcv_nxt through
+ * completed sequence space.
+ */
+ if (!TCPS_HAVEESTABLISHED(tp->t_state))
+ return (0);
+ q = LIST_FIRST(&tp->t_segq);
+ if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
+ return (0);
+ do {
+ tp->rcv_nxt += q->tqe_len;
+ flags = q->tqe_th->th_flags & TH_FIN;
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ if (so->so_state & SS_CANTRCVMORE)
+ m_freem(q->tqe_m);
+ else
+ sbappend(&so->so_rcv, q->tqe_m);
+ FREE(q, M_TSEGQ);
+ q = nq;
+ } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
+ ND6_HINT(tp);
+ sorwakeup(so);
+ return (flags);
+}
+
+/*
+ * TCP input routine, follows pages 65-76 of the
+ * protocol specification dated September, 1981 very closely.
+ */
+#ifdef INET6
+int
+tcp6_input(mp, offp, proto)
+ struct mbuf **mp;
+ int *offp, proto;
+{
+ register struct mbuf *m = *mp;
+ struct in6_ifaddr *ia6;
+
+ IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
+
+ /*
+ * draft-itojun-ipv6-tcp-to-anycast
+ * better place to put this in?
+ */
+ ia6 = ip6_getdstifaddr(m);
+ if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
+ struct ip6_hdr *ip6;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
+ (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
+ return IPPROTO_DONE;
+ }
+
+ tcp_input(m, *offp);
+ return IPPROTO_DONE;
+}
+#endif
+
+void
+tcp_input(m, off0)
+ register struct mbuf *m;
+ int off0;
+{
+ register struct tcphdr *th;
+ register struct ip *ip = NULL;
+ register struct ipovly *ipov;
+ register struct inpcb *inp = NULL;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int len, tlen, off;
+ int drop_hdrlen;
+ register struct tcpcb *tp = 0;
+ register int thflags;
+ struct socket *so = 0;
+ int todrop, acked, ourfinisacked, needoutput = 0;
+ u_long tiwin;
+ struct tcpopt to; /* options in this segment */
+ struct rmxp_tao *taop; /* pointer to our TAO cache entry */
+ struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ int headlocked = 0;
+
+#ifdef TCPDEBUG
+ u_char tcp_saveipgen[40];
+ /* the size of the above must be of max ip header, now IPv6 */
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+#endif
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+#endif /* INET6 */
+ struct sockaddr_in *next_hop = NULL;
+ int rstreason; /* For badport_bandlim accounting purposes */
+
+ /* Grab info from MT_TAG mbufs prepended to the chain. */
+ for (;m && m->m_type == MT_TAG; m = m->m_next) {
+ if (m->m_tag_id == PACKET_TAG_IPFORWARD)
+ next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
+ }
+
+#ifdef INET6
+ isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+#endif
+ bzero((char *)&to, sizeof(to));
+
+ tcpstat.tcps_rcvtotal++;
+
+#ifdef INET6
+ if (isipv6) {
+ /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
+ ip6 = mtod(m, struct ip6_hdr *);
+ tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+ if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
+ tcpstat.tcps_rcvbadsum++;
+ goto drop;
+ }
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
+
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ goto drop;
+ }
+ } else
+#endif /* INET6 */
+ {
+ /*
+ * Get IP and TCP header together in first mbuf.
+ * Note: IP leaves IP header in first mbuf.
+ */
+ if (off0 > sizeof (struct ip)) {
+ ip_stripoptions(m, (struct mbuf *)0);
+ off0 = sizeof(struct ip);
+ }
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
+ tcpstat.tcps_rcvshort++;
+ return;
+ }
+ }
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)((caddr_t)ip + off0);
+ tlen = ip->ip_len;
+
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
+ ip->ip_len + IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+ } else {
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = sizeof (struct ip) + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = (u_short)tlen;
+ ipov->ih_len = htons(ipov->ih_len);
+ th->th_sum = in_cksum(m, len);
+ }
+ if (th->th_sum) {
+ tcpstat.tcps_rcvbadsum++;
+ goto drop;
+ }
+#ifdef INET6
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+#endif
+ }
+
+ /*
+ * Check that TCP offset makes sense,
+ * pull out TCP options and adjust length. XXX
+ */
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ tcpstat.tcps_rcvbadoff++;
+ goto drop;
+ }
+ tlen -= off; /* tlen is used instead of ti->ti_len */
+ if (off > sizeof (struct tcphdr)) {
+#ifdef INET6
+ if (isipv6) {
+ IP6_EXTHDR_CHECK(m, off0, off, );
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
+ } else
+#endif /* INET6 */
+ {
+ if (m->m_len < sizeof(struct ip) + off) {
+ if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
+ tcpstat.tcps_rcvshort++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)((caddr_t)ip + off0);
+ }
+ }
+ optlen = off - sizeof (struct tcphdr);
+ optp = (u_char *)(th + 1);
+ }
+ thflags = th->th_flags;
+
+#ifdef TCP_DROP_SYNFIN
+ /*
+ * If the drop_synfin option is enabled, drop all packets with
+ * both the SYN and FIN bits set. This prevents e.g. nmap from
+ * identifying the TCP/IP stack.
+ *
+ * This is a violation of the TCP specification.
+ */
+ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
+ goto drop;
+#endif
+
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ th->th_seq = ntohl(th->th_seq);
+ th->th_ack = ntohl(th->th_ack);
+ th->th_win = ntohs(th->th_win);
+ th->th_urp = ntohs(th->th_urp);
+
+ /*
+ * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
+ * until after ip6_savecontrol() is called and before other functions
+ * which don't want those proto headers.
+ * Because ip6_savecontrol() is going to parse the mbuf to
+ * search for data to be passed up to user-land, it wants mbuf
+ * parameters to be unchanged.
+ * XXX: the call of ip6_savecontrol() has been obsoleted based on
+ * latest version of the advanced API (20020110).
+ */
+ drop_hdrlen = off0 + off;
+
+ /*
+ * Locate pcb for segment.
+ */
+ INP_INFO_WLOCK(&tcbinfo);
+ headlocked = 1;
+findpcb:
+ /* IPFIREWALL_FORWARD section */
+ if (next_hop != NULL
+#ifdef INET6
+ && isipv6 == NULL /* IPv6 support is not yet */
+#endif /* INET6 */
+ ) {
+ /*
+ * Transparently forwarded. Pretend to be the destination.
+ * already got one like this?
+ */
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
+ if (!inp) {
+ /*
+ * No, then it's new. Try find the ambushing socket
+ */
+ if (next_hop->sin_port == 0) {
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
+ th->th_sport, next_hop->sin_addr,
+ th->th_dport, 1, m->m_pkthdr.rcvif);
+ } else {
+ inp = in_pcblookup_hash(&tcbinfo,
+ ip->ip_src, th->th_sport,
+ next_hop->sin_addr,
+ ntohs(next_hop->sin_port), 1,
+ m->m_pkthdr.rcvif);
+ }
+ }
+ } else
+ {
+#ifdef INET6
+ if (isipv6)
+ inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
+ &ip6->ip6_dst, th->th_dport, 1,
+ m->m_pkthdr.rcvif);
+ else
+#endif /* INET6 */
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
+ }
+
+#ifdef IPSEC
+#ifdef INET6
+ if (isipv6) {
+ if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
+ ipsec6stat.in_polvio++;
+ goto drop;
+ }
+ } else
+#endif /* INET6 */
+ if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
+ ipsecstat.in_polvio++;
+ goto drop;
+ }
+#endif /*IPSEC*/
+
+ /*
+ * If the state is CLOSED (i.e., TCB does not exist) then
+ * all data in the incoming segment is discarded.
+ * If the TCB exists but is in CLOSED state, it is embryonic,
+ * but should either do a listen or a connect soon.
+ */
+ if (inp == NULL) {
+ if (log_in_vain) {
+#ifdef INET6
+ char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN];
+#else /* INET6 */
+ char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
+#endif /* INET6 */
+
+#ifdef INET6
+ if (isipv6) {
+ strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst));
+ strcpy(sbuf, ip6_sprintf(&ip6->ip6_src));
+ } else
+#endif
+ {
+ strcpy(dbuf, inet_ntoa(ip->ip_dst));
+ strcpy(sbuf, inet_ntoa(ip->ip_src));
+ }
+ switch (log_in_vain) {
+ case 1:
+ if(thflags & TH_SYN)
+ log(LOG_INFO,
+ "Connection attempt to TCP %s:%d from %s:%d\n",
+ dbuf, ntohs(th->th_dport),
+ sbuf,
+ ntohs(th->th_sport));
+ break;
+ case 2:
+ log(LOG_INFO,
+ "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
+ dbuf, ntohs(th->th_dport), sbuf,
+ ntohs(th->th_sport), thflags);
+ break;
+ default:
+ break;
+ }
+ }
+ if (blackhole) {
+ switch (blackhole) {
+ case 1:
+ if (thflags & TH_SYN)
+ goto drop;
+ break;
+ case 2:
+ goto drop;
+ default:
+ goto drop;
+ }
+ }
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ if (tp == 0) {
+ INP_UNLOCK(inp);
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+ if (tp->t_state == TCPS_CLOSED)
+ goto drop;
+
+ /* Unscale the window into a 32-bit value. */
+ if ((thflags & TH_SYN) == 0)
+ tiwin = th->th_win << tp->snd_scale;
+ else
+ tiwin = th->th_win;
+
+ so = inp->inp_socket;
+ if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
+ struct in_conninfo inc;
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG) {
+ ostate = tp->t_state;
+#ifdef INET6
+ if (isipv6)
+ bcopy((char *)ip6, (char *)tcp_saveipgen,
+ sizeof(*ip6));
+ else
+#endif /* INET6 */
+ bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
+ tcp_savetcp = *th;
+ }
+#endif
+ /* skip if this isn't a listen socket */
+ if ((so->so_options & SO_ACCEPTCONN) == 0)
+ goto after_listen;
+#ifdef INET6
+ inc.inc_isipv6 = isipv6;
+ if (isipv6) {
+ inc.inc6_faddr = ip6->ip6_src;
+ inc.inc6_laddr = ip6->ip6_dst;
+ inc.inc6_route.ro_rt = NULL; /* XXX */
+
+ } else
+#endif /* INET6 */
+ {
+ inc.inc_faddr = ip->ip_src;
+ inc.inc_laddr = ip->ip_dst;
+ inc.inc_route.ro_rt = NULL; /* XXX */
+ }
+ inc.inc_fport = th->th_sport;
+ inc.inc_lport = th->th_dport;
+
+ /*
+ * If the state is LISTEN then ignore segment if it contains
+ * a RST. If the segment contains an ACK then it is bad and
+ * send a RST. If it does not contain a SYN then it is not
+ * interesting; drop it.
+ *
+ * If the state is SYN_RECEIVED (syncache) and seg contains
+ * an ACK, but not for our SYN/ACK, send a RST. If the seg
+ * contains a RST, check the sequence number to see if it
+ * is a valid reset segment.
+ */
+ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
+ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+ if (!syncache_expand(&inc, th, &so, m)) {
+ /*
+ * No syncache entry, or ACK was not
+ * for our SYN/ACK. Send a RST.
+ */
+ tcpstat.tcps_badsyn++;
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ if (so == NULL) {
+ /*
+ * Could not complete 3-way handshake,
+ * connection is being closed down, and
+ * syncache will free mbuf.
+ */
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+ }
+ /*
+ * Socket is created in state SYN_RECEIVED.
+ * Continue processing segment.
+ */
+ INP_UNLOCK(inp);
+ inp = sotoinpcb(so);
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ /*
+ * This is what would have happened in
+ * tcp_output() when the SYN,ACK was sent.
+ */
+ tp->snd_up = tp->snd_una;
+ tp->snd_max = tp->snd_nxt = tp->iss + 1;
+ tp->last_ack_sent = tp->rcv_nxt;
+/*
+ * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
+ * until the _second_ ACK is received:
+ * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window.
+ * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
+ * move to ESTAB, set snd_wnd to tiwin.
+ */
+ tp->snd_wnd = tiwin; /* unscaled */
+ goto after_listen;
+ }
+ if (thflags & TH_RST) {
+ syncache_chkrst(&inc, th);
+ goto drop;
+ }
+ if (thflags & TH_ACK) {
+ syncache_badack(&inc);
+ tcpstat.tcps_badsyn++;
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ goto drop;
+ }
+
+ /*
+ * Segment's flags are (SYN) or (SYN|FIN).
+ */
+#ifdef INET6
+ /*
+ * If deprecated address is forbidden,
+ * we do not accept SYN to deprecated interface
+ * address to prevent any new inbound connection from
+ * getting established.
+ * When we do not accept SYN, we send a TCP RST,
+ * with deprecated source address (instead of dropping
+ * it). We compromise it as it is much better for peer
+ * to send a RST, and RST will be the final packet
+ * for the exchange.
+ *
+ * If we do not forbid deprecated addresses, we accept
+ * the SYN packet. RFC2462 does not suggest dropping
+ * SYN in this case.
+ * If we decipher RFC2462 5.5.4, it says like this:
+ * 1. use of deprecated addr with existing
+ * communication is okay - "SHOULD continue to be
+ * used"
+ * 2. use of it with new communication:
+ * (2a) "SHOULD NOT be used if alternate address
+ * with sufficient scope is available"
+ * (2b) nothing mentioned otherwise.
+ * Here we fall into (2b) case as we have no choice in
+ * our source address selection - we must obey the peer.
+ *
+ * The wording in RFC2462 is confusing, and there are
+ * multiple description text for deprecated address
+ * handling - worse, they are not exactly the same.
+ * I believe 5.5.4 is the best one, so we follow 5.5.4.
+ */
+ if (isipv6 && !ip6_use_deprecated) {
+ struct in6_ifaddr *ia6;
+
+ if ((ia6 = ip6_getdstifaddr(m)) &&
+ (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
+ INP_UNLOCK(inp);
+ tp = NULL;
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ }
+#endif
+ /*
+ * If it is from this socket, drop it, it must be forged.
+ * Don't bother responding if the destination was a broadcast.
+ */
+ if (th->th_dport == th->th_sport) {
+#ifdef INET6
+ if (isipv6) {
+ if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
+ &ip6->ip6_src))
+ goto drop;
+ } else
+#endif /* INET6 */
+ if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
+ goto drop;
+ }
+ /*
+ * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
+ *
+ * Note that it is quite possible to receive unicast
+ * link-layer packets with a broadcast IP address. Use
+ * in_broadcast() to find them.
+ */
+ if (m->m_flags & (M_BCAST|M_MCAST))
+ goto drop;
+#ifdef INET6
+ if (isipv6) {
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
+ goto drop;
+ } else
+#endif
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
+ goto drop;
+ /*
+ * SYN appears to be valid; create compressed TCP state
+ * for syncache, or perform t/tcp connection.
+ */
+ if (so->so_qlen <= so->so_qlimit) {
+ tcp_dooptions(&to, optp, optlen, 1);
+ if (!syncache_add(&inc, &to, th, &so, m))
+ goto drop;
+ if (so == NULL) {
+ /*
+ * Entry added to syncache, mbuf used to
+ * send SYN,ACK packet.
+ */
+ KASSERT(headlocked, ("headlocked"));
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+ }
+ /*
+ * Segment passed TAO tests.
+ */
+ INP_UNLOCK(inp);
+ inp = sotoinpcb(so);
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ tp->snd_wnd = tiwin;
+ tp->t_starttime = ticks;
+ tp->t_state = TCPS_ESTABLISHED;
+
+ /*
+ * If there is a FIN, or if there is data and the
+ * connection is local, then delay SYN,ACK(SYN) in
+ * the hope of piggy-backing it on a response
+ * segment. Otherwise must send ACK now in case
+ * the other side is slow starting.
+ */
+ if (DELAY_ACK(tp) && ((thflags & TH_FIN) ||
+ (tlen != 0 &&
+#ifdef INET6
+ ((isipv6 && in6_localaddr(&inp->in6p_faddr))
+ ||
+ (!isipv6 &&
+#endif
+ in_localaddr(inp->inp_faddr)
+#ifdef INET6
+ ))
+#endif
+ ))) {
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ tp->t_flags |= TF_NEEDSYN;
+ } else
+ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+
+ tcpstat.tcps_connects++;
+ soisconnected(so);
+ goto trimthenstep6;
+ }
+ goto drop;
+ }
+after_listen:
+
+/* XXX temp debugging */
+ /* should not happen - syncache should pick up these connections */
+ if (tp->t_state == TCPS_LISTEN)
+ panic("tcp_input: TCPS_LISTEN");
+
+ /*
+ * Segment received on connection.
+ * Reset idle time and keep-alive timer.
+ */
+ tp->t_rcvtime = ticks;
+ if (TCPS_HAVEESTABLISHED(tp->t_state))
+ callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+
+ /*
+ * Process options.
+ * XXX this is tradtitional behavior, may need to be cleaned up.
+ */
+ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
+ if (thflags & TH_SYN) {
+ if (to.to_flags & TOF_SCALE) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->requested_s_scale = to.to_requested_s_scale;
+ }
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = ticks;
+ }
+ if (to.to_flags & (TOF_CC|TOF_CCNEW))
+ tp->t_flags |= TF_RCVD_CC;
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ }
+
+ /*
+ * Header prediction: check for the two common cases
+ * of a uni-directional data xfer. If the packet has
+ * no control flags, is in-sequence, the window didn't
+ * change and we're not retransmitting, it's a
+ * candidate. If the length is zero and the ack moved
+ * forward, we're the sender side of the xfer. Just
+ * free the data acked & wake any higher level process
+ * that was blocked waiting for space. If the length
+ * is non-zero and the ack didn't move, we're the
+ * receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data to
+ * the socket buffer and note that we need a delayed ack.
+ * Make sure that the hidden state-flags are also off.
+ * Since we check for TCPS_ESTABLISHED above, it can only
+ * be TH_NEEDSYN.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+ ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
+ ((to.to_flags & TOF_TS) == 0 ||
+ TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
+ /*
+ * Using the CC option is compulsory if once started:
+ * the segment is OK if no T/TCP was negotiated or
+ * if the segment has a CC option equal to CCrecv
+ */
+ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
+ ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
+ th->th_seq == tp->rcv_nxt &&
+ tiwin && tiwin == tp->snd_wnd &&
+ tp->snd_nxt == tp->snd_max) {
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = ticks;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ if (tlen == 0) {
+ if (SEQ_GT(th->th_ack, tp->snd_una) &&
+ SEQ_LEQ(th->th_ack, tp->snd_max) &&
+ tp->snd_cwnd >= tp->snd_wnd &&
+ tp->t_dupacks < tcprexmtthresh) {
+ KASSERT(headlocked, ("headlocked"));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ /*
+ * this is a pure ack for outstanding data.
+ */
+ ++tcpstat.tcps_predack;
+ /*
+ * "bad retransmit" recovery
+ */
+ if (tp->t_rxtshift == 1 &&
+ ticks < tp->t_badrxtwin) {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh =
+ tp->snd_ssthresh_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0;
+ }
+ if ((to.to_flags & TOF_TS) != 0)
+ tcp_xmit_timer(tp,
+ ticks - to.to_tsecr + 1);
+ else if (tp->t_rtttime &&
+ SEQ_GT(th->th_ack, tp->t_rtseq))
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ acked = th->th_ack - tp->snd_una;
+ tcpstat.tcps_rcvackpack++;
+ tcpstat.tcps_rcvackbyte += acked;
+ sbdrop(&so->so_snd, acked);
+ tp->snd_una = th->th_ack;
+ m_freem(m);
+ ND6_HINT(tp); /* some progress has been done */
+
+ /*
+ * If all outstanding data are acked, stop
+ * retransmit timer, otherwise restart timer
+ * using current (possibly backed-off) value.
+ * If process is waiting for space,
+ * wakeup/selwakeup/signal. If data
+ * are ready to send, let tcp_output
+ * decide between more output or persist.
+ */
+ if (tp->snd_una == tp->snd_max)
+ callout_stop(tp->tt_rexmt);
+ else if (!callout_active(tp->tt_persist))
+ callout_reset(tp->tt_rexmt,
+ tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+
+ sowwakeup(so);
+ if (so->so_snd.sb_cc)
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ return;
+ }
+ } else if (th->th_ack == tp->snd_una &&
+ LIST_EMPTY(&tp->t_segq) &&
+ tlen <= sbspace(&so->so_rcv)) {
+ KASSERT(headlocked, ("headlocked"));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ /*
+ * this is a pure, in-sequence data packet
+ * with nothing on the reassembly queue and
+ * we have enough buffer space to take it.
+ */
+ ++tcpstat.tcps_preddat;
+ tp->rcv_nxt += tlen;
+ tcpstat.tcps_rcvpack++;
+ tcpstat.tcps_rcvbyte += tlen;
+ ND6_HINT(tp); /* some progress has been done */
+ /*
+ * Add data to socket buffer.
+ */
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ sbappend(&so->so_rcv, m);
+ sorwakeup(so);
+ if (DELAY_ACK(tp)) {
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ tcp_output(tp);
+ }
+ INP_UNLOCK(inp);
+ return;
+ }
+ }
+
+ /*
+ * Calculate amount of space in receive window,
+ * and then do TCP input processing.
+ * Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ { int win;
+
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+ }
+
+ switch (tp->t_state) {
+
+ /*
+ * If the state is SYN_RECEIVED:
+ * if seg contains an ACK, but not for our SYN/ACK, send a RST.
+ */
+ case TCPS_SYN_RECEIVED:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ break;
+
+ /*
+ * If the state is SYN_SENT:
+ * if seg contains an ACK, but not for our SYN, drop the input.
+ * if seg contains a RST, then drop the connection.
+ * if seg does not contain SYN, then drop it.
+ * Otherwise this is an acceptable SYN segment
+ * initialize tp->rcv_nxt and tp->irs
+ * if seg contains ack then advance tp->snd_una
+ * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+ * arrange for segment to be acked (eventually)
+ * continue processing rest of data/controls, beginning with URG
+ */
+ case TCPS_SYN_SENT:
+ if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
+ taop = &tao_noncached;
+ bzero(taop, sizeof(*taop));
+ }
+
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ /*
+ * If we have a cached CCsent for the remote host,
+ * hence we haven't just crashed and restarted,
+ * do not send a RST. This may be a retransmission
+ * from the other side after our earlier ACK was lost.
+ * Our new SYN, when it arrives, will serve as the
+ * needed ACK.
+ */
+ if (taop->tao_ccsent != 0)
+ goto drop;
+ else {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ }
+ if (thflags & TH_RST) {
+ if (thflags & TH_ACK)
+ tp = tcp_drop(tp, ECONNREFUSED);
+ goto drop;
+ }
+ if ((thflags & TH_SYN) == 0)
+ goto drop;
+ tp->snd_wnd = th->th_win; /* initial send window */
+ tp->cc_recv = to.to_cc; /* foreign CC */
+
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ /*
+ * Our SYN was acked. If segment contains CC.ECHO
+ * option, check it to make sure this segment really
+ * matches our SYN. If not, just drop it as old
+ * duplicate, but send an RST if we're still playing
+ * by the old rules. If no CC.ECHO option, make sure
+ * we don't get fooled into using T/TCP.
+ */
+ if (to.to_flags & TOF_CCECHO) {
+ if (tp->cc_send != to.to_ccecho) {
+ if (taop->tao_ccsent != 0)
+ goto drop;
+ else {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ }
+ } else
+ tp->t_flags &= ~TF_RCVD_CC;
+ tcpstat.tcps_connects++;
+ soisconnected(so);
+ /* Do window scaling on this connection? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ /* Segment is acceptable, update cache if undefined. */
+ if (taop->tao_ccsent == 0)
+ taop->tao_ccsent = to.to_ccecho;
+
+ tp->rcv_adv += tp->rcv_wnd;
+ tp->snd_una++; /* SYN is acked */
+ /*
+ * If there's data, delay ACK; if there's also a FIN
+ * ACKNOW will be turned on later.
+ */
+ if (DELAY_ACK(tp) && tlen != 0)
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ else
+ tp->t_flags |= TF_ACKNOW;
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state.
+ * Transitions:
+ * SYN_SENT --> ESTABLISHED
+ * SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ callout_reset(tp->tt_keep, tcp_keepidle,
+ tcp_timer_keep, tp);
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state => simul-
+ * taneous open. If segment contains CC option and there is
+ * a cached CC, apply TAO test; if it succeeds, connection is
+ * half-synchronized. Otherwise, do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED
+ * SYN-SENT* -> SYN-RECEIVED*
+ * If there was no CC option, clear cached CC value.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ callout_stop(tp->tt_rexmt);
+ if (to.to_flags & TOF_CC) {
+ if (taop->tao_cc != 0 &&
+ CC_GT(to.to_cc, taop->tao_cc)) {
+ /*
+ * update cache and make transition:
+ * SYN-SENT -> ESTABLISHED*
+ * SYN-SENT* -> FIN-WAIT-1*
+ */
+ taop->tao_cc = to.to_cc;
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ callout_reset(tp->tt_keep,
+ tcp_keepidle,
+ tcp_timer_keep,
+ tp);
+ }
+ tp->t_flags |= TF_NEEDSYN;
+ } else
+ tp->t_state = TCPS_SYN_RECEIVED;
+ } else {
+ /* CC.NEW or no option => invalidate cache */
+ taop->tao_cc = 0;
+ tp->t_state = TCPS_SYN_RECEIVED;
+ }
+ }
+
+trimthenstep6:
+ /*
+ * Advance th->th_seq to correspond to first data byte.
+ * If data, trim to stay within window,
+ * dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ tcpstat.tcps_rcvpackafterwin++;
+ tcpstat.tcps_rcvbyteafterwin += todrop;
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data.
+ * If the remote host used T/TCP to validate the SYN,
+ * our data will be ACK'd; if so, enter normal data segment
+ * processing in the middle of step 5, ack processing.
+ * Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK)
+ goto process_ACK;
+ goto step6;
+ /*
+ * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+ * if segment contains a SYN and CC [not CC.NEW] option:
+ * if state == TIME_WAIT and connection duration > MSL,
+ * drop packet and send RST;
+ *
+ * if SEG.CC > CCrecv then is new SYN, and can implicitly
+ * ack the FIN (and data) in retransmission queue.
+ * Complete close and delete TCPCB. Then reprocess
+ * segment, hoping to find new TCPCB in LISTEN state;
+ *
+ * else must be old SYN; drop it.
+ * else do normal processing.
+ */
+ case TCPS_LAST_ACK:
+ case TCPS_CLOSING:
+ case TCPS_TIME_WAIT:
+ if ((thflags & TH_SYN) &&
+ (to.to_flags & TOF_CC) && tp->cc_recv != 0) {
+ if (tp->t_state == TCPS_TIME_WAIT &&
+ (ticks - tp->t_starttime) > tcp_msl) {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ if (CC_GT(to.to_cc, tp->cc_recv)) {
+ tp = tcp_close(tp);
+ goto findpcb;
+ }
+ else
+ goto drop;
+ }
+ break; /* continue normal processing */
+ }
+
+ /*
+ * States other than LISTEN or SYN_SENT.
+ * First check the RST flag and sequence number since reset segments
+ * are exempt from the timestamp and connection count tests. This
+ * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
+ * below which allowed reset segments in half the sequence space
+ * to fall though and be processed (which gives forged reset
+ * segments with a random sequence number a 50 percent chance of
+ * killing a connection).
+ * Then check timestamp, if present.
+ * Then check the connection count, if present.
+ * Then check that at least some bytes of segment are within
+ * receive window. If segment begins before rcv_nxt,
+ * drop leading data (and SYN); if nothing left, just ack.
+ *
+ *
+ * If the RST bit is set, check the sequence number to see
+ * if this is a valid reset segment.
+ * RFC 793 page 37:
+ * In all states except SYN-SENT, all reset (RST) segments
+ * are validated by checking their SEQ-fields. A reset is
+ * valid if its sequence number is in the window.
+ * Note: this does not take into account delayed ACKs, so
+ * we should test against last_ack_sent instead of rcv_nxt.
+ * The sequence number in the reset segment is normally an
+ * echo of our outgoing acknowlegement numbers, but some hosts
+ * send a reset with the sequence number at the rightmost edge
+ * of our receive window, and we have to handle this case.
+ * If we have multiple segments in flight, the intial reset
+ * segment sequence numbers will be to the left of last_ack_sent,
+ * but they will eventually catch up.
+ * In any case, it never made sense to trim reset segments to
+ * fit the receive window since RFC 1122 says:
+ * 4.2.2.12 RST Segment: RFC-793 Section 3.4
+ *
+ * A TCP SHOULD allow a received RST segment to include data.
+ *
+ * DISCUSSION
+ * It has been suggested that a RST segment could contain
+ * ASCII text that encoded and explained the cause of the
+ * RST. No standard has yet been established for such
+ * data.
+ *
+ * If the reset segment passes the sequence number test examine
+ * the state:
+ * SYN_RECEIVED STATE:
+ * If passive open, return to LISTEN state.
+ * If active open, inform user that connection was refused.
+ * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
+ * Inform user that connection was reset, and close tcb.
+ * CLOSING, LAST_ACK STATES:
+ * Close the tcb.
+ * TIME_WAIT STATE:
+ * Drop the segment - see Stevens, vol. 2, p. 964 and
+ * RFC 1337.
+ */
+ if (thflags & TH_RST) {
+ if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ switch (tp->t_state) {
+
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ so->so_error = ECONNRESET;
+ close:
+ tp->t_state = TCPS_CLOSED;
+ tcpstat.tcps_drops++;
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_TIME_WAIT:
+ break;
+ }
+ }
+ goto drop;
+ }
+
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment
+ * and it's less than ts_recent, drop it.
+ */
+ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to.to_tsval, tp->ts_recent)) {
+
+ /* Check to see if ts_recent is over 24 days old. */
+ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates
+ * ts_recent, the age will be reset later and ts_recent
+ * will get a valid value. If it does not, setting
+ * ts_recent to zero will at least satisfy the
+ * requirement that zero be placed in the timestamp
+ * echo reply when ts_recent isn't valid. The
+ * age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be
+ * dropped when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ tcpstat.tcps_rcvduppack++;
+ tcpstat.tcps_rcvdupbyte += tlen;
+ tcpstat.tcps_pawsdrop++;
+ goto dropafterack;
+ }
+ }
+
+ /*
+ * T/TCP mechanism
+ * If T/TCP was negotiated and the segment doesn't have CC,
+ * or if its CC is wrong then drop the segment.
+ * RST segments do not have to comply with this.
+ */
+ if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
+ ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
+ goto dropafterack;
+
+ /*
+ * In the SYN-RECEIVED state, validate that the packet belongs to
+ * this connection before trimming the data to fit the receive
+ * window. Check the sequence number versus IRS since we know
+ * the sequence numbers haven't wrapped. This is a partial fix
+ * for the "LAND" DoS attack.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ tcpstat.tcps_rcvduppack++;
+ tcpstat.tcps_rcvdupbyte += todrop;
+ } else {
+ tcpstat.tcps_rcvpartduppack++;
+ tcpstat.tcps_rcvpartdupbyte += todrop;
+ }
+ drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+
+ /*
+ * If new data are received on a connection after the
+ * user processes are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) &&
+ tp->t_state > TCPS_CLOSE_WAIT && tlen) {
+ tp = tcp_close(tp);
+ tcpstat.tcps_rcvafterclose++;
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+
+ /*
+ * If segment ends after window, drop trailing data
+ * (and PUSH and FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
+ if (todrop > 0) {
+ tcpstat.tcps_rcvpackafterwin++;
+ if (todrop >= tlen) {
+ tcpstat.tcps_rcvbyteafterwin += tlen;
+ /*
+ * If a new connection request is received
+ * while in TIME_WAIT, drop the old connection
+ * and start over if the sequence numbers
+ * are above the previous ones.
+ */
+ if (thflags & TH_SYN &&
+ tp->t_state == TCPS_TIME_WAIT &&
+ SEQ_GT(th->th_seq, tp->rcv_nxt)) {
+ tp = tcp_close(tp);
+ goto findpcb;
+ }
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment
+ * and ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ tcpstat.tcps_rcvwinprobe++;
+ } else
+ goto dropafterack;
+ } else
+ tcpstat.tcps_rcvbyteafterwin += todrop;
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH|TH_FIN);
+ }
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record its timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = ticks;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ /*
+ * If a SYN is in the window, then this is an
+ * error and we send an RST and drop the connection.
+ */
+ if (thflags & TH_SYN) {
+ tp = tcp_drop(tp, ECONNRESET);
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
+ * flag is on (half-synchronized state), then queue data for
+ * later processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_state == TCPS_SYN_RECEIVED ||
+ (tp->t_flags & TF_NEEDSYN))
+ goto step6;
+ else
+ goto drop;
+ }
+
+ /*
+ * Ack processing.
+ */
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
+ * ESTABLISHED state and continue processing.
+ * The ACK was checked above.
+ */
+ case TCPS_SYN_RECEIVED:
+
+ tcpstat.tcps_connects++;
+ soisconnected(so);
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ /*
+ * Upon successful completion of 3-way handshake,
+ * update cache.CC if it was undefined, pass any queued
+ * data to the user, and advance state appropriately.
+ */
+ if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
+ taop->tao_cc == 0)
+ taop->tao_cc = tp->cc_recv;
+
+ /*
+ * Make transitions:
+ * SYN-RECEIVED -> ESTABLISHED
+ * SYN-RECEIVED* -> FIN-WAIT-1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ callout_reset(tp->tt_keep, tcp_keepidle,
+ tcp_timer_keep, tp);
+ }
+ /*
+ * If segment contains data or ACK, will call tcp_reass()
+ * later; if not, do so now to pass queued data to user.
+ */
+ if (tlen == 0 && (thflags & TH_FIN) == 0)
+ (void) tcp_reass(tp, (struct tcphdr *)0, 0,
+ (struct mbuf *)0);
+ tp->snd_wl1 = th->th_seq - 1;
+ /* fall into ... */
+
+ /*
+ * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
+ * ACKs. If the ack is in the range
+ * tp->snd_una < th->th_ack <= tp->snd_max
+ * then advance tp->snd_una to th->th_ack and drop
+ * data from the retransmission queue. If this ACK reflects
+ * more up to date window information we update our window information.
+ */
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ case TCPS_TIME_WAIT:
+
+ if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
+ if (tlen == 0 && tiwin == tp->snd_wnd) {
+ tcpstat.tcps_rcvdupack++;
+ /*
+ * If we have outstanding data (other than
+ * a window probe), this is a completely
+ * duplicate ack (ie, window info didn't
+ * change), the ack is the biggest we've
+ * seen and we've seen exactly our rexmt
+ * threshhold of them, assume a packet
+ * has been dropped and retransmit it.
+ * Kludge snd_nxt & the congestion
+ * window so we send only this one
+ * packet.
+ *
+ * We know we're losing at the current
+ * window size so do congestion avoidance
+ * (set ssthresh to half the current window
+ * and pull our congestion window back to
+ * the new ssthresh).
+ *
+ * Dup acks mean that packets have left the
+ * network (they're now cached at the receiver)
+ * so bump cwnd by the amount in the receiver
+ * to keep a constant cwnd packets in the
+ * network.
+ */
+ if (!callout_active(tp->tt_rexmt) ||
+ th->th_ack != tp->snd_una)
+ tp->t_dupacks = 0;
+ else if (++tp->t_dupacks == tcprexmtthresh) {
+ tcp_seq onxt = tp->snd_nxt;
+ u_int win =
+ min(tp->snd_wnd, tp->snd_cwnd) / 2 /
+ tp->t_maxseg;
+ if (tcp_do_newreno && SEQ_LT(th->th_ack,
+ tp->snd_recover)) {
+ /* False retransmit, should not
+ * cut window
+ */
+ tp->snd_cwnd += tp->t_maxseg;
+ tp->t_dupacks = 0;
+ (void) tcp_output(tp);
+ goto drop;
+ }
+ if (win < 2)
+ win = 2;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ tp->snd_recover = tp->snd_max;
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = th->th_ack;
+ tp->snd_cwnd = tp->t_maxseg;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = tp->snd_ssthresh +
+ tp->t_maxseg * tp->t_dupacks;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ goto drop;
+ } else if (tp->t_dupacks > tcprexmtthresh) {
+ tp->snd_cwnd += tp->t_maxseg;
+ (void) tcp_output(tp);
+ goto drop;
+ }
+ } else
+ tp->t_dupacks = 0;
+ break;
+ }
+ /*
+ * If the congestion window was inflated to account
+ * for the other side's cached packets, retract it.
+ */
+ if (tcp_do_newreno == 0) {
+ if (tp->t_dupacks >= tcprexmtthresh &&
+ tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_dupacks = 0;
+ } else if (tp->t_dupacks >= tcprexmtthresh &&
+ !tcp_newreno(tp, th)) {
+ /*
+ * Window inflation should have left us with approx.
+ * snd_ssthresh outstanding data. But in case we
+ * would be inclined to send a burst, better to do
+ * it via the slow start mechanism.
+ */
+ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
+ tp->snd_cwnd =
+ tp->snd_max - th->th_ack + tp->t_maxseg;
+ else
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_dupacks = 0;
+ }
+ if (tp->t_dupacks < tcprexmtthresh)
+ tp->t_dupacks = 0;
+ if (SEQ_GT(th->th_ack, tp->snd_max)) {
+ tcpstat.tcps_rcvacktoomuch++;
+ goto dropafterack;
+ }
+ /*
+ * If we reach this point, ACK is not a duplicate,
+ * i.e., it ACKs something we sent.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our
+ * SYN has been ACK'd (so connection is now fully
+ * synchronized). Go to non-starred state,
+ * increment snd_una for ACK of SYN, and check if
+ * we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ }
+
+process_ACK:
+ acked = th->th_ack - tp->snd_una;
+ tcpstat.tcps_rcvackpack++;
+ tcpstat.tcps_rcvackbyte += acked;
+
+ /*
+ * If we just performed our first retransmit, and the ACK
+ * arrives within our recovery window, then it was a mistake
+ * to do the retransmit in the first place. Recover our
+ * original cwnd and ssthresh, and proceed to transmit where
+ * we left off.
+ */
+ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0; /* XXX probably not required */
+ }
+
+ /*
+ * If we have a timestamp reply, update smoothed
+ * round trip time. If no timestamp is present but
+ * transmit timer is running and timed sequence
+ * number was acked, update smoothed round trip time.
+ * Since we now have an rtt measurement, cancel the
+ * timer backoff (cf., Phil Karn's retransmit alg.).
+ * Recompute the initial retransmit timer.
+ */
+ if (to.to_flags & TOF_TS)
+ tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
+ else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+
+ /*
+ * If all outstanding data is acked, stop retransmit
+ * timer and remember to restart (more output or persist).
+ * If there is more data to be acked, restart retransmit
+ * timer, using current (possibly backed-off) value.
+ */
+ if (th->th_ack == tp->snd_max) {
+ callout_stop(tp->tt_rexmt);
+ needoutput = 1;
+ } else if (!callout_active(tp->tt_persist))
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+
+ /*
+ * If no data (only SYN) was ACK'd,
+ * skip rest of ACK processing.
+ */
+ if (acked == 0)
+ goto step6;
+
+ /*
+ * When new data is acked, open the congestion window.
+ * If the window gives us less than ssthresh packets
+ * in flight, open exponentially (maxseg per packet).
+ * Otherwise open linearly: maxseg per window
+ * (maxseg^2 / cwnd per packet).
+ */
+ {
+ register u_int cw = tp->snd_cwnd;
+ register u_int incr = tp->t_maxseg;
+
+ if (cw > tp->snd_ssthresh)
+ incr = incr * incr / cw;
+ /*
+ * If t_dupacks != 0 here, it indicates that we are still
+ * in NewReno fast recovery mode, so we leave the congestion
+ * window alone.
+ */
+ if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
+ tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
+ }
+ if (acked > so->so_snd.sb_cc) {
+ tp->snd_wnd -= so->so_snd.sb_cc;
+ sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
+ ourfinisacked = 1;
+ } else {
+ sbdrop(&so->so_snd, acked);
+ tp->snd_wnd -= acked;
+ ourfinisacked = 0;
+ }
+ sowwakeup(so);
+ tp->snd_una = th->th_ack;
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una))
+ tp->snd_nxt = tp->snd_una;
+
+ switch (tp->t_state) {
+
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing
+ * for the ESTABLISHED state if our FIN is now acknowledged
+ * then enter FIN_WAIT_2.
+ */
+ case TCPS_FIN_WAIT_1:
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more
+ * data, then closing user can proceed.
+ * Starting the timer is contrary to the
+ * specification, but if we don't get a FIN
+ * we'll hang forever.
+ */
+ if (so->so_state & SS_CANTRCVMORE) {
+ soisdisconnected(so);
+ callout_reset(tp->tt_2msl, tcp_maxidle,
+ tcp_timer_2msl, tp);
+ }
+ tp->t_state = TCPS_FIN_WAIT_2;
+ }
+ break;
+
+ /*
+ * In CLOSING STATE in addition to the processing for
+ * the ESTABLISHED state if the ACK acknowledges our FIN
+ * then enter the TIME-WAIT state, otherwise ignore
+ * the segment.
+ */
+ case TCPS_CLOSING:
+ if (ourfinisacked) {
+ tp->t_state = TCPS_TIME_WAIT;
+ tcp_canceltimers(tp);
+ /* Shorten TIME_WAIT [RFC-1644, p.28] */
+ if (tp->cc_recv != 0 &&
+ (ticks - tp->t_starttime) < tcp_msl)
+ callout_reset(tp->tt_2msl,
+ tp->t_rxtcur *
+ TCPTV_TWTRUNC,
+ tcp_timer_2msl, tp);
+ else
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ soisdisconnected(so);
+ }
+ break;
+
+ /*
+ * In LAST_ACK, we may still be waiting for data to drain
+ * and/or to be acked, as well as for the ack of our FIN.
+ * If our FIN is now acknowledged, delete the TCB,
+ * enter the closed state and return.
+ */
+ case TCPS_LAST_ACK:
+ if (ourfinisacked) {
+ tp = tcp_close(tp);
+ goto drop;
+ }
+ break;
+
+ /*
+ * In TIME_WAIT state the only thing that should arrive
+ * is a retransmission of the remote FIN. Acknowledge
+ * it and restart the finack timer.
+ */
+ case TCPS_TIME_WAIT:
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ goto dropafterack;
+ }
+ }
+
+step6:
+ /*
+ * Update window information.
+ * Don't look at window if no ACK: TAC's send garbage on first SYN.
+ */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ tcpstat.tcps_rcvwinupd++;
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ needoutput = 1;
+ }
+
+ /*
+ * Process segments with URG.
+ */
+ if ((thflags & TH_URG) && th->th_urp &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ /*
+ * This is a kludge, but if we receive and accept
+ * random urgent pointers, we'll crash in
+ * soreceive. It's hard to imagine someone
+ * actually wanting to send this much urgent data.
+ */
+ if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+ th->th_urp = 0; /* XXX */
+ thflags &= ~TH_URG; /* XXX */
+ goto dodata; /* XXX */
+ }
+ /*
+ * If this segment advances the known urgent pointer,
+ * then mark the data stream. This should not happen
+ * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
+ * a FIN has been received from the remote side.
+ * In these states we ignore the URG.
+ *
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section as the original
+ * spec states (in one of two places).
+ */
+ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
+ tp->rcv_up = th->th_seq + th->th_urp;
+ so->so_oobmark = so->so_rcv.sb_cc +
+ (tp->rcv_up - tp->rcv_nxt) - 1;
+ if (so->so_oobmark == 0)
+ so->so_state |= SS_RCVATMARK;
+ sohasoutofband(so);
+ tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ }
+ /*
+ * Remove out of band data so doesn't get presented to user.
+ * This can happen independent of advancing the URG pointer,
+ * but if two URG's are pending at once, some out-of-band
+ * data may creep in... ick.
+ */
+ if (th->th_urp <= (u_long)tlen
+#ifdef SO_OOBINLINE
+ && (so->so_options & SO_OOBINLINE) == 0
+#endif
+ )
+ tcp_pulloutofband(so, th, m,
+ drop_hdrlen); /* hdr drop is delayed */
+ } else
+ /*
+ * If no out of band data is expected,
+ * pull receive urgent pointer along
+ * with the receive window.
+ */
+ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+ tp->rcv_up = tp->rcv_nxt;
+dodata: /* XXX */
+ KASSERT(headlocked, ("headlocked"));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ /*
+ * Process the segment text, merging it into the TCP sequencing queue,
+ * and arranging for acknowledgment of receipt if necessary.
+ * This process logically involves adjusting tp->rcv_wnd as data
+ * is presented to the user (this happens in tcp_usrreq.c,
+ * case PRU_RCVD). If a FIN has already been received on this
+ * connection then we just ignore the text.
+ */
+ if ((tlen || (thflags&TH_FIN)) &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ /*
+ * Insert segment which inludes th into reassembly queue of tcp with
+ * control block tp. Return TH_FIN if reassembly now includes
+ * a segment with FIN. This handle the common case inline (segment
+ * is the next to be received on an established connection, and the
+ * queue is empty), avoiding linkage into and removal from the queue
+ * and repetition of various conversions.
+ * Set DELACK for segments received in order, but ack immediately
+ * when segments are out of order (so fast retransmit can work).
+ */
+ if (th->th_seq == tp->rcv_nxt &&
+ LIST_EMPTY(&tp->t_segq) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (DELAY_ACK(tp))
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt += tlen;
+ thflags = th->th_flags & TH_FIN;
+ tcpstat.tcps_rcvpack++;
+ tcpstat.tcps_rcvbyte += tlen;
+ ND6_HINT(tp);
+ sbappend(&so->so_rcv, m);
+ sorwakeup(so);
+ } else {
+ thflags = tcp_reass(tp, th, &tlen, m);
+ tp->t_flags |= TF_ACKNOW;
+ }
+
+ /*
+ * Note the amount of data that peer has sent into
+ * our window, in order to estimate the sender's
+ * buffer size.
+ */
+ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+ } else {
+ m_freem(m);
+ thflags &= ~TH_FIN;
+ }
+
+ /*
+ * If FIN is received ACK the FIN and let the user know
+ * that the connection is closing.
+ */
+ if (thflags & TH_FIN) {
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ socantrcvmore(so);
+ /*
+ * If connection is half-synchronized
+ * (ie NEEDSYN flag on) then delay ACK,
+ * so it may be piggybacked when SYN is sent.
+ * Otherwise, since we received a FIN then no
+ * more input can be expected, send ACK now.
+ */
+ if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN))
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt++;
+ }
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED and ESTABLISHED STATES
+ * enter the CLOSE_WAIT state.
+ */
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /*FALLTHROUGH*/
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_CLOSE_WAIT;
+ break;
+
+ /*
+ * If still in FIN_WAIT_1 STATE FIN has not been acked so
+ * enter the CLOSING state.
+ */
+ case TCPS_FIN_WAIT_1:
+ tp->t_state = TCPS_CLOSING;
+ break;
+
+ /*
+ * In FIN_WAIT_2 state enter the TIME_WAIT state,
+ * starting the time-wait timer, turning off the other
+ * standard timers.
+ */
+ case TCPS_FIN_WAIT_2:
+ tp->t_state = TCPS_TIME_WAIT;
+ tcp_canceltimers(tp);
+ /* Shorten TIME_WAIT [RFC-1644, p.28] */
+ if (tp->cc_recv != 0 &&
+ (ticks - tp->t_starttime) < tcp_msl) {
+ callout_reset(tp->tt_2msl,
+ tp->t_rxtcur * TCPTV_TWTRUNC,
+ tcp_timer_2msl, tp);
+ /* For transaction client, force ACK now. */
+ tp->t_flags |= TF_ACKNOW;
+ }
+ else
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ soisdisconnected(so);
+ break;
+
+ /*
+ * In TIME_WAIT state restart the 2 MSL time_wait timer.
+ */
+ case TCPS_TIME_WAIT:
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ break;
+ }
+ }
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+
+ /*
+ * Return any desired output.
+ */
+ if (needoutput || (tp->t_flags & TF_ACKNOW))
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ return;
+
+dropafterack:
+ /*
+ * Generate an ACK dropping incoming segment if it occupies
+ * sequence space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all
+ * paths to this code happen after packets containing
+ * RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the
+ * segment we received passes the SYN-RECEIVED ACK test.
+ * If it fails send a RST. This breaks the loop in the
+ * "LAND" DoS attack, and also prevents an ACK storm
+ * between two listening ports that have been sent forged
+ * SYN segments, each with the source address of the other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max)) ) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ m_freem(m);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ return;
+
+dropwithreset:
+ /*
+ * Generate a RST, dropping incoming segment.
+ * Make ACK acceptable to originator of segment.
+ * Don't bother to respond if destination was broadcast/multicast.
+ */
+ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
+ goto drop;
+#ifdef INET6
+ if (isipv6) {
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
+ goto drop;
+ } else
+#endif /* INET6 */
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
+ goto drop;
+ /* IPv6 anycast check is done at tcp6_input() */
+
+ /*
+ * Perform bandwidth limiting.
+ */
+ if (badport_bandlim(rstreason) < 0)
+ goto drop;
+
+#ifdef TCPDEBUG
+ if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+
+ if (tp)
+ INP_UNLOCK(inp);
+
+ if (thflags & TH_ACK)
+ /* mtod() below is safe as long as hdr dropping is delayed */
+ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
+ TH_RST);
+ else {
+ if (thflags & TH_SYN)
+ tlen++;
+ /* mtod() below is safe as long as hdr dropping is delayed */
+ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
+ (tcp_seq)0, TH_RST|TH_ACK);
+ }
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+
+drop:
+ /*
+ * Drop space held by incoming segment and return.
+ */
+#ifdef TCPDEBUG
+ if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp)
+ INP_UNLOCK(inp);
+ m_freem(m);
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+}
+
+/*
+ * Parse TCP options and place in tcpopt.
+ */
+static void
+tcp_dooptions(to, cp, cnt, is_syn)
+ struct tcpopt *to;
+ u_char *cp;
+ int cnt;
+{
+ int opt, optlen;
+
+ to->to_flags = 0;
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < 2)
+ break;
+ optlen = cp[1];
+ if (optlen < 2 || optlen > cnt)
+ break;
+ }
+ switch (opt) {
+ case TCPOPT_MAXSEG:
+ if (optlen != TCPOLEN_MAXSEG)
+ continue;
+ if (!is_syn)
+ continue;
+ to->to_flags |= TOF_MSS;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_mss, sizeof(to->to_mss));
+ to->to_mss = ntohs(to->to_mss);
+ break;
+ case TCPOPT_WINDOW:
+ if (optlen != TCPOLEN_WINDOW)
+ continue;
+ if (! is_syn)
+ continue;
+ to->to_flags |= TOF_SCALE;
+ to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (optlen != TCPOLEN_TIMESTAMP)
+ continue;
+ to->to_flags |= TOF_TS;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_tsval, sizeof(to->to_tsval));
+ to->to_tsval = ntohl(to->to_tsval);
+ bcopy((char *)cp + 6,
+ (char *)&to->to_tsecr, sizeof(to->to_tsecr));
+ to->to_tsecr = ntohl(to->to_tsecr);
+ break;
+ case TCPOPT_CC:
+ if (optlen != TCPOLEN_CC)
+ continue;
+ to->to_flags |= TOF_CC;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_cc, sizeof(to->to_cc));
+ to->to_cc = ntohl(to->to_cc);
+ break;
+ case TCPOPT_CCNEW:
+ if (optlen != TCPOLEN_CC)
+ continue;
+ if (!is_syn)
+ continue;
+ to->to_flags |= TOF_CCNEW;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_cc, sizeof(to->to_cc));
+ to->to_cc = ntohl(to->to_cc);
+ break;
+ case TCPOPT_CCECHO:
+ if (optlen != TCPOLEN_CC)
+ continue;
+ if (!is_syn)
+ continue;
+ to->to_flags |= TOF_CCECHO;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_ccecho, sizeof(to->to_ccecho));
+ to->to_ccecho = ntohl(to->to_ccecho);
+ break;
+ default:
+ continue;
+ }
+ }
+}
+
+/*
+ * Pull out of band byte out of a segment so
+ * it doesn't appear in the user's data queue.
+ * It is still reflected in the segment length for
+ * sequencing purposes.
+ */
+static void
+tcp_pulloutofband(so, th, m, off)
+ struct socket *so;
+ struct tcphdr *th;
+ register struct mbuf *m;
+ int off; /* delayed to be droped hdrlen */
+{
+ int cnt = off + th->th_urp - 1;
+
+ while (cnt >= 0) {
+ if (m->m_len > cnt) {
+ char *cp = mtod(m, caddr_t) + cnt;
+ struct tcpcb *tp = sototcpcb(so);
+
+ tp->t_iobc = *cp;
+ tp->t_oobflags |= TCPOOB_HAVEDATA;
+ bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
+ m->m_len--;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len--;
+ return;
+ }
+ cnt -= m->m_len;
+ m = m->m_next;
+ if (m == 0)
+ break;
+ }
+ panic("tcp_pulloutofband");
+}
+
+/*
+ * Collect new round-trip time estimate
+ * and update averages and current timeout.
+ */
+static void
+tcp_xmit_timer(tp, rtt)
+ register struct tcpcb *tp;
+ int rtt;
+{
+ register int delta;
+
+ tcpstat.tcps_rttupdated++;
+ tp->t_rttupdated++;
+ if (tp->t_srtt != 0) {
+ /*
+ * srtt is stored as fixed point with 5 bits after the
+ * binary point (i.e., scaled by 8). The following magic
+ * is equivalent to the smoothing algorithm in rfc793 with
+ * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
+ * point). Adjust rtt to origin 0.
+ */
+ delta = ((rtt - 1) << TCP_DELTA_SHIFT)
+ - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+
+ if ((tp->t_srtt += delta) <= 0)
+ tp->t_srtt = 1;
+
+ /*
+ * We accumulate a smoothed rtt variance (actually, a
+ * smoothed mean difference), then set the retransmit
+ * timer to smoothed rtt + 4 times the smoothed variance.
+ * rttvar is stored as fixed point with 4 bits after the
+ * binary point (scaled by 16). The following is
+ * equivalent to rfc793 smoothing with an alpha of .75
+ * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
+ * rfc793's wired-in beta.
+ */
+ if (delta < 0)
+ delta = -delta;
+ delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
+ if ((tp->t_rttvar += delta) <= 0)
+ tp->t_rttvar = 1;
+ } else {
+ /*
+ * No rtt measurement yet - use the unsmoothed rtt.
+ * Set the variance to half the rtt (so our first
+ * retransmit happens at 3*rtt).
+ */
+ tp->t_srtt = rtt << TCP_RTT_SHIFT;
+ tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ }
+ tp->t_rtttime = 0;
+ tp->t_rxtshift = 0;
+
+ /*
+ * the retransmit should happen at rtt + 4 * rttvar.
+ * Because of the way we do the smoothing, srtt and rttvar
+ * will each average +1/2 tick of bias. When we compute
+ * the retransmit timer, we want 1/2 tick of rounding and
+ * 1 extra tick because of +-1/2 tick uncertainty in the
+ * firing of the timer. The bias will give us exactly the
+ * 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below
+ * the minimum feasible timer (which is 2 ticks).
+ */
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
+ max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+
+ /*
+ * We received an ack for a packet that wasn't retransmitted;
+ * it is probably safe to discard any error indications we've
+ * received recently. This isn't quite right, but close enough
+ * for now (a route might have failed after we sent a segment,
+ * and the return path might not be symmetrical).
+ */
+ tp->t_softerror = 0;
+}
+
+/*
+ * Determine a reasonable value for maxseg size.
+ * If the route is known, check route for mtu.
+ * If none, use an mss that can be handled on the outgoing
+ * interface without forcing IP to fragment; if bigger than
+ * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
+ * to utilize large mbufs. If no route is found, route has no mtu,
+ * or the destination isn't local, use a default, hopefully conservative
+ * size (usually 512 or the default IP max size, but no more than the mtu
+ * of the interface), as we can't discover anything about intervening
+ * gateways or networks. We also initialize the congestion/slow start
+ * window to be a single segment if the destination isn't local.
+ * While looking at the routing entry, we also initialize other path-dependent
+ * parameters from pre-set or cached values in the routing entry.
+ *
+ * Also take into account the space needed for options that we
+ * send regularly. Make maxseg shorter by that amount to assure
+ * that we can send maxseg amount of data even when the options
+ * are present. Store the upper limit of the length of options plus
+ * data in maxopd.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment, for outgoing segments only tcp_mssopt is called.
+ *
+ * In case of T/TCP, we call this routine during implicit connection
+ * setup as well (offer = -1), to initialize maxseg from the cached
+ * MSS of our peer.
+ */
+void
+tcp_mss(tp, offer)
+ struct tcpcb *tp;
+ int offer;
+{
+ register struct rtentry *rt;
+ struct ifnet *ifp;
+ register int rtt, mss;
+ u_long bufsize;
+ struct inpcb *inp;
+ struct socket *so;
+ struct rmxp_tao *taop;
+ int origoffer = offer;
+#ifdef INET6
+ int isipv6;
+ int min_protoh;
+#endif
+
+ inp = tp->t_inpcb;
+#ifdef INET6
+ isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+ min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
+ : sizeof (struct tcpiphdr);
+#else
+#define min_protoh (sizeof (struct tcpiphdr))
+#endif
+#ifdef INET6
+ if (isipv6)
+ rt = tcp_rtlookup6(&inp->inp_inc);
+ else
+#endif
+ rt = tcp_rtlookup(&inp->inp_inc);
+ if (rt == NULL) {
+ tp->t_maxopd = tp->t_maxseg =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ return;
+ }
+ ifp = rt->rt_ifp;
+ so = inp->inp_socket;
+
+ taop = rmx_taop(rt->rt_rmx);
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (offer == -1)
+ offer = taop->tao_mssopt;
+ /*
+ * Offer == 0 means that there was no MSS on the SYN segment,
+ * in this case we use tcp_mssdflt.
+ */
+ if (offer == 0)
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ else
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even is the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ taop->tao_mssopt = offer;
+
+ /*
+ * While we're here, check if there's an initial rtt
+ * or rttvar. Convert from the route-table units
+ * to scaled multiples of the slow timeout timer.
+ */
+ if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
+ /*
+ * XXX the lock bit for RTT indicates that the value
+ * is also a minimum value; this is subject to time.
+ */
+ if (rt->rt_rmx.rmx_locks & RTV_RTT)
+ tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
+ tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+ tcpstat.tcps_usedrtt++;
+ if (rt->rt_rmx.rmx_rttvar) {
+ tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
+ (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ /*
+ * if there's an mtu associated with the route, use it
+ * else, use the link mtu.
+ */
+ if (rt->rt_rmx.rmx_mtu)
+ mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ else
+ {
+ mss =
+#ifdef INET6
+ (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
+#endif
+ ifp->if_mtu
+#ifdef INET6
+ )
+#endif
+ - min_protoh;
+#ifdef INET6
+ if (isipv6) {
+ if (!in6_localaddr(&inp->in6p_faddr))
+ mss = min(mss, tcp_v6mssdflt);
+ } else
+#endif
+ if (!in_localaddr(inp->inp_faddr))
+ mss = min(mss, tcp_mssdflt);
+ }
+ mss = min(mss, offer);
+ /*
+ * maxopd stores the maximum length of data AND options
+ * in a segment; maxseg is the amount of data in a normal
+ * segment. We need to store this value (maxopd) apart
+ * from maxseg, because now every segment carries options
+ * and thus we normally have somewhat less data in segments.
+ */
+ tp->t_maxopd = mss;
+
+ /*
+ * In case of T/TCP, origoffer==-1 indicates, that no segments
+ * were received yet. In this case we just guess, otherwise
+ * we do the same as before T/TCP.
+ */
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
+ (origoffer == -1 ||
+ (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
+ mss -= TCPOLEN_TSTAMP_APPA;
+ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
+ (origoffer == -1 ||
+ (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
+ mss -= TCPOLEN_CC_APPA;
+
+#if (MCLBYTES & (MCLBYTES - 1)) == 0
+ if (mss > MCLBYTES)
+ mss &= ~(MCLBYTES-1);
+#else
+ if (mss > MCLBYTES)
+ mss = mss / MCLBYTES * MCLBYTES;
+#endif
+ /*
+ * If there's a pipesize, change the socket buffer
+ * to that size. Make the socket buffers an integral
+ * number of mss units; if the mss is larger than
+ * the socket buffer, decrease the mss.
+ */
+#ifdef RTV_SPIPE
+ if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
+#endif
+ bufsize = so->so_snd.sb_hiwat;
+ if (bufsize < mss)
+ mss = bufsize;
+ else {
+ bufsize = roundup(bufsize, mss);
+ if (bufsize > sb_max)
+ bufsize = sb_max;
+ (void)sbreserve(&so->so_snd, bufsize, so, NULL);
+ }
+ tp->t_maxseg = mss;
+
+#ifdef RTV_RPIPE
+ if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
+#endif
+ bufsize = so->so_rcv.sb_hiwat;
+ if (bufsize > mss) {
+ bufsize = roundup(bufsize, mss);
+ if (bufsize > sb_max)
+ bufsize = sb_max;
+ (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
+ }
+
+ /*
+ * Set the slow-start flight size depending on whether this
+ * is a local network or not.
+ */
+ if (
+#ifdef INET6
+ (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+ (!isipv6 &&
+#endif
+ in_localaddr(inp->inp_faddr)
+#ifdef INET6
+ )
+#endif
+ )
+ tp->snd_cwnd = mss * ss_fltsz_local;
+ else
+ tp->snd_cwnd = mss * ss_fltsz;
+
+ if (rt->rt_rmx.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+}
+
+/*
+ * Determine the MSS option to send on an outgoing SYN.
+ */
+int
+tcp_mssopt(tp)
+ struct tcpcb *tp;
+{
+ struct rtentry *rt;
+#ifdef INET6
+ int isipv6;
+ int min_protoh;
+#endif
+
+#ifdef INET6
+ isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+ min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
+ : sizeof (struct tcpiphdr);
+#else
+#define min_protoh (sizeof (struct tcpiphdr))
+#endif
+#ifdef INET6
+ if (isipv6)
+ rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
+ else
+#endif /* INET6 */
+ rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
+ if (rt == NULL)
+ return
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+
+ return rt->rt_ifp->if_mtu - min_protoh;
+}
+
+
+/*
+ * Checks for partial ack. If partial ack arrives, force the retransmission
+ * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
+ * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
+ * be started again. If the ack advances at least to tp->snd_recover, return 0.
+ */
+static int
+tcp_newreno(tp, th)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+{
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ tcp_seq onxt = tp->snd_nxt;
+ u_long ocwnd = tp->snd_cwnd;
+
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = th->th_ack;
+ /*
+ * Set snd_cwnd to one segment beyond acknowledged offset
+ * (tp->snd_una has not yet been updated when this function
+ * is called)
+ */
+ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ /*
+ * Partial window deflation. Relies on fact that tp->snd_una
+ * not updated yet.
+ */
+ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
+ return (1);
+ }
+ return (0);
+}
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
new file mode 100644
index 0000000..47a1873
--- /dev/null
+++ b/sys/netinet/tcp_output.c
@@ -0,0 +1,970 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/tcp.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+#ifdef notyet
+extern struct mbuf *m_copypack();
+#endif
+
+int path_mtu_discovery = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
+ &path_mtu_discovery, 1, "Enable Path MTU Discovery");
+
+int ss_fltsz = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
+ &ss_fltsz, 1, "Slow start flight size");
+
+int ss_fltsz_local = 4;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
+ &ss_fltsz_local, 1, "Slow start flight size for local networks");
+
+int tcp_do_newreno = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
+ 0, "Enable NewReno Algorithms");
+/*
+ * Tcp output routine: figure out what should be sent and send it.
+ */
+int
+tcp_output(struct tcpcb *tp)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+ long len, win;
+ int off, flags, error;
+ struct mbuf *m;
+ struct ip *ip = NULL;
+ struct ipovly *ipov = NULL;
+ struct tcphdr *th;
+ u_char opt[TCP_MAXOLEN];
+ unsigned ipoptlen, optlen, hdrlen;
+ int idle, sendalot;
+#if 0
+ int maxburst = TCP_MAXBURST;
+#endif
+ struct rmxp_tao *taop;
+ struct rmxp_tao tao_noncached;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+
+ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
+#endif
+
+ /*
+ * Determine length of data that should be transmitted,
+ * and flags that will be used.
+ * If there is some data or critical controls (SYN, RST)
+ * to send, then transmit; otherwise, investigate further.
+ */
+ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
+ if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
+ /*
+ * We have been idle for "a while" and no acks are
+ * expected to clock out any data we send --
+ * slow start to get ack "clock" running again.
+ *
+ * Set the slow-start flight size depending on whether
+ * this is a local network or not.
+ */
+ int ss = ss_fltsz;
+#ifdef INET6
+ if (isipv6) {
+ if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
+ ss = ss_fltsz_local;
+ } else
+#endif /* INET6 */
+ if (in_localaddr(tp->t_inpcb->inp_faddr))
+ ss = ss_fltsz_local;
+ tp->snd_cwnd = tp->t_maxseg * ss;
+ }
+ tp->t_flags &= ~TF_LASTIDLE;
+ if (idle) {
+ if (tp->t_flags & TF_MORETOCOME) {
+ tp->t_flags |= TF_LASTIDLE;
+ idle = 0;
+ }
+ }
+again:
+ sendalot = 0;
+ off = tp->snd_nxt - tp->snd_una;
+ win = min(tp->snd_wnd, tp->snd_cwnd);
+
+ flags = tcp_outflags[tp->t_state];
+ /*
+ * Get standard flags, and add SYN or FIN if requested by 'hidden'
+ * state flags.
+ */
+ if (tp->t_flags & TF_NEEDFIN)
+ flags |= TH_FIN;
+ if (tp->t_flags & TF_NEEDSYN)
+ flags |= TH_SYN;
+
+ /*
+ * If in persist timeout with window of 0, send 1 byte.
+ * Otherwise, if window is small but nonzero
+ * and timer expired, we will send what we can
+ * and go to transmit state.
+ */
+ if (tp->t_force) {
+ if (win == 0) {
+ /*
+ * If we still have some data to send, then
+ * clear the FIN bit. Usually this would
+ * happen below when it realizes that we
+ * aren't sending all the data. However,
+ * if we have exactly 1 byte of unsent data,
+ * then it won't clear the FIN bit below,
+ * and if we are in persist state, we wind
+ * up sending the packet without recording
+ * that we sent the FIN bit.
+ *
+ * We can't just blindly clear the FIN bit,
+ * because if we don't have any more data
+ * to send then the probe will be the FIN
+ * itself.
+ */
+ if (off < so->so_snd.sb_cc)
+ flags &= ~TH_FIN;
+ win = 1;
+ } else {
+ callout_stop(tp->tt_persist);
+ tp->t_rxtshift = 0;
+ }
+ }
+
+ len = (long)ulmin(so->so_snd.sb_cc, win) - off;
+
+ if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
+ taop = &tao_noncached;
+ bzero(taop, sizeof(*taop));
+ }
+
+ /*
+ * Lop off SYN bit if it has already been sent. However, if this
+ * is SYN-SENT state and if segment contains data and if we don't
+ * know that foreign host supports TAO, suppress sending segment.
+ */
+ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
+ flags &= ~TH_SYN;
+ off--, len++;
+ if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
+ taop->tao_ccsent == 0)
+ return 0;
+ }
+
+ /*
+ * Be careful not to send data and/or FIN on SYN segments
+ * in cases when no CC option will be sent.
+ * This measure is needed to prevent interoperability problems
+ * with not fully conformant TCP implementations.
+ */
+ if ((flags & TH_SYN) &&
+ ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
+ ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
+ len = 0;
+ flags &= ~TH_FIN;
+ }
+
+ if (len < 0) {
+ /*
+ * If FIN has been sent but not acked,
+ * but we haven't been called to retransmit,
+ * len will be -1. Otherwise, window shrank
+ * after we sent into it. If window shrank to 0,
+ * cancel pending retransmit, pull snd_nxt back
+ * to (closed) window, and set the persist timer
+ * if it isn't already going. If the window didn't
+ * close completely, just wait for an ACK.
+ */
+ len = 0;
+ if (win == 0) {
+ callout_stop(tp->tt_rexmt);
+ tp->t_rxtshift = 0;
+ tp->snd_nxt = tp->snd_una;
+ if (!callout_active(tp->tt_persist))
+ tcp_setpersist(tp);
+ }
+ }
+ if (len > tp->t_maxseg) {
+ len = tp->t_maxseg;
+ sendalot = 1;
+ }
+ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
+ flags &= ~TH_FIN;
+
+ win = sbspace(&so->so_rcv);
+
+ /*
+ * Sender silly window avoidance. We transmit under the following
+ * conditions when len is non-zero:
+ *
+ * - We have a full segment
+ * - This is the last buffer in a write()/send() and we are
+ * either idle or running NODELAY
+ * - we've timed out (e.g. persist timer)
+ * - we have more then 1/2 the maximum send window's worth of
+ * data (receiver may be limited the window size)
+ * - we need to retransmit
+ */
+ if (len) {
+ if (len == tp->t_maxseg)
+ goto send;
+ /*
+ * NOTE! on localhost connections an 'ack' from the remote
+ * end may occur synchronously with the output and cause
+ * us to flush a buffer queued with moretocome. XXX
+ *
+ * note: the len + off check is almost certainly unnecessary.
+ */
+ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
+ (idle || (tp->t_flags & TF_NODELAY)) &&
+ len + off >= so->so_snd.sb_cc &&
+ (tp->t_flags & TF_NOPUSH) == 0) {
+ goto send;
+ }
+ if (tp->t_force) /* typ. timeout case */
+ goto send;
+ if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
+ goto send;
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
+ goto send;
+ }
+
+ /*
+ * Compare available window to amount of window
+ * known to peer (as advertised window less
+ * next expected input). If the difference is at least two
+ * max size segments, or at least 50% of the maximum possible
+ * window, then want to send a window update to peer.
+ */
+ if (win > 0) {
+ /*
+ * "adv" is the amount we can increase the window,
+ * taking into account that we are limited by
+ * TCP_MAXWIN << tp->rcv_scale.
+ */
+ long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
+ (tp->rcv_adv - tp->rcv_nxt);
+
+ if (adv >= (long) (2 * tp->t_maxseg))
+ goto send;
+ if (2 * adv >= (long) so->so_rcv.sb_hiwat)
+ goto send;
+ }
+
+ /*
+ * Send if we owe peer an ACK.
+ */
+ if (tp->t_flags & TF_ACKNOW)
+ goto send;
+ if ((flags & TH_RST) ||
+ ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
+ goto send;
+ if (SEQ_GT(tp->snd_up, tp->snd_una))
+ goto send;
+ /*
+ * If our state indicates that FIN should be sent
+ * and we have not yet done so, or we're retransmitting the FIN,
+ * then we need to send.
+ */
+ if (flags & TH_FIN &&
+ ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
+ goto send;
+
+ /*
+ * TCP window updates are not reliable, rather a polling protocol
+ * using ``persist'' packets is used to insure receipt of window
+ * updates. The three ``states'' for the output side are:
+ * idle not doing retransmits or persists
+ * persisting to move a small or zero window
+ * (re)transmitting and thereby not persisting
+ *
+ * callout_active(tp->tt_persist)
+ * is true when we are in persist state.
+ * tp->t_force
+ * is set when we are called to send a persist packet.
+ * callout_active(tp->tt_rexmt)
+ * is set when we are retransmitting
+ * The output side is idle when both timers are zero.
+ *
+ * If send window is too small, there is data to transmit, and no
+ * retransmit or persist is pending, then go to persist state.
+ * If nothing happens soon, send when timer expires:
+ * if window is nonzero, transmit what we can,
+ * otherwise force out a byte.
+ */
+ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
+ !callout_active(tp->tt_persist)) {
+ tp->t_rxtshift = 0;
+ tcp_setpersist(tp);
+ }
+
+ /*
+ * No reason to send a segment, just return.
+ */
+ return (0);
+
+send:
+ /*
+ * Before ESTABLISHED, force sending of initial options
+ * unless TCP set not to do any options.
+ * NOTE: we assume that the IP/TCP header plus TCP options
+ * always fit in a single mbuf, leaving room for a maximum
+ * link header, i.e.
+ * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
+ */
+ optlen = 0;
+#ifdef INET6
+ if (isipv6)
+ hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+ else
+#endif
+ hdrlen = sizeof (struct tcpiphdr);
+ if (flags & TH_SYN) {
+ tp->snd_nxt = tp->iss;
+ if ((tp->t_flags & TF_NOOPT) == 0) {
+ u_short mss;
+
+ opt[0] = TCPOPT_MAXSEG;
+ opt[1] = TCPOLEN_MAXSEG;
+ mss = htons((u_short) tcp_mssopt(tp));
+ (void)memcpy(opt + 2, &mss, sizeof(mss));
+ optlen = TCPOLEN_MAXSEG;
+
+ if ((tp->t_flags & TF_REQ_SCALE) &&
+ ((flags & TH_ACK) == 0 ||
+ (tp->t_flags & TF_RCVD_SCALE))) {
+ *((u_int32_t *)(opt + optlen)) = htonl(
+ TCPOPT_NOP << 24 |
+ TCPOPT_WINDOW << 16 |
+ TCPOLEN_WINDOW << 8 |
+ tp->request_r_scale);
+ optlen += 4;
+ }
+ }
+ }
+
+ /*
+ * Send a timestamp and echo-reply if this is a SYN and our side
+ * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
+ * and our peer have sent timestamps in our SYN's.
+ */
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
+ (flags & TH_RST) == 0 &&
+ ((flags & TH_ACK) == 0 ||
+ (tp->t_flags & TF_RCVD_TSTMP))) {
+ u_int32_t *lp = (u_int32_t *)(opt + optlen);
+
+ /* Form timestamp option as shown in appendix A of RFC 1323. */
+ *lp++ = htonl(TCPOPT_TSTAMP_HDR);
+ *lp++ = htonl(ticks);
+ *lp = htonl(tp->ts_recent);
+ optlen += TCPOLEN_TSTAMP_APPA;
+ }
+
+ /*
+ * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
+ * options are allowed (!TF_NOOPT) and it's not a RST.
+ */
+ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
+ (flags & TH_RST) == 0) {
+ switch (flags & (TH_SYN|TH_ACK)) {
+ /*
+ * This is a normal ACK, send CC if we received CC before
+ * from our peer.
+ */
+ case TH_ACK:
+ if (!(tp->t_flags & TF_RCVD_CC))
+ break;
+ /*FALLTHROUGH*/
+
+ /*
+ * We can only get here in T/TCP's SYN_SENT* state, when
+ * we're a sending a non-SYN segment without waiting for
+ * the ACK of our SYN. A check above assures that we only
+ * do this if our peer understands T/TCP.
+ */
+ case 0:
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_CC;
+ opt[optlen++] = TCPOLEN_CC;
+ *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
+
+ optlen += 4;
+ break;
+
+ /*
+ * This is our initial SYN, check whether we have to use
+ * CC or CC.new.
+ */
+ case TH_SYN:
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
+ TCPOPT_CCNEW : TCPOPT_CC;
+ opt[optlen++] = TCPOLEN_CC;
+ *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
+ optlen += 4;
+ break;
+
+ /*
+ * This is a SYN,ACK; send CC and CC.echo if we received
+ * CC from our peer.
+ */
+ case (TH_SYN|TH_ACK):
+ if (tp->t_flags & TF_RCVD_CC) {
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_CC;
+ opt[optlen++] = TCPOLEN_CC;
+ *(u_int32_t *)&opt[optlen] =
+ htonl(tp->cc_send);
+ optlen += 4;
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_NOP;
+ opt[optlen++] = TCPOPT_CCECHO;
+ opt[optlen++] = TCPOLEN_CC;
+ *(u_int32_t *)&opt[optlen] =
+ htonl(tp->cc_recv);
+ optlen += 4;
+ }
+ break;
+ }
+ }
+
+ hdrlen += optlen;
+
+#ifdef INET6
+ if (isipv6)
+ ipoptlen = ip6_optlen(tp->t_inpcb);
+ else
+#endif
+ if (tp->t_inpcb->inp_options)
+ ipoptlen = tp->t_inpcb->inp_options->m_len -
+ offsetof(struct ipoption, ipopt_list);
+ else
+ ipoptlen = 0;
+#ifdef IPSEC
+ ipoptlen += ipsec_hdrsiz_tcp(tp);
+#endif
+
+ /*
+ * Adjust data length if insertion of options will
+ * bump the packet length beyond the t_maxopd length.
+ * Clear the FIN bit because we cut off the tail of
+ * the segment.
+ */
+ if (len + optlen + ipoptlen > tp->t_maxopd) {
+ /*
+ * If there is still more to send, don't close the connection.
+ */
+ flags &= ~TH_FIN;
+ len = tp->t_maxopd - optlen - ipoptlen;
+ sendalot = 1;
+ }
+
+/*#ifdef DIAGNOSTIC*/
+#ifdef INET6
+ if (max_linkhdr + hdrlen > MCLBYTES)
+#else
+ if (max_linkhdr + hdrlen > MHLEN)
+#endif
+ panic("tcphdr too big");
+/*#endif*/
+
+ /*
+ * Grab a header mbuf, attaching a copy of data to
+ * be transmitted, and initialize the header from
+ * the template for sends on this connection.
+ */
+ if (len) {
+ if (tp->t_force && len == 1)
+ tcpstat.tcps_sndprobe++;
+ else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+ tcpstat.tcps_sndrexmitpack++;
+ tcpstat.tcps_sndrexmitbyte += len;
+ } else {
+ tcpstat.tcps_sndpack++;
+ tcpstat.tcps_sndbyte += len;
+ }
+#ifdef notyet
+ if ((m = m_copypack(so->so_snd.sb_mb, off,
+ (int)len, max_linkhdr + hdrlen)) == 0) {
+ error = ENOBUFS;
+ goto out;
+ }
+ /*
+ * m_copypack left space for our hdr; use it.
+ */
+ m->m_len += hdrlen;
+ m->m_data -= hdrlen;
+#else
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr) {
+ MCLGET(m, M_DONTWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ if (len <= MHLEN - hdrlen - max_linkhdr) {
+ m_copydata(so->so_snd.sb_mb, off, (int) len,
+ mtod(m, caddr_t) + hdrlen);
+ m->m_len += len;
+ } else {
+ m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
+ if (m->m_next == 0) {
+ (void) m_free(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+#endif
+ /*
+ * If we're sending everything we've got, set PUSH.
+ * (This will keep happy those implementations which only
+ * give data to the user when a buffer fills or
+ * a PUSH comes in.)
+ */
+ if (off + len == so->so_snd.sb_cc)
+ flags |= TH_PUSH;
+ } else {
+ if (tp->t_flags & TF_ACKNOW)
+ tcpstat.tcps_sndacks++;
+ else if (flags & (TH_SYN|TH_FIN|TH_RST))
+ tcpstat.tcps_sndctrl++;
+ else if (SEQ_GT(tp->snd_up, tp->snd_una))
+ tcpstat.tcps_sndurg++;
+ else
+ tcpstat.tcps_sndwinup++;
+
+ MGETHDR(m, M_DONTWAIT, MT_HEADER);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+#ifdef INET6
+ if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
+ MHLEN >= hdrlen) {
+ MH_ALIGN(m, hdrlen);
+ } else
+#endif
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ }
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+#ifdef INET6
+ if (isipv6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ tcp_fillheaders(tp, ip6, th);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)(ip + 1);
+ /* this picks up the pseudo header (w/o the length) */
+ tcp_fillheaders(tp, ip, th);
+ }
+
+ /*
+ * Fill in fields, remembering maximum advertised
+ * window for use in delaying messages about window sizes.
+ * If resending a FIN, be sure not to use a new sequence number.
+ */
+ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
+ tp->snd_nxt == tp->snd_max)
+ tp->snd_nxt--;
+ /*
+ * If we are doing retransmissions, then snd_nxt will
+ * not reflect the first unsent octet. For ACK only
+ * packets, we do not want the sequence number of the
+ * retransmitted packet, we want the sequence number
+ * of the next unsent octet. So, if there is no data
+ * (and no SYN or FIN), use snd_max instead of snd_nxt
+ * when filling in ti_seq. But if we are in persist
+ * state, snd_max might reflect one byte beyond the
+ * right edge of the window, so use snd_nxt in that
+ * case, since we know we aren't doing a retransmission.
+ * (retransmit and persist are mutually exclusive...)
+ */
+ if (len || (flags & (TH_SYN|TH_FIN))
+ || callout_active(tp->tt_persist))
+ th->th_seq = htonl(tp->snd_nxt);
+ else
+ th->th_seq = htonl(tp->snd_max);
+ th->th_ack = htonl(tp->rcv_nxt);
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
+ }
+ th->th_flags = flags;
+ /*
+ * Calculate receive window. Don't shrink window,
+ * but avoid silly window syndrome.
+ */
+ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
+ win = 0;
+ if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
+ win = (long)(tp->rcv_adv - tp->rcv_nxt);
+ if (win > (long)TCP_MAXWIN << tp->rcv_scale)
+ win = (long)TCP_MAXWIN << tp->rcv_scale;
+ th->th_win = htons((u_short) (win>>tp->rcv_scale));
+
+
+ /*
+ * Adjust the RXWIN0SENT flag - indicate that we have advertised
+ * a 0 window. This may cause the remote transmitter to stall. This
+ * flag tells soreceive() to disable delayed acknowledgements when
+ * draining the buffer. This can occur if the receiver is attempting
+ * to read more data then can be buffered prior to transmitting on
+ * the connection.
+ */
+ if (win == 0)
+ tp->t_flags |= TF_RXWIN0SENT;
+ else
+ tp->t_flags &= ~TF_RXWIN0SENT;
+ if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
+ th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
+ th->th_flags |= TH_URG;
+ } else
+ /*
+ * If no urgent pointer to send, then we pull
+ * the urgent pointer to the left edge of the send window
+ * so that it doesn't drift into the send window on sequence
+ * number wraparound.
+ */
+ tp->snd_up = tp->snd_una; /* drag it along */
+
+ /*
+ * Put TCP length in extended header, and then
+ * checksum extended header and data.
+ */
+ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+#ifdef INET6
+ if (isipv6)
+ /*
+ * ip6_plen is not need to be filled now, and will be filled
+ * in ip6_output.
+ */
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
+ sizeof(struct tcphdr) + optlen + len);
+ else
+#endif /* INET6 */
+ {
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ if (len + optlen)
+ th->th_sum = in_addword(th->th_sum,
+ htons((u_short)(optlen + len)));
+
+ /* IP version must be set here for ipv4/ipv6 checking later */
+ KASSERT(ip->ip_v == IPVERSION,
+ ("%s: IP version incorrect: %d", __func__, ip->ip_v));
+ }
+
+ /*
+ * In transmit state, time the transmission and arrange for
+ * the retransmit. In persist state, just set snd_max.
+ */
+ if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
+ tcp_seq startseq = tp->snd_nxt;
+
+ /*
+ * Advance snd_nxt over sequence space of this segment.
+ */
+ if (flags & (TH_SYN|TH_FIN)) {
+ if (flags & TH_SYN)
+ tp->snd_nxt++;
+ if (flags & TH_FIN) {
+ tp->snd_nxt++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ }
+ tp->snd_nxt += len;
+ if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
+ tp->snd_max = tp->snd_nxt;
+ /*
+ * Time this transmission if not a retransmission and
+ * not currently timing anything.
+ */
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ tcpstat.tcps_segstimed++;
+ }
+ }
+
+ /*
+ * Set retransmit timer if not currently set,
+ * and not doing an ack or a keep-alive probe.
+ * Initial value for retransmit timer is smoothed
+ * round-trip time + 2 * round-trip time variance.
+ * Initialize shift counter which is used for backoff
+ * of retransmit time.
+ */
+ if (!callout_active(tp->tt_rexmt) &&
+ tp->snd_nxt != tp->snd_una) {
+ if (callout_active(tp->tt_persist)) {
+ callout_stop(tp->tt_persist);
+ tp->t_rxtshift = 0;
+ }
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ }
+ } else
+ if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
+ tp->snd_max = tp->snd_nxt + len;
+
+#ifdef TCPDEBUG
+ /*
+ * Trace.
+ */
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
+#endif
+
+ /*
+ * Fill in IP length and desired time to live and
+ * send to IP level. There should be a better way
+ * to handle ttl and tos; we could keep them in
+ * the template, but need a way to checksum without them.
+ */
+ /*
+ * m->m_pkthdr.len should have been set before cksum calcuration,
+ * because in6_cksum() need it.
+ */
+#ifdef INET6
+ if (isipv6) {
+ /*
+ * we separately set hoplimit for every segment, since the
+ * user might want to change the value via setsockopt.
+ * Also, desired default hop limit might be changed via
+ * Neighbor Discovery.
+ */
+ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
+ tp->t_inpcb->in6p_route.ro_rt ?
+ tp->t_inpcb->in6p_route.ro_rt->rt_ifp
+ : NULL);
+
+ /* TODO: IPv6 IP6TOS_ECT bit on */
+#ifdef IPSEC
+ if (ipsec_setsocket(m, so) != 0) {
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+#endif /*IPSEC*/
+ error = ip6_output(m,
+ tp->t_inpcb->in6p_outputopts,
+ &tp->t_inpcb->in6p_route,
+ (so->so_options & SO_DONTROUTE), NULL, NULL);
+ } else
+#endif /* INET6 */
+ {
+ struct rtentry *rt;
+ ip->ip_len = m->m_pkthdr.len;
+#ifdef INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
+ tp->t_inpcb->in6p_route.ro_rt ?
+ tp->t_inpcb->in6p_route.ro_rt->rt_ifp
+ : NULL);
+ else
+#endif /* INET6 */
+ ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */
+ ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */
+ /*
+ * See if we should do MTU discovery. We do it only if the following
+ * are true:
+ * 1) we have a valid route to the destination
+ * 2) the MTU is not locked (if it is, then discovery has been
+ * disabled)
+ */
+ if (path_mtu_discovery
+ && (rt = tp->t_inpcb->inp_route.ro_rt)
+ && rt->rt_flags & RTF_UP
+ && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
+ ip->ip_off |= IP_DF;
+ }
+#ifdef IPSEC
+ ipsec_setsocket(m, so);
+#endif /*IPSEC*/
+ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
+ (so->so_options & SO_DONTROUTE), 0);
+ }
+ if (error) {
+
+ /*
+ * We know that the packet was lost, so back out the
+ * sequence number advance, if any.
+ */
+ if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
+ /*
+ * No need to check for TH_FIN here because
+ * the TF_SENTFIN flag handles that case.
+ */
+ if ((flags & TH_SYN) == 0)
+ tp->snd_nxt -= len;
+ }
+
+out:
+ if (error == ENOBUFS) {
+ if (!callout_active(tp->tt_rexmt) &&
+ !callout_active(tp->tt_persist))
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ tcp_quench(tp->t_inpcb, 0);
+ return (0);
+ }
+ if (error == EMSGSIZE) {
+ /*
+ * ip_output() will have already fixed the route
+ * for us. tcp_mtudisc() will, as its last action,
+ * initiate retransmission, so it is important to
+ * not do so here.
+ */
+ tcp_mtudisc(tp->t_inpcb, 0);
+ return 0;
+ }
+ if ((error == EHOSTUNREACH || error == ENETDOWN)
+ && TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_softerror = error;
+ return (0);
+ }
+ return (error);
+ }
+ tcpstat.tcps_sndtotal++;
+
+ /*
+ * Data sent (as far as we can tell).
+ * If this advertises a larger window than any other segment,
+ * then remember the size of the advertised window.
+ * Any pending ACK has now been sent.
+ */
+ if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
+ tp->rcv_adv = tp->rcv_nxt + win;
+ tp->last_ack_sent = tp->rcv_nxt;
+ tp->t_flags &= ~TF_ACKNOW;
+ if (tcp_delack_enabled)
+ callout_stop(tp->tt_delack);
+#if 0
+ /*
+ * This completely breaks TCP if newreno is turned on. What happens
+ * is that if delayed-acks are turned on on the receiver, this code
+ * on the transmitter effectively destroys the TCP window, forcing
+ * it to four packets (1.5Kx4 = 6K window).
+ */
+ if (sendalot && (!tcp_do_newreno || --maxburst))
+ goto again;
+#endif
+ if (sendalot)
+ goto again;
+ return (0);
+}
+
+void
+tcp_setpersist(tp)
+ register struct tcpcb *tp;
+{
+ int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
+ int tt;
+
+ if (callout_active(tp->tt_rexmt))
+ panic("tcp_setpersist: retransmit pending");
+ /*
+ * Start/restart persistance timer.
+ */
+ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
+ TCPTV_PERSMIN, TCPTV_PERSMAX);
+ callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
+ if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+ tp->t_rxtshift++;
+}
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
new file mode 100644
index 0000000..0fb62e0
--- /dev/null
+++ b/sys/netinet/tcp_reass.c
@@ -0,0 +1,2785 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_ipfw.h" /* for ipfw_fwd */
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_tcp_input.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+
+#endif /* TCPDEBUG */
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#ifdef INET6
+#include <netinet6/ipsec6.h>
+#endif
+#include <netkey/key.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
+
+static int tcprexmtthresh = 3;
+tcp_cc tcp_ccgen;
+
+struct tcpstat tcpstat;
+SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
+ &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
+
+static int log_in_vain = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
+ &log_in_vain, 0, "Log all incoming TCP connections");
+
+static int blackhole = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
+ &blackhole, 0, "Do not send RST when dropping refused connections");
+
+int tcp_delack_enabled = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
+ &tcp_delack_enabled, 0,
+ "Delay ACK to try and piggyback it onto a data packet");
+
+#ifdef TCP_DROP_SYNFIN
+static int drop_synfin = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
+ &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
+#endif
+
+struct inpcbhead tcb;
+#define tcb6 tcb /* for KAME src sync over BSD*'s */
+struct inpcbinfo tcbinfo;
+struct mtx *tcbinfo_mtx;
+
+static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void tcp_pulloutofband(struct socket *,
+ struct tcphdr *, struct mbuf *, int);
+static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
+ struct mbuf *);
+static void tcp_xmit_timer(struct tcpcb *, int);
+static int tcp_newreno(struct tcpcb *, struct tcphdr *);
+
+/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
+#ifdef INET6
+#define ND6_HINT(tp) \
+do { \
+ if ((tp) && (tp)->t_inpcb && \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
+ (tp)->t_inpcb->in6p_route.ro_rt) \
+ nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+} while (0)
+#else
+#define ND6_HINT(tp)
+#endif
+
+/*
+ * Indicate whether this ack should be delayed. We can delay the ack if
+ * - delayed acks are enabled and
+ * - there is no delayed ack timer in progress and
+ * - our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window.
+ */
+#define DELAY_ACK(tp) \
+ (tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
+ (tp->t_flags & TF_RXWIN0SENT) == 0)
+
+static int
+tcp_reass(tp, th, tlenp, m)
+ register struct tcpcb *tp;
+ register struct tcphdr *th;
+ int *tlenp;
+ struct mbuf *m;
+{
+ struct tseg_qent *q;
+ struct tseg_qent *p = NULL;
+ struct tseg_qent *nq;
+ struct tseg_qent *te;
+ struct socket *so = tp->t_inpcb->inp_socket;
+ int flags;
+
+ /*
+ * Call with th==0 after become established to
+ * force pre-ESTABLISHED data up to user socket.
+ */
+ if (th == 0)
+ goto present;
+
+ /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
+ MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ,
+ M_NOWAIT);
+ if (te == NULL) {
+ tcpstat.tcps_rcvmemdrop++;
+ m_freem(m);
+ return (0);
+ }
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+ if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
+ break;
+ p = q;
+ }
+
+ /*
+ * If there is a preceding segment, it may provide some of
+ * our data already. If so, drop the data from the incoming
+ * segment. If it provides all of our data, drop us.
+ */
+ if (p != NULL) {
+ register int i;
+ /* conversion to int (in i) handles seq wraparound */
+ i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
+ if (i > 0) {
+ if (i >= *tlenp) {
+ tcpstat.tcps_rcvduppack++;
+ tcpstat.tcps_rcvdupbyte += *tlenp;
+ m_freem(m);
+ FREE(te, M_TSEGQ);
+ /*
+ * Try to present any queued data
+ * at the left window edge to the user.
+ * This is needed after the 3-WHS
+ * completes.
+ */
+ goto present; /* ??? */
+ }
+ m_adj(m, i);
+ *tlenp -= i;
+ th->th_seq += i;
+ }
+ }
+ tcpstat.tcps_rcvoopack++;
+ tcpstat.tcps_rcvoobyte += *tlenp;
+
+ /*
+ * While we overlap succeeding segments trim them or,
+ * if they are completely covered, dequeue them.
+ */
+ while (q) {
+ register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
+ if (i <= 0)
+ break;
+ if (i < q->tqe_len) {
+ q->tqe_th->th_seq += i;
+ q->tqe_len -= i;
+ m_adj(q->tqe_m, i);
+ break;
+ }
+
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ m_freem(q->tqe_m);
+ FREE(q, M_TSEGQ);
+ q = nq;
+ }
+
+ /* Insert the new segment queue entry into place. */
+ te->tqe_m = m;
+ te->tqe_th = th;
+ te->tqe_len = *tlenp;
+
+ if (p == NULL) {
+ LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+ } else {
+ LIST_INSERT_AFTER(p, te, tqe_q);
+ }
+
+present:
+ /*
+ * Present data to user, advancing rcv_nxt through
+ * completed sequence space.
+ */
+ if (!TCPS_HAVEESTABLISHED(tp->t_state))
+ return (0);
+ q = LIST_FIRST(&tp->t_segq);
+ if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
+ return (0);
+ do {
+ tp->rcv_nxt += q->tqe_len;
+ flags = q->tqe_th->th_flags & TH_FIN;
+ nq = LIST_NEXT(q, tqe_q);
+ LIST_REMOVE(q, tqe_q);
+ if (so->so_state & SS_CANTRCVMORE)
+ m_freem(q->tqe_m);
+ else
+ sbappend(&so->so_rcv, q->tqe_m);
+ FREE(q, M_TSEGQ);
+ q = nq;
+ } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
+ ND6_HINT(tp);
+ sorwakeup(so);
+ return (flags);
+}
+
+/*
+ * TCP input routine, follows pages 65-76 of the
+ * protocol specification dated September, 1981 very closely.
+ */
+#ifdef INET6
+int
+tcp6_input(mp, offp, proto)
+ struct mbuf **mp;
+ int *offp, proto;
+{
+ register struct mbuf *m = *mp;
+ struct in6_ifaddr *ia6;
+
+ IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
+
+ /*
+ * draft-itojun-ipv6-tcp-to-anycast
+ * better place to put this in?
+ */
+ ia6 = ip6_getdstifaddr(m);
+ if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
+ struct ip6_hdr *ip6;
+
+ ip6 = mtod(m, struct ip6_hdr *);
+ icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
+ (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
+ return IPPROTO_DONE;
+ }
+
+ tcp_input(m, *offp);
+ return IPPROTO_DONE;
+}
+#endif
+
+void
+tcp_input(m, off0)
+ register struct mbuf *m;
+ int off0;
+{
+ register struct tcphdr *th;
+ register struct ip *ip = NULL;
+ register struct ipovly *ipov;
+ register struct inpcb *inp = NULL;
+ u_char *optp = NULL;
+ int optlen = 0;
+ int len, tlen, off;
+ int drop_hdrlen;
+ register struct tcpcb *tp = 0;
+ register int thflags;
+ struct socket *so = 0;
+ int todrop, acked, ourfinisacked, needoutput = 0;
+ u_long tiwin;
+ struct tcpopt to; /* options in this segment */
+ struct rmxp_tao *taop; /* pointer to our TAO cache entry */
+ struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ int headlocked = 0;
+
+#ifdef TCPDEBUG
+ u_char tcp_saveipgen[40];
+ /* the size of the above must be of max ip header, now IPv6 */
+ struct tcphdr tcp_savetcp;
+ short ostate = 0;
+#endif
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6;
+#endif /* INET6 */
+ struct sockaddr_in *next_hop = NULL;
+ int rstreason; /* For badport_bandlim accounting purposes */
+
+ /* Grab info from MT_TAG mbufs prepended to the chain. */
+ for (;m && m->m_type == MT_TAG; m = m->m_next) {
+ if (m->m_tag_id == PACKET_TAG_IPFORWARD)
+ next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
+ }
+
+#ifdef INET6
+ isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+#endif
+ bzero((char *)&to, sizeof(to));
+
+ tcpstat.tcps_rcvtotal++;
+
+#ifdef INET6
+ if (isipv6) {
+ /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
+ ip6 = mtod(m, struct ip6_hdr *);
+ tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+ if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
+ tcpstat.tcps_rcvbadsum++;
+ goto drop;
+ }
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
+
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ goto drop;
+ }
+ } else
+#endif /* INET6 */
+ {
+ /*
+ * Get IP and TCP header together in first mbuf.
+ * Note: IP leaves IP header in first mbuf.
+ */
+ if (off0 > sizeof (struct ip)) {
+ ip_stripoptions(m, (struct mbuf *)0);
+ off0 = sizeof(struct ip);
+ }
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
+ tcpstat.tcps_rcvshort++;
+ return;
+ }
+ }
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)((caddr_t)ip + off0);
+ tlen = ip->ip_len;
+
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data +
+ ip->ip_len + IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+ } else {
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = sizeof (struct ip) + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = (u_short)tlen;
+ ipov->ih_len = htons(ipov->ih_len);
+ th->th_sum = in_cksum(m, len);
+ }
+ if (th->th_sum) {
+ tcpstat.tcps_rcvbadsum++;
+ goto drop;
+ }
+#ifdef INET6
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+#endif
+ }
+
+ /*
+ * Check that TCP offset makes sense,
+ * pull out TCP options and adjust length. XXX
+ */
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ tcpstat.tcps_rcvbadoff++;
+ goto drop;
+ }
+ tlen -= off; /* tlen is used instead of ti->ti_len */
+ if (off > sizeof (struct tcphdr)) {
+#ifdef INET6
+ if (isipv6) {
+ IP6_EXTHDR_CHECK(m, off0, off, );
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
+ } else
+#endif /* INET6 */
+ {
+ if (m->m_len < sizeof(struct ip) + off) {
+ if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
+ tcpstat.tcps_rcvshort++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ ipov = (struct ipovly *)ip;
+ th = (struct tcphdr *)((caddr_t)ip + off0);
+ }
+ }
+ optlen = off - sizeof (struct tcphdr);
+ optp = (u_char *)(th + 1);
+ }
+ thflags = th->th_flags;
+
+#ifdef TCP_DROP_SYNFIN
+ /*
+ * If the drop_synfin option is enabled, drop all packets with
+ * both the SYN and FIN bits set. This prevents e.g. nmap from
+ * identifying the TCP/IP stack.
+ *
+ * This is a violation of the TCP specification.
+ */
+ if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN))
+ goto drop;
+#endif
+
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ th->th_seq = ntohl(th->th_seq);
+ th->th_ack = ntohl(th->th_ack);
+ th->th_win = ntohs(th->th_win);
+ th->th_urp = ntohs(th->th_urp);
+
+ /*
+ * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options,
+ * until after ip6_savecontrol() is called and before other functions
+ * which don't want those proto headers.
+ * Because ip6_savecontrol() is going to parse the mbuf to
+ * search for data to be passed up to user-land, it wants mbuf
+ * parameters to be unchanged.
+ * XXX: the call of ip6_savecontrol() has been obsoleted based on
+ * latest version of the advanced API (20020110).
+ */
+ drop_hdrlen = off0 + off;
+
+ /*
+ * Locate pcb for segment.
+ */
+ INP_INFO_WLOCK(&tcbinfo);
+ headlocked = 1;
+findpcb:
+ /* IPFIREWALL_FORWARD section */
+ if (next_hop != NULL
+#ifdef INET6
+ && isipv6 == NULL /* IPv6 support is not yet */
+#endif /* INET6 */
+ ) {
+ /*
+ * Transparently forwarded. Pretend to be the destination.
+ * already got one like this?
+ */
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif);
+ if (!inp) {
+ /*
+ * No, then it's new. Try find the ambushing socket
+ */
+ if (next_hop->sin_port == 0) {
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src,
+ th->th_sport, next_hop->sin_addr,
+ th->th_dport, 1, m->m_pkthdr.rcvif);
+ } else {
+ inp = in_pcblookup_hash(&tcbinfo,
+ ip->ip_src, th->th_sport,
+ next_hop->sin_addr,
+ ntohs(next_hop->sin_port), 1,
+ m->m_pkthdr.rcvif);
+ }
+ }
+ } else
+ {
+#ifdef INET6
+ if (isipv6)
+ inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport,
+ &ip6->ip6_dst, th->th_dport, 1,
+ m->m_pkthdr.rcvif);
+ else
+#endif /* INET6 */
+ inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
+ }
+
+#ifdef IPSEC
+#ifdef INET6
+ if (isipv6) {
+ if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) {
+ ipsec6stat.in_polvio++;
+ goto drop;
+ }
+ } else
+#endif /* INET6 */
+ if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) {
+ ipsecstat.in_polvio++;
+ goto drop;
+ }
+#endif /*IPSEC*/
+
+ /*
+ * If the state is CLOSED (i.e., TCB does not exist) then
+ * all data in the incoming segment is discarded.
+ * If the TCB exists but is in CLOSED state, it is embryonic,
+ * but should either do a listen or a connect soon.
+ */
+ if (inp == NULL) {
+ if (log_in_vain) {
+#ifdef INET6
+ char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN];
+#else /* INET6 */
+ char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"];
+#endif /* INET6 */
+
+#ifdef INET6
+ if (isipv6) {
+ strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst));
+ strcpy(sbuf, ip6_sprintf(&ip6->ip6_src));
+ } else
+#endif
+ {
+ strcpy(dbuf, inet_ntoa(ip->ip_dst));
+ strcpy(sbuf, inet_ntoa(ip->ip_src));
+ }
+ switch (log_in_vain) {
+ case 1:
+ if(thflags & TH_SYN)
+ log(LOG_INFO,
+ "Connection attempt to TCP %s:%d from %s:%d\n",
+ dbuf, ntohs(th->th_dport),
+ sbuf,
+ ntohs(th->th_sport));
+ break;
+ case 2:
+ log(LOG_INFO,
+ "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
+ dbuf, ntohs(th->th_dport), sbuf,
+ ntohs(th->th_sport), thflags);
+ break;
+ default:
+ break;
+ }
+ }
+ if (blackhole) {
+ switch (blackhole) {
+ case 1:
+ if (thflags & TH_SYN)
+ goto drop;
+ break;
+ case 2:
+ goto drop;
+ default:
+ goto drop;
+ }
+ }
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ if (tp == 0) {
+ INP_UNLOCK(inp);
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+ if (tp->t_state == TCPS_CLOSED)
+ goto drop;
+
+ /* Unscale the window into a 32-bit value. */
+ if ((thflags & TH_SYN) == 0)
+ tiwin = th->th_win << tp->snd_scale;
+ else
+ tiwin = th->th_win;
+
+ so = inp->inp_socket;
+ if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
+ struct in_conninfo inc;
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG) {
+ ostate = tp->t_state;
+#ifdef INET6
+ if (isipv6)
+ bcopy((char *)ip6, (char *)tcp_saveipgen,
+ sizeof(*ip6));
+ else
+#endif /* INET6 */
+ bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
+ tcp_savetcp = *th;
+ }
+#endif
+ /* skip if this isn't a listen socket */
+ if ((so->so_options & SO_ACCEPTCONN) == 0)
+ goto after_listen;
+#ifdef INET6
+ inc.inc_isipv6 = isipv6;
+ if (isipv6) {
+ inc.inc6_faddr = ip6->ip6_src;
+ inc.inc6_laddr = ip6->ip6_dst;
+ inc.inc6_route.ro_rt = NULL; /* XXX */
+
+ } else
+#endif /* INET6 */
+ {
+ inc.inc_faddr = ip->ip_src;
+ inc.inc_laddr = ip->ip_dst;
+ inc.inc_route.ro_rt = NULL; /* XXX */
+ }
+ inc.inc_fport = th->th_sport;
+ inc.inc_lport = th->th_dport;
+
+ /*
+ * If the state is LISTEN then ignore segment if it contains
+ * a RST. If the segment contains an ACK then it is bad and
+ * send a RST. If it does not contain a SYN then it is not
+ * interesting; drop it.
+ *
+ * If the state is SYN_RECEIVED (syncache) and seg contains
+ * an ACK, but not for our SYN/ACK, send a RST. If the seg
+ * contains a RST, check the sequence number to see if it
+ * is a valid reset segment.
+ */
+ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
+ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+ if (!syncache_expand(&inc, th, &so, m)) {
+ /*
+ * No syncache entry, or ACK was not
+ * for our SYN/ACK. Send a RST.
+ */
+ tcpstat.tcps_badsyn++;
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ if (so == NULL) {
+ /*
+ * Could not complete 3-way handshake,
+ * connection is being closed down, and
+ * syncache will free mbuf.
+ */
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+ }
+ /*
+ * Socket is created in state SYN_RECEIVED.
+ * Continue processing segment.
+ */
+ INP_UNLOCK(inp);
+ inp = sotoinpcb(so);
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ /*
+ * This is what would have happened in
+ * tcp_output() when the SYN,ACK was sent.
+ */
+ tp->snd_up = tp->snd_una;
+ tp->snd_max = tp->snd_nxt = tp->iss + 1;
+ tp->last_ack_sent = tp->rcv_nxt;
+/*
+ * XXX possible bug - it doesn't appear that tp->snd_wnd is unscaled
+ * until the _second_ ACK is received:
+ * rcv SYN (set wscale opts) --> send SYN/ACK, set snd_wnd = window.
+ * rcv ACK, calculate tiwin --> process SYN_RECEIVED, determine wscale,
+ * move to ESTAB, set snd_wnd to tiwin.
+ */
+ tp->snd_wnd = tiwin; /* unscaled */
+ goto after_listen;
+ }
+ if (thflags & TH_RST) {
+ syncache_chkrst(&inc, th);
+ goto drop;
+ }
+ if (thflags & TH_ACK) {
+ syncache_badack(&inc);
+ tcpstat.tcps_badsyn++;
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ goto drop;
+ }
+
+ /*
+ * Segment's flags are (SYN) or (SYN|FIN).
+ */
+#ifdef INET6
+ /*
+ * If deprecated address is forbidden,
+ * we do not accept SYN to deprecated interface
+ * address to prevent any new inbound connection from
+ * getting established.
+ * When we do not accept SYN, we send a TCP RST,
+ * with deprecated source address (instead of dropping
+ * it). We compromise it as it is much better for peer
+ * to send a RST, and RST will be the final packet
+ * for the exchange.
+ *
+ * If we do not forbid deprecated addresses, we accept
+ * the SYN packet. RFC2462 does not suggest dropping
+ * SYN in this case.
+ * If we decipher RFC2462 5.5.4, it says like this:
+ * 1. use of deprecated addr with existing
+ * communication is okay - "SHOULD continue to be
+ * used"
+ * 2. use of it with new communication:
+ * (2a) "SHOULD NOT be used if alternate address
+ * with sufficient scope is available"
+ * (2b) nothing mentioned otherwise.
+ * Here we fall into (2b) case as we have no choice in
+ * our source address selection - we must obey the peer.
+ *
+ * The wording in RFC2462 is confusing, and there are
+ * multiple description text for deprecated address
+ * handling - worse, they are not exactly the same.
+ * I believe 5.5.4 is the best one, so we follow 5.5.4.
+ */
+ if (isipv6 && !ip6_use_deprecated) {
+ struct in6_ifaddr *ia6;
+
+ if ((ia6 = ip6_getdstifaddr(m)) &&
+ (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
+ INP_UNLOCK(inp);
+ tp = NULL;
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ }
+#endif
+ /*
+ * If it is from this socket, drop it, it must be forged.
+ * Don't bother responding if the destination was a broadcast.
+ */
+ if (th->th_dport == th->th_sport) {
+#ifdef INET6
+ if (isipv6) {
+ if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
+ &ip6->ip6_src))
+ goto drop;
+ } else
+#endif /* INET6 */
+ if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
+ goto drop;
+ }
+ /*
+ * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
+ *
+ * Note that it is quite possible to receive unicast
+ * link-layer packets with a broadcast IP address. Use
+ * in_broadcast() to find them.
+ */
+ if (m->m_flags & (M_BCAST|M_MCAST))
+ goto drop;
+#ifdef INET6
+ if (isipv6) {
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
+ goto drop;
+ } else
+#endif
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
+ goto drop;
+ /*
+ * SYN appears to be valid; create compressed TCP state
+ * for syncache, or perform t/tcp connection.
+ */
+ if (so->so_qlen <= so->so_qlimit) {
+ tcp_dooptions(&to, optp, optlen, 1);
+ if (!syncache_add(&inc, &to, th, &so, m))
+ goto drop;
+ if (so == NULL) {
+ /*
+ * Entry added to syncache, mbuf used to
+ * send SYN,ACK packet.
+ */
+ KASSERT(headlocked, ("headlocked"));
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+ }
+ /*
+ * Segment passed TAO tests.
+ */
+ INP_UNLOCK(inp);
+ inp = sotoinpcb(so);
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ tp->snd_wnd = tiwin;
+ tp->t_starttime = ticks;
+ tp->t_state = TCPS_ESTABLISHED;
+
+ /*
+ * If there is a FIN, or if there is data and the
+ * connection is local, then delay SYN,ACK(SYN) in
+ * the hope of piggy-backing it on a response
+ * segment. Otherwise must send ACK now in case
+ * the other side is slow starting.
+ */
+ if (DELAY_ACK(tp) && ((thflags & TH_FIN) ||
+ (tlen != 0 &&
+#ifdef INET6
+ ((isipv6 && in6_localaddr(&inp->in6p_faddr))
+ ||
+ (!isipv6 &&
+#endif
+ in_localaddr(inp->inp_faddr)
+#ifdef INET6
+ ))
+#endif
+ ))) {
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ tp->t_flags |= TF_NEEDSYN;
+ } else
+ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+
+ tcpstat.tcps_connects++;
+ soisconnected(so);
+ goto trimthenstep6;
+ }
+ goto drop;
+ }
+after_listen:
+
+/* XXX temp debugging */
+ /* should not happen - syncache should pick up these connections */
+ if (tp->t_state == TCPS_LISTEN)
+ panic("tcp_input: TCPS_LISTEN");
+
+ /*
+ * Segment received on connection.
+ * Reset idle time and keep-alive timer.
+ */
+ tp->t_rcvtime = ticks;
+ if (TCPS_HAVEESTABLISHED(tp->t_state))
+ callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+
+ /*
+ * Process options.
+ * XXX this is tradtitional behavior, may need to be cleaned up.
+ */
+ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
+ if (thflags & TH_SYN) {
+ if (to.to_flags & TOF_SCALE) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->requested_s_scale = to.to_requested_s_scale;
+ }
+ if (to.to_flags & TOF_TS) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = ticks;
+ }
+ if (to.to_flags & (TOF_CC|TOF_CCNEW))
+ tp->t_flags |= TF_RCVD_CC;
+ if (to.to_flags & TOF_MSS)
+ tcp_mss(tp, to.to_mss);
+ }
+
+ /*
+ * Header prediction: check for the two common cases
+ * of a uni-directional data xfer. If the packet has
+ * no control flags, is in-sequence, the window didn't
+ * change and we're not retransmitting, it's a
+ * candidate. If the length is zero and the ack moved
+ * forward, we're the sender side of the xfer. Just
+ * free the data acked & wake any higher level process
+ * that was blocked waiting for space. If the length
+ * is non-zero and the ack didn't move, we're the
+ * receiver side. If we're getting packets in-order
+ * (the reassembly queue is empty), add the data to
+ * the socket buffer and note that we need a delayed ack.
+ * Make sure that the hidden state-flags are also off.
+ * Since we check for TCPS_ESTABLISHED above, it can only
+ * be TH_NEEDSYN.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+ ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
+ ((to.to_flags & TOF_TS) == 0 ||
+ TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
+ /*
+ * Using the CC option is compulsory if once started:
+ * the segment is OK if no T/TCP was negotiated or
+ * if the segment has a CC option equal to CCrecv
+ */
+ ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) ||
+ ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) &&
+ th->th_seq == tp->rcv_nxt &&
+ tiwin && tiwin == tp->snd_wnd &&
+ tp->snd_nxt == tp->snd_max) {
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record the timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = ticks;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ if (tlen == 0) {
+ if (SEQ_GT(th->th_ack, tp->snd_una) &&
+ SEQ_LEQ(th->th_ack, tp->snd_max) &&
+ tp->snd_cwnd >= tp->snd_wnd &&
+ tp->t_dupacks < tcprexmtthresh) {
+ KASSERT(headlocked, ("headlocked"));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ /*
+ * this is a pure ack for outstanding data.
+ */
+ ++tcpstat.tcps_predack;
+ /*
+ * "bad retransmit" recovery
+ */
+ if (tp->t_rxtshift == 1 &&
+ ticks < tp->t_badrxtwin) {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh =
+ tp->snd_ssthresh_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0;
+ }
+ if ((to.to_flags & TOF_TS) != 0)
+ tcp_xmit_timer(tp,
+ ticks - to.to_tsecr + 1);
+ else if (tp->t_rtttime &&
+ SEQ_GT(th->th_ack, tp->t_rtseq))
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ acked = th->th_ack - tp->snd_una;
+ tcpstat.tcps_rcvackpack++;
+ tcpstat.tcps_rcvackbyte += acked;
+ sbdrop(&so->so_snd, acked);
+ tp->snd_una = th->th_ack;
+ m_freem(m);
+ ND6_HINT(tp); /* some progress has been done */
+
+ /*
+ * If all outstanding data are acked, stop
+ * retransmit timer, otherwise restart timer
+ * using current (possibly backed-off) value.
+ * If process is waiting for space,
+ * wakeup/selwakeup/signal. If data
+ * are ready to send, let tcp_output
+ * decide between more output or persist.
+ */
+ if (tp->snd_una == tp->snd_max)
+ callout_stop(tp->tt_rexmt);
+ else if (!callout_active(tp->tt_persist))
+ callout_reset(tp->tt_rexmt,
+ tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+
+ sowwakeup(so);
+ if (so->so_snd.sb_cc)
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ return;
+ }
+ } else if (th->th_ack == tp->snd_una &&
+ LIST_EMPTY(&tp->t_segq) &&
+ tlen <= sbspace(&so->so_rcv)) {
+ KASSERT(headlocked, ("headlocked"));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ /*
+ * this is a pure, in-sequence data packet
+ * with nothing on the reassembly queue and
+ * we have enough buffer space to take it.
+ */
+ ++tcpstat.tcps_preddat;
+ tp->rcv_nxt += tlen;
+ tcpstat.tcps_rcvpack++;
+ tcpstat.tcps_rcvbyte += tlen;
+ ND6_HINT(tp); /* some progress has been done */
+ /*
+ * Add data to socket buffer.
+ */
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ sbappend(&so->so_rcv, m);
+ sorwakeup(so);
+ if (DELAY_ACK(tp)) {
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ } else {
+ tp->t_flags |= TF_ACKNOW;
+ tcp_output(tp);
+ }
+ INP_UNLOCK(inp);
+ return;
+ }
+ }
+
+ /*
+ * Calculate amount of space in receive window,
+ * and then do TCP input processing.
+ * Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ { int win;
+
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+ }
+
+ switch (tp->t_state) {
+
+ /*
+ * If the state is SYN_RECEIVED:
+ * if seg contains an ACK, but not for our SYN/ACK, send a RST.
+ */
+ case TCPS_SYN_RECEIVED:
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+ break;
+
+ /*
+ * If the state is SYN_SENT:
+ * if seg contains an ACK, but not for our SYN, drop the input.
+ * if seg contains a RST, then drop the connection.
+ * if seg does not contain SYN, then drop it.
+ * Otherwise this is an acceptable SYN segment
+ * initialize tp->rcv_nxt and tp->irs
+ * if seg contains ack then advance tp->snd_una
+ * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+ * arrange for segment to be acked (eventually)
+ * continue processing rest of data/controls, beginning with URG
+ */
+ case TCPS_SYN_SENT:
+ if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
+ taop = &tao_noncached;
+ bzero(taop, sizeof(*taop));
+ }
+
+ if ((thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ /*
+ * If we have a cached CCsent for the remote host,
+ * hence we haven't just crashed and restarted,
+ * do not send a RST. This may be a retransmission
+ * from the other side after our earlier ACK was lost.
+ * Our new SYN, when it arrives, will serve as the
+ * needed ACK.
+ */
+ if (taop->tao_ccsent != 0)
+ goto drop;
+ else {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ }
+ if (thflags & TH_RST) {
+ if (thflags & TH_ACK)
+ tp = tcp_drop(tp, ECONNREFUSED);
+ goto drop;
+ }
+ if ((thflags & TH_SYN) == 0)
+ goto drop;
+ tp->snd_wnd = th->th_win; /* initial send window */
+ tp->cc_recv = to.to_cc; /* foreign CC */
+
+ tp->irs = th->th_seq;
+ tcp_rcvseqinit(tp);
+ if (thflags & TH_ACK) {
+ /*
+ * Our SYN was acked. If segment contains CC.ECHO
+ * option, check it to make sure this segment really
+ * matches our SYN. If not, just drop it as old
+ * duplicate, but send an RST if we're still playing
+ * by the old rules. If no CC.ECHO option, make sure
+ * we don't get fooled into using T/TCP.
+ */
+ if (to.to_flags & TOF_CCECHO) {
+ if (tp->cc_send != to.to_ccecho) {
+ if (taop->tao_ccsent != 0)
+ goto drop;
+ else {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ }
+ } else
+ tp->t_flags &= ~TF_RCVD_CC;
+ tcpstat.tcps_connects++;
+ soisconnected(so);
+ /* Do window scaling on this connection? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ /* Segment is acceptable, update cache if undefined. */
+ if (taop->tao_ccsent == 0)
+ taop->tao_ccsent = to.to_ccecho;
+
+ tp->rcv_adv += tp->rcv_wnd;
+ tp->snd_una++; /* SYN is acked */
+ /*
+ * If there's data, delay ACK; if there's also a FIN
+ * ACKNOW will be turned on later.
+ */
+ if (DELAY_ACK(tp) && tlen != 0)
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ else
+ tp->t_flags |= TF_ACKNOW;
+ /*
+ * Received <SYN,ACK> in SYN_SENT[*] state.
+ * Transitions:
+ * SYN_SENT --> ESTABLISHED
+ * SYN_SENT* --> FIN_WAIT_1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ thflags &= ~TH_SYN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ callout_reset(tp->tt_keep, tcp_keepidle,
+ tcp_timer_keep, tp);
+ }
+ } else {
+ /*
+ * Received initial SYN in SYN-SENT[*] state => simul-
+ * taneous open. If segment contains CC option and there is
+ * a cached CC, apply TAO test; if it succeeds, connection is
+ * half-synchronized. Otherwise, do 3-way handshake:
+ * SYN-SENT -> SYN-RECEIVED
+ * SYN-SENT* -> SYN-RECEIVED*
+ * If there was no CC option, clear cached CC value.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ callout_stop(tp->tt_rexmt);
+ if (to.to_flags & TOF_CC) {
+ if (taop->tao_cc != 0 &&
+ CC_GT(to.to_cc, taop->tao_cc)) {
+ /*
+ * update cache and make transition:
+ * SYN-SENT -> ESTABLISHED*
+ * SYN-SENT* -> FIN-WAIT-1*
+ */
+ taop->tao_cc = to.to_cc;
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ callout_reset(tp->tt_keep,
+ tcp_keepidle,
+ tcp_timer_keep,
+ tp);
+ }
+ tp->t_flags |= TF_NEEDSYN;
+ } else
+ tp->t_state = TCPS_SYN_RECEIVED;
+ } else {
+ /* CC.NEW or no option => invalidate cache */
+ taop->tao_cc = 0;
+ tp->t_state = TCPS_SYN_RECEIVED;
+ }
+ }
+
+trimthenstep6:
+ /*
+ * Advance th->th_seq to correspond to first data byte.
+ * If data, trim to stay within window,
+ * dropping FIN if necessary.
+ */
+ th->th_seq++;
+ if (tlen > tp->rcv_wnd) {
+ todrop = tlen - tp->rcv_wnd;
+ m_adj(m, -todrop);
+ tlen = tp->rcv_wnd;
+ thflags &= ~TH_FIN;
+ tcpstat.tcps_rcvpackafterwin++;
+ tcpstat.tcps_rcvbyteafterwin += todrop;
+ }
+ tp->snd_wl1 = th->th_seq - 1;
+ tp->rcv_up = th->th_seq;
+ /*
+ * Client side of transaction: already sent SYN and data.
+ * If the remote host used T/TCP to validate the SYN,
+ * our data will be ACK'd; if so, enter normal data segment
+ * processing in the middle of step 5, ack processing.
+ * Otherwise, goto step 6.
+ */
+ if (thflags & TH_ACK)
+ goto process_ACK;
+ goto step6;
+ /*
+ * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+ * if segment contains a SYN and CC [not CC.NEW] option:
+ * if state == TIME_WAIT and connection duration > MSL,
+ * drop packet and send RST;
+ *
+ * if SEG.CC > CCrecv then is new SYN, and can implicitly
+ * ack the FIN (and data) in retransmission queue.
+ * Complete close and delete TCPCB. Then reprocess
+ * segment, hoping to find new TCPCB in LISTEN state;
+ *
+ * else must be old SYN; drop it.
+ * else do normal processing.
+ */
+ case TCPS_LAST_ACK:
+ case TCPS_CLOSING:
+ case TCPS_TIME_WAIT:
+ if ((thflags & TH_SYN) &&
+ (to.to_flags & TOF_CC) && tp->cc_recv != 0) {
+ if (tp->t_state == TCPS_TIME_WAIT &&
+ (ticks - tp->t_starttime) > tcp_msl) {
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+ if (CC_GT(to.to_cc, tp->cc_recv)) {
+ tp = tcp_close(tp);
+ goto findpcb;
+ }
+ else
+ goto drop;
+ }
+ break; /* continue normal processing */
+ }
+
+ /*
+ * States other than LISTEN or SYN_SENT.
+ * First check the RST flag and sequence number since reset segments
+ * are exempt from the timestamp and connection count tests. This
+ * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
+ * below which allowed reset segments in half the sequence space
+ * to fall though and be processed (which gives forged reset
+ * segments with a random sequence number a 50 percent chance of
+ * killing a connection).
+ * Then check timestamp, if present.
+ * Then check the connection count, if present.
+ * Then check that at least some bytes of segment are within
+ * receive window. If segment begins before rcv_nxt,
+ * drop leading data (and SYN); if nothing left, just ack.
+ *
+ *
+ * If the RST bit is set, check the sequence number to see
+ * if this is a valid reset segment.
+ * RFC 793 page 37:
+ * In all states except SYN-SENT, all reset (RST) segments
+ * are validated by checking their SEQ-fields. A reset is
+ * valid if its sequence number is in the window.
+ * Note: this does not take into account delayed ACKs, so
+ * we should test against last_ack_sent instead of rcv_nxt.
+ * The sequence number in the reset segment is normally an
+ * echo of our outgoing acknowlegement numbers, but some hosts
+ * send a reset with the sequence number at the rightmost edge
+ * of our receive window, and we have to handle this case.
+ * If we have multiple segments in flight, the intial reset
+ * segment sequence numbers will be to the left of last_ack_sent,
+ * but they will eventually catch up.
+ * In any case, it never made sense to trim reset segments to
+ * fit the receive window since RFC 1122 says:
+ * 4.2.2.12 RST Segment: RFC-793 Section 3.4
+ *
+ * A TCP SHOULD allow a received RST segment to include data.
+ *
+ * DISCUSSION
+ * It has been suggested that a RST segment could contain
+ * ASCII text that encoded and explained the cause of the
+ * RST. No standard has yet been established for such
+ * data.
+ *
+ * If the reset segment passes the sequence number test examine
+ * the state:
+ * SYN_RECEIVED STATE:
+ * If passive open, return to LISTEN state.
+ * If active open, inform user that connection was refused.
+ * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
+ * Inform user that connection was reset, and close tcb.
+ * CLOSING, LAST_ACK STATES:
+ * Close the tcb.
+ * TIME_WAIT STATE:
+ * Drop the segment - see Stevens, vol. 2, p. 964 and
+ * RFC 1337.
+ */
+ if (thflags & TH_RST) {
+ if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ switch (tp->t_state) {
+
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ so->so_error = ECONNRESET;
+ close:
+ tp->t_state = TCPS_CLOSED;
+ tcpstat.tcps_drops++;
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_TIME_WAIT:
+ break;
+ }
+ }
+ goto drop;
+ }
+
+ /*
+ * RFC 1323 PAWS: If we have a timestamp reply on this segment
+ * and it's less than ts_recent, drop it.
+ */
+ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
+ TSTMP_LT(to.to_tsval, tp->ts_recent)) {
+
+ /* Check to see if ts_recent is over 24 days old. */
+ if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates
+ * ts_recent, the age will be reset later and ts_recent
+ * will get a valid value. If it does not, setting
+ * ts_recent to zero will at least satisfy the
+ * requirement that zero be placed in the timestamp
+ * echo reply when ts_recent isn't valid. The
+ * age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be
+ * dropped when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ tcpstat.tcps_rcvduppack++;
+ tcpstat.tcps_rcvdupbyte += tlen;
+ tcpstat.tcps_pawsdrop++;
+ goto dropafterack;
+ }
+ }
+
+ /*
+ * T/TCP mechanism
+ * If T/TCP was negotiated and the segment doesn't have CC,
+ * or if its CC is wrong then drop the segment.
+ * RST segments do not have to comply with this.
+ */
+ if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) &&
+ ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc))
+ goto dropafterack;
+
+ /*
+ * In the SYN-RECEIVED state, validate that the packet belongs to
+ * this connection before trimming the data to fit the receive
+ * window. Check the sequence number versus IRS since we know
+ * the sequence numbers haven't wrapped. This is a partial fix
+ * for the "LAND" DoS attack.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ tcpstat.tcps_rcvduppack++;
+ tcpstat.tcps_rcvdupbyte += todrop;
+ } else {
+ tcpstat.tcps_rcvpartduppack++;
+ tcpstat.tcps_rcvpartdupbyte += todrop;
+ }
+ drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+
+ /*
+ * If new data are received on a connection after the
+ * user processes are gone, then RST the other end.
+ */
+ if ((so->so_state & SS_NOFDREF) &&
+ tp->t_state > TCPS_CLOSE_WAIT && tlen) {
+ tp = tcp_close(tp);
+ tcpstat.tcps_rcvafterclose++;
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+
+ /*
+ * If segment ends after window, drop trailing data
+ * (and PUSH and FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd);
+ if (todrop > 0) {
+ tcpstat.tcps_rcvpackafterwin++;
+ if (todrop >= tlen) {
+ tcpstat.tcps_rcvbyteafterwin += tlen;
+ /*
+ * If a new connection request is received
+ * while in TIME_WAIT, drop the old connection
+ * and start over if the sequence numbers
+ * are above the previous ones.
+ */
+ if (thflags & TH_SYN &&
+ tp->t_state == TCPS_TIME_WAIT &&
+ SEQ_GT(th->th_seq, tp->rcv_nxt)) {
+ tp = tcp_close(tp);
+ goto findpcb;
+ }
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment
+ * and ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ tcpstat.tcps_rcvwinprobe++;
+ } else
+ goto dropafterack;
+ } else
+ tcpstat.tcps_rcvbyteafterwin += todrop;
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH|TH_FIN);
+ }
+
+ /*
+ * If last ACK falls within this segment's sequence numbers,
+ * record its timestamp.
+ * NOTE that the test is modified according to the latest
+ * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+ */
+ if ((to.to_flags & TOF_TS) != 0 &&
+ SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+ tp->ts_recent_age = ticks;
+ tp->ts_recent = to.to_tsval;
+ }
+
+ /*
+ * If a SYN is in the window, then this is an
+ * error and we send an RST and drop the connection.
+ */
+ if (thflags & TH_SYN) {
+ tp = tcp_drop(tp, ECONNRESET);
+ rstreason = BANDLIM_UNLIMITED;
+ goto dropwithreset;
+ }
+
+ /*
+ * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
+ * flag is on (half-synchronized state), then queue data for
+ * later processing; else drop segment and return.
+ */
+ if ((thflags & TH_ACK) == 0) {
+ if (tp->t_state == TCPS_SYN_RECEIVED ||
+ (tp->t_flags & TF_NEEDSYN))
+ goto step6;
+ else
+ goto drop;
+ }
+
+ /*
+ * Ack processing.
+ */
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
+ * ESTABLISHED state and continue processing.
+ * The ACK was checked above.
+ */
+ case TCPS_SYN_RECEIVED:
+
+ tcpstat.tcps_connects++;
+ soisconnected(so);
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ /*
+ * Upon successful completion of 3-way handshake,
+ * update cache.CC if it was undefined, pass any queued
+ * data to the user, and advance state appropriately.
+ */
+ if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
+ taop->tao_cc == 0)
+ taop->tao_cc = tp->cc_recv;
+
+ /*
+ * Make transitions:
+ * SYN-RECEIVED -> ESTABLISHED
+ * SYN-RECEIVED* -> FIN-WAIT-1
+ */
+ tp->t_starttime = ticks;
+ if (tp->t_flags & TF_NEEDFIN) {
+ tp->t_state = TCPS_FIN_WAIT_1;
+ tp->t_flags &= ~TF_NEEDFIN;
+ } else {
+ tp->t_state = TCPS_ESTABLISHED;
+ callout_reset(tp->tt_keep, tcp_keepidle,
+ tcp_timer_keep, tp);
+ }
+ /*
+ * If segment contains data or ACK, will call tcp_reass()
+ * later; if not, do so now to pass queued data to user.
+ */
+ if (tlen == 0 && (thflags & TH_FIN) == 0)
+ (void) tcp_reass(tp, (struct tcphdr *)0, 0,
+ (struct mbuf *)0);
+ tp->snd_wl1 = th->th_seq - 1;
+ /* fall into ... */
+
+ /*
+ * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
+ * ACKs. If the ack is in the range
+ * tp->snd_una < th->th_ack <= tp->snd_max
+ * then advance tp->snd_una to th->th_ack and drop
+ * data from the retransmission queue. If this ACK reflects
+ * more up to date window information we update our window information.
+ */
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ case TCPS_TIME_WAIT:
+
+ if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
+ if (tlen == 0 && tiwin == tp->snd_wnd) {
+ tcpstat.tcps_rcvdupack++;
+ /*
+ * If we have outstanding data (other than
+ * a window probe), this is a completely
+ * duplicate ack (ie, window info didn't
+ * change), the ack is the biggest we've
+ * seen and we've seen exactly our rexmt
+ * threshhold of them, assume a packet
+ * has been dropped and retransmit it.
+ * Kludge snd_nxt & the congestion
+ * window so we send only this one
+ * packet.
+ *
+ * We know we're losing at the current
+ * window size so do congestion avoidance
+ * (set ssthresh to half the current window
+ * and pull our congestion window back to
+ * the new ssthresh).
+ *
+ * Dup acks mean that packets have left the
+ * network (they're now cached at the receiver)
+ * so bump cwnd by the amount in the receiver
+ * to keep a constant cwnd packets in the
+ * network.
+ */
+ if (!callout_active(tp->tt_rexmt) ||
+ th->th_ack != tp->snd_una)
+ tp->t_dupacks = 0;
+ else if (++tp->t_dupacks == tcprexmtthresh) {
+ tcp_seq onxt = tp->snd_nxt;
+ u_int win =
+ min(tp->snd_wnd, tp->snd_cwnd) / 2 /
+ tp->t_maxseg;
+ if (tcp_do_newreno && SEQ_LT(th->th_ack,
+ tp->snd_recover)) {
+ /* False retransmit, should not
+ * cut window
+ */
+ tp->snd_cwnd += tp->t_maxseg;
+ tp->t_dupacks = 0;
+ (void) tcp_output(tp);
+ goto drop;
+ }
+ if (win < 2)
+ win = 2;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ tp->snd_recover = tp->snd_max;
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = th->th_ack;
+ tp->snd_cwnd = tp->t_maxseg;
+ (void) tcp_output(tp);
+ tp->snd_cwnd = tp->snd_ssthresh +
+ tp->t_maxseg * tp->t_dupacks;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ goto drop;
+ } else if (tp->t_dupacks > tcprexmtthresh) {
+ tp->snd_cwnd += tp->t_maxseg;
+ (void) tcp_output(tp);
+ goto drop;
+ }
+ } else
+ tp->t_dupacks = 0;
+ break;
+ }
+ /*
+ * If the congestion window was inflated to account
+ * for the other side's cached packets, retract it.
+ */
+ if (tcp_do_newreno == 0) {
+ if (tp->t_dupacks >= tcprexmtthresh &&
+ tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_dupacks = 0;
+ } else if (tp->t_dupacks >= tcprexmtthresh &&
+ !tcp_newreno(tp, th)) {
+ /*
+ * Window inflation should have left us with approx.
+ * snd_ssthresh outstanding data. But in case we
+ * would be inclined to send a burst, better to do
+ * it via the slow start mechanism.
+ */
+ if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max))
+ tp->snd_cwnd =
+ tp->snd_max - th->th_ack + tp->t_maxseg;
+ else
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->t_dupacks = 0;
+ }
+ if (tp->t_dupacks < tcprexmtthresh)
+ tp->t_dupacks = 0;
+ if (SEQ_GT(th->th_ack, tp->snd_max)) {
+ tcpstat.tcps_rcvacktoomuch++;
+ goto dropafterack;
+ }
+ /*
+ * If we reach this point, ACK is not a duplicate,
+ * i.e., it ACKs something we sent.
+ */
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our
+ * SYN has been ACK'd (so connection is now fully
+ * synchronized). Go to non-starred state,
+ * increment snd_una for ACK of SYN, and check if
+ * we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ /* Do window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+ tp->snd_scale = tp->requested_s_scale;
+ tp->rcv_scale = tp->request_r_scale;
+ }
+ }
+
+process_ACK:
+ acked = th->th_ack - tp->snd_una;
+ tcpstat.tcps_rcvackpack++;
+ tcpstat.tcps_rcvackbyte += acked;
+
+ /*
+ * If we just performed our first retransmit, and the ACK
+ * arrives within our recovery window, then it was a mistake
+ * to do the retransmit in the first place. Recover our
+ * original cwnd and ssthresh, and proceed to transmit where
+ * we left off.
+ */
+ if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0; /* XXX probably not required */
+ }
+
+ /*
+ * If we have a timestamp reply, update smoothed
+ * round trip time. If no timestamp is present but
+ * transmit timer is running and timed sequence
+ * number was acked, update smoothed round trip time.
+ * Since we now have an rtt measurement, cancel the
+ * timer backoff (cf., Phil Karn's retransmit alg.).
+ * Recompute the initial retransmit timer.
+ */
+ if (to.to_flags & TOF_TS)
+ tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
+ else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
+ tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+
+ /*
+ * If all outstanding data is acked, stop retransmit
+ * timer and remember to restart (more output or persist).
+ * If there is more data to be acked, restart retransmit
+ * timer, using current (possibly backed-off) value.
+ */
+ if (th->th_ack == tp->snd_max) {
+ callout_stop(tp->tt_rexmt);
+ needoutput = 1;
+ } else if (!callout_active(tp->tt_persist))
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+
+ /*
+ * If no data (only SYN) was ACK'd,
+ * skip rest of ACK processing.
+ */
+ if (acked == 0)
+ goto step6;
+
+ /*
+ * When new data is acked, open the congestion window.
+ * If the window gives us less than ssthresh packets
+ * in flight, open exponentially (maxseg per packet).
+ * Otherwise open linearly: maxseg per window
+ * (maxseg^2 / cwnd per packet).
+ */
+ {
+ register u_int cw = tp->snd_cwnd;
+ register u_int incr = tp->t_maxseg;
+
+ if (cw > tp->snd_ssthresh)
+ incr = incr * incr / cw;
+ /*
+ * If t_dupacks != 0 here, it indicates that we are still
+ * in NewReno fast recovery mode, so we leave the congestion
+ * window alone.
+ */
+ if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
+ tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
+ }
+ if (acked > so->so_snd.sb_cc) {
+ tp->snd_wnd -= so->so_snd.sb_cc;
+ sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
+ ourfinisacked = 1;
+ } else {
+ sbdrop(&so->so_snd, acked);
+ tp->snd_wnd -= acked;
+ ourfinisacked = 0;
+ }
+ sowwakeup(so);
+ tp->snd_una = th->th_ack;
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una))
+ tp->snd_nxt = tp->snd_una;
+
+ switch (tp->t_state) {
+
+ /*
+ * In FIN_WAIT_1 STATE in addition to the processing
+ * for the ESTABLISHED state if our FIN is now acknowledged
+ * then enter FIN_WAIT_2.
+ */
+ case TCPS_FIN_WAIT_1:
+ if (ourfinisacked) {
+ /*
+ * If we can't receive any more
+ * data, then closing user can proceed.
+ * Starting the timer is contrary to the
+ * specification, but if we don't get a FIN
+ * we'll hang forever.
+ */
+ if (so->so_state & SS_CANTRCVMORE) {
+ soisdisconnected(so);
+ callout_reset(tp->tt_2msl, tcp_maxidle,
+ tcp_timer_2msl, tp);
+ }
+ tp->t_state = TCPS_FIN_WAIT_2;
+ }
+ break;
+
+ /*
+ * In CLOSING STATE in addition to the processing for
+ * the ESTABLISHED state if the ACK acknowledges our FIN
+ * then enter the TIME-WAIT state, otherwise ignore
+ * the segment.
+ */
+ case TCPS_CLOSING:
+ if (ourfinisacked) {
+ tp->t_state = TCPS_TIME_WAIT;
+ tcp_canceltimers(tp);
+ /* Shorten TIME_WAIT [RFC-1644, p.28] */
+ if (tp->cc_recv != 0 &&
+ (ticks - tp->t_starttime) < tcp_msl)
+ callout_reset(tp->tt_2msl,
+ tp->t_rxtcur *
+ TCPTV_TWTRUNC,
+ tcp_timer_2msl, tp);
+ else
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ soisdisconnected(so);
+ }
+ break;
+
+ /*
+ * In LAST_ACK, we may still be waiting for data to drain
+ * and/or to be acked, as well as for the ack of our FIN.
+ * If our FIN is now acknowledged, delete the TCB,
+ * enter the closed state and return.
+ */
+ case TCPS_LAST_ACK:
+ if (ourfinisacked) {
+ tp = tcp_close(tp);
+ goto drop;
+ }
+ break;
+
+ /*
+ * In TIME_WAIT state the only thing that should arrive
+ * is a retransmission of the remote FIN. Acknowledge
+ * it and restart the finack timer.
+ */
+ case TCPS_TIME_WAIT:
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ goto dropafterack;
+ }
+ }
+
+step6:
+ /*
+ * Update window information.
+ * Don't look at window if no ACK: TAC's send garbage on first SYN.
+ */
+ if ((thflags & TH_ACK) &&
+ (SEQ_LT(tp->snd_wl1, th->th_seq) ||
+ (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+ (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+ /* keep track of pure window updates */
+ if (tlen == 0 &&
+ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
+ tcpstat.tcps_rcvwinupd++;
+ tp->snd_wnd = tiwin;
+ tp->snd_wl1 = th->th_seq;
+ tp->snd_wl2 = th->th_ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ needoutput = 1;
+ }
+
+ /*
+ * Process segments with URG.
+ */
+ if ((thflags & TH_URG) && th->th_urp &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ /*
+ * This is a kludge, but if we receive and accept
+ * random urgent pointers, we'll crash in
+ * soreceive. It's hard to imagine someone
+ * actually wanting to send this much urgent data.
+ */
+ if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+ th->th_urp = 0; /* XXX */
+ thflags &= ~TH_URG; /* XXX */
+ goto dodata; /* XXX */
+ }
+ /*
+ * If this segment advances the known urgent pointer,
+ * then mark the data stream. This should not happen
+ * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
+ * a FIN has been received from the remote side.
+ * In these states we ignore the URG.
+ *
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section as the original
+ * spec states (in one of two places).
+ */
+ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
+ tp->rcv_up = th->th_seq + th->th_urp;
+ so->so_oobmark = so->so_rcv.sb_cc +
+ (tp->rcv_up - tp->rcv_nxt) - 1;
+ if (so->so_oobmark == 0)
+ so->so_state |= SS_RCVATMARK;
+ sohasoutofband(so);
+ tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ }
+ /*
+ * Remove out of band data so doesn't get presented to user.
+ * This can happen independent of advancing the URG pointer,
+ * but if two URG's are pending at once, some out-of-band
+ * data may creep in... ick.
+ */
+ if (th->th_urp <= (u_long)tlen
+#ifdef SO_OOBINLINE
+ && (so->so_options & SO_OOBINLINE) == 0
+#endif
+ )
+ tcp_pulloutofband(so, th, m,
+ drop_hdrlen); /* hdr drop is delayed */
+ } else
+ /*
+ * If no out of band data is expected,
+ * pull receive urgent pointer along
+ * with the receive window.
+ */
+ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
+ tp->rcv_up = tp->rcv_nxt;
+dodata: /* XXX */
+ KASSERT(headlocked, ("headlocked"));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ /*
+ * Process the segment text, merging it into the TCP sequencing queue,
+ * and arranging for acknowledgment of receipt if necessary.
+ * This process logically involves adjusting tp->rcv_wnd as data
+ * is presented to the user (this happens in tcp_usrreq.c,
+ * case PRU_RCVD). If a FIN has already been received on this
+ * connection then we just ignore the text.
+ */
+ if ((tlen || (thflags&TH_FIN)) &&
+ TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ m_adj(m, drop_hdrlen); /* delayed header drop */
+ /*
+ * Insert segment which inludes th into reassembly queue of tcp with
+ * control block tp. Return TH_FIN if reassembly now includes
+ * a segment with FIN. This handle the common case inline (segment
+ * is the next to be received on an established connection, and the
+ * queue is empty), avoiding linkage into and removal from the queue
+ * and repetition of various conversions.
+ * Set DELACK for segments received in order, but ack immediately
+ * when segments are out of order (so fast retransmit can work).
+ */
+ if (th->th_seq == tp->rcv_nxt &&
+ LIST_EMPTY(&tp->t_segq) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (DELAY_ACK(tp))
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt += tlen;
+ thflags = th->th_flags & TH_FIN;
+ tcpstat.tcps_rcvpack++;
+ tcpstat.tcps_rcvbyte += tlen;
+ ND6_HINT(tp);
+ sbappend(&so->so_rcv, m);
+ sorwakeup(so);
+ } else {
+ thflags = tcp_reass(tp, th, &tlen, m);
+ tp->t_flags |= TF_ACKNOW;
+ }
+
+ /*
+ * Note the amount of data that peer has sent into
+ * our window, in order to estimate the sender's
+ * buffer size.
+ */
+ len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
+ } else {
+ m_freem(m);
+ thflags &= ~TH_FIN;
+ }
+
+ /*
+ * If FIN is received ACK the FIN and let the user know
+ * that the connection is closing.
+ */
+ if (thflags & TH_FIN) {
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+ socantrcvmore(so);
+ /*
+ * If connection is half-synchronized
+ * (ie NEEDSYN flag on) then delay ACK,
+ * so it may be piggybacked when SYN is sent.
+ * Otherwise, since we received a FIN then no
+ * more input can be expected, send ACK now.
+ */
+ if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN))
+ callout_reset(tp->tt_delack, tcp_delacktime,
+ tcp_timer_delack, tp);
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt++;
+ }
+ switch (tp->t_state) {
+
+ /*
+ * In SYN_RECEIVED and ESTABLISHED STATES
+ * enter the CLOSE_WAIT state.
+ */
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /*FALLTHROUGH*/
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_CLOSE_WAIT;
+ break;
+
+ /*
+ * If still in FIN_WAIT_1 STATE FIN has not been acked so
+ * enter the CLOSING state.
+ */
+ case TCPS_FIN_WAIT_1:
+ tp->t_state = TCPS_CLOSING;
+ break;
+
+ /*
+ * In FIN_WAIT_2 state enter the TIME_WAIT state,
+ * starting the time-wait timer, turning off the other
+ * standard timers.
+ */
+ case TCPS_FIN_WAIT_2:
+ tp->t_state = TCPS_TIME_WAIT;
+ tcp_canceltimers(tp);
+ /* Shorten TIME_WAIT [RFC-1644, p.28] */
+ if (tp->cc_recv != 0 &&
+ (ticks - tp->t_starttime) < tcp_msl) {
+ callout_reset(tp->tt_2msl,
+ tp->t_rxtcur * TCPTV_TWTRUNC,
+ tcp_timer_2msl, tp);
+ /* For transaction client, force ACK now. */
+ tp->t_flags |= TF_ACKNOW;
+ }
+ else
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ soisdisconnected(so);
+ break;
+
+ /*
+ * In TIME_WAIT state restart the 2 MSL time_wait timer.
+ */
+ case TCPS_TIME_WAIT:
+ callout_reset(tp->tt_2msl, 2 * tcp_msl,
+ tcp_timer_2msl, tp);
+ break;
+ }
+ }
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+
+ /*
+ * Return any desired output.
+ */
+ if (needoutput || (tp->t_flags & TF_ACKNOW))
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ return;
+
+dropafterack:
+ /*
+ * Generate an ACK dropping incoming segment if it occupies
+ * sequence space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all
+ * paths to this code happen after packets containing
+ * RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the
+ * segment we received passes the SYN-RECEIVED ACK test.
+ * If it fails send a RST. This breaks the loop in the
+ * "LAND" DoS attack, and also prevents an ACK storm
+ * between two listening ports that have been sent forged
+ * SYN segments, each with the source address of the other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max)) ) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ }
+#ifdef TCPDEBUG
+ if (so->so_options & SO_DEBUG)
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ m_freem(m);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ return;
+
+dropwithreset:
+ /*
+ * Generate a RST, dropping incoming segment.
+ * Make ACK acceptable to originator of segment.
+ * Don't bother to respond if destination was broadcast/multicast.
+ */
+ if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
+ goto drop;
+#ifdef INET6
+ if (isipv6) {
+ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
+ IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
+ goto drop;
+ } else
+#endif /* INET6 */
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
+ ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
+ goto drop;
+ /* IPv6 anycast check is done at tcp6_input() */
+
+ /*
+ * Perform bandwidth limiting.
+ */
+ if (badport_bandlim(rstreason) < 0)
+ goto drop;
+
+#ifdef TCPDEBUG
+ if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+
+ if (tp)
+ INP_UNLOCK(inp);
+
+ if (thflags & TH_ACK)
+ /* mtod() below is safe as long as hdr dropping is delayed */
+ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
+ TH_RST);
+ else {
+ if (thflags & TH_SYN)
+ tlen++;
+ /* mtod() below is safe as long as hdr dropping is delayed */
+ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
+ (tcp_seq)0, TH_RST|TH_ACK);
+ }
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+
+drop:
+ /*
+ * Drop space held by incoming segment and return.
+ */
+#ifdef TCPDEBUG
+ if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
+ &tcp_savetcp, 0);
+#endif
+ if (tp)
+ INP_UNLOCK(inp);
+ m_freem(m);
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return;
+}
+
+/*
+ * Parse TCP options and place in tcpopt.
+ */
+static void
+tcp_dooptions(to, cp, cnt, is_syn)
+ struct tcpopt *to;
+ u_char *cp;
+ int cnt;
+{
+ int opt, optlen;
+
+ to->to_flags = 0;
+ for (; cnt > 0; cnt -= optlen, cp += optlen) {
+ opt = cp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < 2)
+ break;
+ optlen = cp[1];
+ if (optlen < 2 || optlen > cnt)
+ break;
+ }
+ switch (opt) {
+ case TCPOPT_MAXSEG:
+ if (optlen != TCPOLEN_MAXSEG)
+ continue;
+ if (!is_syn)
+ continue;
+ to->to_flags |= TOF_MSS;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_mss, sizeof(to->to_mss));
+ to->to_mss = ntohs(to->to_mss);
+ break;
+ case TCPOPT_WINDOW:
+ if (optlen != TCPOLEN_WINDOW)
+ continue;
+ if (! is_syn)
+ continue;
+ to->to_flags |= TOF_SCALE;
+ to->to_requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (optlen != TCPOLEN_TIMESTAMP)
+ continue;
+ to->to_flags |= TOF_TS;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_tsval, sizeof(to->to_tsval));
+ to->to_tsval = ntohl(to->to_tsval);
+ bcopy((char *)cp + 6,
+ (char *)&to->to_tsecr, sizeof(to->to_tsecr));
+ to->to_tsecr = ntohl(to->to_tsecr);
+ break;
+ case TCPOPT_CC:
+ if (optlen != TCPOLEN_CC)
+ continue;
+ to->to_flags |= TOF_CC;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_cc, sizeof(to->to_cc));
+ to->to_cc = ntohl(to->to_cc);
+ break;
+ case TCPOPT_CCNEW:
+ if (optlen != TCPOLEN_CC)
+ continue;
+ if (!is_syn)
+ continue;
+ to->to_flags |= TOF_CCNEW;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_cc, sizeof(to->to_cc));
+ to->to_cc = ntohl(to->to_cc);
+ break;
+ case TCPOPT_CCECHO:
+ if (optlen != TCPOLEN_CC)
+ continue;
+ if (!is_syn)
+ continue;
+ to->to_flags |= TOF_CCECHO;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_ccecho, sizeof(to->to_ccecho));
+ to->to_ccecho = ntohl(to->to_ccecho);
+ break;
+ default:
+ continue;
+ }
+ }
+}
+
+/*
+ * Pull out of band byte out of a segment so
+ * it doesn't appear in the user's data queue.
+ * It is still reflected in the segment length for
+ * sequencing purposes.
+ */
+static void
+tcp_pulloutofband(so, th, m, off)
+ struct socket *so;
+ struct tcphdr *th;
+ register struct mbuf *m;
+ int off; /* delayed to be droped hdrlen */
+{
+ int cnt = off + th->th_urp - 1;
+
+ while (cnt >= 0) {
+ if (m->m_len > cnt) {
+ char *cp = mtod(m, caddr_t) + cnt;
+ struct tcpcb *tp = sototcpcb(so);
+
+ tp->t_iobc = *cp;
+ tp->t_oobflags |= TCPOOB_HAVEDATA;
+ bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
+ m->m_len--;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len--;
+ return;
+ }
+ cnt -= m->m_len;
+ m = m->m_next;
+ if (m == 0)
+ break;
+ }
+ panic("tcp_pulloutofband");
+}
+
+/*
+ * Collect new round-trip time estimate
+ * and update averages and current timeout.
+ */
+static void
+tcp_xmit_timer(tp, rtt)
+ register struct tcpcb *tp;
+ int rtt;
+{
+ register int delta;
+
+ tcpstat.tcps_rttupdated++;
+ tp->t_rttupdated++;
+ if (tp->t_srtt != 0) {
+ /*
+ * srtt is stored as fixed point with 5 bits after the
+ * binary point (i.e., scaled by 8). The following magic
+ * is equivalent to the smoothing algorithm in rfc793 with
+ * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
+ * point). Adjust rtt to origin 0.
+ */
+ delta = ((rtt - 1) << TCP_DELTA_SHIFT)
+ - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+
+ if ((tp->t_srtt += delta) <= 0)
+ tp->t_srtt = 1;
+
+ /*
+ * We accumulate a smoothed rtt variance (actually, a
+ * smoothed mean difference), then set the retransmit
+ * timer to smoothed rtt + 4 times the smoothed variance.
+ * rttvar is stored as fixed point with 4 bits after the
+ * binary point (scaled by 16). The following is
+ * equivalent to rfc793 smoothing with an alpha of .75
+ * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
+ * rfc793's wired-in beta.
+ */
+ if (delta < 0)
+ delta = -delta;
+ delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
+ if ((tp->t_rttvar += delta) <= 0)
+ tp->t_rttvar = 1;
+ } else {
+ /*
+ * No rtt measurement yet - use the unsmoothed rtt.
+ * Set the variance to half the rtt (so our first
+ * retransmit happens at 3*rtt).
+ */
+ tp->t_srtt = rtt << TCP_RTT_SHIFT;
+ tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ }
+ tp->t_rtttime = 0;
+ tp->t_rxtshift = 0;
+
+ /*
+ * the retransmit should happen at rtt + 4 * rttvar.
+ * Because of the way we do the smoothing, srtt and rttvar
+ * will each average +1/2 tick of bias. When we compute
+ * the retransmit timer, we want 1/2 tick of rounding and
+ * 1 extra tick because of +-1/2 tick uncertainty in the
+ * firing of the timer. The bias will give us exactly the
+ * 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below
+ * the minimum feasible timer (which is 2 ticks).
+ */
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
+ max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+
+ /*
+ * We received an ack for a packet that wasn't retransmitted;
+ * it is probably safe to discard any error indications we've
+ * received recently. This isn't quite right, but close enough
+ * for now (a route might have failed after we sent a segment,
+ * and the return path might not be symmetrical).
+ */
+ tp->t_softerror = 0;
+}
+
+/*
+ * Determine a reasonable value for maxseg size.
+ * If the route is known, check route for mtu.
+ * If none, use an mss that can be handled on the outgoing
+ * interface without forcing IP to fragment; if bigger than
+ * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
+ * to utilize large mbufs. If no route is found, route has no mtu,
+ * or the destination isn't local, use a default, hopefully conservative
+ * size (usually 512 or the default IP max size, but no more than the mtu
+ * of the interface), as we can't discover anything about intervening
+ * gateways or networks. We also initialize the congestion/slow start
+ * window to be a single segment if the destination isn't local.
+ * While looking at the routing entry, we also initialize other path-dependent
+ * parameters from pre-set or cached values in the routing entry.
+ *
+ * Also take into account the space needed for options that we
+ * send regularly. Make maxseg shorter by that amount to assure
+ * that we can send maxseg amount of data even when the options
+ * are present. Store the upper limit of the length of options plus
+ * data in maxopd.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment, for outgoing segments only tcp_mssopt is called.
+ *
+ * In case of T/TCP, we call this routine during implicit connection
+ * setup as well (offer = -1), to initialize maxseg from the cached
+ * MSS of our peer.
+ */
+void
+tcp_mss(tp, offer)
+ struct tcpcb *tp;
+ int offer;
+{
+ register struct rtentry *rt;
+ struct ifnet *ifp;
+ register int rtt, mss;
+ u_long bufsize;
+ struct inpcb *inp;
+ struct socket *so;
+ struct rmxp_tao *taop;
+ int origoffer = offer;
+#ifdef INET6
+ int isipv6;
+ int min_protoh;
+#endif
+
+ inp = tp->t_inpcb;
+#ifdef INET6
+ isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+ min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
+ : sizeof (struct tcpiphdr);
+#else
+#define min_protoh (sizeof (struct tcpiphdr))
+#endif
+#ifdef INET6
+ if (isipv6)
+ rt = tcp_rtlookup6(&inp->inp_inc);
+ else
+#endif
+ rt = tcp_rtlookup(&inp->inp_inc);
+ if (rt == NULL) {
+ tp->t_maxopd = tp->t_maxseg =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ return;
+ }
+ ifp = rt->rt_ifp;
+ so = inp->inp_socket;
+
+ taop = rmx_taop(rt->rt_rmx);
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (offer == -1)
+ offer = taop->tao_mssopt;
+ /*
+ * Offer == 0 means that there was no MSS on the SYN segment,
+ * in this case we use tcp_mssdflt.
+ */
+ if (offer == 0)
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ else
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even is the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ taop->tao_mssopt = offer;
+
+ /*
+ * While we're here, check if there's an initial rtt
+ * or rttvar. Convert from the route-table units
+ * to scaled multiples of the slow timeout timer.
+ */
+ if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
+ /*
+ * XXX the lock bit for RTT indicates that the value
+ * is also a minimum value; this is subject to time.
+ */
+ if (rt->rt_rmx.rmx_locks & RTV_RTT)
+ tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
+ tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+ tcpstat.tcps_usedrtt++;
+ if (rt->rt_rmx.rmx_rttvar) {
+ tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
+ (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ /*
+ * if there's an mtu associated with the route, use it
+ * else, use the link mtu.
+ */
+ if (rt->rt_rmx.rmx_mtu)
+ mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ else
+ {
+ mss =
+#ifdef INET6
+ (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu :
+#endif
+ ifp->if_mtu
+#ifdef INET6
+ )
+#endif
+ - min_protoh;
+#ifdef INET6
+ if (isipv6) {
+ if (!in6_localaddr(&inp->in6p_faddr))
+ mss = min(mss, tcp_v6mssdflt);
+ } else
+#endif
+ if (!in_localaddr(inp->inp_faddr))
+ mss = min(mss, tcp_mssdflt);
+ }
+ mss = min(mss, offer);
+ /*
+ * maxopd stores the maximum length of data AND options
+ * in a segment; maxseg is the amount of data in a normal
+ * segment. We need to store this value (maxopd) apart
+ * from maxseg, because now every segment carries options
+ * and thus we normally have somewhat less data in segments.
+ */
+ tp->t_maxopd = mss;
+
+ /*
+ * In case of T/TCP, origoffer==-1 indicates, that no segments
+ * were received yet. In this case we just guess, otherwise
+ * we do the same as before T/TCP.
+ */
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
+ (origoffer == -1 ||
+ (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
+ mss -= TCPOLEN_TSTAMP_APPA;
+ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
+ (origoffer == -1 ||
+ (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
+ mss -= TCPOLEN_CC_APPA;
+
+#if (MCLBYTES & (MCLBYTES - 1)) == 0
+ if (mss > MCLBYTES)
+ mss &= ~(MCLBYTES-1);
+#else
+ if (mss > MCLBYTES)
+ mss = mss / MCLBYTES * MCLBYTES;
+#endif
+ /*
+ * If there's a pipesize, change the socket buffer
+ * to that size. Make the socket buffers an integral
+ * number of mss units; if the mss is larger than
+ * the socket buffer, decrease the mss.
+ */
+#ifdef RTV_SPIPE
+ if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
+#endif
+ bufsize = so->so_snd.sb_hiwat;
+ if (bufsize < mss)
+ mss = bufsize;
+ else {
+ bufsize = roundup(bufsize, mss);
+ if (bufsize > sb_max)
+ bufsize = sb_max;
+ (void)sbreserve(&so->so_snd, bufsize, so, NULL);
+ }
+ tp->t_maxseg = mss;
+
+#ifdef RTV_RPIPE
+ if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
+#endif
+ bufsize = so->so_rcv.sb_hiwat;
+ if (bufsize > mss) {
+ bufsize = roundup(bufsize, mss);
+ if (bufsize > sb_max)
+ bufsize = sb_max;
+ (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
+ }
+
+ /*
+ * Set the slow-start flight size depending on whether this
+ * is a local network or not.
+ */
+ if (
+#ifdef INET6
+ (isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+ (!isipv6 &&
+#endif
+ in_localaddr(inp->inp_faddr)
+#ifdef INET6
+ )
+#endif
+ )
+ tp->snd_cwnd = mss * ss_fltsz_local;
+ else
+ tp->snd_cwnd = mss * ss_fltsz;
+
+ if (rt->rt_rmx.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+}
+
+/*
+ * Determine the MSS option to send on an outgoing SYN.
+ */
+int
+tcp_mssopt(tp)
+ struct tcpcb *tp;
+{
+ struct rtentry *rt;
+#ifdef INET6
+ int isipv6;
+ int min_protoh;
+#endif
+
+#ifdef INET6
+ isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+ min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr)
+ : sizeof (struct tcpiphdr);
+#else
+#define min_protoh (sizeof (struct tcpiphdr))
+#endif
+#ifdef INET6
+ if (isipv6)
+ rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
+ else
+#endif /* INET6 */
+ rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
+ if (rt == NULL)
+ return
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+
+ return rt->rt_ifp->if_mtu - min_protoh;
+}
+
+
+/*
+ * Checks for partial ack. If partial ack arrives, force the retransmission
+ * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
+ * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
+ * be started again. If the ack advances at least to tp->snd_recover, return 0.
+ */
+static int
+tcp_newreno(tp, th)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+{
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ tcp_seq onxt = tp->snd_nxt;
+ u_long ocwnd = tp->snd_cwnd;
+
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
+ tp->snd_nxt = th->th_ack;
+ /*
+ * Set snd_cwnd to one segment beyond acknowledged offset
+ * (tp->snd_una has not yet been updated when this function
+ * is called)
+ */
+ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+ (void) tcp_output(tp);
+ tp->snd_cwnd = ocwnd;
+ if (SEQ_GT(onxt, tp->snd_nxt))
+ tp->snd_nxt = onxt;
+ /*
+ * Partial window deflation. Relies on fact that tp->snd_una
+ * not updated yet.
+ */
+ tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
+ return (1);
+ }
+ return (0);
+}
diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h
new file mode 100644
index 0000000..5850ccc
--- /dev/null
+++ b/sys/netinet/tcp_seq.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 1982, 1986, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_SEQ_H_
+#define _NETINET_TCP_SEQ_H_
+/*
+ * TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic. These macros can be
+ * used to compare such integers.
+ */
+#define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
+#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
+#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
+#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+/* for modulo comparisons of timestamps */
+#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
+#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+/*
+ * TCP connection counts are 32 bit integers operated
+ * on with modular arithmetic. These macros can be
+ * used to compare such integers.
+ */
+#define CC_LT(a,b) ((int)((a)-(b)) < 0)
+#define CC_LEQ(a,b) ((int)((a)-(b)) <= 0)
+#define CC_GT(a,b) ((int)((a)-(b)) > 0)
+#define CC_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+/* Macro to increment a CC: skip 0 which has a special meaning */
+#define CC_INC(c) (++(c) == 0 ? ++(c) : (c))
+
+/*
+ * Macros to initialize tcp sequence numbers for
+ * send and receive from initial send and receive
+ * sequence numbers.
+ */
+#define tcp_rcvseqinit(tp) \
+ (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1
+
+#define tcp_sendseqinit(tp) \
+ (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \
+ (tp)->snd_recover = (tp)->iss
+
+#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * hz)
+ /* timestamp wrap-around time */
+
+#ifdef _KERNEL
+extern tcp_cc tcp_ccgen; /* global connection count */
+#endif /* _KERNEL */
+#endif /* _NETINET_TCP_SEQ_H_ */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
new file mode 100644
index 0000000..f7800d2
--- /dev/null
+++ b/sys/netinet/tcp_subr.c
@@ -0,0 +1,1510 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/random.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/if.h>
+
+#define _IP_VHL
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+#include <netinet6/ip6protosw.h>
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#ifdef INET6
+#include <netinet6/ipsec6.h>
+#endif
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+#include <sys/md5.h>
+
+int tcp_mssdflt = TCP_MSS;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
+ &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
+
+#ifdef INET6
+int tcp_v6mssdflt = TCP6_MSS;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
+ CTLFLAG_RW, &tcp_v6mssdflt , 0,
+ "Default TCP Maximum Segment Size for IPv6");
+#endif
+
+#if 0
+static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
+ &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
+#endif
+
+int tcp_do_rfc1323 = 1;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
+ &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
+
+int tcp_do_rfc1644 = 0;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
+ &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
+
+static int tcp_tcbhashsize = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
+ &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
+
+static int do_tcpdrain = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
+ "Enable tcp_drain routine for extra help when low on mbufs");
+
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
+ &tcbinfo.ipi_count, 0, "Number of active PCBs");
+
+static int icmp_may_rst = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
+ "Certain ICMP unreachable messages may abort connections in SYN_SENT");
+
+static int tcp_isn_reseed_interval = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+ &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+
+static void tcp_cleartaocache(void);
+static struct inpcb *tcp_notify(struct inpcb *, int);
+
+/*
+ * Target size of TCP PCB hash tables. Must be a power of two.
+ *
+ * Note that this can be overridden by the kernel environment
+ * variable net.inet.tcp.tcbhashsize
+ */
+#ifndef TCBHASHSIZE
+#define TCBHASHSIZE 512
+#endif
+
+/*
+ * This is the actual shape of what we allocate using the zone
+ * allocator. Doing it this way allows us to protect both structures
+ * using the same generation count, and also eliminates the overhead
+ * of allocating tcpcbs separately. By hiding the structure here,
+ * we avoid changing most of the rest of the code (although it needs
+ * to be changed, eventually, for greater efficiency).
+ */
+#define ALIGNMENT 32
+#define ALIGNM1 (ALIGNMENT - 1)
+struct inp_tp {
+ union {
+ struct inpcb inp;
+ char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
+ } inp_tp_u;
+ struct tcpcb tcb;
+ struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
+ struct callout inp_tp_delack;
+};
+#undef ALIGNMENT
+#undef ALIGNM1
+
+/*
+ * Tcp initialization
+ */
+void
+tcp_init()
+{
+ int hashsize = TCBHASHSIZE;
+
+ tcp_ccgen = 1;
+ tcp_cleartaocache();
+
+ tcp_delacktime = TCPTV_DELACK;
+ tcp_keepinit = TCPTV_KEEP_INIT;
+ tcp_keepidle = TCPTV_KEEP_IDLE;
+ tcp_keepintvl = TCPTV_KEEPINTVL;
+ tcp_maxpersistidle = TCPTV_KEEP_IDLE;
+ tcp_msl = TCPTV_MSL;
+
+ INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
+ LIST_INIT(&tcb);
+ tcbinfo.listhead = &tcb;
+ TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
+ if (!powerof2(hashsize)) {
+ printf("WARNING: TCB hash size not a power of 2\n");
+ hashsize = 512; /* safe default */
+ }
+ tcp_tcbhashsize = hashsize;
+ tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
+ tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
+ &tcbinfo.porthashmask);
+ tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
+#ifdef INET6
+#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
+#else /* INET6 */
+#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
+#endif /* INET6 */
+ if (max_protohdr < TCP_MINPROTOHDR)
+ max_protohdr = TCP_MINPROTOHDR;
+ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
+ panic("tcp_init");
+#undef TCP_MINPROTOHDR
+
+ syncache_init();
+}
+
+/*
+ * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
+ * tcp_template used to store this data in mbufs, but we now recopy it out
+ * of the tcpcb each time to conserve mbufs.
+ */
+void
+tcp_fillheaders(tp, ip_ptr, tcp_ptr)
+ struct tcpcb *tp;
+ void *ip_ptr;
+ void *tcp_ptr;
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
+
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) != 0) {
+ struct ip6_hdr *ip6;
+
+ ip6 = (struct ip6_hdr *)ip_ptr;
+ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
+ (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
+ ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
+ (IPV6_VERSION & IPV6_VERSION_MASK);
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_plen = sizeof(struct tcphdr);
+ ip6->ip6_src = inp->in6p_laddr;
+ ip6->ip6_dst = inp->in6p_faddr;
+ tcp_hdr->th_sum = 0;
+ } else
+#endif
+ {
+ struct ip *ip = (struct ip *) ip_ptr;
+
+ ip->ip_vhl = IP_VHL_BORING;
+ ip->ip_tos = 0;
+ ip->ip_len = 0;
+ ip->ip_id = 0;
+ ip->ip_off = 0;
+ ip->ip_ttl = 0;
+ ip->ip_sum = 0;
+ ip->ip_p = IPPROTO_TCP;
+ ip->ip_src = inp->inp_laddr;
+ ip->ip_dst = inp->inp_faddr;
+ tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(sizeof(struct tcphdr) + IPPROTO_TCP));
+ }
+
+ tcp_hdr->th_sport = inp->inp_lport;
+ tcp_hdr->th_dport = inp->inp_fport;
+ tcp_hdr->th_seq = 0;
+ tcp_hdr->th_ack = 0;
+ tcp_hdr->th_x2 = 0;
+ tcp_hdr->th_off = 5;
+ tcp_hdr->th_flags = 0;
+ tcp_hdr->th_win = 0;
+ tcp_hdr->th_urp = 0;
+}
+
+/*
+ * Create template to be used to send tcp packets on a connection.
+ * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
+ * use for this function is in keepalives, which use tcp_respond.
+ */
+struct tcptemp *
+tcp_maketemplate(tp)
+ struct tcpcb *tp;
+{
+ struct mbuf *m;
+ struct tcptemp *n;
+
+ m = m_get(M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return (0);
+ m->m_len = sizeof(struct tcptemp);
+ n = mtod(m, struct tcptemp *);
+
+ tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
+ return (n);
+}
+
+/*
+ * Send a single message to the TCP at address specified by
+ * the given TCP/IP header. If m == 0, then we make a copy
+ * of the tcpiphdr at ti and send directly to the addressed host.
+ * This is used to force keep alive messages out using the TCP
+ * template for a connection. If flags are given then we send
+ * a message back to the TCP which originated the * segment ti,
+ * and discard the mbuf containing it and any other attached mbufs.
+ *
+ * In any case the ack and sequence number of the transmitted
+ * segment are as specified by the parameters.
+ *
+ * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
+ */
+void
+tcp_respond(tp, ipgen, th, m, ack, seq, flags)
+ struct tcpcb *tp;
+ void *ipgen;
+ register struct tcphdr *th;
+ register struct mbuf *m;
+ tcp_seq ack, seq;
+ int flags;
+{
+ register int tlen;
+ int win = 0;
+ struct route *ro = 0;
+ struct route sro;
+ struct ip *ip;
+ struct tcphdr *nth;
+#ifdef INET6
+ struct route_in6 *ro6 = 0;
+ struct route_in6 sro6;
+ struct ip6_hdr *ip6;
+ int isipv6;
+#endif /* INET6 */
+ int ipflags = 0;
+
+#ifdef INET6
+ isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
+ ip6 = ipgen;
+#endif /* INET6 */
+ ip = ipgen;
+
+ if (tp) {
+ if (!(flags & TH_RST)) {
+ win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
+ if (win > (long)TCP_MAXWIN << tp->rcv_scale)
+ win = (long)TCP_MAXWIN << tp->rcv_scale;
+ }
+#ifdef INET6
+ if (isipv6)
+ ro6 = &tp->t_inpcb->in6p_route;
+ else
+#endif /* INET6 */
+ ro = &tp->t_inpcb->inp_route;
+ } else {
+#ifdef INET6
+ if (isipv6) {
+ ro6 = &sro6;
+ bzero(ro6, sizeof *ro6);
+ } else
+#endif /* INET6 */
+ {
+ ro = &sro;
+ bzero(ro, sizeof *ro);
+ }
+ }
+ if (m == 0) {
+ m = m_gethdr(M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return;
+ tlen = 0;
+ m->m_data += max_linkhdr;
+#ifdef INET6
+ if (isipv6) {
+ bcopy((caddr_t)ip6, mtod(m, caddr_t),
+ sizeof(struct ip6_hdr));
+ ip6 = mtod(m, struct ip6_hdr *);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
+ ip = mtod(m, struct ip *);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
+ flags = TH_ACK;
+ } else {
+ m_freem(m->m_next);
+ m->m_next = 0;
+ m->m_data = (caddr_t)ipgen;
+ /* m_len is set later */
+ tlen = 0;
+#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
+#ifdef INET6
+ if (isipv6) {
+ xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ if (th != nth) {
+ /*
+ * this is usually a case when an extension header
+ * exists between the IPv6 header and the
+ * TCP header.
+ */
+ nth->th_sport = th->th_sport;
+ nth->th_dport = th->th_dport;
+ }
+ xchg(nth->th_dport, nth->th_sport, n_short);
+#undef xchg
+ }
+#ifdef INET6
+ if (isipv6) {
+ ip6->ip6_flow = 0;
+ ip6->ip6_vfc = IPV6_VERSION;
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
+ tlen));
+ tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+ } else
+#endif
+ {
+ tlen += sizeof (struct tcpiphdr);
+ ip->ip_len = tlen;
+ ip->ip_ttl = ip_defttl;
+ }
+ m->m_len = tlen;
+ m->m_pkthdr.len = tlen;
+ m->m_pkthdr.rcvif = (struct ifnet *) 0;
+ nth->th_seq = htonl(seq);
+ nth->th_ack = htonl(ack);
+ nth->th_x2 = 0;
+ nth->th_off = sizeof (struct tcphdr) >> 2;
+ nth->th_flags = flags;
+ if (tp)
+ nth->th_win = htons((u_short) (win >> tp->rcv_scale));
+ else
+ nth->th_win = htons((u_short)win);
+ nth->th_urp = 0;
+#ifdef INET6
+ if (isipv6) {
+ nth->th_sum = 0;
+ nth->th_sum = in6_cksum(m, IPPROTO_TCP,
+ sizeof(struct ip6_hdr),
+ tlen - sizeof(struct ip6_hdr));
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
+ ro6 && ro6->ro_rt ?
+ ro6->ro_rt->rt_ifp :
+ NULL);
+ } else
+#endif /* INET6 */
+ {
+ nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ }
+#ifdef TCPDEBUG
+ if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
+#endif
+#ifdef IPSEC
+ if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
+ m_freem(m);
+ return;
+ }
+#endif
+#ifdef INET6
+ if (isipv6) {
+ (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
+ if (ro6 == &sro6 && ro6->ro_rt) {
+ RTFREE(ro6->ro_rt);
+ ro6->ro_rt = NULL;
+ }
+ } else
+#endif /* INET6 */
+ {
+ (void) ip_output(m, NULL, ro, ipflags, NULL);
+ if (ro == &sro && ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = NULL;
+ }
+ }
+}
+
+/*
+ * Create a new TCP control block, making an
+ * empty reassembly queue and hooking it to the argument
+ * protocol control block. The `inp' parameter must have
+ * come from the zone allocator set up in tcp_init().
+ */
+struct tcpcb *
+tcp_newtcpcb(inp)
+ struct inpcb *inp;
+{
+ struct inp_tp *it;
+ register struct tcpcb *tp;
+#ifdef INET6
+ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+
+ it = (struct inp_tp *)inp;
+ tp = &it->tcb;
+ bzero((char *) tp, sizeof(struct tcpcb));
+ LIST_INIT(&tp->t_segq);
+ tp->t_maxseg = tp->t_maxopd =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+
+ /* Set up our timeouts. */
+ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
+ callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
+ callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
+ callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
+ callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
+
+ if (tcp_do_rfc1323)
+ tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
+ if (tcp_do_rfc1644)
+ tp->t_flags |= TF_REQ_CC;
+ tp->t_inpcb = inp; /* XXX */
+ /*
+ * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
+ * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
+ * reasonable initial retransmit time.
+ */
+ tp->t_srtt = TCPTV_SRTTBASE;
+ tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
+ tp->t_rttmin = TCPTV_MIN;
+ tp->t_rxtcur = TCPTV_RTOBASE;
+ tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->t_rcvtime = ticks;
+ /*
+ * IPv4 TTL initialization is necessary for an IPv6 socket as well,
+ * because the socket may be bound to an IPv6 wildcard address,
+ * which may match an IPv4-mapped IPv6 address.
+ */
+ inp->inp_ip_ttl = ip_defttl;
+ inp->inp_ppcb = (caddr_t)tp;
+ return (tp); /* XXX */
+}
+
+/*
+ * Drop a TCP connection, reporting
+ * the specified error. If connection is synchronized,
+ * then send a RST to peer.
+ */
+struct tcpcb *
+tcp_drop(tp, errno)
+ register struct tcpcb *tp;
+ int errno;
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_state = TCPS_CLOSED;
+ (void) tcp_output(tp);
+ tcpstat.tcps_drops++;
+ } else
+ tcpstat.tcps_conndrops++;
+ if (errno == ETIMEDOUT && tp->t_softerror)
+ errno = tp->t_softerror;
+ so->so_error = errno;
+ return (tcp_close(tp));
+}
+
+/*
+ * Close a TCP control block:
+ * discard all space held by the tcp
+ * discard internet protocol block
+ * wake up any sleepers
+ */
+struct tcpcb *
+tcp_close(tp)
+ register struct tcpcb *tp;
+{
+ register struct tseg_qent *q;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+#ifdef INET6
+ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+ register struct rtentry *rt;
+ int dosavessthresh;
+
+ /*
+ * Make sure that all of our timers are stopped before we
+ * delete the PCB.
+ */
+ callout_stop(tp->tt_rexmt);
+ callout_stop(tp->tt_persist);
+ callout_stop(tp->tt_keep);
+ callout_stop(tp->tt_2msl);
+ callout_stop(tp->tt_delack);
+
+ /*
+ * If we got enough samples through the srtt filter,
+ * save the rtt and rttvar in the routing entry.
+ * 'Enough' is arbitrarily defined as the 16 samples.
+ * 16 samples is enough for the srtt filter to converge
+ * to within 5% of the correct value; fewer samples and
+ * we could save a very bogus rtt.
+ *
+ * Don't update the default route's characteristics and don't
+ * update anything that the user "locked".
+ */
+ if (tp->t_rttupdated >= 16) {
+ register u_long i = 0;
+#ifdef INET6
+ if (isipv6) {
+ struct sockaddr_in6 *sin6;
+
+ if ((rt = inp->in6p_route.ro_rt) == NULL)
+ goto no_valid_rt;
+ sin6 = (struct sockaddr_in6 *)rt_key(rt);
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ goto no_valid_rt;
+ }
+ else
+#endif /* INET6 */
+ if ((rt = inp->inp_route.ro_rt) == NULL ||
+ ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
+ == INADDR_ANY)
+ goto no_valid_rt;
+
+ if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
+ i = tp->t_srtt *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+ if (rt->rt_rmx.rmx_rtt && i)
+ /*
+ * filter this update to half the old & half
+ * the new values, converting scale.
+ * See route.h and tcp_var.h for a
+ * description of the scaling constants.
+ */
+ rt->rt_rmx.rmx_rtt =
+ (rt->rt_rmx.rmx_rtt + i) / 2;
+ else
+ rt->rt_rmx.rmx_rtt = i;
+ tcpstat.tcps_cachedrtt++;
+ }
+ if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
+ i = tp->t_rttvar *
+ (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
+ if (rt->rt_rmx.rmx_rttvar && i)
+ rt->rt_rmx.rmx_rttvar =
+ (rt->rt_rmx.rmx_rttvar + i) / 2;
+ else
+ rt->rt_rmx.rmx_rttvar = i;
+ tcpstat.tcps_cachedrttvar++;
+ }
+ /*
+ * The old comment here said:
+ * update the pipelimit (ssthresh) if it has been updated
+ * already or if a pipesize was specified & the threshhold
+ * got below half the pipesize. I.e., wait for bad news
+ * before we start updating, then update on both good
+ * and bad news.
+ *
+ * But we want to save the ssthresh even if no pipesize is
+ * specified explicitly in the route, because such
+ * connections still have an implicit pipesize specified
+ * by the global tcp_sendspace. In the absence of a reliable
+ * way to calculate the pipesize, it will have to do.
+ */
+ i = tp->snd_ssthresh;
+ if (rt->rt_rmx.rmx_sendpipe != 0)
+ dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
+ else
+ dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
+ if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
+ i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
+ || dosavessthresh) {
+ /*
+ * convert the limit from user data bytes to
+ * packets then to packet data bytes.
+ */
+ i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (i < 2)
+ i = 2;
+ i *= (u_long)(tp->t_maxseg +
+#ifdef INET6
+ (isipv6 ? sizeof (struct ip6_hdr) +
+ sizeof (struct tcphdr) :
+#endif
+ sizeof (struct tcpiphdr)
+#ifdef INET6
+ )
+#endif
+ );
+ if (rt->rt_rmx.rmx_ssthresh)
+ rt->rt_rmx.rmx_ssthresh =
+ (rt->rt_rmx.rmx_ssthresh + i) / 2;
+ else
+ rt->rt_rmx.rmx_ssthresh = i;
+ tcpstat.tcps_cachedssthresh++;
+ }
+ }
+ no_valid_rt:
+ /* free the reassembly queue, if any */
+ while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
+ LIST_REMOVE(q, tqe_q);
+ m_freem(q->tqe_m);
+ FREE(q, M_TSEGQ);
+ }
+ inp->inp_ppcb = NULL;
+ soisdisconnected(so);
+#ifdef INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ in6_pcbdetach(inp);
+ else
+#endif /* INET6 */
+ in_pcbdetach(inp);
+ tcpstat.tcps_closed++;
+ return ((struct tcpcb *)0);
+}
+
+void
+tcp_drain()
+{
+ if (do_tcpdrain)
+ {
+ struct inpcb *inpb;
+ struct tcpcb *tcpb;
+ struct tseg_qent *te;
+
+ /*
+ * Walk the tcpbs, if existing, and flush the reassembly queue,
+ * if there is one...
+ * XXX: The "Net/3" implementation doesn't imply that the TCP
+ * reassembly queue should be flushed, but in a situation
+ * where we're really low on mbufs, this is potentially
+ * usefull.
+ */
+ INP_INFO_RLOCK(&tcbinfo);
+ LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
+ INP_LOCK(inpb);
+ if ((tcpb = intotcpcb(inpb))) {
+ while ((te = LIST_FIRST(&tcpb->t_segq))
+ != NULL) {
+ LIST_REMOVE(te, tqe_q);
+ m_freem(te->tqe_m);
+ FREE(te, M_TSEGQ);
+ }
+ }
+ INP_UNLOCK(inpb);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+ }
+}
+
+/*
+ * Notify a tcp user of an asynchronous error;
+ * store error as soft error, but wake up user
+ * (for now, won't do anything until can select for soft error).
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+static struct inpcb *
+tcp_notify(inp, error)
+ struct inpcb *inp;
+ int error;
+{
+ struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
+
+ /*
+ * Ignore some errors if we are hooked up.
+ * If connection hasn't completed, has retransmitted several times,
+ * and receives a second error, give up now. This is better
+ * than waiting a long time to establish a connection that
+ * can never complete.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (error == EHOSTUNREACH || error == ENETUNREACH ||
+ error == EHOSTDOWN)) {
+ return inp;
+ } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
+ tp->t_softerror) {
+ tcp_drop(tp, error);
+ return (struct inpcb *)0;
+ } else {
+ tp->t_softerror = error;
+ return inp;
+ }
+#if 0
+ wakeup((caddr_t) &so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+#endif
+}
+
+static int
+tcp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n, s;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = tcbinfo.ipi_count;
+ req->oldidx = 2 * (sizeof xig)
+ + (n + n/8) * sizeof(struct xtcpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ gencnt = tcbinfo.ipi_gencnt;
+ n = tcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return error;
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return ENOMEM;
+
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ INP_LOCK(inp);
+ if (inp->inp_gencnt <= gencnt &&
+ cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
+ inp_list[i++] = inp;
+ INP_UNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_LOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xtcpcb xt;
+ caddr_t inp_ppcb;
+ xt.xt_len = sizeof xt;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xt.xt_inp, sizeof *inp);
+ inp_ppcb = inp->inp_ppcb;
+ if (inp_ppcb != NULL)
+ bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
+ else
+ bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xt.xt_socket);
+ error = SYSCTL_OUT(req, &xt, sizeof xt);
+ }
+ INP_UNLOCK(inp);
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ xig.xig_gen = tcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = tcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return error;
+}
+
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
+ tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
+
+static int
+tcp_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in addrs[2];
+ struct inpcb *inp;
+ int error, s;
+
+ error = suser_cred(req->td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
+ addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
+ if (inp == NULL) {
+ error = ENOENT;
+ goto outunlocked;
+ } else {
+ INP_LOCK(inp);
+ if (inp->inp_socket == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ }
+
+ error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
+ if (error)
+ goto out;
+ cru2x(inp->inp_socket->so_cred, &xuc);
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+out:
+ INP_UNLOCK(inp);
+outunlocked:
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
+
+#ifdef INET6
+static int
+tcp6_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in6 addrs[2];
+ struct inpcb *inp;
+ int error, s, mapped = 0;
+
+ error = suser_cred(req->td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
+ if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
+ mapped = 1;
+ else
+ return (EINVAL);
+ }
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ if (mapped == 1)
+ inp = in_pcblookup_hash(&tcbinfo,
+ *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
+ addrs[1].sin6_port,
+ *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
+ addrs[0].sin6_port,
+ 0, NULL);
+ else
+ inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
+ addrs[1].sin6_port,
+ &addrs[0].sin6_addr, addrs[0].sin6_port,
+ 0, NULL);
+ if (inp == NULL) {
+ error = ENOENT;
+ goto outunlocked;
+ } else {
+ INP_LOCK(inp);
+ if (inp->inp_socket == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ }
+ error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
+ if (error)
+ goto out;
+ cru2x(inp->inp_socket->so_cred, &xuc);
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+out:
+ INP_UNLOCK(inp);
+outunlocked:
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
+#endif
+
+
+void
+tcp_ctlinput(cmd, sa, vip)
+ int cmd;
+ struct sockaddr *sa;
+ void *vip;
+{
+ struct ip *ip = vip;
+ struct tcphdr *th;
+ struct in_addr faddr;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ tcp_seq icmp_seq;
+ int s;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ if (cmd == PRC_QUENCH)
+ notify = tcp_quench;
+ else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+ cmd == PRC_UNREACH_PORT) && ip)
+ notify = tcp_drop_syn_sent;
+ else if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (PRC_IS_REDIRECT(cmd)) {
+ ip = 0;
+ notify = in_rtchange;
+ } else if (cmd == PRC_HOSTDEAD)
+ ip = 0;
+ else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
+ return;
+ if (ip) {
+ s = splnet();
+ th = (struct tcphdr *)((caddr_t)ip
+ + (IP_VHL_HL(ip->ip_vhl) << 2));
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
+ ip->ip_src, th->th_sport, 0, NULL);
+ if (inp != NULL) {
+ INP_LOCK(inp);
+ if (inp->inp_socket != NULL) {
+ icmp_seq = htonl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_seq, tp->snd_max))
+ inp = (*notify)(inp, inetctlerrmap[cmd]);
+ }
+ if (inp)
+ INP_UNLOCK(inp);
+ } else {
+ struct in_conninfo inc;
+
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc_faddr = faddr;
+ inc.inc_laddr = ip->ip_src;
+#ifdef INET6
+ inc.inc_isipv6 = 0;
+#endif
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ } else
+ in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
+}
+
+#ifdef INET6
+void
+tcp6_ctlinput(cmd, sa, d)
+ int cmd;
+ struct sockaddr *sa;
+ void *d;
+{
+ struct tcphdr th;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ struct ip6_hdr *ip6;
+ struct mbuf *m;
+ struct ip6ctlparam *ip6cp = NULL;
+ const struct sockaddr_in6 *sa6_src = NULL;
+ int off;
+ struct tcp_portonly {
+ u_int16_t th_sport;
+ u_int16_t th_dport;
+ } *thp;
+
+ if (sa->sa_family != AF_INET6 ||
+ sa->sa_len != sizeof(struct sockaddr_in6))
+ return;
+
+ if (cmd == PRC_QUENCH)
+ notify = tcp_quench;
+ else if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (!PRC_IS_REDIRECT(cmd) &&
+ ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
+ return;
+
+ /* if the parameter is from icmp6, decode it. */
+ if (d != NULL) {
+ ip6cp = (struct ip6ctlparam *)d;
+ m = ip6cp->ip6c_m;
+ ip6 = ip6cp->ip6c_ip6;
+ off = ip6cp->ip6c_off;
+ sa6_src = ip6cp->ip6c_src;
+ } else {
+ m = NULL;
+ ip6 = NULL;
+ off = 0; /* fool gcc */
+ sa6_src = &sa6_any;
+ }
+
+ if (ip6) {
+ struct in_conninfo inc;
+ /*
+ * XXX: We assume that when IPV6 is non NULL,
+ * M and OFF are valid.
+ */
+
+ /* check if we can safely examine src and dst ports */
+ if (m->m_pkthdr.len < off + sizeof(*thp))
+ return;
+
+ bzero(&th, sizeof(th));
+ m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+
+ in6_pcbnotify(&tcb, sa, th.th_dport,
+ (struct sockaddr *)ip6cp->ip6c_src,
+ th.th_sport, cmd, notify);
+
+ inc.inc_fport = th.th_dport;
+ inc.inc_lport = th.th_sport;
+ inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+ inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+ inc.inc_isipv6 = 1;
+ syncache_unreach(&inc, &th);
+ } else
+ in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
+ 0, cmd, notify);
+}
+#endif /* INET6 */
+
+
+/*
+ * Following is where TCP initial sequence number generation occurs.
+ *
+ * There are two places where we must use initial sequence numbers:
+ * 1. In SYN-ACK packets.
+ * 2. In SYN packets.
+ *
+ * All ISNs for SYN-ACK packets are generated by the syncache. See
+ * tcp_syncache.c for details.
+ *
+ * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
+ * depends on this property. In addition, these ISNs should be
+ * unguessable so as to prevent connection hijacking. To satisfy
+ * the requirements of this situation, the algorithm outlined in
+ * RFC 1948 is used to generate sequence numbers.
+ *
+ * Implementation details:
+ *
+ * Time is based off the system timer, and is corrected so that it
+ * increases by one megabyte per second. This allows for proper
+ * recycling on high speed LANs while still leaving over an hour
+ * before rollover.
+ *
+ * net.inet.tcp.isn_reseed_interval controls the number of seconds
+ * between seeding of isn_secret. This is normally set to zero,
+ * as reseeding should not be necessary.
+ *
+ */
+
+#define ISN_BYTES_PER_SECOND 1048576
+
+u_char isn_secret[32];
+int isn_last_reseed;
+MD5_CTX isn_ctx;
+
+tcp_seq
+tcp_new_isn(tp)
+ struct tcpcb *tp;
+{
+ u_int32_t md5_buffer[4];
+ tcp_seq new_isn;
+
+ /* Seed if this is the first use, reseed if requested. */
+ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
+ (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
+ < (u_int)ticks))) {
+ read_random(&isn_secret, sizeof(isn_secret));
+ isn_last_reseed = ticks;
+ }
+
+ /* Compute the md5 hash and return the ISN. */
+ MD5Init(&isn_ctx);
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
+#ifdef INET6
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
+ sizeof(struct in6_addr));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
+ sizeof(struct in6_addr));
+ } else
+#endif
+ {
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
+ sizeof(struct in_addr));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
+ sizeof(struct in_addr));
+ }
+ MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
+ MD5Final((u_char *) &md5_buffer, &isn_ctx);
+ new_isn = (tcp_seq) md5_buffer[0];
+ new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
+ return new_isn;
+}
+
+/*
+ * When a source quench is received, close congestion window
+ * to one segment. We will gradually open it again as we proceed.
+ */
+struct inpcb *
+tcp_quench(inp, errno)
+ struct inpcb *inp;
+ int errno;
+{
+ struct tcpcb *tp = intotcpcb(inp);
+
+ if (tp)
+ tp->snd_cwnd = tp->t_maxseg;
+ return (inp);
+}
+
+/*
+ * When a specific ICMP unreachable message is received and the
+ * connection state is SYN-SENT, drop the connection. This behavior
+ * is controlled by the icmp_may_rst sysctl.
+ */
+struct inpcb *
+tcp_drop_syn_sent(inp, errno)
+ struct inpcb *inp;
+ int errno;
+{
+ struct tcpcb *tp = intotcpcb(inp);
+
+ if (tp && tp->t_state == TCPS_SYN_SENT) {
+ tcp_drop(tp, errno);
+ return (struct inpcb *)0;
+ }
+ return inp;
+}
+
+/*
+ * When `need fragmentation' ICMP is received, update our idea of the MSS
+ * based on the new value in the route. Also nudge TCP to send something,
+ * since we know the packet we just sent was dropped.
+ * This duplicates some code in the tcp_mss() function in tcp_input.c.
+ */
+struct inpcb *
+tcp_mtudisc(inp, errno)
+ struct inpcb *inp;
+ int errno;
+{
+ struct tcpcb *tp = intotcpcb(inp);
+ struct rtentry *rt;
+ struct rmxp_tao *taop;
+ struct socket *so = inp->inp_socket;
+ int offered;
+ int mss;
+#ifdef INET6
+ int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+
+ if (tp) {
+#ifdef INET6
+ if (isipv6)
+ rt = tcp_rtlookup6(&inp->inp_inc);
+ else
+#endif /* INET6 */
+ rt = tcp_rtlookup(&inp->inp_inc);
+ if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tp->t_maxopd = tp->t_maxseg =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ return inp;
+ }
+ taop = rmx_taop(rt->rt_rmx);
+ offered = taop->tao_mssopt;
+ mss = rt->rt_rmx.rmx_mtu -
+#ifdef INET6
+ (isipv6 ?
+ sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
+#endif /* INET6 */
+ sizeof(struct tcpiphdr)
+#ifdef INET6
+ )
+#endif /* INET6 */
+ ;
+
+ if (offered)
+ mss = min(mss, offered);
+ /*
+ * XXX - The above conditional probably violates the TCP
+ * spec. The problem is that, since we don't know the
+ * other end's MSS, we are supposed to use a conservative
+ * default. But, if we do that, then MTU discovery will
+ * never actually take place, because the conservative
+ * default is much less than the MTUs typically seen
+ * on the Internet today. For the moment, we'll sweep
+ * this under the carpet.
+ *
+ * The conservative default might not actually be a problem
+ * if the only case this occurs is when sending an initial
+ * SYN with options and data to a host we've never talked
+ * to before. Then, they will reply with an MSS value which
+ * will get recorded and the new parameters should get
+ * recomputed. For Further Study.
+ */
+ if (tp->t_maxopd <= mss)
+ return inp;
+ tp->t_maxopd = mss;
+
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
+ (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
+ mss -= TCPOLEN_TSTAMP_APPA;
+ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
+ (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
+ mss -= TCPOLEN_CC_APPA;
+#if (MCLBYTES & (MCLBYTES - 1)) == 0
+ if (mss > MCLBYTES)
+ mss &= ~(MCLBYTES-1);
+#else
+ if (mss > MCLBYTES)
+ mss = mss / MCLBYTES * MCLBYTES;
+#endif
+ if (so->so_snd.sb_hiwat < mss)
+ mss = so->so_snd.sb_hiwat;
+
+ tp->t_maxseg = mss;
+
+ tcpstat.tcps_mturesent++;
+ tp->t_rtttime = 0;
+ tp->snd_nxt = tp->snd_una;
+ tcp_output(tp);
+ }
+ return inp;
+}
+
+/*
+ * Look-up the routing entry to the peer of this inpcb. If no route
+ * is found and it cannot be allocated the return NULL. This routine
+ * is called by TCP routines that access the rmx structure and by tcp_mss
+ * to get the interface MTU.
+ */
+struct rtentry *
+tcp_rtlookup(inc)
+ struct in_conninfo *inc;
+{
+ struct route *ro;
+ struct rtentry *rt;
+
+ ro = &inc->inc_route;
+ rt = ro->ro_rt;
+ if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
+ /* No route yet, so try to acquire one */
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ ro->ro_dst.sa_family = AF_INET;
+ ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
+ ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
+ inc->inc_faddr;
+ rtalloc(ro);
+ rt = ro->ro_rt;
+ }
+ }
+ return rt;
+}
+
+#ifdef INET6
+struct rtentry *
+tcp_rtlookup6(inc)
+ struct in_conninfo *inc;
+{
+ struct route_in6 *ro6;
+ struct rtentry *rt;
+
+ ro6 = &inc->inc6_route;
+ rt = ro6->ro_rt;
+ if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
+ /* No route yet, so try to acquire one */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ ro6->ro_dst.sin6_family = AF_INET6;
+ ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ ro6->ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc((struct route *)ro6);
+ rt = ro6->ro_rt;
+ }
+ }
+ return rt;
+}
+#endif /* INET6 */
+
+#ifdef IPSEC
+/* compute ESP/AH header size for TCP, including outer IP header. */
+size_t
+ipsec_hdrsiz_tcp(tp)
+ struct tcpcb *tp;
+{
+ struct inpcb *inp;
+ struct mbuf *m;
+ size_t hdrsiz;
+ struct ip *ip;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+#endif /* INET6 */
+ struct tcphdr *th;
+
+ if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
+ return 0;
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (!m)
+ return 0;
+
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) != 0) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ m->m_pkthdr.len = m->m_len =
+ sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ tcp_fillheaders(tp, ip6, th);
+ hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(ip + 1);
+ m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
+ tcp_fillheaders(tp, ip, th);
+ hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
+ }
+
+ m_free(m);
+ return hdrsiz;
+}
+#endif /*IPSEC*/
+
+/*
+ * Return a pointer to the cached information about the remote host.
+ * The cached information is stored in the protocol specific part of
+ * the route metrics.
+ */
+struct rmxp_tao *
+tcp_gettaocache(inc)
+ struct in_conninfo *inc;
+{
+ struct rtentry *rt;
+
+#ifdef INET6
+ if (inc->inc_isipv6)
+ rt = tcp_rtlookup6(inc);
+ else
+#endif /* INET6 */
+ rt = tcp_rtlookup(inc);
+
+ /* Make sure this is a host route and is up. */
+ if (rt == NULL ||
+ (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
+ return NULL;
+
+ return rmx_taop(rt->rt_rmx);
+}
+
+/*
+ * Clear all the TAO cache entries, called from tcp_init.
+ *
+ * XXX
+ * This routine is just an empty one, because we assume that the routing
+ * routing tables are initialized at the same time when TCP, so there is
+ * nothing in the cache left over.
+ */
+static void
+tcp_cleartaocache()
+{
+}
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
new file mode 100644
index 0000000..0ef2f3d
--- /dev/null
+++ b/sys/netinet/tcp_syncache.c
@@ -0,0 +1,1371 @@
+/*-
+ * Copyright (c) 2001 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Jonathan Lemon
+ * and NAI Labs, the Security Research Division of Network Associates, Inc.
+ * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
+ * DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/md5.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/random.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/nd6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#ifdef INET6
+#include <netinet6/ipsec6.h>
+#endif
+#include <netkey/key.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+#include <vm/uma.h>
+
+static int tcp_syncookies = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
+ &tcp_syncookies, 0,
+ "Use TCP SYN cookies if the syncache overflows");
+
+static void syncache_drop(struct syncache *, struct syncache_head *);
+static void syncache_free(struct syncache *);
+static void syncache_insert(struct syncache *, struct syncache_head *);
+struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
+static int syncache_respond(struct syncache *, struct mbuf *);
+static struct socket *syncache_socket(struct syncache *, struct socket *,
+ struct mbuf *m);
+static void syncache_timer(void *);
+static u_int32_t syncookie_generate(struct syncache *);
+static struct syncache *syncookie_lookup(struct in_conninfo *,
+ struct tcphdr *, struct socket *);
+
+/*
+ * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
+ * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
+ * the odds are that the user has given up attempting to connect by then.
+ */
+#define SYNCACHE_MAXREXMTS 3
+
+/* Arbitrary values */
+#define TCP_SYNCACHE_HASHSIZE 512
+#define TCP_SYNCACHE_BUCKETLIMIT 30
+
+struct tcp_syncache {
+ struct syncache_head *hashbase;
+ uma_zone_t zone;
+ u_int hashsize;
+ u_int hashmask;
+ u_int bucket_limit;
+ u_int cache_count;
+ u_int cache_limit;
+ u_int rexmt_limit;
+ u_int hash_secret;
+ u_int next_reseed;
+ TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
+ struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
+};
+static struct tcp_syncache tcp_syncache;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
+
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RD,
+ &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
+
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RD,
+ &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
+
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
+ &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
+
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RD,
+ &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
+
+SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
+ &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
+
+static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
+
+#define SYNCACHE_HASH(inc, mask) \
+ ((tcp_syncache.hash_secret ^ \
+ (inc)->inc_faddr.s_addr ^ \
+ ((inc)->inc_faddr.s_addr >> 16) ^ \
+ (inc)->inc_fport ^ (inc)->inc_lport) & mask)
+
+#define SYNCACHE_HASH6(inc, mask) \
+ ((tcp_syncache.hash_secret ^ \
+ (inc)->inc6_faddr.s6_addr32[0] ^ \
+ (inc)->inc6_faddr.s6_addr32[3] ^ \
+ (inc)->inc_fport ^ (inc)->inc_lport) & mask)
+
+#define ENDPTS_EQ(a, b) ( \
+ (a)->ie_fport == (b)->ie_fport && \
+ (a)->ie_lport == (b)->ie_lport && \
+ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
+ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
+)
+
+#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
+
+#define SYNCACHE_TIMEOUT(sc, slot) do { \
+ sc->sc_rxtslot = slot; \
+ sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[slot]; \
+ TAILQ_INSERT_TAIL(&tcp_syncache.timerq[slot], sc, sc_timerq); \
+ if (!callout_active(&tcp_syncache.tt_timerq[slot])) \
+ callout_reset(&tcp_syncache.tt_timerq[slot], \
+ TCPTV_RTOBASE * tcp_backoff[slot], \
+ syncache_timer, (void *)((intptr_t)slot)); \
+} while (0)
+
+static void
+syncache_free(struct syncache *sc)
+{
+ struct rtentry *rt;
+
+ if (sc->sc_ipopts)
+ (void) m_free(sc->sc_ipopts);
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6)
+ rt = sc->sc_route6.ro_rt;
+ else
+#endif
+ rt = sc->sc_route.ro_rt;
+ if (rt != NULL) {
+ /*
+ * If this is the only reference to a protocol cloned
+ * route, remove it immediately.
+ */
+ if (rt->rt_flags & RTF_WASCLONED &&
+ (sc->sc_flags & SCF_KEEPROUTE) == 0 &&
+ rt->rt_refcnt == 1)
+ rtrequest(RTM_DELETE, rt_key(rt),
+ rt->rt_gateway, rt_mask(rt),
+ rt->rt_flags, NULL);
+ RTFREE(rt);
+ }
+ uma_zfree(tcp_syncache.zone, sc);
+}
+
+void
+syncache_init(void)
+{
+ int i;
+
+ tcp_syncache.cache_count = 0;
+ tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
+ tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
+ tcp_syncache.cache_limit =
+ tcp_syncache.hashsize * tcp_syncache.bucket_limit;
+ tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
+ tcp_syncache.next_reseed = 0;
+ tcp_syncache.hash_secret = arc4random();
+
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
+ &tcp_syncache.hashsize);
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
+ &tcp_syncache.cache_limit);
+ TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
+ &tcp_syncache.bucket_limit);
+ if (!powerof2(tcp_syncache.hashsize)) {
+ printf("WARNING: syncache hash size is not a power of 2.\n");
+ tcp_syncache.hashsize = 512; /* safe default */
+ }
+ tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
+
+ /* Allocate the hash table. */
+ MALLOC(tcp_syncache.hashbase, struct syncache_head *,
+ tcp_syncache.hashsize * sizeof(struct syncache_head),
+ M_SYNCACHE, M_WAITOK);
+
+ /* Initialize the hash buckets. */
+ for (i = 0; i < tcp_syncache.hashsize; i++) {
+ TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
+ tcp_syncache.hashbase[i].sch_length = 0;
+ }
+
+ /* Initialize the timer queues. */
+ for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
+ TAILQ_INIT(&tcp_syncache.timerq[i]);
+ callout_init(&tcp_syncache.tt_timerq[i], 0);
+ }
+
+ /*
+ * Allocate the syncache entries. Allow the zone to allocate one
+ * more entry than cache limit, so a new entry can bump out an
+ * older one.
+ */
+ tcp_syncache.cache_limit -= 1;
+ tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
+}
+
+static void
+syncache_insert(sc, sch)
+ struct syncache *sc;
+ struct syncache_head *sch;
+{
+ struct syncache *sc2;
+ int s, i;
+
+ /*
+ * Make sure that we don't overflow the per-bucket
+ * limit or the total cache size limit.
+ */
+ s = splnet();
+ if (sch->sch_length >= tcp_syncache.bucket_limit) {
+ /*
+ * The bucket is full, toss the oldest element.
+ */
+ sc2 = TAILQ_FIRST(&sch->sch_bucket);
+ sc2->sc_tp->ts_recent = ticks;
+ syncache_drop(sc2, sch);
+ tcpstat.tcps_sc_bucketoverflow++;
+ } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
+ /*
+ * The cache is full. Toss the oldest entry in the
+ * entire cache. This is the front entry in the
+ * first non-empty timer queue with the largest
+ * timeout value.
+ */
+ for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
+ sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
+ if (sc2 != NULL)
+ break;
+ }
+ sc2->sc_tp->ts_recent = ticks;
+ syncache_drop(sc2, NULL);
+ tcpstat.tcps_sc_cacheoverflow++;
+ }
+
+ /* Initialize the entry's timer. */
+ SYNCACHE_TIMEOUT(sc, 0);
+
+ /* Put it into the bucket. */
+ TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
+ sch->sch_length++;
+ tcp_syncache.cache_count++;
+ tcpstat.tcps_sc_added++;
+ splx(s);
+}
+
+static void
+syncache_drop(sc, sch)
+ struct syncache *sc;
+ struct syncache_head *sch;
+{
+ int s;
+
+ if (sch == NULL) {
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ sch = &tcp_syncache.hashbase[
+ SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
+ } else
+#endif
+ {
+ sch = &tcp_syncache.hashbase[
+ SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
+ }
+ }
+
+ s = splnet();
+
+ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
+ sch->sch_length--;
+ tcp_syncache.cache_count--;
+
+ TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
+ if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
+ callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
+ splx(s);
+
+ syncache_free(sc);
+}
+
+/*
+ * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
+ * If we have retransmitted an entry the maximum number of times, expire it.
+ */
+static void
+syncache_timer(xslot)
+ void *xslot;
+{
+ intptr_t slot = (intptr_t)xslot;
+ struct syncache *sc, *nsc;
+ struct inpcb *inp;
+ int s;
+
+ s = splnet();
+ if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
+ !callout_active(&tcp_syncache.tt_timerq[slot])) {
+ splx(s);
+ return;
+ }
+ callout_deactivate(&tcp_syncache.tt_timerq[slot]);
+
+ nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
+ INP_INFO_RLOCK(&tcbinfo);
+ while (nsc != NULL) {
+ if (ticks < nsc->sc_rxttime)
+ break;
+ sc = nsc;
+ nsc = TAILQ_NEXT(sc, sc_timerq);
+ inp = sc->sc_tp->t_inpcb;
+ INP_LOCK(inp);
+ if (slot == SYNCACHE_MAXREXMTS ||
+ slot >= tcp_syncache.rexmt_limit ||
+ inp->inp_gencnt != sc->sc_inp_gencnt) {
+ syncache_drop(sc, NULL);
+ tcpstat.tcps_sc_stale++;
+ INP_UNLOCK(inp);
+ continue;
+ }
+ (void) syncache_respond(sc, NULL);
+ INP_UNLOCK(inp);
+ tcpstat.tcps_sc_retransmitted++;
+ TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
+ SYNCACHE_TIMEOUT(sc, slot + 1);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+ if (nsc != NULL)
+ callout_reset(&tcp_syncache.tt_timerq[slot],
+ nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
+ splx(s);
+}
+
+/*
+ * Find an entry in the syncache.
+ */
+struct syncache *
+syncache_lookup(inc, schp)
+ struct in_conninfo *inc;
+ struct syncache_head **schp;
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+ int s;
+
+#ifdef INET6
+ if (inc->inc_isipv6) {
+ sch = &tcp_syncache.hashbase[
+ SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
+ *schp = sch;
+ s = splnet();
+ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
+ if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) {
+ splx(s);
+ return (sc);
+ }
+ }
+ splx(s);
+ } else
+#endif
+ {
+ sch = &tcp_syncache.hashbase[
+ SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
+ *schp = sch;
+ s = splnet();
+ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6)
+ continue;
+#endif
+ if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) {
+ splx(s);
+ return (sc);
+ }
+ }
+ splx(s);
+ }
+ return (NULL);
+}
+
+/*
+ * This function is called when we get a RST for a
+ * non-existent connection, so that we can see if the
+ * connection is in the syn cache. If it is, zap it.
+ */
+void
+syncache_chkrst(inc, th)
+ struct in_conninfo *inc;
+ struct tcphdr *th;
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+
+ sc = syncache_lookup(inc, &sch);
+ if (sc == NULL)
+ return;
+ /*
+ * If the RST bit is set, check the sequence number to see
+ * if this is a valid reset segment.
+ * RFC 793 page 37:
+ * In all states except SYN-SENT, all reset (RST) segments
+ * are validated by checking their SEQ-fields. A reset is
+ * valid if its sequence number is in the window.
+ *
+ * The sequence number in the reset segment is normally an
+ * echo of our outgoing acknowlegement numbers, but some hosts
+ * send a reset with the sequence number at the rightmost edge
+ * of our receive window, and we have to handle this case.
+ */
+ if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
+ SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
+ syncache_drop(sc, sch);
+ tcpstat.tcps_sc_reset++;
+ }
+}
+
+void
+syncache_badack(inc)
+ struct in_conninfo *inc;
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+
+ sc = syncache_lookup(inc, &sch);
+ if (sc != NULL) {
+ syncache_drop(sc, sch);
+ tcpstat.tcps_sc_badack++;
+ }
+}
+
+void
+syncache_unreach(inc, th)
+ struct in_conninfo *inc;
+ struct tcphdr *th;
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+
+ /* we are called at splnet() here */
+ sc = syncache_lookup(inc, &sch);
+ if (sc == NULL)
+ return;
+
+ /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
+ if (ntohl(th->th_seq) != sc->sc_iss)
+ return;
+
+ /*
+ * If we've rertransmitted 3 times and this is our second error,
+ * we remove the entry. Otherwise, we allow it to continue on.
+ * This prevents us from incorrectly nuking an entry during a
+ * spurious network outage.
+ *
+ * See tcp_notify().
+ */
+ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
+ sc->sc_flags |= SCF_UNREACH;
+ return;
+ }
+ syncache_drop(sc, sch);
+ tcpstat.tcps_sc_unreach++;
+}
+
+/*
+ * Build a new TCP socket structure from a syncache entry.
+ */
+static struct socket *
+syncache_socket(sc, lso, m)
+ struct syncache *sc;
+ struct socket *lso;
+ struct mbuf *m;
+{
+ struct inpcb *inp = NULL;
+ struct socket *so;
+ struct tcpcb *tp;
+
+ /*
+ * Ok, create the full blown connection, and set things up
+ * as they would have been set up if we had created the
+ * connection when the SYN arrived. If we can't create
+ * the connection, abort it.
+ */
+ so = sonewconn(lso, SS_ISCONNECTED);
+ if (so == NULL) {
+ /*
+ * Drop the connection; we will send a RST if the peer
+ * retransmits the ACK,
+ */
+ tcpstat.tcps_listendrop++;
+ goto abort;
+ }
+
+ inp = sotoinpcb(so);
+
+ /*
+ * Insert new socket into hash list.
+ */
+ inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ inp->in6p_laddr = sc->sc_inc.inc6_laddr;
+ } else {
+ inp->inp_vflag &= ~INP_IPV6;
+ inp->inp_vflag |= INP_IPV4;
+#endif
+ inp->inp_laddr = sc->sc_inc.inc_laddr;
+#ifdef INET6
+ }
+#endif
+ inp->inp_lport = sc->sc_inc.inc_lport;
+ if (in_pcbinshash(inp) != 0) {
+ /*
+ * Undo the assignments above if we failed to
+ * put the PCB on the hash lists.
+ */
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6)
+ inp->in6p_laddr = in6addr_any;
+ else
+#endif
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ inp->inp_lport = 0;
+ goto abort;
+ }
+#ifdef IPSEC
+ /* copy old policy into new socket's */
+ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
+ printf("syncache_expand: could not copy policy\n");
+#endif
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ struct inpcb *oinp = sotoinpcb(lso);
+ struct in6_addr laddr6;
+ struct sockaddr_in6 *sin6;
+ /*
+ * Inherit socket options from the listening socket.
+ * Note that in6p_inputopts are not (and should not be)
+ * copied, since it stores previously received options and is
+ * used to detect if each new option is different than the
+ * previous one and hence should be passed to a user.
+ * If we copied in6p_inputopts, a user would not be able to
+ * receive options just after calling the accept system call.
+ */
+ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
+ if (oinp->in6p_outputopts)
+ inp->in6p_outputopts =
+ ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
+ inp->in6p_route = sc->sc_route6;
+ sc->sc_route6.ro_rt = NULL;
+
+ MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
+ M_SONAME, M_NOWAIT | M_ZERO);
+ if (sin6 == NULL)
+ goto abort;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_addr = sc->sc_inc.inc6_faddr;
+ sin6->sin6_port = sc->sc_inc.inc_fport;
+ laddr6 = inp->in6p_laddr;
+ if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
+ inp->in6p_laddr = sc->sc_inc.inc6_laddr;
+ if (in6_pcbconnect(inp, (struct sockaddr *)sin6, &thread0)) {
+ inp->in6p_laddr = laddr6;
+ FREE(sin6, M_SONAME);
+ goto abort;
+ }
+ FREE(sin6, M_SONAME);
+ } else
+#endif
+ {
+ struct in_addr laddr;
+ struct sockaddr_in *sin;
+
+ inp->inp_options = ip_srcroute();
+ if (inp->inp_options == NULL) {
+ inp->inp_options = sc->sc_ipopts;
+ sc->sc_ipopts = NULL;
+ }
+ inp->inp_route = sc->sc_route;
+ sc->sc_route.ro_rt = NULL;
+
+ MALLOC(sin, struct sockaddr_in *, sizeof *sin,
+ M_SONAME, M_NOWAIT | M_ZERO);
+ if (sin == NULL)
+ goto abort;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = sc->sc_inc.inc_faddr;
+ sin->sin_port = sc->sc_inc.inc_fport;
+ bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
+ laddr = inp->inp_laddr;
+ if (inp->inp_laddr.s_addr == INADDR_ANY)
+ inp->inp_laddr = sc->sc_inc.inc_laddr;
+ if (in_pcbconnect(inp, (struct sockaddr *)sin, &thread0)) {
+ inp->inp_laddr = laddr;
+ FREE(sin, M_SONAME);
+ goto abort;
+ }
+ FREE(sin, M_SONAME);
+ }
+
+ tp = intotcpcb(inp);
+ tp->t_state = TCPS_SYN_RECEIVED;
+ tp->iss = sc->sc_iss;
+ tp->irs = sc->sc_irs;
+ tcp_rcvseqinit(tp);
+ tcp_sendseqinit(tp);
+ tp->snd_wl1 = sc->sc_irs;
+ tp->rcv_up = sc->sc_irs + 1;
+ tp->rcv_wnd = sc->sc_wnd;
+ tp->rcv_adv += tp->rcv_wnd;
+
+ tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
+ if (sc->sc_flags & SCF_NOOPT)
+ tp->t_flags |= TF_NOOPT;
+ if (sc->sc_flags & SCF_WINSCALE) {
+ tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
+ tp->requested_s_scale = sc->sc_requested_s_scale;
+ tp->request_r_scale = sc->sc_request_r_scale;
+ }
+ if (sc->sc_flags & SCF_TIMESTAMP) {
+ tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
+ tp->ts_recent = sc->sc_tsrecent;
+ tp->ts_recent_age = ticks;
+ }
+ if (sc->sc_flags & SCF_CC) {
+ /*
+ * Initialization of the tcpcb for transaction;
+ * set SND.WND = SEG.WND,
+ * initialize CCsend and CCrecv.
+ */
+ tp->t_flags |= TF_REQ_CC|TF_RCVD_CC;
+ tp->cc_send = sc->sc_cc_send;
+ tp->cc_recv = sc->sc_cc_recv;
+ }
+
+ tcp_mss(tp, sc->sc_peer_mss);
+
+ /*
+ * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
+ */
+ if (sc->sc_rxtslot != 0)
+ tp->snd_cwnd = tp->t_maxseg;
+ callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
+
+ tcpstat.tcps_accepts++;
+ return (so);
+
+abort:
+ if (so != NULL)
+ (void) soabort(so);
+ return (NULL);
+}
+
+/*
+ * This function gets called when we receive an ACK for a
+ * socket in the LISTEN state. We look up the connection
+ * in the syncache, and if its there, we pull it out of
+ * the cache and turn it into a full-blown connection in
+ * the SYN-RECEIVED state.
+ */
+int
+syncache_expand(inc, th, sop, m)
+ struct in_conninfo *inc;
+ struct tcphdr *th;
+ struct socket **sop;
+ struct mbuf *m;
+{
+ struct syncache *sc;
+ struct syncache_head *sch;
+ struct socket *so;
+
+ sc = syncache_lookup(inc, &sch);
+ if (sc == NULL) {
+ /*
+ * There is no syncache entry, so see if this ACK is
+ * a returning syncookie. To do this, first:
+ * A. See if this socket has had a syncache entry dropped in
+ * the past. We don't want to accept a bogus syncookie
+ * if we've never received a SYN.
+ * B. check that the syncookie is valid. If it is, then
+ * cobble up a fake syncache entry, and return.
+ */
+ if (!tcp_syncookies)
+ return (0);
+ sc = syncookie_lookup(inc, th, *sop);
+ if (sc == NULL)
+ return (0);
+ sch = NULL;
+ tcpstat.tcps_sc_recvcookie++;
+ }
+
+ /*
+ * If seg contains an ACK, but not for our SYN/ACK, send a RST.
+ */
+ if (th->th_ack != sc->sc_iss + 1)
+ return (0);
+
+ so = syncache_socket(sc, *sop, m);
+ if (so == NULL) {
+#if 0
+resetandabort:
+ /* XXXjlemon check this - is this correct? */
+ (void) tcp_respond(NULL, m, m, th,
+ th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
+#endif
+ m_freem(m); /* XXX only needed for above */
+ tcpstat.tcps_sc_aborted++;
+ } else {
+ sc->sc_flags |= SCF_KEEPROUTE;
+ tcpstat.tcps_sc_completed++;
+ }
+ if (sch == NULL)
+ syncache_free(sc);
+ else
+ syncache_drop(sc, sch);
+ *sop = so;
+ return (1);
+}
+
+/*
+ * Given a LISTEN socket and an inbound SYN request, add
+ * this to the syn cache, and send back a segment:
+ * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
+ * to the source.
+ *
+ * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
+ * Doing so would require that we hold onto the data and deliver it
+ * to the application. However, if we are the target of a SYN-flood
+ * DoS attack, an attacker could send data which would eventually
+ * consume all available buffer space if it were ACKed. By not ACKing
+ * the data, we avoid this DoS scenario.
+ */
+int
+syncache_add(inc, to, th, sop, m)
+ struct in_conninfo *inc;
+ struct tcpopt *to;
+ struct tcphdr *th;
+ struct socket **sop;
+ struct mbuf *m;
+{
+ struct tcpcb *tp;
+ struct socket *so;
+ struct syncache *sc = NULL;
+ struct syncache_head *sch;
+ struct mbuf *ipopts = NULL;
+ struct rmxp_tao *taop;
+ int i, s, win;
+
+ so = *sop;
+ tp = sototcpcb(so);
+
+ /*
+ * Remember the IP options, if any.
+ */
+#ifdef INET6
+ if (!inc->inc_isipv6)
+#endif
+ ipopts = ip_srcroute();
+
+ /*
+ * See if we already have an entry for this connection.
+ * If we do, resend the SYN,ACK, and reset the retransmit timer.
+ *
+ * XXX
+ * should the syncache be re-initialized with the contents
+ * of the new SYN here (which may have different options?)
+ */
+ sc = syncache_lookup(inc, &sch);
+ if (sc != NULL) {
+ tcpstat.tcps_sc_dupsyn++;
+ if (ipopts) {
+ /*
+ * If we were remembering a previous source route,
+ * forget it and use the new one we've been given.
+ */
+ if (sc->sc_ipopts)
+ (void) m_free(sc->sc_ipopts);
+ sc->sc_ipopts = ipopts;
+ }
+ /*
+ * Update timestamp if present.
+ */
+ if (sc->sc_flags & SCF_TIMESTAMP)
+ sc->sc_tsrecent = to->to_tsval;
+ /*
+ * PCB may have changed, pick up new values.
+ */
+ sc->sc_tp = tp;
+ sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
+ if (syncache_respond(sc, m) == 0) {
+ s = splnet();
+ TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
+ sc, sc_timerq);
+ SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
+ splx(s);
+ tcpstat.tcps_sndacks++;
+ tcpstat.tcps_sndtotal++;
+ }
+ *sop = NULL;
+ return (1);
+ }
+
+ sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
+ if (sc == NULL) {
+ /*
+ * The zone allocator couldn't provide more entries.
+ * Treat this as if the cache was full; drop the oldest
+ * entry and insert the new one.
+ */
+ s = splnet();
+ for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
+ sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
+ if (sc != NULL)
+ break;
+ }
+ sc->sc_tp->ts_recent = ticks;
+ syncache_drop(sc, NULL);
+ splx(s);
+ tcpstat.tcps_sc_zonefail++;
+ sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
+ if (sc == NULL) {
+ if (ipopts)
+ (void) m_free(ipopts);
+ return (0);
+ }
+ }
+
+ /*
+ * Fill in the syncache values.
+ */
+ bzero(sc, sizeof(*sc));
+ sc->sc_tp = tp;
+ sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
+ sc->sc_ipopts = ipopts;
+ sc->sc_inc.inc_fport = inc->inc_fport;
+ sc->sc_inc.inc_lport = inc->inc_lport;
+#ifdef INET6
+ sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
+ if (inc->inc_isipv6) {
+ sc->sc_inc.inc6_faddr = inc->inc6_faddr;
+ sc->sc_inc.inc6_laddr = inc->inc6_laddr;
+ sc->sc_route6.ro_rt = NULL;
+ } else
+#endif
+ {
+ sc->sc_inc.inc_faddr = inc->inc_faddr;
+ sc->sc_inc.inc_laddr = inc->inc_laddr;
+ sc->sc_route.ro_rt = NULL;
+ }
+ sc->sc_irs = th->th_seq;
+ if (tcp_syncookies)
+ sc->sc_iss = syncookie_generate(sc);
+ else
+ sc->sc_iss = arc4random();
+
+ /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
+ win = sbspace(&so->so_rcv);
+ win = imax(win, 0);
+ win = imin(win, TCP_MAXWIN);
+ sc->sc_wnd = win;
+
+ sc->sc_flags = 0;
+ sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
+ if (tcp_do_rfc1323) {
+ /*
+ * A timestamp received in a SYN makes
+ * it ok to send timestamp requests and replies.
+ */
+ if (to->to_flags & TOF_TS) {
+ sc->sc_tsrecent = to->to_tsval;
+ sc->sc_flags |= SCF_TIMESTAMP;
+ }
+ if (to->to_flags & TOF_SCALE) {
+ int wscale = 0;
+
+ /* Compute proper scaling value from buffer space */
+ while (wscale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
+ wscale++;
+ sc->sc_request_r_scale = wscale;
+ sc->sc_requested_s_scale = to->to_requested_s_scale;
+ sc->sc_flags |= SCF_WINSCALE;
+ }
+ }
+ if (tcp_do_rfc1644) {
+ /*
+ * A CC or CC.new option received in a SYN makes
+ * it ok to send CC in subsequent segments.
+ */
+ if (to->to_flags & (TOF_CC|TOF_CCNEW)) {
+ sc->sc_cc_recv = to->to_cc;
+ sc->sc_cc_send = CC_INC(tcp_ccgen);
+ sc->sc_flags |= SCF_CC;
+ }
+ }
+ if (tp->t_flags & TF_NOOPT)
+ sc->sc_flags = SCF_NOOPT;
+
+ /*
+ * XXX
+ * We have the option here of not doing TAO (even if the segment
+ * qualifies) and instead fall back to a normal 3WHS via the syncache.
+ * This allows us to apply synflood protection to TAO-qualifying SYNs
+ * also. However, there should be a hueristic to determine when to
+ * do this, and is not present at the moment.
+ */
+
+ /*
+ * Perform TAO test on incoming CC (SEG.CC) option, if any.
+ * - compare SEG.CC against cached CC from the same host, if any.
+ * - if SEG.CC > chached value, SYN must be new and is accepted
+ * immediately: save new CC in the cache, mark the socket
+ * connected, enter ESTABLISHED state, turn on flag to
+ * send a SYN in the next segment.
+ * A virtual advertised window is set in rcv_adv to
+ * initialize SWS prevention. Then enter normal segment
+ * processing: drop SYN, process data and FIN.
+ * - otherwise do a normal 3-way handshake.
+ */
+ taop = tcp_gettaocache(&sc->sc_inc);
+ if ((to->to_flags & TOF_CC) != 0) {
+ if (((tp->t_flags & TF_NOPUSH) != 0) &&
+ sc->sc_flags & SCF_CC &&
+ taop != NULL && taop->tao_cc != 0 &&
+ CC_GT(to->to_cc, taop->tao_cc)) {
+ sc->sc_rxtslot = 0;
+ so = syncache_socket(sc, *sop, m);
+ if (so != NULL) {
+ sc->sc_flags |= SCF_KEEPROUTE;
+ taop->tao_cc = to->to_cc;
+ *sop = so;
+ }
+ syncache_free(sc);
+ return (so != NULL);
+ }
+ } else {
+ /*
+ * No CC option, but maybe CC.NEW: invalidate cached value.
+ */
+ if (taop != NULL)
+ taop->tao_cc = 0;
+ }
+ /*
+ * TAO test failed or there was no CC option,
+ * do a standard 3-way handshake.
+ */
+ if (syncache_respond(sc, m) == 0) {
+ syncache_insert(sc, sch);
+ tcpstat.tcps_sndacks++;
+ tcpstat.tcps_sndtotal++;
+ } else {
+ syncache_free(sc);
+ tcpstat.tcps_sc_dropped++;
+ }
+ *sop = NULL;
+ return (1);
+}
+
+static int
+syncache_respond(sc, m)
+ struct syncache *sc;
+ struct mbuf *m;
+{
+ u_int8_t *optp;
+ int optlen, error;
+ u_int16_t tlen, hlen, mssopt;
+ struct ip *ip = NULL;
+ struct rtentry *rt;
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+#endif
+
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ rt = tcp_rtlookup6(&sc->sc_inc);
+ if (rt != NULL)
+ mssopt = rt->rt_ifp->if_mtu -
+ (sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
+ else
+ mssopt = tcp_v6mssdflt;
+ hlen = sizeof(struct ip6_hdr);
+ } else
+#endif
+ {
+ rt = tcp_rtlookup(&sc->sc_inc);
+ if (rt != NULL)
+ mssopt = rt->rt_ifp->if_mtu -
+ (sizeof(struct ip) + sizeof(struct tcphdr));
+ else
+ mssopt = tcp_mssdflt;
+ hlen = sizeof(struct ip);
+ }
+
+ /* Compute the size of the TCP options. */
+ if (sc->sc_flags & SCF_NOOPT) {
+ optlen = 0;
+ } else {
+ optlen = TCPOLEN_MAXSEG +
+ ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
+ ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) +
+ ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0);
+ }
+ tlen = hlen + sizeof(struct tcphdr) + optlen;
+
+ /*
+ * XXX
+ * assume that the entire packet will fit in a header mbuf
+ */
+ KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
+
+ /*
+ * XXX shouldn't this reuse the mbuf if possible ?
+ * Create the IP+TCP header from scratch.
+ */
+ if (m)
+ m_freem(m);
+
+ m = m_gethdr(M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return (ENOBUFS);
+ m->m_data += max_linkhdr;
+ m->m_len = tlen;
+ m->m_pkthdr.len = tlen;
+ m->m_pkthdr.rcvif = NULL;
+
+#ifdef IPSEC
+ /* use IPsec policy on listening socket to send SYN,ACK */
+ if (ipsec_setsocket(m, sc->sc_tp->t_inpcb->inp_socket) != 0) {
+ m_freem(m);
+ return (ENOBUFS);
+ }
+#endif
+
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6->ip6_vfc = IPV6_VERSION;
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_src = sc->sc_inc.inc6_laddr;
+ ip6->ip6_dst = sc->sc_inc.inc6_faddr;
+ ip6->ip6_plen = htons(tlen - hlen);
+ /* ip6_hlim is set after checksum */
+ /* ip6_flow = ??? */
+
+ th = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif
+ {
+ ip = mtod(m, struct ip *);
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(struct ip) >> 2;
+ ip->ip_len = tlen;
+ ip->ip_id = 0;
+ ip->ip_off = 0;
+ ip->ip_sum = 0;
+ ip->ip_p = IPPROTO_TCP;
+ ip->ip_src = sc->sc_inc.inc_laddr;
+ ip->ip_dst = sc->sc_inc.inc_faddr;
+ ip->ip_ttl = sc->sc_tp->t_inpcb->inp_ip_ttl; /* XXX */
+ ip->ip_tos = sc->sc_tp->t_inpcb->inp_ip_tos; /* XXX */
+
+ /*
+ * See if we should do MTU discovery. We do it only if the following
+ * are true:
+ * 1) we have a valid route to the destination
+ * 2) the MTU is not locked (if it is, then discovery has been
+ * disabled)
+ */
+ if (path_mtu_discovery
+ && (rt != NULL)
+ && rt->rt_flags & RTF_UP
+ && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
+ ip->ip_off |= IP_DF;
+ }
+
+ th = (struct tcphdr *)(ip + 1);
+ }
+ th->th_sport = sc->sc_inc.inc_lport;
+ th->th_dport = sc->sc_inc.inc_fport;
+
+ th->th_seq = htonl(sc->sc_iss);
+ th->th_ack = htonl(sc->sc_irs + 1);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ th->th_x2 = 0;
+ th->th_flags = TH_SYN|TH_ACK;
+ th->th_win = htons(sc->sc_wnd);
+ th->th_urp = 0;
+
+ /* Tack on the TCP options. */
+ if (optlen == 0)
+ goto no_options;
+ optp = (u_int8_t *)(th + 1);
+ *optp++ = TCPOPT_MAXSEG;
+ *optp++ = TCPOLEN_MAXSEG;
+ *optp++ = (mssopt >> 8) & 0xff;
+ *optp++ = mssopt & 0xff;
+
+ if (sc->sc_flags & SCF_WINSCALE) {
+ *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
+ TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
+ sc->sc_request_r_scale);
+ optp += 4;
+ }
+
+ if (sc->sc_flags & SCF_TIMESTAMP) {
+ u_int32_t *lp = (u_int32_t *)(optp);
+
+ /* Form timestamp option as shown in appendix A of RFC 1323. */
+ *lp++ = htonl(TCPOPT_TSTAMP_HDR);
+ *lp++ = htonl(ticks);
+ *lp = htonl(sc->sc_tsrecent);
+ optp += TCPOLEN_TSTAMP_APPA;
+ }
+
+ /*
+ * Send CC and CC.echo if we received CC from our peer.
+ */
+ if (sc->sc_flags & SCF_CC) {
+ u_int32_t *lp = (u_int32_t *)(optp);
+
+ *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
+ *lp++ = htonl(sc->sc_cc_send);
+ *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CCECHO));
+ *lp = htonl(sc->sc_cc_recv);
+ optp += TCPOLEN_CC_APPA * 2;
+ }
+no_options:
+
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ struct route_in6 *ro6 = &sc->sc_route6;
+
+ th->th_sum = 0;
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
+ ip6->ip6_hlim = in6_selecthlim(NULL,
+ ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
+ error = ip6_output(m, NULL, ro6, 0, NULL, NULL);
+ } else
+#endif
+ {
+ th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(tlen - hlen + IPPROTO_TCP));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL);
+ }
+ return (error);
+}
+
+/*
+ * cookie layers:
+ *
+ * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
+ * | peer iss |
+ * | MD5(laddr,faddr,lport,fport,secret) |. . . . . . .|
+ * | 0 |(A)| |
+ * (A): peer mss index
+ */
+
+/*
+ * The values below are chosen to minimize the size of the tcp_secret
+ * table, as well as providing roughly a 4 second lifetime for the cookie.
+ */
+
+#define SYNCOOKIE_HASHSHIFT 2 /* log2(# of 32bit words from hash) */
+#define SYNCOOKIE_WNDBITS 7 /* exposed bits for window indexing */
+#define SYNCOOKIE_TIMESHIFT 5 /* scale ticks to window time units */
+
+#define SYNCOOKIE_HASHMASK ((1 << SYNCOOKIE_HASHSHIFT) - 1)
+#define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1)
+#define SYNCOOKIE_NSECRETS (1 << (SYNCOOKIE_WNDBITS - SYNCOOKIE_HASHSHIFT))
+#define SYNCOOKIE_TIMEOUT \
+ (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
+#define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
+
+static struct {
+ u_int32_t ts_secbits;
+ u_int ts_expire;
+} tcp_secret[SYNCOOKIE_NSECRETS];
+
+static int tcp_msstab[] = { 0, 536, 1460, 8960 };
+
+static MD5_CTX syn_ctx;
+
+#define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
+
+/*
+ * Consider the problem of a recreated (and retransmitted) cookie. If the
+ * original SYN was accepted, the connection is established. The second
+ * SYN is inflight, and if it arrives with an ISN that falls within the
+ * receive window, the connection is killed.
+ *
+ * However, since cookies have other problems, this may not be worth
+ * worrying about.
+ */
+
+static u_int32_t
+syncookie_generate(struct syncache *sc)
+{
+ u_int32_t md5_buffer[4];
+ u_int32_t data;
+ int wnd, idx;
+
+ wnd = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
+ idx = wnd >> SYNCOOKIE_HASHSHIFT;
+ if (tcp_secret[idx].ts_expire < ticks) {
+ tcp_secret[idx].ts_secbits = arc4random();
+ tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
+ }
+ for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
+ if (tcp_msstab[data] <= sc->sc_peer_mss)
+ break;
+ data = (data << SYNCOOKIE_WNDBITS) | wnd;
+ data ^= sc->sc_irs; /* peer's iss */
+ MD5Init(&syn_ctx);
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6) {
+ MD5Add(sc->sc_inc.inc6_laddr);
+ MD5Add(sc->sc_inc.inc6_faddr);
+ } else
+#endif
+ {
+ MD5Add(sc->sc_inc.inc_laddr);
+ MD5Add(sc->sc_inc.inc_faddr);
+ }
+ MD5Add(sc->sc_inc.inc_lport);
+ MD5Add(sc->sc_inc.inc_fport);
+ MD5Add(tcp_secret[idx].ts_secbits);
+ MD5Final((u_char *)&md5_buffer, &syn_ctx);
+ data ^= (md5_buffer[wnd & SYNCOOKIE_HASHMASK] & ~SYNCOOKIE_WNDMASK);
+ return (data);
+}
+
+static struct syncache *
+syncookie_lookup(inc, th, so)
+ struct in_conninfo *inc;
+ struct tcphdr *th;
+ struct socket *so;
+{
+ u_int32_t md5_buffer[4];
+ struct syncache *sc;
+ u_int32_t data;
+ int wnd, idx;
+
+ data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */
+ wnd = data & SYNCOOKIE_WNDMASK;
+ idx = wnd >> SYNCOOKIE_HASHSHIFT;
+ if (tcp_secret[idx].ts_expire < ticks ||
+ sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
+ return (NULL);
+ MD5Init(&syn_ctx);
+#ifdef INET6
+ if (inc->inc_isipv6) {
+ MD5Add(inc->inc6_laddr);
+ MD5Add(inc->inc6_faddr);
+ } else
+#endif
+ {
+ MD5Add(inc->inc_laddr);
+ MD5Add(inc->inc_faddr);
+ }
+ MD5Add(inc->inc_lport);
+ MD5Add(inc->inc_fport);
+ MD5Add(tcp_secret[idx].ts_secbits);
+ MD5Final((u_char *)&md5_buffer, &syn_ctx);
+ data ^= md5_buffer[wnd & SYNCOOKIE_HASHMASK];
+ if ((data & ~SYNCOOKIE_DATAMASK) != 0)
+ return (NULL);
+ data = data >> SYNCOOKIE_WNDBITS;
+
+ sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
+ if (sc == NULL)
+ return (NULL);
+ /*
+ * Fill in the syncache values.
+ * XXX duplicate code from syncache_add
+ */
+ sc->sc_ipopts = NULL;
+ sc->sc_inc.inc_fport = inc->inc_fport;
+ sc->sc_inc.inc_lport = inc->inc_lport;
+#ifdef INET6
+ sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
+ if (inc->inc_isipv6) {
+ sc->sc_inc.inc6_faddr = inc->inc6_faddr;
+ sc->sc_inc.inc6_laddr = inc->inc6_laddr;
+ sc->sc_route6.ro_rt = NULL;
+ } else
+#endif
+ {
+ sc->sc_inc.inc_faddr = inc->inc_faddr;
+ sc->sc_inc.inc_laddr = inc->inc_laddr;
+ sc->sc_route.ro_rt = NULL;
+ }
+ sc->sc_irs = th->th_seq - 1;
+ sc->sc_iss = th->th_ack - 1;
+ wnd = sbspace(&so->so_rcv);
+ wnd = imax(wnd, 0);
+ wnd = imin(wnd, TCP_MAXWIN);
+ sc->sc_wnd = wnd;
+ sc->sc_flags = 0;
+ sc->sc_rxtslot = 0;
+ sc->sc_peer_mss = tcp_msstab[data];
+ return (sc);
+}
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
new file mode 100644
index 0000000..82cf3c5
--- /dev/null
+++ b/sys/netinet/tcp_timer.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_inet6.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+
+#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
+
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+
+static int
+sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
+{
+ int error, s, tt;
+
+ tt = *(int *)oidp->oid_arg1;
+ s = tt * 1000 / hz;
+
+ error = sysctl_handle_int(oidp, &s, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ tt = s * hz / 1000;
+ if (tt < 1)
+ return (EINVAL);
+
+ *(int *)oidp->oid_arg1 = tt;
+ return (0);
+}
+
+int tcp_keepinit;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
+
+int tcp_keepidle;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
+
+int tcp_keepintvl;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
+
+int tcp_delacktime;
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
+ CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
+ "Time before a delayed ACK is sent");
+
+int tcp_msl;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
+
+static int always_keepalive = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
+ &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
+
+static int tcp_keepcnt = TCPTV_KEEPCNT;
+ /* max idle probes */
+int tcp_maxpersistidle;
+ /* max idle time in persist */
+int tcp_maxidle;
+
+/*
+ * Tcp protocol timeout routine called every 500 ms.
+ * Updates timestamps used for TCP
+ * causes finite state machine actions if timers expire.
+ */
+void
+tcp_slowtimo()
+{
+ int s;
+
+ s = splnet();
+
+ tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
+
+ splx(s);
+}
+
+/*
+ * Cancel all timers for TCP tp.
+ */
+void
+tcp_canceltimers(tp)
+ struct tcpcb *tp;
+{
+ callout_stop(tp->tt_2msl);
+ callout_stop(tp->tt_persist);
+ callout_stop(tp->tt_keep);
+ callout_stop(tp->tt_rexmt);
+}
+
+int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
+ { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
+
+int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
+ { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
+
+static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
+
+/*
+ * TCP timer processing.
+ */
+
+void
+tcp_timer_delack(xtp)
+ void *xtp;
+{
+ struct tcpcb *tp = xtp;
+ int s;
+ struct inpcb *inp;
+
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ inp = tp->t_inpcb;
+ INP_LOCK(inp);
+ INP_INFO_RUNLOCK(&tcbinfo);
+ if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) {
+ INP_UNLOCK(inp);
+ splx(s);
+ return;
+ }
+ callout_deactivate(tp->tt_delack);
+
+ tp->t_flags |= TF_ACKNOW;
+ tcpstat.tcps_delack++;
+ (void) tcp_output(tp);
+ INP_UNLOCK(inp);
+ splx(s);
+}
+
+void
+tcp_timer_2msl(xtp)
+ void *xtp;
+{
+ struct tcpcb *tp = xtp;
+ int s;
+ struct inpcb *inp;
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ s = splnet();
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = tp->t_inpcb;
+ INP_LOCK(inp);
+ if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) {
+ INP_UNLOCK(tp->t_inpcb);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return;
+ }
+ callout_deactivate(tp->tt_2msl);
+ /*
+ * 2 MSL timeout in shutdown went off. If we're closed but
+ * still waiting for peer to close and connection has been idle
+ * too long, or if 2MSL time is up from TIME_WAIT, delete connection
+ * control block. Otherwise, check again in a bit.
+ */
+ if (tp->t_state != TCPS_TIME_WAIT &&
+ (ticks - tp->t_rcvtime) <= tcp_maxidle)
+ callout_reset(tp->tt_2msl, tcp_keepintvl,
+ tcp_timer_2msl, tp);
+ else
+ tp = tcp_close(tp);
+
+#ifdef TCPDEBUG
+ if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp)
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+}
+
+void
+tcp_timer_keep(xtp)
+ void *xtp;
+{
+ struct tcpcb *tp = xtp;
+ struct tcptemp *t_template;
+ int s;
+ struct inpcb *inp;
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ s = splnet();
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = tp->t_inpcb;
+ INP_LOCK(inp);
+ if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) {
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return;
+ }
+ callout_deactivate(tp->tt_keep);
+ /*
+ * Keep-alive timer went off; send something
+ * or drop connection if idle for too long.
+ */
+ tcpstat.tcps_keeptimeo++;
+ if (tp->t_state < TCPS_ESTABLISHED)
+ goto dropit;
+ if ((always_keepalive ||
+ tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
+ tp->t_state <= TCPS_CLOSING) {
+ if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle)
+ goto dropit;
+ /*
+ * Send a packet designed to force a response
+ * if the peer is up and reachable:
+ * either an ACK if the connection is still alive,
+ * or an RST if the peer has closed the connection
+ * due to timeout or reboot.
+ * Using sequence number tp->snd_una-1
+ * causes the transmitted zero-length segment
+ * to lie outside the receive window;
+ * by the protocol spec, this requires the
+ * correspondent TCP to respond.
+ */
+ tcpstat.tcps_keepprobe++;
+ t_template = tcp_maketemplate(tp);
+ if (t_template) {
+ tcp_respond(tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ tp->rcv_nxt, tp->snd_una - 1, 0);
+ (void) m_free(dtom(t_template));
+ }
+ callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp);
+ } else
+ callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+
+#ifdef TCPDEBUG
+ if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return;
+
+dropit:
+ tcpstat.tcps_keepdrops++;
+ tp = tcp_drop(tp, ETIMEDOUT);
+
+#ifdef TCPDEBUG
+ if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp)
+ INP_UNLOCK(tp->t_inpcb);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+}
+
+void
+tcp_timer_persist(xtp)
+ void *xtp;
+{
+ struct tcpcb *tp = xtp;
+ int s;
+ struct inpcb *inp;
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ s = splnet();
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = tp->t_inpcb;
+ INP_LOCK(inp);
+ if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return;
+ }
+ callout_deactivate(tp->tt_persist);
+ /*
+ * Persistance timer into zero window.
+ * Force a byte to be output, if possible.
+ */
+ tcpstat.tcps_persisttimeo++;
+ /*
+ * Hack: if the peer is dead/unreachable, we do not
+ * time out if the window is closed. After a full
+ * backoff, drop the connection if the idle time
+ * (no responses to probes) reaches the maximum
+ * backoff that we would use if retransmitting.
+ */
+ if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+ ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle ||
+ (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ tcpstat.tcps_persistdrop++;
+ tp = tcp_drop(tp, ETIMEDOUT);
+ goto out;
+ }
+ tcp_setpersist(tp);
+ tp->t_force = 1;
+ (void) tcp_output(tp);
+ tp->t_force = 0;
+
+out:
+#ifdef TCPDEBUG
+ if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp)
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+}
+
+void
+tcp_timer_rexmt(xtp)
+ void *xtp;
+{
+ struct tcpcb *tp = xtp;
+ int s;
+ int rexmt;
+ int headlocked;
+ struct inpcb *inp;
+#ifdef TCPDEBUG
+ int ostate;
+
+ ostate = tp->t_state;
+#endif
+ s = splnet();
+ INP_INFO_WLOCK(&tcbinfo);
+ headlocked = 1;
+ inp = tp->t_inpcb;
+ INP_LOCK(inp);
+ if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) {
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return;
+ }
+ callout_deactivate(tp->tt_rexmt);
+ /*
+ * Retransmission timer went off. Message has not
+ * been acked within retransmit interval. Back off
+ * to a longer retransmit interval and retransmit one segment.
+ */
+ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ tp->t_rxtshift = TCP_MAXRXTSHIFT;
+ tcpstat.tcps_timeoutdrop++;
+ tp = tcp_drop(tp, tp->t_softerror ?
+ tp->t_softerror : ETIMEDOUT);
+ goto out;
+ }
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ if (tp->t_rxtshift == 1) {
+ /*
+ * first retransmit; record ssthresh and cwnd so they can
+ * be recovered if this turns out to be a "bad" retransmit.
+ * A retransmit is considered "bad" if an ACK for this
+ * segment is received within RTT/2 interval; the assumption
+ * here is that the ACK was already in flight. See
+ * "On Estimating End-to-End Network Path Properties" by
+ * Allman and Paxson for more details.
+ */
+ tp->snd_cwnd_prev = tp->snd_cwnd;
+ tp->snd_ssthresh_prev = tp->snd_ssthresh;
+ tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ }
+ tcpstat.tcps_rexmttimeo++;
+ if (tp->t_state == TCPS_SYN_SENT)
+ rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
+ else
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ TCPT_RANGESET(tp->t_rxtcur, rexmt,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ /*
+ * Disable rfc1323 and rfc1644 if we havn't got any response to
+ * our third SYN to work-around some broken terminal servers
+ * (most of which have hopefully been retired) that have bad VJ
+ * header compression code which trashes TCP segments containing
+ * unknown-to-them TCP options.
+ */
+ if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
+ tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
+ /*
+ * If losing, let the lower level know and try for
+ * a better route. Also, if we backed off this far,
+ * our srtt estimate is probably bogus. Clobber it
+ * so we'll take the next rtt measurement as our srtt;
+ * move the current srtt into rttvar to keep the current
+ * retransmit times until then.
+ */
+ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
+#ifdef INET6
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
+ in6_losing(tp->t_inpcb);
+ else
+#endif
+ in_losing(tp->t_inpcb);
+ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
+ tp->t_srtt = 0;
+ }
+ tp->snd_nxt = tp->snd_una;
+ /*
+ * Note: We overload snd_recover to function also as the
+ * snd_last variable described in RFC 2582
+ */
+ tp->snd_recover = tp->snd_max;
+ /*
+ * Force a segment to be sent.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ /*
+ * If timing a segment in this window, stop the timer.
+ */
+ tp->t_rtttime = 0;
+ /*
+ * Close the congestion window down to one segment
+ * (we'll open it by one segment for each ack we get).
+ * Since we probably have a window's worth of unacked
+ * data accumulated, this "slow start" keeps us from
+ * dumping all that data as back-to-back packets (which
+ * might overwhelm an intermediate gateway).
+ *
+ * There are two phases to the opening: Initially we
+ * open by one mss on each ack. This makes the window
+ * size increase exponentially with time. If the
+ * window is larger than the path can handle, this
+ * exponential growth results in dropped packet(s)
+ * almost immediately. To get more time between
+ * drops but still "push" the network to take advantage
+ * of improving conditions, we switch from exponential
+ * to linear window opening at some threshhold size.
+ * For a threshhold, we use half the current window
+ * size, truncated to a multiple of the mss.
+ *
+ * (the minimum cwnd that will give us exponential
+ * growth is 2 mss. We don't allow the threshhold
+ * to go below this.)
+ */
+ {
+ u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
+ if (win < 2)
+ win = 2;
+ tp->snd_cwnd = tp->t_maxseg;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ tp->t_dupacks = 0;
+ }
+ (void) tcp_output(tp);
+
+out:
+#ifdef TCPDEBUG
+ if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
+ PRU_SLOWTIMO);
+#endif
+ if (tp)
+ INP_UNLOCK(inp);
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+}
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
new file mode 100644
index 0000000..ff86d2a
--- /dev/null
+++ b/sys/netinet/tcp_timer.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_TIMER_H_
+#define _NETINET_TCP_TIMER_H_
+
+/*
+ * The TCPT_REXMT timer is used to force retransmissions.
+ * The TCP has the TCPT_REXMT timer set whenever segments
+ * have been sent for which ACKs are expected but not yet
+ * received. If an ACK is received which advances tp->snd_una,
+ * then the retransmit timer is cleared (if there are no more
+ * outstanding segments) or reset to the base value (if there
+ * are more ACKs expected). Whenever the retransmit timer goes off,
+ * we retransmit one unacknowledged segment, and do a backoff
+ * on the retransmit timer.
+ *
+ * The TCPT_PERSIST timer is used to keep window size information
+ * flowing even if the window goes shut. If all previous transmissions
+ * have been acknowledged (so that there are no retransmissions in progress),
+ * and the window is too small to bother sending anything, then we start
+ * the TCPT_PERSIST timer. When it expires, if the window is nonzero,
+ * we go to transmit state. Otherwise, at intervals send a single byte
+ * into the peer's window to force him to update our window information.
+ * We do this at most as often as TCPT_PERSMIN time intervals,
+ * but no more frequently than the current estimate of round-trip
+ * packet time. The TCPT_PERSIST timer is cleared whenever we receive
+ * a window update from the peer.
+ *
+ * The TCPT_KEEP timer is used to keep connections alive. If an
+ * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time,
+ * but not yet established, then we drop the connection. Once the connection
+ * is established, if the connection is idle for TCPTV_KEEP_IDLE time
+ * (and keepalives have been enabled on the socket), we begin to probe
+ * the connection. We force the peer to send us a segment by sending:
+ * <SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK>
+ * This segment is (deliberately) outside the window, and should elicit
+ * an ack segment in response from the peer. If, despite the TCPT_KEEP
+ * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE
+ * amount of time probing, then we drop the connection.
+ */
+
+/*
+ * Time constants.
+ */
+#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */
+#define TCPTV_SRTTBASE 0 /* base roundtrip time;
+ if 0, no idea yet */
+#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */
+#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */
+
+#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */
+#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */
+
+#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */
+#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */
+#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */
+#define TCPTV_KEEPCNT 8 /* max probes before drop */
+
+#define TCPTV_MIN ( 1*hz) /* minimum allowable value */
+#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */
+
+#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */
+
+#define TCP_LINGERTIME 120 /* linger at most 2 minutes */
+
+#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */
+
+#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */
+
+#ifdef TCPTIMERS
+static char *tcptimers[] =
+ { "REXMT", "PERSIST", "KEEP", "2MSL" };
+#endif
+
+/*
+ * Force a time value to be in a certain range.
+ */
+#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \
+ (tv) = (value); \
+ if ((u_long)(tv) < (u_long)(tvmin)) \
+ (tv) = (tvmin); \
+ else if ((u_long)(tv) > (u_long)(tvmax)) \
+ (tv) = (tvmax); \
+} while(0)
+
+#ifdef _KERNEL
+extern int tcp_keepinit; /* time to establish connection */
+extern int tcp_keepidle; /* time before keepalive probes begin */
+extern int tcp_keepintvl; /* time between keepalive probes */
+extern int tcp_maxidle; /* time to drop after starting probes */
+extern int tcp_delacktime; /* time before sending a delayed ACK */
+extern int tcp_maxpersistidle;
+extern int tcp_msl;
+extern int tcp_ttl; /* time to live for TCP segs */
+extern int tcp_backoff[];
+
+void tcp_timer_2msl(void *xtp);
+void tcp_timer_keep(void *xtp);
+void tcp_timer_persist(void *xtp);
+void tcp_timer_rexmt(void *xtp);
+void tcp_timer_delack(void *xtp);
+
+#endif /* _KERNEL */
+
+#endif /* !_NETINET_TCP_TIMER_H_ */
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
new file mode 100644
index 0000000..f7800d2
--- /dev/null
+++ b/sys/netinet/tcp_timewait.c
@@ -0,0 +1,1510 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#include "opt_compat.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/random.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/if.h>
+
+#define _IP_VHL
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+#include <netinet6/ip6protosw.h>
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#ifdef INET6
+#include <netinet6/ipsec6.h>
+#endif
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+#include <sys/md5.h>
+
+int tcp_mssdflt = TCP_MSS;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
+ &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
+
+#ifdef INET6
+int tcp_v6mssdflt = TCP6_MSS;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
+ CTLFLAG_RW, &tcp_v6mssdflt , 0,
+ "Default TCP Maximum Segment Size for IPv6");
+#endif
+
+#if 0
+static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
+ &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
+#endif
+
+int tcp_do_rfc1323 = 1;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
+ &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
+
+int tcp_do_rfc1644 = 0;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
+ &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
+
+static int tcp_tcbhashsize = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
+ &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
+
+static int do_tcpdrain = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
+ "Enable tcp_drain routine for extra help when low on mbufs");
+
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
+ &tcbinfo.ipi_count, 0, "Number of active PCBs");
+
+static int icmp_may_rst = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
+ "Certain ICMP unreachable messages may abort connections in SYN_SENT");
+
+static int tcp_isn_reseed_interval = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+ &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+
+static void tcp_cleartaocache(void);
+static struct inpcb *tcp_notify(struct inpcb *, int);
+
+/*
+ * Target size of TCP PCB hash tables. Must be a power of two.
+ *
+ * Note that this can be overridden by the kernel environment
+ * variable net.inet.tcp.tcbhashsize
+ */
+#ifndef TCBHASHSIZE
+#define TCBHASHSIZE 512
+#endif
+
+/*
+ * This is the actual shape of what we allocate using the zone
+ * allocator. Doing it this way allows us to protect both structures
+ * using the same generation count, and also eliminates the overhead
+ * of allocating tcpcbs separately. By hiding the structure here,
+ * we avoid changing most of the rest of the code (although it needs
+ * to be changed, eventually, for greater efficiency).
+ */
+#define ALIGNMENT 32
+#define ALIGNM1 (ALIGNMENT - 1)
+struct inp_tp {
+ union {
+ struct inpcb inp;
+ char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
+ } inp_tp_u;
+ struct tcpcb tcb;
+ struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
+ struct callout inp_tp_delack;
+};
+#undef ALIGNMENT
+#undef ALIGNM1
+
+/*
+ * Tcp initialization
+ */
+void
+tcp_init()
+{
+ int hashsize = TCBHASHSIZE;
+
+ tcp_ccgen = 1;
+ tcp_cleartaocache();
+
+ tcp_delacktime = TCPTV_DELACK;
+ tcp_keepinit = TCPTV_KEEP_INIT;
+ tcp_keepidle = TCPTV_KEEP_IDLE;
+ tcp_keepintvl = TCPTV_KEEPINTVL;
+ tcp_maxpersistidle = TCPTV_KEEP_IDLE;
+ tcp_msl = TCPTV_MSL;
+
+ INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
+ LIST_INIT(&tcb);
+ tcbinfo.listhead = &tcb;
+ TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
+ if (!powerof2(hashsize)) {
+ printf("WARNING: TCB hash size not a power of 2\n");
+ hashsize = 512; /* safe default */
+ }
+ tcp_tcbhashsize = hashsize;
+ tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
+ tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
+ &tcbinfo.porthashmask);
+ tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
+#ifdef INET6
+#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
+#else /* INET6 */
+#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
+#endif /* INET6 */
+ if (max_protohdr < TCP_MINPROTOHDR)
+ max_protohdr = TCP_MINPROTOHDR;
+ if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
+ panic("tcp_init");
+#undef TCP_MINPROTOHDR
+
+ syncache_init();
+}
+
+/*
+ * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
+ * tcp_template used to store this data in mbufs, but we now recopy it out
+ * of the tcpcb each time to conserve mbufs.
+ */
+void
+tcp_fillheaders(tp, ip_ptr, tcp_ptr)
+ struct tcpcb *tp;
+ void *ip_ptr;
+ void *tcp_ptr;
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
+
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) != 0) {
+ struct ip6_hdr *ip6;
+
+ ip6 = (struct ip6_hdr *)ip_ptr;
+ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
+ (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
+ ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
+ (IPV6_VERSION & IPV6_VERSION_MASK);
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_plen = sizeof(struct tcphdr);
+ ip6->ip6_src = inp->in6p_laddr;
+ ip6->ip6_dst = inp->in6p_faddr;
+ tcp_hdr->th_sum = 0;
+ } else
+#endif
+ {
+ struct ip *ip = (struct ip *) ip_ptr;
+
+ ip->ip_vhl = IP_VHL_BORING;
+ ip->ip_tos = 0;
+ ip->ip_len = 0;
+ ip->ip_id = 0;
+ ip->ip_off = 0;
+ ip->ip_ttl = 0;
+ ip->ip_sum = 0;
+ ip->ip_p = IPPROTO_TCP;
+ ip->ip_src = inp->inp_laddr;
+ ip->ip_dst = inp->inp_faddr;
+ tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(sizeof(struct tcphdr) + IPPROTO_TCP));
+ }
+
+ tcp_hdr->th_sport = inp->inp_lport;
+ tcp_hdr->th_dport = inp->inp_fport;
+ tcp_hdr->th_seq = 0;
+ tcp_hdr->th_ack = 0;
+ tcp_hdr->th_x2 = 0;
+ tcp_hdr->th_off = 5;
+ tcp_hdr->th_flags = 0;
+ tcp_hdr->th_win = 0;
+ tcp_hdr->th_urp = 0;
+}
+
+/*
+ * Create template to be used to send tcp packets on a connection.
+ * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
+ * use for this function is in keepalives, which use tcp_respond.
+ */
+struct tcptemp *
+tcp_maketemplate(tp)
+ struct tcpcb *tp;
+{
+ struct mbuf *m;
+ struct tcptemp *n;
+
+ m = m_get(M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return (0);
+ m->m_len = sizeof(struct tcptemp);
+ n = mtod(m, struct tcptemp *);
+
+ tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
+ return (n);
+}
+
+/*
+ * Send a single message to the TCP at address specified by
+ * the given TCP/IP header. If m == 0, then we make a copy
+ * of the tcpiphdr at ti and send directly to the addressed host.
+ * This is used to force keep alive messages out using the TCP
+ * template for a connection. If flags are given then we send
+ * a message back to the TCP which originated the * segment ti,
+ * and discard the mbuf containing it and any other attached mbufs.
+ *
+ * In any case the ack and sequence number of the transmitted
+ * segment are as specified by the parameters.
+ *
+ * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
+ */
+void
+tcp_respond(tp, ipgen, th, m, ack, seq, flags)
+ struct tcpcb *tp;
+ void *ipgen;
+ register struct tcphdr *th;
+ register struct mbuf *m;
+ tcp_seq ack, seq;
+ int flags;
+{
+ register int tlen;
+ int win = 0;
+ struct route *ro = 0;
+ struct route sro;
+ struct ip *ip;
+ struct tcphdr *nth;
+#ifdef INET6
+ struct route_in6 *ro6 = 0;
+ struct route_in6 sro6;
+ struct ip6_hdr *ip6;
+ int isipv6;
+#endif /* INET6 */
+ int ipflags = 0;
+
+#ifdef INET6
+ isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
+ ip6 = ipgen;
+#endif /* INET6 */
+ ip = ipgen;
+
+ if (tp) {
+ if (!(flags & TH_RST)) {
+ win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
+ if (win > (long)TCP_MAXWIN << tp->rcv_scale)
+ win = (long)TCP_MAXWIN << tp->rcv_scale;
+ }
+#ifdef INET6
+ if (isipv6)
+ ro6 = &tp->t_inpcb->in6p_route;
+ else
+#endif /* INET6 */
+ ro = &tp->t_inpcb->inp_route;
+ } else {
+#ifdef INET6
+ if (isipv6) {
+ ro6 = &sro6;
+ bzero(ro6, sizeof *ro6);
+ } else
+#endif /* INET6 */
+ {
+ ro = &sro;
+ bzero(ro, sizeof *ro);
+ }
+ }
+ if (m == 0) {
+ m = m_gethdr(M_DONTWAIT, MT_HEADER);
+ if (m == NULL)
+ return;
+ tlen = 0;
+ m->m_data += max_linkhdr;
+#ifdef INET6
+ if (isipv6) {
+ bcopy((caddr_t)ip6, mtod(m, caddr_t),
+ sizeof(struct ip6_hdr));
+ ip6 = mtod(m, struct ip6_hdr *);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
+ ip = mtod(m, struct ip *);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
+ flags = TH_ACK;
+ } else {
+ m_freem(m->m_next);
+ m->m_next = 0;
+ m->m_data = (caddr_t)ipgen;
+ /* m_len is set later */
+ tlen = 0;
+#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
+#ifdef INET6
+ if (isipv6) {
+ xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
+ nth = (struct tcphdr *)(ip6 + 1);
+ } else
+#endif /* INET6 */
+ {
+ xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
+ nth = (struct tcphdr *)(ip + 1);
+ }
+ if (th != nth) {
+ /*
+ * this is usually a case when an extension header
+ * exists between the IPv6 header and the
+ * TCP header.
+ */
+ nth->th_sport = th->th_sport;
+ nth->th_dport = th->th_dport;
+ }
+ xchg(nth->th_dport, nth->th_sport, n_short);
+#undef xchg
+ }
+#ifdef INET6
+ if (isipv6) {
+ ip6->ip6_flow = 0;
+ ip6->ip6_vfc = IPV6_VERSION;
+ ip6->ip6_nxt = IPPROTO_TCP;
+ ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
+ tlen));
+ tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
+ } else
+#endif
+ {
+ tlen += sizeof (struct tcpiphdr);
+ ip->ip_len = tlen;
+ ip->ip_ttl = ip_defttl;
+ }
+ m->m_len = tlen;
+ m->m_pkthdr.len = tlen;
+ m->m_pkthdr.rcvif = (struct ifnet *) 0;
+ nth->th_seq = htonl(seq);
+ nth->th_ack = htonl(ack);
+ nth->th_x2 = 0;
+ nth->th_off = sizeof (struct tcphdr) >> 2;
+ nth->th_flags = flags;
+ if (tp)
+ nth->th_win = htons((u_short) (win >> tp->rcv_scale));
+ else
+ nth->th_win = htons((u_short)win);
+ nth->th_urp = 0;
+#ifdef INET6
+ if (isipv6) {
+ nth->th_sum = 0;
+ nth->th_sum = in6_cksum(m, IPPROTO_TCP,
+ sizeof(struct ip6_hdr),
+ tlen - sizeof(struct ip6_hdr));
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
+ ro6 && ro6->ro_rt ?
+ ro6->ro_rt->rt_ifp :
+ NULL);
+ } else
+#endif /* INET6 */
+ {
+ nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ }
+#ifdef TCPDEBUG
+ if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
+ tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
+#endif
+#ifdef IPSEC
+ if (ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
+ m_freem(m);
+ return;
+ }
+#endif
+#ifdef INET6
+ if (isipv6) {
+ (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL);
+ if (ro6 == &sro6 && ro6->ro_rt) {
+ RTFREE(ro6->ro_rt);
+ ro6->ro_rt = NULL;
+ }
+ } else
+#endif /* INET6 */
+ {
+ (void) ip_output(m, NULL, ro, ipflags, NULL);
+ if (ro == &sro && ro->ro_rt) {
+ RTFREE(ro->ro_rt);
+ ro->ro_rt = NULL;
+ }
+ }
+}
+
+/*
+ * Create a new TCP control block, making an
+ * empty reassembly queue and hooking it to the argument
+ * protocol control block. The `inp' parameter must have
+ * come from the zone allocator set up in tcp_init().
+ */
+struct tcpcb *
+tcp_newtcpcb(inp)
+ struct inpcb *inp;
+{
+ struct inp_tp *it;
+ register struct tcpcb *tp;
+#ifdef INET6
+ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+
+ it = (struct inp_tp *)inp;
+ tp = &it->tcb;
+ bzero((char *) tp, sizeof(struct tcpcb));
+ LIST_INIT(&tp->t_segq);
+ tp->t_maxseg = tp->t_maxopd =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+
+ /* Set up our timeouts. */
+ callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
+ callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
+ callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
+ callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
+ callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
+
+ if (tcp_do_rfc1323)
+ tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
+ if (tcp_do_rfc1644)
+ tp->t_flags |= TF_REQ_CC;
+ tp->t_inpcb = inp; /* XXX */
+ /*
+ * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
+ * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
+ * reasonable initial retransmit time.
+ */
+ tp->t_srtt = TCPTV_SRTTBASE;
+ tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
+ tp->t_rttmin = TCPTV_MIN;
+ tp->t_rxtcur = TCPTV_RTOBASE;
+ tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->t_rcvtime = ticks;
+ /*
+ * IPv4 TTL initialization is necessary for an IPv6 socket as well,
+ * because the socket may be bound to an IPv6 wildcard address,
+ * which may match an IPv4-mapped IPv6 address.
+ */
+ inp->inp_ip_ttl = ip_defttl;
+ inp->inp_ppcb = (caddr_t)tp;
+ return (tp); /* XXX */
+}
+
+/*
+ * Drop a TCP connection, reporting
+ * the specified error. If connection is synchronized,
+ * then send a RST to peer.
+ */
+struct tcpcb *
+tcp_drop(tp, errno)
+ register struct tcpcb *tp;
+ int errno;
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_state = TCPS_CLOSED;
+ (void) tcp_output(tp);
+ tcpstat.tcps_drops++;
+ } else
+ tcpstat.tcps_conndrops++;
+ if (errno == ETIMEDOUT && tp->t_softerror)
+ errno = tp->t_softerror;
+ so->so_error = errno;
+ return (tcp_close(tp));
+}
+
+/*
+ * Close a TCP control block:
+ * discard all space held by the tcp
+ * discard internet protocol block
+ * wake up any sleepers
+ */
+struct tcpcb *
+tcp_close(tp)
+ register struct tcpcb *tp;
+{
+ register struct tseg_qent *q;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+#ifdef INET6
+ int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+ register struct rtentry *rt;
+ int dosavessthresh;
+
+ /*
+ * Make sure that all of our timers are stopped before we
+ * delete the PCB.
+ */
+ callout_stop(tp->tt_rexmt);
+ callout_stop(tp->tt_persist);
+ callout_stop(tp->tt_keep);
+ callout_stop(tp->tt_2msl);
+ callout_stop(tp->tt_delack);
+
+ /*
+ * If we got enough samples through the srtt filter,
+ * save the rtt and rttvar in the routing entry.
+ * 'Enough' is arbitrarily defined as the 16 samples.
+ * 16 samples is enough for the srtt filter to converge
+ * to within 5% of the correct value; fewer samples and
+ * we could save a very bogus rtt.
+ *
+ * Don't update the default route's characteristics and don't
+ * update anything that the user "locked".
+ */
+ if (tp->t_rttupdated >= 16) {
+ register u_long i = 0;
+#ifdef INET6
+ if (isipv6) {
+ struct sockaddr_in6 *sin6;
+
+ if ((rt = inp->in6p_route.ro_rt) == NULL)
+ goto no_valid_rt;
+ sin6 = (struct sockaddr_in6 *)rt_key(rt);
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ goto no_valid_rt;
+ }
+ else
+#endif /* INET6 */
+ if ((rt = inp->inp_route.ro_rt) == NULL ||
+ ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
+ == INADDR_ANY)
+ goto no_valid_rt;
+
+ if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
+ i = tp->t_srtt *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+ if (rt->rt_rmx.rmx_rtt && i)
+ /*
+ * filter this update to half the old & half
+ * the new values, converting scale.
+ * See route.h and tcp_var.h for a
+ * description of the scaling constants.
+ */
+ rt->rt_rmx.rmx_rtt =
+ (rt->rt_rmx.rmx_rtt + i) / 2;
+ else
+ rt->rt_rmx.rmx_rtt = i;
+ tcpstat.tcps_cachedrtt++;
+ }
+ if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
+ i = tp->t_rttvar *
+ (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
+ if (rt->rt_rmx.rmx_rttvar && i)
+ rt->rt_rmx.rmx_rttvar =
+ (rt->rt_rmx.rmx_rttvar + i) / 2;
+ else
+ rt->rt_rmx.rmx_rttvar = i;
+ tcpstat.tcps_cachedrttvar++;
+ }
+ /*
+ * The old comment here said:
+ * update the pipelimit (ssthresh) if it has been updated
+ * already or if a pipesize was specified & the threshhold
+ * got below half the pipesize. I.e., wait for bad news
+ * before we start updating, then update on both good
+ * and bad news.
+ *
+ * But we want to save the ssthresh even if no pipesize is
+ * specified explicitly in the route, because such
+ * connections still have an implicit pipesize specified
+ * by the global tcp_sendspace. In the absence of a reliable
+ * way to calculate the pipesize, it will have to do.
+ */
+ i = tp->snd_ssthresh;
+ if (rt->rt_rmx.rmx_sendpipe != 0)
+ dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
+ else
+ dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
+ if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
+ i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
+ || dosavessthresh) {
+ /*
+ * convert the limit from user data bytes to
+ * packets then to packet data bytes.
+ */
+ i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (i < 2)
+ i = 2;
+ i *= (u_long)(tp->t_maxseg +
+#ifdef INET6
+ (isipv6 ? sizeof (struct ip6_hdr) +
+ sizeof (struct tcphdr) :
+#endif
+ sizeof (struct tcpiphdr)
+#ifdef INET6
+ )
+#endif
+ );
+ if (rt->rt_rmx.rmx_ssthresh)
+ rt->rt_rmx.rmx_ssthresh =
+ (rt->rt_rmx.rmx_ssthresh + i) / 2;
+ else
+ rt->rt_rmx.rmx_ssthresh = i;
+ tcpstat.tcps_cachedssthresh++;
+ }
+ }
+ no_valid_rt:
+ /* free the reassembly queue, if any */
+ while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
+ LIST_REMOVE(q, tqe_q);
+ m_freem(q->tqe_m);
+ FREE(q, M_TSEGQ);
+ }
+ inp->inp_ppcb = NULL;
+ soisdisconnected(so);
+#ifdef INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ in6_pcbdetach(inp);
+ else
+#endif /* INET6 */
+ in_pcbdetach(inp);
+ tcpstat.tcps_closed++;
+ return ((struct tcpcb *)0);
+}
+
+void
+tcp_drain()
+{
+ if (do_tcpdrain)
+ {
+ struct inpcb *inpb;
+ struct tcpcb *tcpb;
+ struct tseg_qent *te;
+
+ /*
+ * Walk the tcpbs, if existing, and flush the reassembly queue,
+ * if there is one...
+ * XXX: The "Net/3" implementation doesn't imply that the TCP
+ * reassembly queue should be flushed, but in a situation
+ * where we're really low on mbufs, this is potentially
+ * usefull.
+ */
+ INP_INFO_RLOCK(&tcbinfo);
+ LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
+ INP_LOCK(inpb);
+ if ((tcpb = intotcpcb(inpb))) {
+ while ((te = LIST_FIRST(&tcpb->t_segq))
+ != NULL) {
+ LIST_REMOVE(te, tqe_q);
+ m_freem(te->tqe_m);
+ FREE(te, M_TSEGQ);
+ }
+ }
+ INP_UNLOCK(inpb);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+ }
+}
+
+/*
+ * Notify a tcp user of an asynchronous error;
+ * store error as soft error, but wake up user
+ * (for now, won't do anything until can select for soft error).
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+static struct inpcb *
+tcp_notify(inp, error)
+ struct inpcb *inp;
+ int error;
+{
+ struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
+
+ /*
+ * Ignore some errors if we are hooked up.
+ * If connection hasn't completed, has retransmitted several times,
+ * and receives a second error, give up now. This is better
+ * than waiting a long time to establish a connection that
+ * can never complete.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (error == EHOSTUNREACH || error == ENETUNREACH ||
+ error == EHOSTDOWN)) {
+ return inp;
+ } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
+ tp->t_softerror) {
+ tcp_drop(tp, error);
+ return (struct inpcb *)0;
+ } else {
+ tp->t_softerror = error;
+ return inp;
+ }
+#if 0
+ wakeup((caddr_t) &so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+#endif
+}
+
+static int
+tcp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n, s;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = tcbinfo.ipi_count;
+ req->oldidx = 2 * (sizeof xig)
+ + (n + n/8) * sizeof(struct xtcpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ gencnt = tcbinfo.ipi_gencnt;
+ n = tcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return error;
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return ENOMEM;
+
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ INP_LOCK(inp);
+ if (inp->inp_gencnt <= gencnt &&
+ cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
+ inp_list[i++] = inp;
+ INP_UNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_LOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xtcpcb xt;
+ caddr_t inp_ppcb;
+ xt.xt_len = sizeof xt;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xt.xt_inp, sizeof *inp);
+ inp_ppcb = inp->inp_ppcb;
+ if (inp_ppcb != NULL)
+ bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
+ else
+ bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xt.xt_socket);
+ error = SYSCTL_OUT(req, &xt, sizeof xt);
+ }
+ INP_UNLOCK(inp);
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ xig.xig_gen = tcbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = tcbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return error;
+}
+
+SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
+ tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
+
+static int
+tcp_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in addrs[2];
+ struct inpcb *inp;
+ int error, s;
+
+ error = suser_cred(req->td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
+ addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
+ if (inp == NULL) {
+ error = ENOENT;
+ goto outunlocked;
+ } else {
+ INP_LOCK(inp);
+ if (inp->inp_socket == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ }
+
+ error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
+ if (error)
+ goto out;
+ cru2x(inp->inp_socket->so_cred, &xuc);
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+out:
+ INP_UNLOCK(inp);
+outunlocked:
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
+
+#ifdef INET6
+static int
+tcp6_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in6 addrs[2];
+ struct inpcb *inp;
+ int error, s, mapped = 0;
+
+ error = suser_cred(req->td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
+ if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
+ mapped = 1;
+ else
+ return (EINVAL);
+ }
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ if (mapped == 1)
+ inp = in_pcblookup_hash(&tcbinfo,
+ *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
+ addrs[1].sin6_port,
+ *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
+ addrs[0].sin6_port,
+ 0, NULL);
+ else
+ inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
+ addrs[1].sin6_port,
+ &addrs[0].sin6_addr, addrs[0].sin6_port,
+ 0, NULL);
+ if (inp == NULL) {
+ error = ENOENT;
+ goto outunlocked;
+ } else {
+ INP_LOCK(inp);
+ if (inp->inp_socket == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ }
+ error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
+ if (error)
+ goto out;
+ cru2x(inp->inp_socket->so_cred, &xuc);
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+out:
+ INP_UNLOCK(inp);
+outunlocked:
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
+#endif
+
+
+void
+tcp_ctlinput(cmd, sa, vip)
+ int cmd;
+ struct sockaddr *sa;
+ void *vip;
+{
+ struct ip *ip = vip;
+ struct tcphdr *th;
+ struct in_addr faddr;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ tcp_seq icmp_seq;
+ int s;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ if (cmd == PRC_QUENCH)
+ notify = tcp_quench;
+ else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+ cmd == PRC_UNREACH_PORT) && ip)
+ notify = tcp_drop_syn_sent;
+ else if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (PRC_IS_REDIRECT(cmd)) {
+ ip = 0;
+ notify = in_rtchange;
+ } else if (cmd == PRC_HOSTDEAD)
+ ip = 0;
+ else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
+ return;
+ if (ip) {
+ s = splnet();
+ th = (struct tcphdr *)((caddr_t)ip
+ + (IP_VHL_HL(ip->ip_vhl) << 2));
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
+ ip->ip_src, th->th_sport, 0, NULL);
+ if (inp != NULL) {
+ INP_LOCK(inp);
+ if (inp->inp_socket != NULL) {
+ icmp_seq = htonl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_seq, tp->snd_max))
+ inp = (*notify)(inp, inetctlerrmap[cmd]);
+ }
+ if (inp)
+ INP_UNLOCK(inp);
+ } else {
+ struct in_conninfo inc;
+
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc_faddr = faddr;
+ inc.inc_laddr = ip->ip_src;
+#ifdef INET6
+ inc.inc_isipv6 = 0;
+#endif
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ } else
+ in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
+}
+
+#ifdef INET6
+void
+tcp6_ctlinput(cmd, sa, d)
+ int cmd;
+ struct sockaddr *sa;
+ void *d;
+{
+ struct tcphdr th;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ struct ip6_hdr *ip6;
+ struct mbuf *m;
+ struct ip6ctlparam *ip6cp = NULL;
+ const struct sockaddr_in6 *sa6_src = NULL;
+ int off;
+ struct tcp_portonly {
+ u_int16_t th_sport;
+ u_int16_t th_dport;
+ } *thp;
+
+ if (sa->sa_family != AF_INET6 ||
+ sa->sa_len != sizeof(struct sockaddr_in6))
+ return;
+
+ if (cmd == PRC_QUENCH)
+ notify = tcp_quench;
+ else if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (!PRC_IS_REDIRECT(cmd) &&
+ ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
+ return;
+
+ /* if the parameter is from icmp6, decode it. */
+ if (d != NULL) {
+ ip6cp = (struct ip6ctlparam *)d;
+ m = ip6cp->ip6c_m;
+ ip6 = ip6cp->ip6c_ip6;
+ off = ip6cp->ip6c_off;
+ sa6_src = ip6cp->ip6c_src;
+ } else {
+ m = NULL;
+ ip6 = NULL;
+ off = 0; /* fool gcc */
+ sa6_src = &sa6_any;
+ }
+
+ if (ip6) {
+ struct in_conninfo inc;
+ /*
+ * XXX: We assume that when IPV6 is non NULL,
+ * M and OFF are valid.
+ */
+
+ /* check if we can safely examine src and dst ports */
+ if (m->m_pkthdr.len < off + sizeof(*thp))
+ return;
+
+ bzero(&th, sizeof(th));
+ m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+
+ in6_pcbnotify(&tcb, sa, th.th_dport,
+ (struct sockaddr *)ip6cp->ip6c_src,
+ th.th_sport, cmd, notify);
+
+ inc.inc_fport = th.th_dport;
+ inc.inc_lport = th.th_sport;
+ inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+ inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+ inc.inc_isipv6 = 1;
+ syncache_unreach(&inc, &th);
+ } else
+ in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
+ 0, cmd, notify);
+}
+#endif /* INET6 */
+
+
+/*
+ * Following is where TCP initial sequence number generation occurs.
+ *
+ * There are two places where we must use initial sequence numbers:
+ * 1. In SYN-ACK packets.
+ * 2. In SYN packets.
+ *
+ * All ISNs for SYN-ACK packets are generated by the syncache. See
+ * tcp_syncache.c for details.
+ *
+ * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
+ * depends on this property. In addition, these ISNs should be
+ * unguessable so as to prevent connection hijacking. To satisfy
+ * the requirements of this situation, the algorithm outlined in
+ * RFC 1948 is used to generate sequence numbers.
+ *
+ * Implementation details:
+ *
+ * Time is based off the system timer, and is corrected so that it
+ * increases by one megabyte per second. This allows for proper
+ * recycling on high speed LANs while still leaving over an hour
+ * before rollover.
+ *
+ * net.inet.tcp.isn_reseed_interval controls the number of seconds
+ * between seeding of isn_secret. This is normally set to zero,
+ * as reseeding should not be necessary.
+ *
+ */
+
+#define ISN_BYTES_PER_SECOND 1048576
+
+u_char isn_secret[32];
+int isn_last_reseed;
+MD5_CTX isn_ctx;
+
+tcp_seq
+tcp_new_isn(tp)
+ struct tcpcb *tp;
+{
+ u_int32_t md5_buffer[4];
+ tcp_seq new_isn;
+
+ /* Seed if this is the first use, reseed if requested. */
+ if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
+ (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
+ < (u_int)ticks))) {
+ read_random(&isn_secret, sizeof(isn_secret));
+ isn_last_reseed = ticks;
+ }
+
+ /* Compute the md5 hash and return the ISN. */
+ MD5Init(&isn_ctx);
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
+#ifdef INET6
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
+ sizeof(struct in6_addr));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
+ sizeof(struct in6_addr));
+ } else
+#endif
+ {
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
+ sizeof(struct in_addr));
+ MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
+ sizeof(struct in_addr));
+ }
+ MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
+ MD5Final((u_char *) &md5_buffer, &isn_ctx);
+ new_isn = (tcp_seq) md5_buffer[0];
+ new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
+ return new_isn;
+}
+
+/*
+ * When a source quench is received, close congestion window
+ * to one segment. We will gradually open it again as we proceed.
+ */
+struct inpcb *
+tcp_quench(inp, errno)
+ struct inpcb *inp;
+ int errno;
+{
+ struct tcpcb *tp = intotcpcb(inp);
+
+ if (tp)
+ tp->snd_cwnd = tp->t_maxseg;
+ return (inp);
+}
+
+/*
+ * When a specific ICMP unreachable message is received and the
+ * connection state is SYN-SENT, drop the connection. This behavior
+ * is controlled by the icmp_may_rst sysctl.
+ */
+struct inpcb *
+tcp_drop_syn_sent(inp, errno)
+ struct inpcb *inp;
+ int errno;
+{
+ struct tcpcb *tp = intotcpcb(inp);
+
+ if (tp && tp->t_state == TCPS_SYN_SENT) {
+ tcp_drop(tp, errno);
+ return (struct inpcb *)0;
+ }
+ return inp;
+}
+
+/*
+ * When `need fragmentation' ICMP is received, update our idea of the MSS
+ * based on the new value in the route. Also nudge TCP to send something,
+ * since we know the packet we just sent was dropped.
+ * This duplicates some code in the tcp_mss() function in tcp_input.c.
+ */
+struct inpcb *
+tcp_mtudisc(inp, errno)
+ struct inpcb *inp;
+ int errno;
+{
+ struct tcpcb *tp = intotcpcb(inp);
+ struct rtentry *rt;
+ struct rmxp_tao *taop;
+ struct socket *so = inp->inp_socket;
+ int offered;
+ int mss;
+#ifdef INET6
+ int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
+#endif /* INET6 */
+
+ if (tp) {
+#ifdef INET6
+ if (isipv6)
+ rt = tcp_rtlookup6(&inp->inp_inc);
+ else
+#endif /* INET6 */
+ rt = tcp_rtlookup(&inp->inp_inc);
+ if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tp->t_maxopd = tp->t_maxseg =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif /* INET6 */
+ tcp_mssdflt;
+ return inp;
+ }
+ taop = rmx_taop(rt->rt_rmx);
+ offered = taop->tao_mssopt;
+ mss = rt->rt_rmx.rmx_mtu -
+#ifdef INET6
+ (isipv6 ?
+ sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
+#endif /* INET6 */
+ sizeof(struct tcpiphdr)
+#ifdef INET6
+ )
+#endif /* INET6 */
+ ;
+
+ if (offered)
+ mss = min(mss, offered);
+ /*
+ * XXX - The above conditional probably violates the TCP
+ * spec. The problem is that, since we don't know the
+ * other end's MSS, we are supposed to use a conservative
+ * default. But, if we do that, then MTU discovery will
+ * never actually take place, because the conservative
+ * default is much less than the MTUs typically seen
+ * on the Internet today. For the moment, we'll sweep
+ * this under the carpet.
+ *
+ * The conservative default might not actually be a problem
+ * if the only case this occurs is when sending an initial
+ * SYN with options and data to a host we've never talked
+ * to before. Then, they will reply with an MSS value which
+ * will get recorded and the new parameters should get
+ * recomputed. For Further Study.
+ */
+ if (tp->t_maxopd <= mss)
+ return inp;
+ tp->t_maxopd = mss;
+
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
+ (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
+ mss -= TCPOLEN_TSTAMP_APPA;
+ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
+ (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
+ mss -= TCPOLEN_CC_APPA;
+#if (MCLBYTES & (MCLBYTES - 1)) == 0
+ if (mss > MCLBYTES)
+ mss &= ~(MCLBYTES-1);
+#else
+ if (mss > MCLBYTES)
+ mss = mss / MCLBYTES * MCLBYTES;
+#endif
+ if (so->so_snd.sb_hiwat < mss)
+ mss = so->so_snd.sb_hiwat;
+
+ tp->t_maxseg = mss;
+
+ tcpstat.tcps_mturesent++;
+ tp->t_rtttime = 0;
+ tp->snd_nxt = tp->snd_una;
+ tcp_output(tp);
+ }
+ return inp;
+}
+
+/*
+ * Look-up the routing entry to the peer of this inpcb. If no route
+ * is found and it cannot be allocated the return NULL. This routine
+ * is called by TCP routines that access the rmx structure and by tcp_mss
+ * to get the interface MTU.
+ */
+struct rtentry *
+tcp_rtlookup(inc)
+ struct in_conninfo *inc;
+{
+ struct route *ro;
+ struct rtentry *rt;
+
+ ro = &inc->inc_route;
+ rt = ro->ro_rt;
+ if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
+ /* No route yet, so try to acquire one */
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ ro->ro_dst.sa_family = AF_INET;
+ ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
+ ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
+ inc->inc_faddr;
+ rtalloc(ro);
+ rt = ro->ro_rt;
+ }
+ }
+ return rt;
+}
+
+#ifdef INET6
+struct rtentry *
+tcp_rtlookup6(inc)
+ struct in_conninfo *inc;
+{
+ struct route_in6 *ro6;
+ struct rtentry *rt;
+
+ ro6 = &inc->inc6_route;
+ rt = ro6->ro_rt;
+ if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
+ /* No route yet, so try to acquire one */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ ro6->ro_dst.sin6_family = AF_INET6;
+ ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ ro6->ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc((struct route *)ro6);
+ rt = ro6->ro_rt;
+ }
+ }
+ return rt;
+}
+#endif /* INET6 */
+
+#ifdef IPSEC
+/* compute ESP/AH header size for TCP, including outer IP header. */
+size_t
+ipsec_hdrsiz_tcp(tp)
+ struct tcpcb *tp;
+{
+ struct inpcb *inp;
+ struct mbuf *m;
+ size_t hdrsiz;
+ struct ip *ip;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+#endif /* INET6 */
+ struct tcphdr *th;
+
+ if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
+ return 0;
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (!m)
+ return 0;
+
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV6) != 0) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ m->m_pkthdr.len = m->m_len =
+ sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ tcp_fillheaders(tp, ip6, th);
+ hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
+ } else
+#endif /* INET6 */
+ {
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(ip + 1);
+ m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
+ tcp_fillheaders(tp, ip, th);
+ hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
+ }
+
+ m_free(m);
+ return hdrsiz;
+}
+#endif /*IPSEC*/
+
+/*
+ * Return a pointer to the cached information about the remote host.
+ * The cached information is stored in the protocol specific part of
+ * the route metrics.
+ */
+struct rmxp_tao *
+tcp_gettaocache(inc)
+ struct in_conninfo *inc;
+{
+ struct rtentry *rt;
+
+#ifdef INET6
+ if (inc->inc_isipv6)
+ rt = tcp_rtlookup6(inc);
+ else
+#endif /* INET6 */
+ rt = tcp_rtlookup(inc);
+
+ /* Make sure this is a host route and is up. */
+ if (rt == NULL ||
+ (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
+ return NULL;
+
+ return rmx_taop(rt->rt_rmx);
+}
+
+/*
+ * Clear all the TAO cache entries, called from tcp_init.
+ *
+ * XXX
+ * This routine is just an empty one, because we assume that the routing
+ * routing tables are initialized at the same time when TCP, so there is
+ * nothing in the cache left over.
+ */
+static void
+tcp_cleartaocache()
+{
+}
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
new file mode 100644
index 0000000..9c4b547
--- /dev/null
+++ b/sys/netinet/tcp_usrreq.c
@@ -0,0 +1,1252 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
+ * $FreeBSD$
+ */
+
+#include "opt_ipsec.h"
+#include "opt_inet6.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif /* INET6 */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#endif /*IPSEC*/
+
+/*
+ * TCP protocol interface to socket abstraction.
+ */
+extern char *tcpstates[]; /* XXX ??? */
+
+static int tcp_attach(struct socket *, struct thread *td);
+static int tcp_connect(struct tcpcb *, struct sockaddr *,
+ struct thread *td);
+#ifdef INET6
+static int tcp6_connect(struct tcpcb *, struct sockaddr *,
+ struct thread *td);
+#endif /* INET6 */
+static struct tcpcb *
+ tcp_disconnect(struct tcpcb *);
+static struct tcpcb *
+ tcp_usrclosed(struct tcpcb *);
+
+#ifdef TCPDEBUG
+#define TCPDEBUG0 int ostate = 0
+#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
+#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
+ tcp_trace(TA_USER, ostate, tp, 0, 0, req)
+#else
+#define TCPDEBUG0
+#define TCPDEBUG1()
+#define TCPDEBUG2(req)
+#endif
+
+/*
+ * TCP attaches to socket via pru_attach(), reserving space,
+ * and an internet control block.
+ */
+static int
+tcp_usr_attach(struct socket *so, int proto, struct thread *td)
+{
+ int s = splnet();
+ int error;
+ struct inpcb *inp;
+ struct tcpcb *tp = 0;
+ TCPDEBUG0;
+
+ INP_INFO_WLOCK(&tcbinfo);
+ TCPDEBUG1();
+ inp = sotoinpcb(so);
+ if (inp) {
+ error = EISCONN;
+ goto out;
+ }
+
+ error = tcp_attach(so, td);
+ if (error)
+ goto out;
+
+ if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+ so->so_linger = TCP_LINGERTIME;
+
+ inp = sotoinpcb(so);
+ tp = intotcpcb(inp);
+out:
+ TCPDEBUG2(PRU_ATTACH);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return error;
+}
+
+/*
+ * pru_detach() detaches the TCP protocol from the socket.
+ * If the protocol state is non-embryonic, then can't
+ * do this directly: have to initiate a pru_disconnect(),
+ * which may finish later; embryonic TCB's can just
+ * be discarded here.
+ */
+static int
+tcp_usr_detach(struct socket *so)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ TCPDEBUG0;
+
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return EINVAL; /* XXX */
+ }
+ INP_LOCK(inp);
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tp = tcp_disconnect(tp);
+
+ TCPDEBUG2(PRU_DETACH);
+ if (tp)
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ splx(s);
+ return error;
+}
+
+#define INI_NOLOCK 0
+#define INI_READ 1
+#define INI_WRITE 2
+
+#define COMMON_START() \
+ TCPDEBUG0; \
+ do { \
+ if (inirw == INI_READ) \
+ INP_INFO_RLOCK(&tcbinfo); \
+ else if (inirw == INI_WRITE) \
+ INP_INFO_WLOCK(&tcbinfo); \
+ inp = sotoinpcb(so); \
+ if (inp == 0) { \
+ if (inirw == INI_READ) \
+ INP_INFO_RUNLOCK(&tcbinfo); \
+ else if (inirw == INI_WRITE) \
+ INP_INFO_WUNLOCK(&tcbinfo); \
+ splx(s); \
+ return EINVAL; \
+ } \
+ INP_LOCK(inp); \
+ if (inirw == INI_READ) \
+ INP_INFO_RUNLOCK(&tcbinfo); \
+ tp = intotcpcb(inp); \
+ TCPDEBUG1(); \
+} while(0)
+
+#define COMMON_END(req) \
+out: TCPDEBUG2(req); \
+ do { \
+ if (tp) \
+ INP_UNLOCK(inp); \
+ if (inirw == INI_WRITE) \
+ INP_INFO_WUNLOCK(&tcbinfo); \
+ splx(s); \
+ return error; \
+ goto out; \
+} while(0)
+
+/*
+ * Give the socket an address.
+ */
+static int
+tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct sockaddr_in *sinp;
+ const int inirw = INI_READ;
+
+ COMMON_START();
+
+ /*
+ * Must check for multicast addresses and disallow binding
+ * to them.
+ */
+ sinp = (struct sockaddr_in *)nam;
+ if (sinp->sin_family == AF_INET &&
+ IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
+ error = EAFNOSUPPORT;
+ goto out;
+ }
+ error = in_pcbbind(inp, nam, td);
+ if (error)
+ goto out;
+ COMMON_END(PRU_BIND);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct sockaddr_in6 *sin6p;
+ const int inirw = INI_READ;
+
+ COMMON_START();
+
+ /*
+ * Must check for multicast addresses and disallow binding
+ * to them.
+ */
+ sin6p = (struct sockaddr_in6 *)nam;
+ if (sin6p->sin6_family == AF_INET6 &&
+ IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
+ error = EAFNOSUPPORT;
+ goto out;
+ }
+ inp->inp_vflag &= ~INP_IPV4;
+ inp->inp_vflag |= INP_IPV6;
+ if (ip6_mapped_addr_on && (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
+ inp->inp_vflag |= INP_IPV4;
+ else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+ struct sockaddr_in sin;
+
+ in6_sin6_2_sin(&sin, sin6p);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_vflag &= ~INP_IPV6;
+ error = in_pcbbind(inp, (struct sockaddr *)&sin, td);
+ goto out;
+ }
+ }
+ error = in6_pcbbind(inp, nam, td);
+ if (error)
+ goto out;
+ COMMON_END(PRU_BIND);
+}
+#endif /* INET6 */
+
+/*
+ * Prepare to accept connections.
+ */
+static int
+tcp_usr_listen(struct socket *so, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_READ;
+
+ COMMON_START();
+ if (inp->inp_lport == 0)
+ error = in_pcbbind(inp, (struct sockaddr *)0, td);
+ if (error == 0)
+ tp->t_state = TCPS_LISTEN;
+ COMMON_END(PRU_LISTEN);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_listen(struct socket *so, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_READ;
+
+ COMMON_START();
+ if (inp->inp_lport == 0) {
+ inp->inp_vflag &= ~INP_IPV4;
+ if (ip6_mapped_addr_on &&
+ (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
+ inp->inp_vflag |= INP_IPV4;
+ error = in6_pcbbind(inp, (struct sockaddr *)0, td);
+ }
+ if (error == 0)
+ tp->t_state = TCPS_LISTEN;
+ COMMON_END(PRU_LISTEN);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate connection to peer.
+ * Create a template for use in transmissions on this connection.
+ * Enter SYN_SENT state, and mark socket as connecting.
+ * Start keep-alive timer, and seed output sequence space.
+ * Send initial segment on connection.
+ */
+static int
+tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct sockaddr_in *sinp;
+ const int inirw = INI_WRITE;
+
+ COMMON_START();
+
+ /*
+ * Must disallow TCP ``connections'' to multicast addresses.
+ */
+ sinp = (struct sockaddr_in *)nam;
+ if (sinp->sin_family == AF_INET
+ && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
+ error = EAFNOSUPPORT;
+ goto out;
+ }
+
+ if (td && jailed(td->td_ucred))
+ prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
+
+ if ((error = tcp_connect(tp, nam, td)) != 0)
+ goto out;
+ error = tcp_output(tp);
+ COMMON_END(PRU_CONNECT);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct sockaddr_in6 *sin6p;
+ const int inirw = INI_WRITE;
+
+ COMMON_START();
+
+ /*
+ * Must disallow TCP ``connections'' to multicast addresses.
+ */
+ sin6p = (struct sockaddr_in6 *)nam;
+ if (sin6p->sin6_family == AF_INET6
+ && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) {
+ error = EAFNOSUPPORT;
+ goto out;
+ }
+
+ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+ struct sockaddr_in sin;
+
+ if (!ip6_mapped_addr_on ||
+ (inp->inp_flags & IN6P_IPV6_V6ONLY))
+ return(EINVAL);
+
+ in6_sin6_2_sin(&sin, sin6p);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_vflag &= ~INP_IPV6;
+ if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
+ goto out;
+ error = tcp_output(tp);
+ goto out;
+ }
+ inp->inp_vflag &= ~INP_IPV4;
+ inp->inp_vflag |= INP_IPV6;
+ inp->inp_inc.inc_isipv6 = 1;
+ if ((error = tcp6_connect(tp, nam, td)) != 0)
+ goto out;
+ error = tcp_output(tp);
+ COMMON_END(PRU_CONNECT);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate disconnect from peer.
+ * If connection never passed embryonic stage, just drop;
+ * else if don't need to let data drain, then can just drop anyways,
+ * else have to begin TCP shutdown process: mark socket disconnecting,
+ * drain unread data, state switch to reflect user close, and
+ * send segment (e.g. FIN) to peer. Socket will be really disconnected
+ * when peer sends FIN and acks ours.
+ *
+ * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
+ */
+static int
+tcp_usr_disconnect(struct socket *so)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_WRITE;
+
+ COMMON_START();
+ tp = tcp_disconnect(tp);
+ COMMON_END(PRU_DISCONNECT);
+}
+
+/*
+ * Accept a connection. Essentially all the work is
+ * done at higher levels; just return the address
+ * of the peer, storing through addr.
+ */
+static int
+tcp_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+ int s;
+ int error = 0;
+ struct inpcb *inp = NULL;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in *sin;
+ const int inirw = INI_READ;
+ TCPDEBUG0;
+
+ if (so->so_state & SS_ISDISCONNECTED) {
+ error = ECONNABORTED;
+ goto out;
+ }
+
+ /*
+ * Do the malloc first in case it blocks.
+ */
+ MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
+ M_WAITOK | M_ZERO);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ if (!inp) {
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ free(sin, M_SONAME);
+ return (EINVAL);
+ }
+ INP_LOCK(inp);
+ INP_INFO_RUNLOCK(&tcbinfo);
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+
+ /*
+ * We inline in_setpeeraddr here, because we have already done
+ * the locking and the malloc.
+ */
+ sin->sin_port = inp->inp_fport;
+ sin->sin_addr = inp->inp_faddr;
+ *nam = (struct sockaddr *)sin;
+
+ COMMON_END(PRU_ACCEPT);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+ int s;
+ struct inpcb *inp = NULL;
+ int error = 0;
+ struct tcpcb *tp = NULL;
+ const int inirw = INI_READ;
+ TCPDEBUG0;
+
+ if (so->so_state & SS_ISDISCONNECTED) {
+ error = ECONNABORTED;
+ goto out;
+ }
+
+ s = splnet();
+ INP_INFO_RLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ return (EINVAL);
+ }
+ INP_LOCK(inp);
+ INP_INFO_RUNLOCK(&tcbinfo);
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ in6_mapped_peeraddr(so, nam);
+ COMMON_END(PRU_ACCEPT);
+}
+#endif /* INET6 */
+
+/*
+ * This is the wrapper function for in_setsockaddr. We just pass down
+ * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
+ * here because in_setsockaddr will call malloc and can block.
+ */
+static int
+tcp_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setsockaddr(so, nam, &tcbinfo));
+}
+
+/*
+ * This is the wrapper function for in_setpeeraddr. We just pass down
+ * the pcbinfo for in_setpeeraddr to lock.
+ */
+static int
+tcp_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setpeeraddr(so, nam, &tcbinfo));
+}
+
+/*
+ * Mark the connection as being incapable of further output.
+ */
+static int
+tcp_usr_shutdown(struct socket *so)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_WRITE;
+
+ COMMON_START();
+ socantsendmore(so);
+ tp = tcp_usrclosed(tp);
+ if (tp)
+ error = tcp_output(tp);
+ COMMON_END(PRU_SHUTDOWN);
+}
+
+/*
+ * After a receive, possibly send window update to peer.
+ */
+static int
+tcp_usr_rcvd(struct socket *so, int flags)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_READ;
+
+ COMMON_START();
+ tcp_output(tp);
+ COMMON_END(PRU_RCVD);
+}
+
+/*
+ * Do a send by putting data in output queue and updating urgent
+ * marker if URG set. Possibly send more data. Unlike the other
+ * pru_*() routines, the mbuf chains are our responsibility. We
+ * must either enqueue them or free them. The other pru_* routines
+ * generally are caller-frees.
+ */
+static int
+tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
+ struct sockaddr *nam, struct mbuf *control, struct thread *td)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_WRITE;
+#ifdef INET6
+ int isipv6;
+#endif
+ TCPDEBUG0;
+
+ /*
+ * Need write lock here because this function might call
+ * tcp_connect or tcp_usrclosed.
+ * We really want to have to this function upgrade from read lock
+ * to write lock. XXX
+ */
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ if (inp == NULL) {
+ /*
+ * OOPS! we lost a race, the TCP session got reset after
+ * we checked SS_CANTSENDMORE, eg: while doing uiomove or a
+ * network interrupt in the non-splnet() section of sosend().
+ */
+ if (m)
+ m_freem(m);
+ if (control)
+ m_freem(control);
+ error = ECONNRESET; /* XXX EPIPE? */
+ tp = NULL;
+ TCPDEBUG1();
+ goto out;
+ }
+ INP_LOCK(inp);
+#ifdef INET6
+ isipv6 = nam && nam->sa_family == AF_INET6;
+#endif /* INET6 */
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if (control) {
+ /* TCP doesn't do control messages (rights, creds, etc) */
+ if (control->m_len) {
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = EINVAL;
+ goto out;
+ }
+ m_freem(control); /* empty control, just free it */
+ }
+ if (!(flags & PRUS_OOB)) {
+ sbappend(&so->so_snd, m);
+ if (nam && tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected,
+ * initialize window to default value, and
+ * initialize maxseg/maxopd using peer's cached
+ * MSS.
+ */
+#ifdef INET6
+ if (isipv6)
+ error = tcp6_connect(tp, nam, td);
+ else
+#endif /* INET6 */
+ error = tcp_connect(tp, nam, td);
+ if (error)
+ goto out;
+ tp->snd_wnd = TTCP_CLIENT_SND_WND;
+ tcp_mss(tp, -1);
+ }
+
+ if (flags & PRUS_EOF) {
+ /*
+ * Close the send side of the connection after
+ * the data is sent.
+ */
+ socantsendmore(so);
+ tp = tcp_usrclosed(tp);
+ }
+ if (tp != NULL) {
+ if (flags & PRUS_MORETOCOME)
+ tp->t_flags |= TF_MORETOCOME;
+ error = tcp_output(tp);
+ if (flags & PRUS_MORETOCOME)
+ tp->t_flags &= ~TF_MORETOCOME;
+ }
+ } else {
+ if (sbspace(&so->so_snd) < -512) {
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ /*
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section.
+ * Otherwise, snd_up should be one lower.
+ */
+ sbappend(&so->so_snd, m);
+ if (nam && tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected,
+ * initialize window to default value, and
+ * initialize maxseg/maxopd using peer's cached
+ * MSS.
+ */
+#ifdef INET6
+ if (isipv6)
+ error = tcp6_connect(tp, nam, td);
+ else
+#endif /* INET6 */
+ error = tcp_connect(tp, nam, td);
+ if (error)
+ goto out;
+ tp->snd_wnd = TTCP_CLIENT_SND_WND;
+ tcp_mss(tp, -1);
+ }
+ tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
+ tp->t_force = 1;
+ error = tcp_output(tp);
+ tp->t_force = 0;
+ }
+ COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB :
+ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
+}
+
+/*
+ * Abort the TCP.
+ */
+static int
+tcp_usr_abort(struct socket *so)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_WRITE;
+
+ COMMON_START();
+ tp = tcp_drop(tp, ECONNABORTED);
+ COMMON_END(PRU_ABORT);
+}
+
+/*
+ * Receive out-of-band data.
+ */
+static int
+tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+ int s = splnet();
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ const int inirw = INI_READ;
+
+ COMMON_START();
+ if ((so->so_oobmark == 0 &&
+ (so->so_state & SS_RCVATMARK) == 0) ||
+ so->so_options & SO_OOBINLINE ||
+ tp->t_oobflags & TCPOOB_HADDATA) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ m->m_len = 1;
+ *mtod(m, caddr_t) = tp->t_iobc;
+ if ((flags & MSG_PEEK) == 0)
+ tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+ COMMON_END(PRU_RCVOOB);
+}
+
+/* xxx - should be const */
+struct pr_usrreqs tcp_usrreqs = {
+ tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind,
+ tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
+ tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd,
+ tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
+ tcp_sockaddr, sosend, soreceive, sopoll
+};
+
+#ifdef INET6
+struct pr_usrreqs tcp6_usrreqs = {
+ tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind,
+ tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach,
+ tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd,
+ tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
+ in6_mapped_sockaddr, sosend, soreceive, sopoll
+};
+#endif /* INET6 */
+
+/*
+ * Common subroutine to open a TCP connection to remote host specified
+ * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
+ * port number if needed. Call in_pcbladdr to do the routing and to choose
+ * a local host address (interface). If there is an existing incarnation
+ * of the same connection in TIME-WAIT state and if the remote host was
+ * sending CC options and if the connection duration was < MSL, then
+ * truncate the previous TIME-WAIT state and proceed.
+ * Initialize connection parameters and enter SYN-SENT state.
+ */
+static int
+tcp_connect(tp, nam, td)
+ register struct tcpcb *tp;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ struct inpcb *inp = tp->t_inpcb, *oinp;
+ struct socket *so = inp->inp_socket;
+ struct tcpcb *otp;
+ struct sockaddr_in *sin = (struct sockaddr_in *)nam;
+ struct sockaddr_in *ifaddr;
+ struct rmxp_tao *taop;
+ struct rmxp_tao tao_noncached;
+ int error;
+
+ if (inp->inp_lport == 0) {
+ error = in_pcbbind(inp, (struct sockaddr *)0, td);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Cannot simply call in_pcbconnect, because there might be an
+ * earlier incarnation of this same connection still in
+ * TIME_WAIT state, creating an ADDRINUSE error.
+ */
+ error = in_pcbladdr(inp, nam, &ifaddr);
+ if (error)
+ return error;
+ oinp = in_pcblookup_hash(inp->inp_pcbinfo,
+ sin->sin_addr, sin->sin_port,
+ inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr
+ : ifaddr->sin_addr,
+ inp->inp_lport, 0, NULL);
+ if (oinp) {
+ if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
+ otp->t_state == TCPS_TIME_WAIT &&
+ (ticks - otp->t_starttime) < tcp_msl &&
+ (otp->t_flags & TF_RCVD_CC))
+ otp = tcp_close(otp);
+ else
+ return EADDRINUSE;
+ }
+ if (inp->inp_laddr.s_addr == INADDR_ANY)
+ inp->inp_laddr = ifaddr->sin_addr;
+ inp->inp_faddr = sin->sin_addr;
+ inp->inp_fport = sin->sin_port;
+ in_pcbrehash(inp);
+
+ /* Compute window scaling to request. */
+ while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
+ tp->request_r_scale++;
+
+ soisconnecting(so);
+ tcpstat.tcps_connattempt++;
+ tp->t_state = TCPS_SYN_SENT;
+ callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
+ tp->iss = tcp_new_isn(tp);
+ tcp_sendseqinit(tp);
+
+ /*
+ * Generate a CC value for this connection and
+ * check whether CC or CCnew should be used.
+ */
+ if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
+ taop = &tao_noncached;
+ bzero(taop, sizeof(*taop));
+ }
+
+ tp->cc_send = CC_INC(tcp_ccgen);
+ if (taop->tao_ccsent != 0 &&
+ CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
+ taop->tao_ccsent = tp->cc_send;
+ } else {
+ taop->tao_ccsent = 0;
+ tp->t_flags |= TF_SENDCCNEW;
+ }
+
+ return 0;
+}
+
+#ifdef INET6
+static int
+tcp6_connect(tp, nam, td)
+ register struct tcpcb *tp;
+ struct sockaddr *nam;
+ struct thread *td;
+{
+ struct inpcb *inp = tp->t_inpcb, *oinp;
+ struct socket *so = inp->inp_socket;
+ struct tcpcb *otp;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
+ struct in6_addr *addr6;
+ struct rmxp_tao *taop;
+ struct rmxp_tao tao_noncached;
+ int error;
+
+ if (inp->inp_lport == 0) {
+ error = in6_pcbbind(inp, (struct sockaddr *)0, td);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Cannot simply call in_pcbconnect, because there might be an
+ * earlier incarnation of this same connection still in
+ * TIME_WAIT state, creating an ADDRINUSE error.
+ */
+ error = in6_pcbladdr(inp, nam, &addr6);
+ if (error)
+ return error;
+ oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
+ &sin6->sin6_addr, sin6->sin6_port,
+ IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
+ ? addr6
+ : &inp->in6p_laddr,
+ inp->inp_lport, 0, NULL);
+ if (oinp) {
+ if (oinp != inp && (otp = intotcpcb(oinp)) != NULL &&
+ otp->t_state == TCPS_TIME_WAIT &&
+ (ticks - otp->t_starttime) < tcp_msl &&
+ (otp->t_flags & TF_RCVD_CC))
+ otp = tcp_close(otp);
+ else
+ return EADDRINUSE;
+ }
+ if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
+ inp->in6p_laddr = *addr6;
+ inp->in6p_faddr = sin6->sin6_addr;
+ inp->inp_fport = sin6->sin6_port;
+ if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL)
+ inp->in6p_flowinfo = sin6->sin6_flowinfo;
+ in_pcbrehash(inp);
+
+ /* Compute window scaling to request. */
+ while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
+ tp->request_r_scale++;
+
+ soisconnecting(so);
+ tcpstat.tcps_connattempt++;
+ tp->t_state = TCPS_SYN_SENT;
+ callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
+ tp->iss = tcp_new_isn(tp);
+ tcp_sendseqinit(tp);
+
+ /*
+ * Generate a CC value for this connection and
+ * check whether CC or CCnew should be used.
+ */
+ if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
+ taop = &tao_noncached;
+ bzero(taop, sizeof(*taop));
+ }
+
+ tp->cc_send = CC_INC(tcp_ccgen);
+ if (taop->tao_ccsent != 0 &&
+ CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
+ taop->tao_ccsent = tp->cc_send;
+ } else {
+ taop->tao_ccsent = 0;
+ tp->t_flags |= TF_SENDCCNEW;
+ }
+
+ return 0;
+}
+#endif /* INET6 */
+
+/*
+ * The new sockopt interface makes it possible for us to block in the
+ * copyin/out step (if we take a page fault). Taking a page fault at
+ * splnet() is probably a Bad Thing. (Since sockets and pcbs both now
+ * use TSM, there probably isn't any need for this function to run at
+ * splnet() any more. This needs more examination.)
+ */
+int
+tcp_ctloutput(so, sopt)
+ struct socket *so;
+ struct sockopt *sopt;
+{
+ int error, opt, optval, s;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ error = 0;
+ s = splnet(); /* XXX */
+ INP_INFO_RLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ if (inp == NULL) {
+ INP_INFO_RUNLOCK(&tcbinfo);
+ splx(s);
+ return (ECONNRESET);
+ }
+ INP_LOCK(inp);
+ INP_INFO_RUNLOCK(&tcbinfo);
+ if (sopt->sopt_level != IPPROTO_TCP) {
+#ifdef INET6
+ if (INP_CHECK_SOCKAF(so, AF_INET6))
+ error = ip6_ctloutput(so, sopt);
+ else
+#endif /* INET6 */
+ error = ip_ctloutput(so, sopt);
+ INP_UNLOCK(inp);
+ splx(s);
+ return (error);
+ }
+ tp = intotcpcb(inp);
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case TCP_NODELAY:
+ case TCP_NOOPT:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ switch (sopt->sopt_name) {
+ case TCP_NODELAY:
+ opt = TF_NODELAY;
+ break;
+ case TCP_NOOPT:
+ opt = TF_NOOPT;
+ break;
+ default:
+ opt = 0; /* dead code to fool gcc */
+ break;
+ }
+
+ if (optval)
+ tp->t_flags |= opt;
+ else
+ tp->t_flags &= ~opt;
+ break;
+
+ case TCP_NOPUSH:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ if (optval)
+ tp->t_flags |= TF_NOPUSH;
+ else {
+ tp->t_flags &= ~TF_NOPUSH;
+ error = tcp_output(tp);
+ }
+ break;
+
+ case TCP_MAXSEG:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ break;
+
+ if (optval > 0 && optval <= tp->t_maxseg)
+ tp->t_maxseg = optval;
+ else
+ error = EINVAL;
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case TCP_NODELAY:
+ optval = tp->t_flags & TF_NODELAY;
+ break;
+ case TCP_MAXSEG:
+ optval = tp->t_maxseg;
+ break;
+ case TCP_NOOPT:
+ optval = tp->t_flags & TF_NOOPT;
+ break;
+ case TCP_NOPUSH:
+ optval = tp->t_flags & TF_NOPUSH;
+ break;
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ if (error == 0)
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ }
+ INP_UNLOCK(inp);
+ splx(s);
+ return (error);
+}
+
+/*
+ * tcp_sendspace and tcp_recvspace are the default send and receive window
+ * sizes, respectively. These are obsolescent (this information should
+ * be set by the route).
+ */
+u_long tcp_sendspace = 1024*32;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+ &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
+u_long tcp_recvspace = 1024*64;
+SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+ &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
+
+/*
+ * Attach TCP protocol to socket, allocating
+ * internet protocol control block, tcp control block,
+ * bufer space, and entering LISTEN state if to accept connections.
+ */
+static int
+tcp_attach(so, td)
+ struct socket *so;
+ struct thread *td;
+{
+ register struct tcpcb *tp;
+ struct inpcb *inp;
+ int error;
+#ifdef INET6
+ int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL;
+#endif
+
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ error = soreserve(so, tcp_sendspace, tcp_recvspace);
+ if (error)
+ return (error);
+ }
+ error = in_pcballoc(so, &tcbinfo, td);
+ if (error)
+ return (error);
+ inp = sotoinpcb(so);
+#ifdef INET6
+ if (isipv6) {
+ inp->inp_vflag |= INP_IPV6;
+ inp->in6p_hops = -1; /* use kernel default */
+ }
+ else
+#endif
+ inp->inp_vflag |= INP_IPV4;
+ tp = tcp_newtcpcb(inp);
+ if (tp == 0) {
+ int nofd = so->so_state & SS_NOFDREF; /* XXX */
+
+ so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
+#ifdef INET6
+ if (isipv6)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ so->so_state |= nofd;
+ return (ENOBUFS);
+ }
+ tp->t_state = TCPS_CLOSED;
+ return (0);
+}
+
+/*
+ * Initiate (or continue) disconnect.
+ * If embryonic state, just send reset (once).
+ * If in ``let data drain'' option and linger null, just drop.
+ * Otherwise (hard), mark socket disconnecting and drop
+ * current input data; switch states based on user close, and
+ * send segment to peer (with FIN).
+ */
+static struct tcpcb *
+tcp_disconnect(tp)
+ register struct tcpcb *tp;
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ if (tp->t_state < TCPS_ESTABLISHED)
+ tp = tcp_close(tp);
+ else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+ tp = tcp_drop(tp, 0);
+ else {
+ soisdisconnecting(so);
+ sbflush(&so->so_rcv);
+ tp = tcp_usrclosed(tp);
+ if (tp)
+ (void) tcp_output(tp);
+ }
+ return (tp);
+}
+
+/*
+ * User issued close, and wish to trail through shutdown states:
+ * if never received SYN, just forget it. If got a SYN from peer,
+ * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
+ * If already got a FIN from peer, then almost done; go to LAST_ACK
+ * state. In all other cases, have already sent FIN to peer (e.g.
+ * after PRU_SHUTDOWN), and just have to play tedious game waiting
+ * for peer to send FIN or not respond to keep-alives, etc.
+ * We can let the user exit from the close as soon as the FIN is acked.
+ */
+static struct tcpcb *
+tcp_usrclosed(tp)
+ register struct tcpcb *tp;
+{
+
+ switch (tp->t_state) {
+
+ case TCPS_CLOSED:
+ case TCPS_LISTEN:
+ tp->t_state = TCPS_CLOSED;
+ tp = tcp_close(tp);
+ break;
+
+ case TCPS_SYN_SENT:
+ case TCPS_SYN_RECEIVED:
+ tp->t_flags |= TF_NEEDFIN;
+ break;
+
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_FIN_WAIT_1;
+ break;
+
+ case TCPS_CLOSE_WAIT:
+ tp->t_state = TCPS_LAST_ACK;
+ break;
+ }
+ if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
+ soisdisconnected(tp->t_inpcb->inp_socket);
+ /* To prevent the connection hanging in FIN_WAIT_2 forever. */
+ if (tp->t_state == TCPS_FIN_WAIT_2)
+ callout_reset(tp->tt_2msl, tcp_maxidle,
+ tcp_timer_2msl, tp);
+ }
+ return (tp);
+}
+
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
new file mode 100644
index 0000000..a58bdf5
--- /dev/null
+++ b/sys/netinet/tcp_var.h
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 1982, 1986, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCP_VAR_H_
+#define _NETINET_TCP_VAR_H_
+
+#include <netinet/in_pcb.h> /* needed for in_conninfo, inp_gen_t */
+#include <netinet/tcp.h>
+
+/*
+ * Kernel variables for tcp.
+ */
+extern int tcp_do_rfc1323;
+extern int tcp_do_rfc1644;
+
+/* TCP segment queue entry */
+struct tseg_qent {
+ LIST_ENTRY(tseg_qent) tqe_q;
+ int tqe_len; /* TCP segment data length */
+ struct tcphdr *tqe_th; /* a pointer to tcp header */
+ struct mbuf *tqe_m; /* mbuf contains packet */
+};
+LIST_HEAD(tsegqe_head, tseg_qent);
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_TSEGQ);
+#endif
+
+struct tcptemp {
+ u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
+ struct tcphdr tt_t;
+};
+
+#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */
+
+/*
+ * Tcp control block, one per tcp; fields:
+ * Organized for 16 byte cacheline efficiency.
+ */
+struct tcpcb {
+ struct tsegqe_head t_segq;
+ int t_dupacks; /* consecutive dup acks recd */
+ struct tcptemp *unused; /* unused */
+
+ struct callout *tt_rexmt; /* retransmit timer */
+ struct callout *tt_persist; /* retransmit persistence */
+ struct callout *tt_keep; /* keepalive */
+ struct callout *tt_2msl; /* 2*msl TIME_WAIT timer */
+ struct callout *tt_delack; /* delayed ACK timer */
+
+ struct inpcb *t_inpcb; /* back pointer to internet pcb */
+ int t_state; /* state of this connection */
+ u_int t_flags;
+#define TF_ACKNOW 0x00001 /* ack peer immediately */
+#define TF_DELACK 0x00002 /* ack, but try to delay it */
+#define TF_NODELAY 0x00004 /* don't delay packets to coalesce */
+#define TF_NOOPT 0x00008 /* don't use tcp options */
+#define TF_SENTFIN 0x00010 /* have sent FIN */
+#define TF_REQ_SCALE 0x00020 /* have/will request window scaling */
+#define TF_RCVD_SCALE 0x00040 /* other side has requested scaling */
+#define TF_REQ_TSTMP 0x00080 /* have/will request timestamps */
+#define TF_RCVD_TSTMP 0x00100 /* a timestamp was received in SYN */
+#define TF_SACK_PERMIT 0x00200 /* other side said I could SACK */
+#define TF_NEEDSYN 0x00400 /* send SYN (implicit state) */
+#define TF_NEEDFIN 0x00800 /* send FIN (implicit state) */
+#define TF_NOPUSH 0x01000 /* don't push */
+#define TF_REQ_CC 0x02000 /* have/will request CC */
+#define TF_RCVD_CC 0x04000 /* a CC was received in SYN */
+#define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */
+#define TF_MORETOCOME 0x10000 /* More data to be appended to sock */
+#define TF_LQ_OVERFLOW 0x20000 /* listen queue overflow */
+#define TF_LASTIDLE 0x40000 /* connection was previously idle */
+#define TF_RXWIN0SENT 0x80000 /* sent a receiver win 0 in response */
+ int t_force; /* 1 if forcing out a byte */
+
+ tcp_seq snd_una; /* send unacknowledged */
+ tcp_seq snd_max; /* highest sequence number sent;
+ * used to recognize retransmits
+ */
+ tcp_seq snd_nxt; /* send next */
+ tcp_seq snd_up; /* send urgent pointer */
+
+ tcp_seq snd_wl1; /* window update seg seq number */
+ tcp_seq snd_wl2; /* window update seg ack number */
+ tcp_seq iss; /* initial send sequence number */
+ tcp_seq irs; /* initial receive sequence number */
+
+ tcp_seq rcv_nxt; /* receive next */
+ tcp_seq rcv_adv; /* advertised window */
+ u_long rcv_wnd; /* receive window */
+ tcp_seq rcv_up; /* receive urgent pointer */
+
+ u_long snd_wnd; /* send window */
+ u_long snd_cwnd; /* congestion-controlled window */
+ u_long snd_ssthresh; /* snd_cwnd size threshold for
+ * for slow start exponential to
+ * linear switch
+ */
+ tcp_seq snd_recover; /* for use in fast recovery */
+
+ u_int t_maxopd; /* mss plus options */
+
+ u_long t_rcvtime; /* inactivity time */
+ u_long t_starttime; /* time connection was established */
+ int t_rtttime; /* round trip time */
+ tcp_seq t_rtseq; /* sequence number being timed */
+
+ int t_rxtcur; /* current retransmit value (ticks) */
+ u_int t_maxseg; /* maximum segment size */
+ int t_srtt; /* smoothed round-trip time */
+ int t_rttvar; /* variance in round-trip time */
+
+ int t_rxtshift; /* log(2) of rexmt exp. backoff */
+ u_int t_rttmin; /* minimum rtt allowed */
+ u_long t_rttupdated; /* number of times rtt sampled */
+ u_long max_sndwnd; /* largest window peer has offered */
+
+ int t_softerror; /* possible error not yet reported */
+/* out-of-band data */
+ char t_oobflags; /* have some */
+ char t_iobc; /* input character */
+#define TCPOOB_HAVEDATA 0x01
+#define TCPOOB_HADDATA 0x02
+/* RFC 1323 variables */
+ u_char snd_scale; /* window scaling for send window */
+ u_char rcv_scale; /* window scaling for recv window */
+ u_char request_r_scale; /* pending window scaling */
+ u_char requested_s_scale;
+ u_long ts_recent; /* timestamp echo data */
+
+ u_long ts_recent_age; /* when last updated */
+ tcp_seq last_ack_sent;
+/* RFC 1644 variables */
+ tcp_cc cc_send; /* send connection count */
+ tcp_cc cc_recv; /* receive connection count */
+/* experimental */
+ u_long snd_cwnd_prev; /* cwnd prior to retransmit */
+ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */
+ u_long t_badrxtwin; /* window for retransmit recovery */
+};
+
+/*
+ * Structure to hold TCP options that are only used during segment
+ * processing (in tcp_input), but not held in the tcpcb.
+ * It's basically used to reduce the number of parameters
+ * to tcp_dooptions.
+ */
+struct tcpopt {
+ u_long to_flags; /* which options are present */
+#define TOF_TS 0x0001 /* timestamp */
+#define TOF_CC 0x0002 /* CC and CCnew are exclusive */
+#define TOF_CCNEW 0x0004
+#define TOF_CCECHO 0x0008
+#define TOF_MSS 0x0010
+#define TOF_SCALE 0x0020
+ u_int32_t to_tsval;
+ u_int32_t to_tsecr;
+ tcp_cc to_cc; /* holds CC or CCnew */
+ tcp_cc to_ccecho;
+ u_int16_t to_mss;
+ u_int8_t to_requested_s_scale;
+ u_int8_t to_pad;
+};
+
+struct syncache {
+ inp_gen_t sc_inp_gencnt; /* pointer check */
+ struct tcpcb *sc_tp; /* tcb for listening socket */
+ struct mbuf *sc_ipopts; /* source route */
+ struct in_conninfo sc_inc; /* addresses */
+#define sc_route sc_inc.inc_route
+#define sc_route6 sc_inc.inc6_route
+ u_int32_t sc_tsrecent;
+ tcp_cc sc_cc_send; /* holds CC or CCnew */
+ tcp_cc sc_cc_recv;
+ tcp_seq sc_irs; /* seq from peer */
+ tcp_seq sc_iss; /* our ISS */
+ u_long sc_rxttime; /* retransmit time */
+ u_int16_t sc_rxtslot; /* retransmit counter */
+ u_int16_t sc_peer_mss; /* peer's MSS */
+ u_int16_t sc_wnd; /* advertised window */
+ u_int8_t sc_requested_s_scale:4,
+ sc_request_r_scale:4;
+ u_int8_t sc_flags;
+#define SCF_NOOPT 0x01 /* no TCP options */
+#define SCF_WINSCALE 0x02 /* negotiated window scaling */
+#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
+#define SCF_CC 0x08 /* negotiated CC */
+#define SCF_UNREACH 0x10 /* icmp unreachable received */
+#define SCF_KEEPROUTE 0x20 /* keep cloned route */
+ TAILQ_ENTRY(syncache) sc_hash;
+ TAILQ_ENTRY(syncache) sc_timerq;
+};
+
+struct syncache_head {
+ TAILQ_HEAD(, syncache) sch_bucket;
+ u_int sch_length;
+};
+
+/*
+ * The TAO cache entry which is stored in the protocol family specific
+ * portion of the route metrics.
+ */
+struct rmxp_tao {
+ tcp_cc tao_cc; /* latest CC in valid SYN */
+ tcp_cc tao_ccsent; /* latest CC sent to peer */
+ u_short tao_mssopt; /* peer's cached MSS */
+#ifdef notyet
+ u_short tao_flags; /* cache status flags */
+#define TAOF_DONT 0x0001 /* peer doesn't understand rfc1644 */
+#define TAOF_OK 0x0002 /* peer does understand rfc1644 */
+#define TAOF_UNDEF 0 /* we don't know yet */
+#endif /* notyet */
+};
+#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler)
+
+#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
+#define sototcpcb(so) (intotcpcb(sotoinpcb(so)))
+
+/*
+ * The smoothed round-trip time and estimated variance
+ * are stored as fixed point numbers scaled by the values below.
+ * For convenience, these scales are also used in smoothing the average
+ * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
+ * With these scales, srtt has 3 bits to the right of the binary point,
+ * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the
+ * binary point, and is smoothed with an ALPHA of 0.75.
+ */
+#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */
+#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */
+#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */
+#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */
+#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */
+
+/*
+ * The initial retransmission should happen at rtt + 4 * rttvar.
+ * Because of the way we do the smoothing, srtt and rttvar
+ * will each average +1/2 tick of bias. When we compute
+ * the retransmit timer, we want 1/2 tick of rounding and
+ * 1 extra tick because of +-1/2 tick uncertainty in the
+ * firing of the timer. The bias will give us exactly the
+ * 1.5 tick we need. But, because the bias is
+ * statistical, we have to test that we don't drop below
+ * the minimum feasible timer (which is 2 ticks).
+ * This version of the macro adapted from a paper by Lawrence
+ * Brakmo and Larry Peterson which outlines a problem caused
+ * by insufficient precision in the original implementation,
+ * which results in inappropriately large RTO values for very
+ * fast networks.
+ */
+#define TCP_REXMTVAL(tp) \
+ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \
+ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
+
+/*
+ * TCP statistics.
+ * Many of these should be kept per connection,
+ * but that's inconvenient at the moment.
+ */
+struct tcpstat {
+ u_long tcps_connattempt; /* connections initiated */
+ u_long tcps_accepts; /* connections accepted */
+ u_long tcps_connects; /* connections established */
+ u_long tcps_drops; /* connections dropped */
+ u_long tcps_conndrops; /* embryonic connections dropped */
+ u_long tcps_closed; /* conn. closed (includes drops) */
+ u_long tcps_segstimed; /* segs where we tried to get rtt */
+ u_long tcps_rttupdated; /* times we succeeded */
+ u_long tcps_delack; /* delayed acks sent */
+ u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */
+ u_long tcps_rexmttimeo; /* retransmit timeouts */
+ u_long tcps_persisttimeo; /* persist timeouts */
+ u_long tcps_keeptimeo; /* keepalive timeouts */
+ u_long tcps_keepprobe; /* keepalive probes sent */
+ u_long tcps_keepdrops; /* connections dropped in keepalive */
+
+ u_long tcps_sndtotal; /* total packets sent */
+ u_long tcps_sndpack; /* data packets sent */
+ u_long tcps_sndbyte; /* data bytes sent */
+ u_long tcps_sndrexmitpack; /* data packets retransmitted */
+ u_long tcps_sndrexmitbyte; /* data bytes retransmitted */
+ u_long tcps_sndacks; /* ack-only packets sent */
+ u_long tcps_sndprobe; /* window probes sent */
+ u_long tcps_sndurg; /* packets sent with URG only */
+ u_long tcps_sndwinup; /* window update-only packets sent */
+ u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */
+
+ u_long tcps_rcvtotal; /* total packets received */
+ u_long tcps_rcvpack; /* packets received in sequence */
+ u_long tcps_rcvbyte; /* bytes received in sequence */
+ u_long tcps_rcvbadsum; /* packets received with ccksum errs */
+ u_long tcps_rcvbadoff; /* packets received with bad offset */
+ u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */
+ u_long tcps_rcvshort; /* packets received too short */
+ u_long tcps_rcvduppack; /* duplicate-only packets received */
+ u_long tcps_rcvdupbyte; /* duplicate-only bytes received */
+ u_long tcps_rcvpartduppack; /* packets with some duplicate data */
+ u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */
+ u_long tcps_rcvoopack; /* out-of-order packets received */
+ u_long tcps_rcvoobyte; /* out-of-order bytes received */
+ u_long tcps_rcvpackafterwin; /* packets with data after window */
+ u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */
+ u_long tcps_rcvafterclose; /* packets rcvd after "close" */
+ u_long tcps_rcvwinprobe; /* rcvd window probe packets */
+ u_long tcps_rcvdupack; /* rcvd duplicate acks */
+ u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */
+ u_long tcps_rcvackpack; /* rcvd ack packets */
+ u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */
+ u_long tcps_rcvwinupd; /* rcvd window update packets */
+ u_long tcps_pawsdrop; /* segments dropped due to PAWS */
+ u_long tcps_predack; /* times hdr predict ok for acks */
+ u_long tcps_preddat; /* times hdr predict ok for data pkts */
+ u_long tcps_pcbcachemiss;
+ u_long tcps_cachedrtt; /* times cached RTT in route updated */
+ u_long tcps_cachedrttvar; /* times cached rttvar updated */
+ u_long tcps_cachedssthresh; /* times cached ssthresh updated */
+ u_long tcps_usedrtt; /* times RTT initialized from route */
+ u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */
+ u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/
+ u_long tcps_persistdrop; /* timeout in persist state */
+ u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */
+ u_long tcps_mturesent; /* resends due to MTU discovery */
+ u_long tcps_listendrop; /* listen queue overflows */
+
+ u_long tcps_sc_added; /* entry added to syncache */
+ u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */
+ u_long tcps_sc_dupsyn; /* duplicate SYN packet */
+ u_long tcps_sc_dropped; /* could not reply to packet */
+ u_long tcps_sc_completed; /* successful extraction of entry */
+ u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */
+ u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */
+ u_long tcps_sc_reset; /* RST removed entry from syncache */
+ u_long tcps_sc_stale; /* timed out or listen socket gone */
+ u_long tcps_sc_aborted; /* syncache entry aborted */
+ u_long tcps_sc_badack; /* removed due to bad ACK */
+ u_long tcps_sc_unreach; /* ICMP unreachable received */
+ u_long tcps_sc_zonefail; /* zalloc() failed */
+ u_long tcps_sc_sendcookie; /* SYN cookie sent */
+ u_long tcps_sc_recvcookie; /* SYN cookie received */
+};
+
+/*
+ * TCB structure exported to user-land via sysctl(3).
+ * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been
+ * included. Not all of our clients do.
+ */
+#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_)
+struct xtcpcb {
+ size_t xt_len;
+ struct inpcb xt_inp;
+ struct tcpcb xt_tp;
+ struct xsocket xt_socket;
+ u_quad_t xt_alignment_hack;
+};
+#endif
+
+/*
+ * Names for TCP sysctl objects
+ */
+#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */
+#define TCPCTL_DO_RFC1644 2 /* use RFC-1644 extensions */
+#define TCPCTL_MSSDFLT 3 /* MSS default */
+#define TCPCTL_STATS 4 /* statistics (read-only) */
+#define TCPCTL_RTTDFLT 5 /* default RTT estimate */
+#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */
+#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */
+#define TCPCTL_SENDSPACE 8 /* send buffer space */
+#define TCPCTL_RECVSPACE 9 /* receive buffer space */
+#define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */
+#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
+#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
+#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
+#define TCPCTL_MAXID 14
+
+#define TCPCTL_NAMES { \
+ { 0, 0 }, \
+ { "rfc1323", CTLTYPE_INT }, \
+ { "rfc1644", CTLTYPE_INT }, \
+ { "mssdflt", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "rttdflt", CTLTYPE_INT }, \
+ { "keepidle", CTLTYPE_INT }, \
+ { "keepintvl", CTLTYPE_INT }, \
+ { "sendspace", CTLTYPE_INT }, \
+ { "recvspace", CTLTYPE_INT }, \
+ { "keepinit", CTLTYPE_INT }, \
+ { "pcblist", CTLTYPE_STRUCT }, \
+ { "delacktime", CTLTYPE_INT }, \
+ { "v6mssdflt", CTLTYPE_INT }, \
+}
+
+
+#ifdef _KERNEL
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_inet_tcp);
+#endif
+
+extern struct inpcbhead tcb; /* head of queue of active tcpcb's */
+extern struct inpcbinfo tcbinfo;
+extern struct tcpstat tcpstat; /* tcp statistics */
+extern int tcp_mssdflt; /* XXX */
+extern int tcp_delack_enabled;
+extern int tcp_do_newreno;
+extern int path_mtu_discovery;
+extern int ss_fltsz;
+extern int ss_fltsz_local;
+
+void tcp_canceltimers(struct tcpcb *);
+struct tcpcb *
+ tcp_close(struct tcpcb *);
+void tcp_ctlinput(int, struct sockaddr *, void *);
+int tcp_ctloutput(struct socket *, struct sockopt *);
+struct tcpcb *
+ tcp_drop(struct tcpcb *, int);
+void tcp_drain(void);
+void tcp_fasttimo(void);
+struct rmxp_tao *
+ tcp_gettaocache(struct in_conninfo *);
+void tcp_init(void);
+void tcp_input(struct mbuf *, int);
+void tcp_mss(struct tcpcb *, int);
+int tcp_mssopt(struct tcpcb *);
+struct inpcb *
+ tcp_drop_syn_sent(struct inpcb *, int);
+struct inpcb *
+ tcp_mtudisc(struct inpcb *, int);
+struct tcpcb *
+ tcp_newtcpcb(struct inpcb *);
+int tcp_output(struct tcpcb *);
+struct inpcb *
+ tcp_quench(struct inpcb *, int);
+void tcp_respond(struct tcpcb *, void *,
+ struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
+struct rtentry *
+ tcp_rtlookup(struct in_conninfo *);
+void tcp_setpersist(struct tcpcb *);
+void tcp_slowtimo(void);
+struct tcptemp *
+ tcp_maketemplate(struct tcpcb *);
+void tcp_fillheaders(struct tcpcb *, void *, void *);
+struct tcpcb *
+ tcp_timers(struct tcpcb *, int);
+void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int);
+void syncache_init(void);
+void syncache_unreach(struct in_conninfo *, struct tcphdr *);
+int syncache_expand(struct in_conninfo *, struct tcphdr *,
+ struct socket **, struct mbuf *);
+int syncache_add(struct in_conninfo *, struct tcpopt *,
+ struct tcphdr *, struct socket **, struct mbuf *);
+void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
+void syncache_badack(struct in_conninfo *);
+
+extern struct pr_usrreqs tcp_usrreqs;
+extern u_long tcp_sendspace;
+extern u_long tcp_recvspace;
+tcp_seq tcp_new_isn(struct tcpcb *);
+
+#endif /* _KERNEL */
+
+#endif /* _NETINET_TCP_VAR_H_ */
diff --git a/sys/netinet/tcpip.h b/sys/netinet/tcpip.h
new file mode 100644
index 0000000..92189b9
--- /dev/null
+++ b/sys/netinet/tcpip.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcpip.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TCPIP_H_
+#define _NETINET_TCPIP_H_
+
+/*
+ * Tcp+ip header, after ip options removed.
+ */
+struct tcpiphdr {
+ struct ipovly ti_i; /* overlaid ip structure */
+ struct tcphdr ti_t; /* tcp header */
+};
+#define ti_x1 ti_i.ih_x1
+#define ti_pr ti_i.ih_pr
+#define ti_len ti_i.ih_len
+#define ti_src ti_i.ih_src
+#define ti_dst ti_i.ih_dst
+#define ti_sport ti_t.th_sport
+#define ti_dport ti_t.th_dport
+#define ti_seq ti_t.th_seq
+#define ti_ack ti_t.th_ack
+#define ti_x2 ti_t.th_x2
+#define ti_off ti_t.th_off
+#define ti_flags ti_t.th_flags
+#define ti_win ti_t.th_win
+#define ti_sum ti_t.th_sum
+#define ti_urp ti_t.th_urp
+
+#endif
diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h
new file mode 100644
index 0000000..635267f
--- /dev/null
+++ b/sys/netinet/udp.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)udp.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_UDP_H_
+#define _NETINET_UDP_H_
+
+/*
+ * Udp protocol header.
+ * Per RFC 768, September, 1981.
+ */
+struct udphdr {
+ u_short uh_sport; /* source port */
+ u_short uh_dport; /* destination port */
+ u_short uh_ulen; /* udp length */
+ u_short uh_sum; /* udp checksum */
+};
+
+#endif
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
new file mode 100644
index 0000000..31fb251
--- /dev/null
+++ b/sys/netinet/udp_usrreq.c
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
+ * $FreeBSD$
+ */
+
+#include "opt_ipsec.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+
+#ifdef IPSEC
+#include <netinet6/ipsec.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+/*
+ * UDP protocol implementation.
+ * Per RFC 768, August, 1980.
+ */
+#ifndef COMPAT_42
+static int udpcksum = 1;
+#else
+static int udpcksum = 0; /* XXX */
+#endif
+SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW,
+ &udpcksum, 0, "");
+
+int log_in_vain = 0;
+SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
+ &log_in_vain, 0, "Log all incoming UDP packets");
+
+static int blackhole = 0;
+SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW,
+ &blackhole, 0, "Do not send port unreachables for refused connects");
+
+struct inpcbhead udb; /* from udp_var.h */
+#define udb6 udb /* for KAME src sync over BSD*'s */
+struct inpcbinfo udbinfo;
+
+#ifndef UDBHASHSIZE
+#define UDBHASHSIZE 16
+#endif
+
+struct udpstat udpstat; /* from udp_var.h */
+SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
+ &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
+
+static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET };
+#ifdef INET6
+struct udp_in6 {
+ struct sockaddr_in6 uin6_sin;
+ u_char uin6_init_done : 1;
+} udp_in6 = {
+ { sizeof(udp_in6.uin6_sin), AF_INET6 },
+ 0
+};
+struct udp_ip6 {
+ struct ip6_hdr uip6_ip6;
+ u_char uip6_init_done : 1;
+} udp_ip6;
+#endif /* INET6 */
+
+static void udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
+ int off);
+#ifdef INET6
+static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip);
+#endif
+
+static int udp_detach(struct socket *so);
+static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
+ struct mbuf *, struct thread *);
+
+void
+udp_init()
+{
+ INP_INFO_LOCK_INIT(&udbinfo, "udp");
+ LIST_INIT(&udb);
+ udbinfo.listhead = &udb;
+ udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask);
+ udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB,
+ &udbinfo.porthashmask);
+ udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(udbinfo.ipi_zone, maxsockets);
+}
+
+void
+udp_input(m, off)
+ register struct mbuf *m;
+ int off;
+{
+ int iphlen = off;
+ register struct ip *ip;
+ register struct udphdr *uh;
+ register struct inpcb *inp;
+ struct mbuf *opts = 0;
+ int len;
+ struct ip save_ip;
+ struct sockaddr *append_sa;
+
+ udpstat.udps_ipackets++;
+
+ /*
+ * Strip IP options, if any; should skip this,
+ * make available to user, and use on returned packets,
+ * but we don't yet have a way to check the checksum
+ * with options still present.
+ */
+ if (iphlen > sizeof (struct ip)) {
+ ip_stripoptions(m, (struct mbuf *)0);
+ iphlen = sizeof(struct ip);
+ }
+
+ /*
+ * Get IP and UDP header together in first mbuf.
+ */
+ ip = mtod(m, struct ip *);
+ if (m->m_len < iphlen + sizeof(struct udphdr)) {
+ if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
+ udpstat.udps_hdrops++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+ uh = (struct udphdr *)((caddr_t)ip + iphlen);
+
+ /* destination port of 0 is illegal, based on RFC768. */
+ if (uh->uh_dport == 0)
+ goto badunlocked;
+
+ /*
+ * Make mbuf data length reflect UDP length.
+ * If not enough data to reflect UDP length, drop.
+ */
+ len = ntohs((u_short)uh->uh_ulen);
+ if (ip->ip_len != len) {
+ if (len > ip->ip_len || len < sizeof(struct udphdr)) {
+ udpstat.udps_badlen++;
+ goto badunlocked;
+ }
+ m_adj(m, len - ip->ip_len);
+ /* ip->ip_len = len; */
+ }
+ /*
+ * Save a copy of the IP header in case we want restore it
+ * for sending an ICMP error message in response.
+ */
+ if (!blackhole)
+ save_ip = *ip;
+
+ /*
+ * Checksum extended UDP header and data.
+ */
+ if (uh->uh_sum) {
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ uh->uh_sum = m->m_pkthdr.csum_data;
+ else
+ uh->uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl((u_short)len +
+ m->m_pkthdr.csum_data + IPPROTO_UDP));
+ uh->uh_sum ^= 0xffff;
+ } else {
+ char b[9];
+ bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
+ bzero(((struct ipovly *)ip)->ih_x1, 9);
+ ((struct ipovly *)ip)->ih_len = uh->uh_ulen;
+ uh->uh_sum = in_cksum(m, len + sizeof (struct ip));
+ bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
+ }
+ if (uh->uh_sum) {
+ udpstat.udps_badsum++;
+ m_freem(m);
+ return;
+ }
+ } else
+ udpstat.udps_nosum++;
+
+ INP_INFO_RLOCK(&udbinfo);
+
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
+ in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
+ struct inpcb *last;
+ /*
+ * Deliver a multicast or broadcast datagram to *all* sockets
+ * for which the local and remote addresses and ports match
+ * those of the incoming datagram. This allows more than
+ * one process to receive multi/broadcasts on the same port.
+ * (This really ought to be done for unicast datagrams as
+ * well, but that would cause problems with existing
+ * applications that open both address-specific sockets and
+ * a wildcard socket listening to the same port -- they would
+ * end up receiving duplicates of every unicast datagram.
+ * Those applications open the multiple sockets to overcome an
+ * inadequacy of the UDP socket interface, but for backwards
+ * compatibility we avoid the problem here rather than
+ * fixing the interface. Maybe 4.5BSD will remedy this?)
+ */
+
+ /*
+ * Construct sockaddr format source address.
+ */
+ udp_in.sin_port = uh->uh_sport;
+ udp_in.sin_addr = ip->ip_src;
+ /*
+ * Locate pcb(s) for datagram.
+ * (Algorithm copied from raw_intr().)
+ */
+ last = NULL;
+#ifdef INET6
+ udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0;
+#endif
+ LIST_FOREACH(inp, &udb, inp_list) {
+ INP_LOCK(inp);
+ if (inp->inp_lport != uh->uh_dport) {
+ docontinue:
+ INP_UNLOCK(inp);
+ continue;
+ }
+#ifdef INET6
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ goto docontinue;
+#endif
+ if (inp->inp_laddr.s_addr != INADDR_ANY) {
+ if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
+ goto docontinue;
+ }
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ if (inp->inp_faddr.s_addr !=
+ ip->ip_src.s_addr ||
+ inp->inp_fport != uh->uh_sport)
+ goto docontinue;
+ }
+
+ if (last != NULL) {
+ struct mbuf *n;
+
+#ifdef IPSEC
+ /* check AH/ESP integrity. */
+ if (ipsec4_in_reject_so(m, last->inp_socket))
+ ipsecstat.in_polvio++;
+ /* do not inject data to pcb */
+ else
+#endif /*IPSEC*/
+ if ((n = m_copy(m, 0, M_COPYALL)) != NULL)
+ udp_append(last, ip, n,
+ iphlen +
+ sizeof(struct udphdr));
+ INP_UNLOCK(last);
+ }
+ last = inp;
+ /*
+ * Don't look for additional matches if this one does
+ * not have either the SO_REUSEPORT or SO_REUSEADDR
+ * socket options set. This heuristic avoids searching
+ * through all pcbs in the common case of a non-shared
+ * port. It * assumes that an application will never
+ * clear these options after setting them.
+ */
+ if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0)
+ break;
+ }
+
+ if (last == NULL) {
+ /*
+ * No matching pcb found; discard datagram.
+ * (No need to send an ICMP Port Unreachable
+ * for a broadcast or multicast datgram.)
+ */
+ udpstat.udps_noportbcast++;
+ goto badheadlocked;
+ }
+#ifdef IPSEC
+ /* check AH/ESP integrity. */
+ if (ipsec4_in_reject_so(m, last->inp_socket)) {
+ ipsecstat.in_polvio++;
+ goto badheadlocked;
+ }
+#endif /*IPSEC*/
+ INP_UNLOCK(last);
+ INP_INFO_RUNLOCK(&udbinfo);
+ udp_append(last, ip, m, iphlen + sizeof(struct udphdr));
+ return;
+ }
+ /*
+ * Locate pcb for datagram.
+ */
+ inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport,
+ ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif);
+ if (inp == NULL) {
+ if (log_in_vain) {
+ char buf[4*sizeof "123"];
+
+ strcpy(buf, inet_ntoa(ip->ip_dst));
+ log(LOG_INFO,
+ "Connection attempt to UDP %s:%d from %s:%d\n",
+ buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
+ ntohs(uh->uh_sport));
+ }
+ udpstat.udps_noport++;
+ if (m->m_flags & (M_BCAST | M_MCAST)) {
+ udpstat.udps_noportbcast++;
+ goto badheadlocked;
+ }
+ if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
+ goto badheadlocked;
+ if (blackhole)
+ goto badheadlocked;
+ *ip = save_ip;
+ ip->ip_len += iphlen;
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
+ INP_INFO_RUNLOCK(&udbinfo);
+ return;
+ }
+ INP_LOCK(inp);
+ INP_INFO_RUNLOCK(&udbinfo);
+#ifdef IPSEC
+ if (ipsec4_in_reject_so(m, inp->inp_socket)) {
+ ipsecstat.in_polvio++;
+ goto bad;
+ }
+#endif /*IPSEC*/
+
+ /*
+ * Construct sockaddr format source address.
+ * Stuff source address and datagram in user buffer.
+ */
+ udp_in.sin_port = uh->uh_sport;
+ udp_in.sin_addr = ip->ip_src;
+ if (inp->inp_flags & INP_CONTROLOPTS
+ || inp->inp_socket->so_options & SO_TIMESTAMP) {
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6) {
+ int savedflags;
+
+ ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip);
+ savedflags = inp->inp_flags;
+ inp->inp_flags &= ~INP_UNMAPPABLEOPTS;
+ ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m);
+ inp->inp_flags = savedflags;
+ } else
+#endif
+ ip_savecontrol(inp, &opts, ip, m);
+ }
+ m_adj(m, iphlen + sizeof(struct udphdr));
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6) {
+ in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin);
+ append_sa = (struct sockaddr *)&udp_in6;
+ } else
+#endif
+ append_sa = (struct sockaddr *)&udp_in;
+ if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) {
+ udpstat.udps_fullsock++;
+ goto bad;
+ }
+ sorwakeup(inp->inp_socket);
+ INP_UNLOCK(inp);
+ return;
+
+badheadlocked:
+ INP_INFO_RUNLOCK(&udbinfo);
+bad:
+ if (inp)
+ INP_UNLOCK(inp);
+badunlocked:
+ m_freem(m);
+ if (opts)
+ m_freem(opts);
+ return;
+}
+
+#ifdef INET6
+static void
+ip_2_ip6_hdr(ip6, ip)
+ struct ip6_hdr *ip6;
+ struct ip *ip;
+{
+ bzero(ip6, sizeof(*ip6));
+
+ ip6->ip6_vfc = IPV6_VERSION;
+ ip6->ip6_plen = ip->ip_len;
+ ip6->ip6_nxt = ip->ip_p;
+ ip6->ip6_hlim = ip->ip_ttl;
+ ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] =
+ IPV6_ADDR_INT32_SMP;
+ ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr;
+ ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr;
+}
+#endif
+
+/*
+ * subroutine of udp_input(), mainly for source code readability.
+ * caller must properly init udp_ip6 and udp_in6 beforehand.
+ */
+static void
+udp_append(last, ip, n, off)
+ struct inpcb *last;
+ struct ip *ip;
+ struct mbuf *n;
+ int off;
+{
+ struct sockaddr *append_sa;
+ struct mbuf *opts = 0;
+
+ if (last->inp_flags & INP_CONTROLOPTS ||
+ last->inp_socket->so_options & SO_TIMESTAMP) {
+#ifdef INET6
+ if (last->inp_vflag & INP_IPV6) {
+ int savedflags;
+
+ if (udp_ip6.uip6_init_done == 0) {
+ ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip);
+ udp_ip6.uip6_init_done = 1;
+ }
+ savedflags = last->inp_flags;
+ last->inp_flags &= ~INP_UNMAPPABLEOPTS;
+ ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n);
+ last->inp_flags = savedflags;
+ } else
+#endif
+ ip_savecontrol(last, &opts, ip, n);
+ }
+#ifdef INET6
+ if (last->inp_vflag & INP_IPV6) {
+ if (udp_in6.uin6_init_done == 0) {
+ in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin);
+ udp_in6.uin6_init_done = 1;
+ }
+ append_sa = (struct sockaddr *)&udp_in6.uin6_sin;
+ } else
+#endif
+ append_sa = (struct sockaddr *)&udp_in;
+ m_adj(n, off);
+ if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) {
+ m_freem(n);
+ if (opts)
+ m_freem(opts);
+ udpstat.udps_fullsock++;
+ } else
+ sorwakeup(last->inp_socket);
+}
+
+/*
+ * Notify a udp user of an asynchronous error;
+ * just wake up so that he can collect error status.
+ */
+struct inpcb *
+udp_notify(inp, errno)
+ register struct inpcb *inp;
+ int errno;
+{
+ inp->inp_socket->so_error = errno;
+ sorwakeup(inp->inp_socket);
+ sowwakeup(inp->inp_socket);
+ return inp;
+}
+
+void
+udp_ctlinput(cmd, sa, vip)
+ int cmd;
+ struct sockaddr *sa;
+ void *vip;
+{
+ struct ip *ip = vip;
+ struct udphdr *uh;
+ struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
+ struct in_addr faddr;
+ struct inpcb *inp;
+ int s;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ if (PRC_IS_REDIRECT(cmd)) {
+ ip = 0;
+ notify = in_rtchange;
+ } else if (cmd == PRC_HOSTDEAD)
+ ip = 0;
+ else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
+ return;
+ if (ip) {
+ s = splnet();
+ uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
+ INP_INFO_RLOCK(&udbinfo);
+ inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport,
+ ip->ip_src, uh->uh_sport, 0, NULL);
+ if (inp != NULL) {
+ INP_LOCK(inp);
+ if(inp->inp_socket != NULL) {
+ (*notify)(inp, inetctlerrmap[cmd]);
+ }
+ INP_UNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&udbinfo);
+ splx(s);
+ } else
+ in_pcbnotifyall(&udbinfo, faddr, inetctlerrmap[cmd], notify);
+}
+
+static int
+udp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n, s;
+ struct inpcb *inp, **inp_list;
+ inp_gen_t gencnt;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == 0) {
+ n = udbinfo.ipi_count;
+ req->oldidx = 2 * (sizeof xig)
+ + (n + n/8) * sizeof(struct xinpcb);
+ return 0;
+ }
+
+ if (req->newptr != 0)
+ return EPERM;
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ s = splnet();
+ gencnt = udbinfo.ipi_gencnt;
+ n = udbinfo.ipi_count;
+ splx(s);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = gencnt;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return error;
+
+ inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ if (inp_list == 0)
+ return ENOMEM;
+
+ s = splnet();
+ INP_INFO_RLOCK(&udbinfo);
+ for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n;
+ inp = LIST_NEXT(inp, inp_list)) {
+ INP_LOCK(inp);
+ if (inp->inp_gencnt <= gencnt &&
+ cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
+ inp_list[i++] = inp;
+ INP_UNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&udbinfo);
+ splx(s);
+ n = i;
+
+ error = 0;
+ for (i = 0; i < n; i++) {
+ inp = inp_list[i];
+ INP_LOCK(inp);
+ if (inp->inp_gencnt <= gencnt) {
+ struct xinpcb xi;
+ xi.xi_len = sizeof xi;
+ /* XXX should avoid extra copy */
+ bcopy(inp, &xi.xi_inp, sizeof *inp);
+ if (inp->inp_socket)
+ sotoxsocket(inp->inp_socket, &xi.xi_socket);
+ error = SYSCTL_OUT(req, &xi, sizeof xi);
+ }
+ INP_UNLOCK(inp);
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ s = splnet();
+ INP_INFO_RLOCK(&udbinfo);
+ xig.xig_gen = udbinfo.ipi_gencnt;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = udbinfo.ipi_count;
+ INP_INFO_RUNLOCK(&udbinfo);
+ splx(s);
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ free(inp_list, M_TEMP);
+ return error;
+}
+
+SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
+ udp_pcblist, "S,xinpcb", "List of active UDP sockets");
+
+static int
+udp_getcred(SYSCTL_HANDLER_ARGS)
+{
+ struct xucred xuc;
+ struct sockaddr_in addrs[2];
+ struct inpcb *inp;
+ int error, s;
+
+ error = suser_cred(req->td->td_ucred, PRISON_ROOT);
+ if (error)
+ return (error);
+ error = SYSCTL_IN(req, addrs, sizeof(addrs));
+ if (error)
+ return (error);
+ s = splnet();
+ INP_INFO_RLOCK(&udbinfo);
+ inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
+ addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
+ if (inp == NULL || inp->inp_socket == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
+ if (error)
+ goto out;
+ cru2x(inp->inp_socket->so_cred, &xuc);
+ error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
+out:
+ INP_INFO_RUNLOCK(&udbinfo);
+ splx(s);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
+ CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
+ udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
+
+static int
+udp_output(inp, m, addr, control, td)
+ register struct inpcb *inp;
+ struct mbuf *m;
+ struct sockaddr *addr;
+ struct mbuf *control;
+ struct thread *td;
+{
+ register struct udpiphdr *ui;
+ register int len = m->m_pkthdr.len;
+ struct in_addr laddr;
+ struct sockaddr_in *sin;
+ int s = 0, error = 0;
+
+ if (control)
+ m_freem(control); /* XXX */
+
+ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
+ error = EMSGSIZE;
+ goto release;
+ }
+
+ if (addr) {
+ sin = (struct sockaddr_in *)addr;
+ if (td && jailed(td->td_ucred))
+ prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
+ laddr = inp->inp_laddr;
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ error = EISCONN;
+ goto release;
+ }
+ /*
+ * Must block input while temporarily connected.
+ */
+ s = splnet();
+ error = in_pcbconnect(inp, addr, td);
+ if (error) {
+ splx(s);
+ goto release;
+ }
+ } else {
+ if (inp->inp_faddr.s_addr == INADDR_ANY) {
+ error = ENOTCONN;
+ goto release;
+ }
+ }
+ /*
+ * Calculate data length and get a mbuf
+ * for UDP and IP headers.
+ */
+ M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT);
+ if (m == 0) {
+ error = ENOBUFS;
+ if (addr)
+ splx(s);
+ goto release;
+ }
+
+ /*
+ * Fill in mbuf with extended UDP header
+ * and addresses and length put into network format.
+ */
+ ui = mtod(m, struct udpiphdr *);
+ bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */
+ ui->ui_pr = IPPROTO_UDP;
+ ui->ui_src = inp->inp_laddr;
+ ui->ui_dst = inp->inp_faddr;
+ ui->ui_sport = inp->inp_lport;
+ ui->ui_dport = inp->inp_fport;
+ ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
+
+ /*
+ * Set up checksum and output datagram.
+ */
+ if (udpcksum) {
+ ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr,
+ htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ } else {
+ ui->ui_sum = 0;
+ }
+ ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
+ ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */
+ ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */
+ udpstat.udps_opackets++;
+
+#ifdef IPSEC
+ if (ipsec_setsocket(m, inp->inp_socket) != 0) {
+ error = ENOBUFS;
+ goto release;
+ }
+#endif /*IPSEC*/
+ error = ip_output(m, inp->inp_options, &inp->inp_route,
+ (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)),
+ inp->inp_moptions);
+
+ if (addr) {
+ in_pcbdisconnect(inp);
+ inp->inp_laddr = laddr; /* XXX rehash? */
+ splx(s);
+ }
+ return (error);
+
+release:
+ m_freem(m);
+ return (error);
+}
+
+u_long udp_sendspace = 9216; /* really max datagram size */
+ /* 40 1K datagrams */
+SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
+ &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
+
+u_long udp_recvspace = 40 * (1024 +
+#ifdef INET6
+ sizeof(struct sockaddr_in6)
+#else
+ sizeof(struct sockaddr_in)
+#endif
+ );
+SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+ &udp_recvspace, 0, "Maximum incoming UDP datagram size");
+
+static int
+udp_abort(struct socket *so)
+{
+ struct inpcb *inp;
+ int s;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EINVAL; /* ??? possible? panic instead? */
+ }
+ INP_LOCK(inp);
+ soisdisconnected(so);
+ s = splnet();
+ in_pcbdetach(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ splx(s);
+ return 0;
+}
+
+static int
+udp_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ int s, error;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp != 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EINVAL;
+ }
+ error = soreserve(so, udp_sendspace, udp_recvspace);
+ if (error) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return error;
+ }
+ s = splnet();
+ error = in_pcballoc(so, &udbinfo, td);
+ splx(s);
+ if (error)
+ return error;
+
+ inp = (struct inpcb *)so->so_pcb;
+ INP_LOCK(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_ip_ttl = ip_defttl;
+ INP_UNLOCK(inp);
+ return 0;
+}
+
+static int
+udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp;
+ int s, error;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EINVAL;
+ }
+ INP_LOCK(inp);
+ s = splnet();
+ error = in_pcbbind(inp, nam, td);
+ splx(s);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ return error;
+}
+
+static int
+udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp;
+ int s, error;
+ struct sockaddr_in *sin;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EINVAL;
+ }
+ INP_LOCK(inp);
+ if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EISCONN;
+ }
+ s = splnet();
+ sin = (struct sockaddr_in *)nam;
+ if (td && jailed(td->td_ucred))
+ prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
+ error = in_pcbconnect(inp, nam, td);
+ splx(s);
+ if (error == 0)
+ soisconnected(so);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ return error;
+}
+
+static int
+udp_detach(struct socket *so)
+{
+ struct inpcb *inp;
+ int s;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EINVAL;
+ }
+ INP_LOCK(inp);
+ s = splnet();
+ in_pcbdetach(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ splx(s);
+ return 0;
+}
+
+static int
+udp_disconnect(struct socket *so)
+{
+ struct inpcb *inp;
+ int s;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ return EINVAL;
+ }
+ INP_LOCK(inp);
+ if (inp->inp_faddr.s_addr == INADDR_ANY) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ INP_UNLOCK(inp);
+ return ENOTCONN;
+ }
+
+ s = splnet();
+ in_pcbdisconnect(inp);
+ inp->inp_laddr.s_addr = INADDR_ANY;
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ splx(s);
+ so->so_state &= ~SS_ISCONNECTED; /* XXX */
+ return 0;
+}
+
+static int
+udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
+ struct mbuf *control, struct thread *td)
+{
+ struct inpcb *inp;
+ int ret;
+
+ INP_INFO_WLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_WUNLOCK(&udbinfo);
+ m_freem(m);
+ return EINVAL;
+ }
+ INP_LOCK(inp);
+ ret = udp_output(inp, m, addr, control, td);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&udbinfo);
+ return ret;
+}
+
+int
+udp_shutdown(struct socket *so)
+{
+ struct inpcb *inp;
+
+ INP_INFO_RLOCK(&udbinfo);
+ inp = sotoinpcb(so);
+ if (inp == 0) {
+ INP_INFO_RUNLOCK(&udbinfo);
+ return EINVAL;
+ }
+ INP_LOCK(inp);
+ INP_INFO_RUNLOCK(&udbinfo);
+ socantsendmore(so);
+ INP_UNLOCK(inp);
+ return 0;
+}
+
+/*
+ * This is the wrapper function for in_setsockaddr. We just pass down
+ * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking
+ * here because in_setsockaddr will call malloc and might block.
+ */
+static int
+udp_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setsockaddr(so, nam, &udbinfo));
+}
+
+/*
+ * This is the wrapper function for in_setpeeraddr. We just pass down
+ * the pcbinfo for in_setpeeraddr to lock.
+ */
+static int
+udp_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ return (in_setpeeraddr(so, nam, &udbinfo));
+}
+
+struct pr_usrreqs udp_usrreqs = {
+ udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect,
+ pru_connect2_notsupp, in_control, udp_detach, udp_disconnect,
+ pru_listen_notsupp, udp_peeraddr, pru_rcvd_notsupp,
+ pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown,
+ udp_sockaddr, sosend, soreceive, sopoll
+};
diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h
new file mode 100644
index 0000000..66db23f
--- /dev/null
+++ b/sys/netinet/udp_var.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)udp_var.h 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_UDP_VAR_H_
+#define _NETINET_UDP_VAR_H_
+
+/*
+ * UDP kernel structures and variables.
+ */
+struct udpiphdr {
+ struct ipovly ui_i; /* overlaid ip structure */
+ struct udphdr ui_u; /* udp header */
+};
+#define ui_x1 ui_i.ih_x1
+#define ui_pr ui_i.ih_pr
+#define ui_len ui_i.ih_len
+#define ui_src ui_i.ih_src
+#define ui_dst ui_i.ih_dst
+#define ui_sport ui_u.uh_sport
+#define ui_dport ui_u.uh_dport
+#define ui_ulen ui_u.uh_ulen
+#define ui_sum ui_u.uh_sum
+
+struct udpstat {
+ /* input statistics: */
+ u_long udps_ipackets; /* total input packets */
+ u_long udps_hdrops; /* packet shorter than header */
+ u_long udps_badsum; /* checksum error */
+ u_long udps_nosum; /* no checksum */
+ u_long udps_badlen; /* data length larger than packet */
+ u_long udps_noport; /* no socket on port */
+ u_long udps_noportbcast; /* of above, arrived as broadcast */
+ u_long udps_fullsock; /* not delivered, input socket full */
+ u_long udpps_pcbcachemiss; /* input packets missing pcb cache */
+ u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */
+ /* output statistics: */
+ u_long udps_opackets; /* total output packets */
+ u_long udps_fastout; /* output packets on fast path */
+ /* of no socket on port, arrived as multicast */
+ u_long udps_noportmcast;
+};
+
+/*
+ * Names for UDP sysctl objects
+ */
+#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */
+#define UDPCTL_STATS 2 /* statistics (read-only) */
+#define UDPCTL_MAXDGRAM 3 /* max datagram size */
+#define UDPCTL_RECVSPACE 4 /* default receive buffer space */
+#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */
+#define UDPCTL_MAXID 6
+
+#define UDPCTL_NAMES { \
+ { 0, 0 }, \
+ { "checksum", CTLTYPE_INT }, \
+ { "stats", CTLTYPE_STRUCT }, \
+ { "maxdgram", CTLTYPE_INT }, \
+ { "recvspace", CTLTYPE_INT }, \
+ { "pcblist", CTLTYPE_STRUCT }, \
+}
+
+#ifdef _KERNEL
+SYSCTL_DECL(_net_inet_udp);
+
+extern struct pr_usrreqs udp_usrreqs;
+extern struct inpcbhead udb;
+extern struct inpcbinfo udbinfo;
+extern u_long udp_sendspace;
+extern u_long udp_recvspace;
+extern struct udpstat udpstat;
+extern int log_in_vain;
+
+void udp_ctlinput(int, struct sockaddr *, void *);
+void udp_init(void);
+void udp_input(struct mbuf *, int);
+
+struct inpcb *
+ udp_notify(struct inpcb *inp, int errno);
+int udp_shutdown(struct socket *so);
+#endif
+
+#endif
OpenPOWER on IntegriCloud