diff options
author | glebius <glebius@FreeBSD.org> | 2012-09-14 11:51:49 +0000 |
---|---|---|
committer | glebius <glebius@FreeBSD.org> | 2012-09-14 11:51:49 +0000 |
commit | 0ccf4838d7a8b4da2c3beaac7ea1fd977aa0ed11 (patch) | |
tree | ec60da6e90cde2e87aa91ac9450c84ce3446233a /sys/netpfil/pf | |
parent | f99fc207edf21e7c05c1147864077ce3fe1f3e2c (diff) | |
download | FreeBSD-src-0ccf4838d7a8b4da2c3beaac7ea1fd977aa0ed11.zip FreeBSD-src-0ccf4838d7a8b4da2c3beaac7ea1fd977aa0ed11.tar.gz |
o Create directory sys/netpfil, where all packet filters should
reside, and move there ipfw(4) and pf(4).
o Move most modified parts of pf out of contrib.
Actual movements:
sys/contrib/pf/net/*.c -> sys/netpfil/pf/
sys/contrib/pf/net/*.h -> sys/net/
contrib/pf/pfctl/*.c -> sbin/pfctl
contrib/pf/pfctl/*.h -> sbin/pfctl
contrib/pf/pfctl/pfctl.8 -> sbin/pfctl
contrib/pf/pfctl/*.4 -> share/man/man4
contrib/pf/pfctl/*.5 -> share/man/man5
sys/netinet/ipfw -> sys/netpfil/ipfw
The arguable movement is pf/net/*.h -> sys/net. There are
future plans to refactor pf includes, so I decided not to
break things twice.
Not modified bits of pf left in contrib: authpf, ftp-proxy,
tftp-proxy, pflogd.
The ipfw(4) movement is planned to be merged to stable/9,
to make head and stable match.
Discussed with: bz, luigi
Diffstat (limited to 'sys/netpfil/pf')
-rw-r--r-- | sys/netpfil/pf/if_pflog.c | 290 | ||||
-rw-r--r-- | sys/netpfil/pf/if_pfsync.c | 2397 | ||||
-rw-r--r-- | sys/netpfil/pf/in4_cksum.c | 120 | ||||
-rw-r--r-- | sys/netpfil/pf/pf.c | 6271 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_if.c | 859 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_ioctl.c | 3774 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_lb.c | 663 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_norm.c | 1999 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_osfp.c | 526 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_ruleset.c | 424 | ||||
-rw-r--r-- | sys/netpfil/pf/pf_table.c | 2191 |
11 files changed, 19514 insertions, 0 deletions
diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c new file mode 100644 index 0000000..20feea2 --- /dev/null +++ b/sys/netpfil/pf/if_pflog.c @@ -0,0 +1,290 @@ +/* $OpenBSD: if_pflog.c,v 1.26 2007/10/18 21:58:18 mpf Exp $ */ +/* + * The authors of this code are John Ioannidis (ji@tla.org), + * Angelos D. Keromytis (kermit@csd.uch.gr) and + * Niels Provos (provos@physnet.uni-hamburg.de). + * + * This code was written by John Ioannidis for BSD/OS in Athens, Greece, + * in November 1995. + * + * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, + * by Angelos D. Keromytis. + * + * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis + * and Niels Provos. + * + * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis + * and Niels Provos. + * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos. + * + * Permission to use, copy, and modify this software with or without fee + * is hereby granted, provided that this entire notice is included in + * all copies of any software which is or includes a copy or + * modification of this software. + * You may use this code under the GNU public license if you so wish. Please + * contribute changes back to the authors under this freer than GPL license + * so that we may further the use of strong encryption without limitations to + * all. + * + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE + * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR + * PURPOSE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_bpf.h" +#include "opt_pf.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/sockio.h> + +#include <net/bpf.h> +#include <net/if.h> +#include <net/if_clone.h> +#include <net/if_pflog.h> +#include <net/if_types.h> +#include <net/pfvar.h> + +#if defined(INET) || defined(INET6) +#include <netinet/in.h> +#endif +#ifdef INET +#include <netinet/in_var.h> +#include <netinet/ip.h> +#endif + +#ifdef INET6 +#include <netinet6/in6_var.h> +#include <netinet6/nd6.h> +#endif /* INET6 */ + +#ifdef INET +#include <machine/in_cksum.h> +#endif /* INET */ + +#define PFLOGMTU (32768 + MHLEN + MLEN) + +#ifdef PFLOGDEBUG +#define DPRINTF(x) do { if (pflogdebug) printf x ; } while (0) +#else +#define DPRINTF(x) +#endif + +static int pflogoutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +static void pflogattach(int); +static int pflogioctl(struct ifnet *, u_long, caddr_t); +static void pflogstart(struct ifnet *); +static int pflog_clone_create(struct if_clone *, int, caddr_t); +static void pflog_clone_destroy(struct ifnet *); + +IFC_SIMPLE_DECLARE(pflog, 1); + +struct ifnet *pflogifs[PFLOGIFS_MAX]; /* for fast access */ + +static void +pflogattach(int npflog) +{ + int i; + for (i = 0; i < PFLOGIFS_MAX; i++) + pflogifs[i] = NULL; + if_clone_attach(&pflog_cloner); +} + +static int +pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param) +{ + struct ifnet *ifp; + + if (unit >= PFLOGIFS_MAX) + return (EINVAL); + + ifp = if_alloc(IFT_PFLOG); + if (ifp == NULL) { + return (ENOSPC); + } + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_mtu = PFLOGMTU; + ifp->if_ioctl = pflogioctl; + ifp->if_output = pflogoutput; + ifp->if_start = pflogstart; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = PFLOG_HDRLEN; + if_attach(ifp); + + bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN); + + pflogifs[unit] = ifp; + + return (0); +} + +static void +pflog_clone_destroy(struct ifnet *ifp) +{ + int i; + + for (i = 0; i < PFLOGIFS_MAX; i++) + if (pflogifs[i] == ifp) + pflogifs[i] = NULL; + + bpfdetach(ifp); + if_detach(ifp); + if_free(ifp); +} + +/* + * Start output on the pflog interface. + */ +static void +pflogstart(struct ifnet *ifp) +{ + struct mbuf *m; + + for (;;) { + IF_LOCK(&ifp->if_snd); + _IF_DROP(&ifp->if_snd); + _IF_DEQUEUE(&ifp->if_snd, m); + IF_UNLOCK(&ifp->if_snd); + + if (m == NULL) + return; + else + m_freem(m); + } +} + +static int +pflogoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *rt) +{ + m_freem(m); + return (0); +} + +/* ARGSUSED */ +static int +pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + switch (cmd) { + case SIOCSIFFLAGS: + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + break; + default: + return (ENOTTY); + } + + return (0); +} + +static int +pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir, + u_int8_t reason, struct pf_rule *rm, struct pf_rule *am, + struct pf_ruleset *ruleset, struct pf_pdesc *pd, int lookupsafe) +{ + struct ifnet *ifn; + struct pfloghdr hdr; + + if (kif == NULL || m == NULL || rm == NULL || pd == NULL) + return ( 1); + + if ((ifn = pflogifs[rm->logif]) == NULL || !ifn->if_bpf) + return (0); + + bzero(&hdr, sizeof(hdr)); + hdr.length = PFLOG_REAL_HDRLEN; + hdr.af = af; + hdr.action = rm->action; + hdr.reason = reason; + memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname)); + + if (am == NULL) { + hdr.rulenr = htonl(rm->nr); + hdr.subrulenr = 1; + } else { + hdr.rulenr = htonl(am->nr); + hdr.subrulenr = htonl(rm->nr); + if (ruleset != NULL && ruleset->anchor != NULL) + strlcpy(hdr.ruleset, ruleset->anchor->name, + sizeof(hdr.ruleset)); + } + /* + * XXXGL: we avoid pf_socket_lookup() when we are holding + * state lock, since this leads to unsafe LOR. + * These conditions are very very rare, however. + */ + if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe) + pd->lookup.done = pf_socket_lookup(dir, pd, m); + if (pd->lookup.done > 0) + hdr.uid = pd->lookup.uid; + else + hdr.uid = UID_MAX; + hdr.pid = NO_PID; + hdr.rule_uid = rm->cuid; + hdr.rule_pid = rm->cpid; + hdr.dir = dir; + +#ifdef INET + if (af == AF_INET && dir == PF_OUT) { + struct ip *ip; + + ip = mtod(m, struct ip *); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, ip->ip_hl << 2); + } +#endif /* INET */ + + ifn->if_opackets++; + ifn->if_obytes += m->m_pkthdr.len; + BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m); + + return (0); +} + +static int +pflog_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + pflogattach(1); + PF_RULES_WLOCK(); + pflog_packet_ptr = pflog_packet; + PF_RULES_WUNLOCK(); + break; + case MOD_UNLOAD: + PF_RULES_WLOCK(); + pflog_packet_ptr = NULL; + PF_RULES_WUNLOCK(); + if_clone_detach(&pflog_cloner); + break; + default: + error = EINVAL; + break; + } + + return error; +} + +static moduledata_t pflog_mod = { "pflog", pflog_modevent, 0 }; + +#define PFLOG_MODVER 1 + +DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(pflog, PFLOG_MODVER); +MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER); diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c new file mode 100644 index 0000000..28af641 --- /dev/null +++ b/sys/netpfil/pf/if_pfsync.c @@ -0,0 +1,2397 @@ +/* $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $ */ + +/* + * Copyright (c) 2002 Michael Shalayeff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2009 David Gwynne <dlg@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Revisions picked from OpenBSD after revision 1.110 import: + * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates + * 1.120, 1.175 - use monotonic time_uptime + * 1.122 - reduce number of updates for non-TCP sessions + * 1.128 - cleanups + * 1.146 - bzero() mbuf before sparsely filling it with data + * 1.170 - SIOCSIFMTU checks + * 1.126, 1.142 - deferred packets processing + * 1.173 - correct expire time processing + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_pf.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/endian.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/sysctl.h> + +#include <net/bpf.h> +#include <net/if.h> +#include <net/if_clone.h> +#include <net/if_types.h> +#include <net/pfvar.h> +#include <net/if_pfsync.h> + +#include <netinet/if_ether.h> +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_carp.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> + +#define PFSYNC_MINPKT ( \ + sizeof(struct ip) + \ + sizeof(struct pfsync_header) + \ + sizeof(struct pfsync_subheader) + \ + sizeof(struct pfsync_eof)) + +struct pfsync_pkt { + struct ip *ip; + struct in_addr src; + u_int8_t flags; +}; + +static int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *, + struct pfsync_state_peer *); +static int pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int); +static int pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int); + +static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = { + pfsync_in_clr, /* PFSYNC_ACT_CLR */ + pfsync_in_ins, /* PFSYNC_ACT_INS */ + pfsync_in_iack, /* PFSYNC_ACT_INS_ACK */ + pfsync_in_upd, /* PFSYNC_ACT_UPD */ + pfsync_in_upd_c, /* PFSYNC_ACT_UPD_C */ + pfsync_in_ureq, /* PFSYNC_ACT_UPD_REQ */ + pfsync_in_del, /* PFSYNC_ACT_DEL */ + pfsync_in_del_c, /* PFSYNC_ACT_DEL_C */ + pfsync_in_error, /* PFSYNC_ACT_INS_F */ + pfsync_in_error, /* PFSYNC_ACT_DEL_F */ + pfsync_in_bus, /* PFSYNC_ACT_BUS */ + pfsync_in_tdb, /* PFSYNC_ACT_TDB */ + pfsync_in_eof /* PFSYNC_ACT_EOF */ +}; + +struct pfsync_q { + int (*write)(struct pf_state *, struct mbuf *, int); + size_t len; + u_int8_t action; +}; + +/* we have one of these for every PFSYNC_S_ */ +static int pfsync_out_state(struct pf_state *, struct mbuf *, int); +static int pfsync_out_iack(struct pf_state *, struct mbuf *, int); +static int pfsync_out_upd_c(struct pf_state *, struct mbuf *, int); +static int pfsync_out_del(struct pf_state *, struct mbuf *, int); + +static struct pfsync_q pfsync_qs[] = { + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, + { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD }, + { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, + { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C } +}; + +static void pfsync_q_ins(struct pf_state *, int); +static void pfsync_q_del(struct pf_state *); + +static void pfsync_update_state(struct pf_state *); + +struct pfsync_upd_req_item { + TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; + struct pfsync_upd_req ur_msg; +}; + +struct pfsync_deferral { + struct pfsync_softc *pd_sc; + TAILQ_ENTRY(pfsync_deferral) pd_entry; + u_int pd_refs; + struct callout pd_tmo; + + struct pf_state *pd_st; + struct mbuf *pd_m; +}; + +struct pfsync_softc { + /* Configuration */ + struct ifnet *sc_ifp; + struct ifnet *sc_sync_if; + struct ip_moptions sc_imo; + struct in_addr sc_sync_peer; + uint32_t sc_flags; +#define PFSYNCF_OK 0x00000001 +#define PFSYNCF_DEFER 0x00000002 +#define PFSYNCF_PUSH 0x00000004 + uint8_t sc_maxupdates; + struct ip sc_template; + struct callout sc_tmo; + struct mtx sc_mtx; + + /* Queued data */ + size_t sc_len; + TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT]; + TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list; + TAILQ_HEAD(, pfsync_deferral) sc_deferrals; + u_int sc_deferred; + void *sc_plus; + size_t sc_pluslen; + + /* Bulk update info */ + struct mtx sc_bulk_mtx; + uint32_t sc_ureq_sent; + int sc_bulk_tries; + uint32_t sc_ureq_received; + int sc_bulk_hashid; + uint64_t sc_bulk_stateid; + uint32_t sc_bulk_creatorid; + struct callout sc_bulk_tmo; + struct callout sc_bulkfail_tmo; +}; + +#define PFSYNC_LOCK(sc) mtx_lock(&(sc)->sc_mtx) +#define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) +#define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) + +#define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx) +#define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx) +#define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED) + +static MALLOC_DEFINE(M_PFSYNC, "pfsync", "pfsync(4) data"); +static VNET_DEFINE(struct pfsync_softc *, pfsyncif) = NULL; +#define V_pfsyncif VNET(pfsyncif) +static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL; +#define V_pfsync_swi_cookie VNET(pfsync_swi_cookie) +static VNET_DEFINE(struct pfsyncstats, pfsyncstats); +#define V_pfsyncstats VNET(pfsyncstats) +static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW; +#define V_pfsync_carp_adj VNET(pfsync_carp_adj) + +static void pfsync_timeout(void *); +static void pfsync_push(struct pfsync_softc *); +static void pfsyncintr(void *); +static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *, + void *); +static void pfsync_multicast_cleanup(struct pfsync_softc *); +static int pfsync_init(void); +static void pfsync_uninit(void); + +SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC"); +SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW, + &VNET_NAME(pfsyncstats), pfsyncstats, + "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); +SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW, + &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment"); + +static int pfsync_clone_create(struct if_clone *, int, caddr_t); +static void pfsync_clone_destroy(struct ifnet *); +static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *, + struct pf_state_peer *); +static int pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +static int pfsyncioctl(struct ifnet *, u_long, caddr_t); + +static int pfsync_defer(struct pf_state *, struct mbuf *); +static void pfsync_undefer(struct pfsync_deferral *, int); +static void pfsync_undefer_state(struct pf_state *, int); +static void pfsync_defer_tmo(void *); + +static void pfsync_request_update(u_int32_t, u_int64_t); +static void pfsync_update_state_req(struct pf_state *); + +static void pfsync_drop(struct pfsync_softc *); +static void pfsync_sendout(int); +static void pfsync_send_plus(void *, size_t); + +static void pfsync_bulk_start(void); +static void pfsync_bulk_status(u_int8_t); +static void pfsync_bulk_update(void *); +static void pfsync_bulk_fail(void *); + +#ifdef IPSEC +static void pfsync_update_net_tdb(struct pfsync_tdb *); +#endif + +#define PFSYNC_MAX_BULKTRIES 12 + +VNET_DEFINE(struct ifc_simple_data, pfsync_cloner_data); +VNET_DEFINE(struct if_clone, pfsync_cloner); +#define V_pfsync_cloner_data VNET(pfsync_cloner_data) +#define V_pfsync_cloner VNET(pfsync_cloner) +IFC_SIMPLE_DECLARE(pfsync, 1); + +static int +pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) +{ + struct pfsync_softc *sc; + struct ifnet *ifp; + int q; + + if (unit != 0) + return (EINVAL); + + sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); + sc->sc_flags |= PFSYNCF_OK; + + for (q = 0; q < PFSYNC_S_COUNT; q++) + TAILQ_INIT(&sc->sc_qs[q]); + + TAILQ_INIT(&sc->sc_upd_req_list); + TAILQ_INIT(&sc->sc_deferrals); + + sc->sc_len = PFSYNC_MINPKT; + sc->sc_maxupdates = 128; + + ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC); + if (ifp == NULL) { + free(sc, M_PFSYNC); + return (ENOSPC); + } + if_initname(ifp, ifc->ifc_name, unit); + ifp->if_softc = sc; + ifp->if_ioctl = pfsyncioctl; + ifp->if_output = pfsyncoutput; + ifp->if_type = IFT_PFSYNC; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = sizeof(struct pfsync_header); + ifp->if_mtu = ETHERMTU; + mtx_init(&sc->sc_mtx, "pfsync", NULL, MTX_DEF); + mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF); + callout_init(&sc->sc_tmo, CALLOUT_MPSAFE); + callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0); + callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0); + + if_attach(ifp); + + bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); + + V_pfsyncif = sc; + + return (0); +} + +static void +pfsync_clone_destroy(struct ifnet *ifp) +{ + struct pfsync_softc *sc = ifp->if_softc; + + /* + * At this stage, everything should have already been + * cleared by pfsync_uninit(), and we have only to + * drain callouts. + */ + while (sc->sc_deferred > 0) { + struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals); + + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + if (callout_stop(&pd->pd_tmo)) { + pf_release_state(pd->pd_st); + m_freem(pd->pd_m); + free(pd, M_PFSYNC); + } else { + pd->pd_refs++; + callout_drain(&pd->pd_tmo); + free(pd, M_PFSYNC); + } + } + + callout_drain(&sc->sc_tmo); + callout_drain(&sc->sc_bulkfail_tmo); + callout_drain(&sc->sc_bulk_tmo); + + if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy"); + bpfdetach(ifp); + if_detach(ifp); + + pfsync_drop(sc); + + if_free(ifp); + if (sc->sc_imo.imo_membership) + pfsync_multicast_cleanup(sc); + mtx_destroy(&sc->sc_mtx); + mtx_destroy(&sc->sc_bulk_mtx); + free(sc, M_PFSYNC); + + V_pfsyncif = NULL; +} + +static int +pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, + struct pf_state_peer *d) +{ + if (s->scrub.scrub_flag && d->scrub == NULL) { + d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO); + if (d->scrub == NULL) + return (ENOMEM); + } + + return (0); +} + + +static int +pfsync_state_import(struct pfsync_state *sp, u_int8_t flags) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pf_state *st = NULL; + struct pf_state_key *skw = NULL, *sks = NULL; + struct pf_rule *r = NULL; + struct pfi_kif *kif; + int error; + + PF_RULES_RASSERT(); + + if (sp->creatorid == 0 && V_pf_status.debug >= PF_DEBUG_MISC) { + printf("%s: invalid creator id: %08x\n", __func__, + ntohl(sp->creatorid)); + return (EINVAL); + } + + if ((kif = pfi_kif_find(sp->ifname)) == NULL) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: unknown interface: %s\n", __func__, + sp->ifname); + if (flags & PFSYNC_SI_IOCTL) + return (EINVAL); + return (0); /* skip this state */ + } + + /* + * If the ruleset checksums match or the state is coming from the ioctl, + * it's safe to associate the state with the rule of that number. + */ + if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && + (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) < + pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) + r = pf_main_ruleset.rules[ + PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)]; + else + r = &V_pf_default_rule; + + if ((r->max_states && r->states_cur >= r->max_states)) + goto cleanup; + + /* + * XXXGL: consider M_WAITOK in ioctl path after. + */ + if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL) + goto cleanup; + + if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL) + goto cleanup; + + if (PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0], + &sp->key[PF_SK_STACK].addr[0], sp->af) || + PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1], + &sp->key[PF_SK_STACK].addr[1], sp->af) || + sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] || + sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1]) { + sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT); + if (sks == NULL) + goto cleanup; + } else + sks = skw; + + /* allocate memory for scrub info */ + if (pfsync_alloc_scrub_memory(&sp->src, &st->src) || + pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) + goto cleanup; + + /* copy to state key(s) */ + skw->addr[0] = sp->key[PF_SK_WIRE].addr[0]; + skw->addr[1] = sp->key[PF_SK_WIRE].addr[1]; + skw->port[0] = sp->key[PF_SK_WIRE].port[0]; + skw->port[1] = sp->key[PF_SK_WIRE].port[1]; + skw->proto = sp->proto; + skw->af = sp->af; + if (sks != skw) { + sks->addr[0] = sp->key[PF_SK_STACK].addr[0]; + sks->addr[1] = sp->key[PF_SK_STACK].addr[1]; + sks->port[0] = sp->key[PF_SK_STACK].port[0]; + sks->port[1] = sp->key[PF_SK_STACK].port[1]; + sks->proto = sp->proto; + sks->af = sp->af; + } + + /* copy to state */ + bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr)); + st->creation = time_uptime - ntohl(sp->creation); + st->expire = time_uptime; + if (sp->expire) { + uint32_t timeout; + + timeout = r->timeout[sp->timeout]; + if (!timeout) + timeout = V_pf_default_rule.timeout[sp->timeout]; + + /* sp->expire may have been adaptively scaled by export. */ + st->expire -= timeout - ntohl(sp->expire); + } + + st->direction = sp->direction; + st->log = sp->log; + st->timeout = sp->timeout; + st->state_flags = sp->state_flags; + + st->id = sp->id; + st->creatorid = sp->creatorid; + pf_state_peer_ntoh(&sp->src, &st->src); + pf_state_peer_ntoh(&sp->dst, &st->dst); + + st->rule.ptr = r; + st->nat_rule.ptr = NULL; + st->anchor.ptr = NULL; + st->rt_kif = NULL; + + st->pfsync_time = time_uptime; + st->sync_state = PFSYNC_S_NONE; + + /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */ + r->states_cur++; + r->states_tot++; + + if (!(flags & PFSYNC_SI_IOCTL)) + st->state_flags |= PFSTATE_NOSYNC; + + if ((error = pf_state_insert(kif, skw, sks, st)) != 0) { + /* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */ + r->states_cur--; + goto cleanup_state; + } + + if (!(flags & PFSYNC_SI_IOCTL)) { + st->state_flags &= ~PFSTATE_NOSYNC; + if (st->state_flags & PFSTATE_ACK) { + pfsync_q_ins(st, PFSYNC_S_IACK); + pfsync_push(sc); + } + } + st->state_flags &= ~PFSTATE_ACK; + PF_STATE_UNLOCK(st); + + return (0); + +cleanup: + error = ENOMEM; + if (skw == sks) + sks = NULL; + if (skw != NULL) + uma_zfree(V_pf_state_key_z, skw); + if (sks != NULL) + uma_zfree(V_pf_state_key_z, sks); + +cleanup_state: /* pf_state_insert() frees the state keys. */ + if (st) { + if (st->dst.scrub) + uma_zfree(V_pf_state_scrub_z, st->dst.scrub); + if (st->src.scrub) + uma_zfree(V_pf_state_scrub_z, st->src.scrub); + uma_zfree(V_pf_state_z, st); + } + return (error); +} + +static void +pfsync_input(struct mbuf *m, __unused int off) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_pkt pkt; + struct ip *ip = mtod(m, struct ip *); + struct pfsync_header *ph; + struct pfsync_subheader subh; + + int offset; + int rv; + uint16_t count; + + V_pfsyncstats.pfsyncs_ipackets++; + + /* Verify that we have a sync interface configured. */ + if (!sc || !sc->sc_sync_if || !V_pf_status.running || + (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + goto done; + + /* verify that the packet came in on the right interface */ + if (sc->sc_sync_if != m->m_pkthdr.rcvif) { + V_pfsyncstats.pfsyncs_badif++; + goto done; + } + + sc->sc_ifp->if_ipackets++; + sc->sc_ifp->if_ibytes += m->m_pkthdr.len; + /* verify that the IP TTL is 255. */ + if (ip->ip_ttl != PFSYNC_DFLTTL) { + V_pfsyncstats.pfsyncs_badttl++; + goto done; + } + + offset = ip->ip_hl << 2; + if (m->m_pkthdr.len < offset + sizeof(*ph)) { + V_pfsyncstats.pfsyncs_hdrops++; + goto done; + } + + if (offset + sizeof(*ph) > m->m_len) { + if (m_pullup(m, offset + sizeof(*ph)) == NULL) { + V_pfsyncstats.pfsyncs_hdrops++; + return; + } + ip = mtod(m, struct ip *); + } + ph = (struct pfsync_header *)((char *)ip + offset); + + /* verify the version */ + if (ph->version != PFSYNC_VERSION) { + V_pfsyncstats.pfsyncs_badver++; + goto done; + } + + /* Cheaper to grab this now than having to mess with mbufs later */ + pkt.ip = ip; + pkt.src = ip->ip_src; + pkt.flags = 0; + + /* + * Trusting pf_chksum during packet processing, as well as seeking + * in interface name tree, require holding PF_RULES_RLOCK(). + */ + PF_RULES_RLOCK(); + if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) + pkt.flags |= PFSYNC_SI_CKSUM; + + offset += sizeof(*ph); + for (;;) { + m_copydata(m, offset, sizeof(subh), (caddr_t)&subh); + offset += sizeof(subh); + + if (subh.action >= PFSYNC_ACT_MAX) { + V_pfsyncstats.pfsyncs_badact++; + PF_RULES_RUNLOCK(); + goto done; + } + + count = ntohs(subh.count); + V_pfsyncstats.pfsyncs_iacts[subh.action] += count; + rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count); + if (rv == -1) { + PF_RULES_RUNLOCK(); + return; + } + + offset += rv; + } + PF_RULES_RUNLOCK(); + +done: + m_freem(m); +} + +static int +pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_clr *clr; + struct mbuf *mp; + int len = sizeof(*clr) * count; + int i, offp; + u_int32_t creatorid; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + clr = (struct pfsync_clr *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + creatorid = clr[i].creatorid; + + if (clr[i].ifname[0] != '\0' && + pfi_kif_find(clr[i].ifname) == NULL) + continue; + + for (int i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + struct pf_state *s; +relock: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (s->creatorid == creatorid) { + s->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + } + PF_HASHROW_UNLOCK(ih); + } + } + + return (len); +} + +static int +pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct mbuf *mp; + struct pfsync_state *sa, *sp; + int len = sizeof(*sp) * count; + int i, offp; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_state *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + /* Check for invalid values. */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST || + sp->direction > PF_OUT || + (sp->af != AF_INET && sp->af != AF_INET6)) { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: invalid value\n", __func__); + V_pfsyncstats.pfsyncs_badval++; + continue; + } + + if (pfsync_state_import(sp, pkt->flags) == ENOMEM) + /* Drop out, but process the rest of the actions. */ + break; + } + + return (len); +} + +static int +pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_ins_ack *ia, *iaa; + struct pf_state *st; + + struct mbuf *mp; + int len = count * sizeof(*ia); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + iaa = (struct pfsync_ins_ack *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + ia = &iaa[i]; + + st = pf_find_state_byid(ia->id, ia->creatorid); + if (st == NULL) + continue; + + if (st->state_flags & PFSTATE_ACK) { + PFSYNC_LOCK(V_pfsyncif); + pfsync_undefer_state(st, 0); + PFSYNC_UNLOCK(V_pfsyncif); + } + PF_STATE_UNLOCK(st); + } + /* + * XXX this is not yet implemented, but we know the size of the + * message so we can skip it. + */ + + return (count * sizeof(struct pfsync_ins_ack)); +} + +static int +pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src, + struct pfsync_state_peer *dst) +{ + int sfail = 0; + + PF_STATE_LOCK_ASSERT(st); + + /* + * The state should never go backwards except + * for syn-proxy states. Neither should the + * sequence window slide backwards. + */ + if (st->src.state > src->state && + (st->src.state < PF_TCPS_PROXY_SRC || + src->state >= PF_TCPS_PROXY_SRC)) + sfail = 1; + else if (SEQ_GT(st->src.seqlo, ntohl(src->seqlo))) + sfail = 3; + else if (st->dst.state > dst->state) { + /* There might still be useful + * information about the src state here, + * so import that part of the update, + * then "fail" so we send the updated + * state back to the peer who is missing + * our what we know. */ + pf_state_peer_ntoh(src, &st->src); + /* XXX do anything with timeouts? */ + sfail = 7; + } else if (st->dst.state >= TCPS_SYN_SENT && + SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))) + sfail = 4; + + return (sfail); +} + +static int +pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_state *sa, *sp; + struct pf_state_key *sk; + struct pf_state *st; + int sfail; + + struct mbuf *mp; + int len = count * sizeof(*sp); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_state *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync_input: PFSYNC_ACT_UPD: " + "invalid value\n"); + } + V_pfsyncstats.pfsyncs_badval++; + continue; + } + + st = pf_find_state_byid(sp->id, sp->creatorid); + if (st == NULL) { + /* insert the update */ + if (pfsync_state_import(sp, 0)) + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + + if (st->state_flags & PFSTATE_ACK) { + PFSYNC_LOCK(sc); + pfsync_undefer_state(st, 1); + PFSYNC_UNLOCK(sc); + } + + sk = st->key[PF_SK_WIRE]; /* XXX right one? */ + sfail = 0; + if (sk->proto == IPPROTO_TCP) + sfail = pfsync_upd_tcp(st, &sp->src, &sp->dst); + else { + /* + * Non-TCP protocol state machine always go + * forwards + */ + if (st->src.state > sp->src.state) + sfail = 5; + else if (st->dst.state > sp->dst.state) + sfail = 6; + } + + if (sfail) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync: %s stale update (%d)" + " id: %016llx creatorid: %08x\n", + (sfail < 7 ? "ignoring" : "partial"), + sfail, (unsigned long long)be64toh(st->id), + ntohl(st->creatorid)); + } + V_pfsyncstats.pfsyncs_stale++; + + pfsync_update_state(st); + PF_STATE_UNLOCK(st); + PFSYNC_LOCK(sc); + pfsync_push(sc); + PFSYNC_UNLOCK(sc); + continue; + } + pfsync_alloc_scrub_memory(&sp->dst, &st->dst); + pf_state_peer_ntoh(&sp->src, &st->src); + pf_state_peer_ntoh(&sp->dst, &st->dst); + st->expire = time_uptime; + st->timeout = sp->timeout; + st->pfsync_time = time_uptime; + PF_STATE_UNLOCK(st); + } + + return (len); +} + +static int +pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_upd_c *ua, *up; + struct pf_state_key *sk; + struct pf_state *st; + + int len = count * sizeof(*up); + int sfail; + + struct mbuf *mp; + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + ua = (struct pfsync_upd_c *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + up = &ua[i]; + + /* check for invalid values */ + if (up->timeout >= PFTM_MAX || + up->src.state > PF_TCPS_PROXY_DST || + up->dst.state > PF_TCPS_PROXY_DST) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync_input: " + "PFSYNC_ACT_UPD_C: " + "invalid value\n"); + } + V_pfsyncstats.pfsyncs_badval++; + continue; + } + + st = pf_find_state_byid(up->id, up->creatorid); + if (st == NULL) { + /* We don't have this state. Ask for it. */ + PFSYNC_LOCK(sc); + pfsync_request_update(up->creatorid, up->id); + PFSYNC_UNLOCK(sc); + continue; + } + + if (st->state_flags & PFSTATE_ACK) { + PFSYNC_LOCK(sc); + pfsync_undefer_state(st, 1); + PFSYNC_UNLOCK(sc); + } + + sk = st->key[PF_SK_WIRE]; /* XXX right one? */ + sfail = 0; + if (sk->proto == IPPROTO_TCP) + sfail = pfsync_upd_tcp(st, &up->src, &up->dst); + else { + /* + * Non-TCP protocol state machine always go forwards + */ + if (st->src.state > up->src.state) + sfail = 5; + else if (st->dst.state > up->dst.state) + sfail = 6; + } + + if (sfail) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pfsync: ignoring stale update " + "(%d) id: %016llx " + "creatorid: %08x\n", sfail, + (unsigned long long)be64toh(st->id), + ntohl(st->creatorid)); + } + V_pfsyncstats.pfsyncs_stale++; + + pfsync_update_state(st); + PF_STATE_UNLOCK(st); + PFSYNC_LOCK(sc); + pfsync_push(sc); + PFSYNC_UNLOCK(sc); + continue; + } + pfsync_alloc_scrub_memory(&up->dst, &st->dst); + pf_state_peer_ntoh(&up->src, &st->src); + pf_state_peer_ntoh(&up->dst, &st->dst); + st->expire = time_uptime; + st->timeout = up->timeout; + st->pfsync_time = time_uptime; + PF_STATE_UNLOCK(st); + } + + return (len); +} + +static int +pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_upd_req *ur, *ura; + struct mbuf *mp; + int len = count * sizeof(*ur); + int i, offp; + + struct pf_state *st; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + ura = (struct pfsync_upd_req *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + ur = &ura[i]; + + if (ur->id == 0 && ur->creatorid == 0) + pfsync_bulk_start(); + else { + st = pf_find_state_byid(ur->id, ur->creatorid); + if (st == NULL) { + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + if (st->state_flags & PFSTATE_NOSYNC) { + PF_STATE_UNLOCK(st); + continue; + } + + pfsync_update_state_req(st); + PF_STATE_UNLOCK(st); + } + } + + return (len); +} + +static int +pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct mbuf *mp; + struct pfsync_state *sa, *sp; + struct pf_state *st; + int len = count * sizeof(*sp); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_state *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + st = pf_find_state_byid(sp->id, sp->creatorid); + if (st == NULL) { + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + st->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(st, PF_ENTER_LOCKED); + } + + return (len); +} + +static int +pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct mbuf *mp; + struct pfsync_del_c *sa, *sp; + struct pf_state *st; + int len = count * sizeof(*sp); + int offp, i; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + sa = (struct pfsync_del_c *)(mp->m_data + offp); + + for (i = 0; i < count; i++) { + sp = &sa[i]; + + st = pf_find_state_byid(sp->id, sp->creatorid); + if (st == NULL) { + V_pfsyncstats.pfsyncs_badstate++; + continue; + } + + st->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(st, PF_ENTER_LOCKED); + } + + return (len); +} + +static int +pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_bus *bus; + struct mbuf *mp; + int len = count * sizeof(*bus); + int offp; + + PFSYNC_BLOCK(sc); + + /* If we're not waiting for a bulk update, who cares. */ + if (sc->sc_ureq_sent == 0) { + PFSYNC_BUNLOCK(sc); + return (len); + } + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + PFSYNC_BUNLOCK(sc); + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + bus = (struct pfsync_bus *)(mp->m_data + offp); + + switch (bus->status) { + case PFSYNC_BUS_START: + callout_reset(&sc->sc_bulkfail_tmo, 4 * hz + + V_pf_limits[PF_LIMIT_STATES].limit / + ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) / + sizeof(struct pfsync_state)), + pfsync_bulk_fail, sc); + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received bulk update start\n"); + break; + + case PFSYNC_BUS_END: + if (time_uptime - ntohl(bus->endtime) >= + sc->sc_ureq_sent) { + /* that's it, we're happy */ + sc->sc_ureq_sent = 0; + sc->sc_bulk_tries = 0; + callout_stop(&sc->sc_bulkfail_tmo); + if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(-V_pfsync_carp_adj, + "pfsync bulk done"); + sc->sc_flags |= PFSYNCF_OK; + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received valid " + "bulk update end\n"); + } else { + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received invalid " + "bulk update end: bad timestamp\n"); + } + break; + } + PFSYNC_BUNLOCK(sc); + + return (len); +} + +static int +pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + int len = count * sizeof(struct pfsync_tdb); + +#if defined(IPSEC) + struct pfsync_tdb *tp; + struct mbuf *mp; + int offp; + int i; + int s; + + mp = m_pulldown(m, offset, len, &offp); + if (mp == NULL) { + V_pfsyncstats.pfsyncs_badlen++; + return (-1); + } + tp = (struct pfsync_tdb *)(mp->m_data + offp); + + for (i = 0; i < count; i++) + pfsync_update_net_tdb(&tp[i]); +#endif + + return (len); +} + +#if defined(IPSEC) +/* Update an in-kernel tdb. Silently fail if no tdb is found. */ +static void +pfsync_update_net_tdb(struct pfsync_tdb *pt) +{ + struct tdb *tdb; + int s; + + /* check for invalid values */ + if (ntohl(pt->spi) <= SPI_RESERVED_MAX || + (pt->dst.sa.sa_family != AF_INET && + pt->dst.sa.sa_family != AF_INET6)) + goto bad; + + tdb = gettdb(pt->spi, &pt->dst, pt->sproto); + if (tdb) { + pt->rpl = ntohl(pt->rpl); + pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes); + + /* Neither replay nor byte counter should ever decrease. */ + if (pt->rpl < tdb->tdb_rpl || + pt->cur_bytes < tdb->tdb_cur_bytes) { + goto bad; + } + + tdb->tdb_rpl = pt->rpl; + tdb->tdb_cur_bytes = pt->cur_bytes; + } + return; + +bad: + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: " + "invalid value\n"); + V_pfsyncstats.pfsyncs_badstate++; + return; +} +#endif + + +static int +pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + /* check if we are at the right place in the packet */ + if (offset != m->m_pkthdr.len - sizeof(struct pfsync_eof)) + V_pfsyncstats.pfsyncs_badact++; + + /* we're done. free and let the caller return */ + m_freem(m); + return (-1); +} + +static int +pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) +{ + V_pfsyncstats.pfsyncs_badact++; + + m_freem(m); + return (-1); +} + +static int +pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *rt) +{ + m_freem(m); + return (0); +} + +/* ARGSUSED */ +static int +pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct pfsync_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct pfsyncreq pfsyncr; + int error; + + switch (cmd) { + case SIOCSIFFLAGS: + PFSYNC_LOCK(sc); + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + PFSYNC_UNLOCK(sc); + break; + case SIOCSIFMTU: + if (!sc->sc_sync_if || + ifr->ifr_mtu <= PFSYNC_MINPKT || + ifr->ifr_mtu > sc->sc_sync_if->if_mtu) + return (EINVAL); + if (ifr->ifr_mtu < ifp->if_mtu) { + PFSYNC_LOCK(sc); + if (sc->sc_len > PFSYNC_MINPKT) + pfsync_sendout(1); + PFSYNC_UNLOCK(sc); + } + ifp->if_mtu = ifr->ifr_mtu; + break; + case SIOCGETPFSYNC: + bzero(&pfsyncr, sizeof(pfsyncr)); + PFSYNC_LOCK(sc); + if (sc->sc_sync_if) { + strlcpy(pfsyncr.pfsyncr_syncdev, + sc->sc_sync_if->if_xname, IFNAMSIZ); + } + pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; + pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; + pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER == + (sc->sc_flags & PFSYNCF_DEFER)); + PFSYNC_UNLOCK(sc); + return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); + + case SIOCSETPFSYNC: + { + struct ip_moptions *imo = &sc->sc_imo; + struct ifnet *sifp; + struct ip *ip; + void *mship = NULL; + + if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) + return (error); + if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) + return (error); + + if (pfsyncr.pfsyncr_maxupdates > 255) + return (EINVAL); + + if (pfsyncr.pfsyncr_syncdev[0] == 0) + sifp = NULL; + else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL) + return (EINVAL); + + if (pfsyncr.pfsyncr_syncpeer.s_addr == 0 && sifp != NULL) + mship = malloc((sizeof(struct in_multi *) * + IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO); + + PFSYNC_LOCK(sc); + if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) + sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); + else + sc->sc_sync_peer.s_addr = + pfsyncr.pfsyncr_syncpeer.s_addr; + + sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; + if (pfsyncr.pfsyncr_defer) { + sc->sc_flags |= PFSYNCF_DEFER; + pfsync_defer_ptr = pfsync_defer; + } else { + sc->sc_flags &= ~PFSYNCF_DEFER; + pfsync_defer_ptr = NULL; + } + + if (sifp == NULL) { + if (sc->sc_sync_if) + if_rele(sc->sc_sync_if); + sc->sc_sync_if = NULL; + if (imo->imo_membership) + pfsync_multicast_cleanup(sc); + PFSYNC_UNLOCK(sc); + break; + } + + if (sc->sc_len > PFSYNC_MINPKT && + (sifp->if_mtu < sc->sc_ifp->if_mtu || + (sc->sc_sync_if != NULL && + sifp->if_mtu < sc->sc_sync_if->if_mtu) || + sifp->if_mtu < MCLBYTES - sizeof(struct ip))) + pfsync_sendout(1); + + if (imo->imo_membership) + pfsync_multicast_cleanup(sc); + + if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { + error = pfsync_multicast_setup(sc, sifp, mship); + if (error) { + if_rele(sifp); + free(mship, M_PFSYNC); + return (error); + } + } + if (sc->sc_sync_if) + if_rele(sc->sc_sync_if); + sc->sc_sync_if = sifp; + + ip = &sc->sc_template; + bzero(ip, sizeof(*ip)); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(sc->sc_template) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; + /* len and id are set later. */ + ip->ip_off = IP_DF; + ip->ip_ttl = PFSYNC_DFLTTL; + ip->ip_p = IPPROTO_PFSYNC; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr; + + /* Request a full state table update. */ + if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(V_pfsync_carp_adj, + "pfsync bulk start"); + sc->sc_flags &= ~PFSYNCF_OK; + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: requesting bulk update\n"); + pfsync_request_update(0, 0); + PFSYNC_UNLOCK(sc); + PFSYNC_BLOCK(sc); + sc->sc_ureq_sent = time_uptime; + callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, + sc); + PFSYNC_BUNLOCK(sc); + + break; + } + default: + return (ENOTTY); + } + + return (0); +} + +static int +pfsync_out_state(struct pf_state *st, struct mbuf *m, int offset) +{ + struct pfsync_state *sp = (struct pfsync_state *)(m->m_data + offset); + + pfsync_state_export(sp, st); + + return (sizeof(*sp)); +} + +static int +pfsync_out_iack(struct pf_state *st, struct mbuf *m, int offset) +{ + struct pfsync_ins_ack *iack = + (struct pfsync_ins_ack *)(m->m_data + offset); + + iack->id = st->id; + iack->creatorid = st->creatorid; + + return (sizeof(*iack)); +} + +static int +pfsync_out_upd_c(struct pf_state *st, struct mbuf *m, int offset) +{ + struct pfsync_upd_c *up = (struct pfsync_upd_c *)(m->m_data + offset); + + bzero(up, sizeof(*up)); + up->id = st->id; + pf_state_peer_hton(&st->src, &up->src); + pf_state_peer_hton(&st->dst, &up->dst); + up->creatorid = st->creatorid; + up->timeout = st->timeout; + + return (sizeof(*up)); +} + +static int +pfsync_out_del(struct pf_state *st, struct mbuf *m, int offset) +{ + struct pfsync_del_c *dp = (struct pfsync_del_c *)(m->m_data + offset); + + dp->id = st->id; + dp->creatorid = st->creatorid; + + st->state_flags |= PFSTATE_NOSYNC; + + return (sizeof(*dp)); +} + +static void +pfsync_drop(struct pfsync_softc *sc) +{ + struct pf_state *st, *next; + struct pfsync_upd_req_item *ur; + int q; + + for (q = 0; q < PFSYNC_S_COUNT; q++) { + if (TAILQ_EMPTY(&sc->sc_qs[q])) + continue; + + TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) { + KASSERT(st->sync_state == q, + ("%s: st->sync_state == q", + __func__)); + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + } + TAILQ_INIT(&sc->sc_qs[q]); + } + + while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + free(ur, M_PFSYNC); + } + + sc->sc_plus = NULL; + sc->sc_len = PFSYNC_MINPKT; +} + +static void +pfsync_sendout(int schedswi) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct ifnet *ifp = sc->sc_ifp; + struct mbuf *m; + struct ip *ip; + struct pfsync_header *ph; + struct pfsync_subheader *subh; + struct pf_state *st, *next; + struct pfsync_upd_req_item *ur; + int offset; + int q, count = 0; + + KASSERT(sc != NULL, ("%s: null sc", __func__)); + KASSERT(sc->sc_len > PFSYNC_MINPKT, + ("%s: sc_len %zu", __func__, sc->sc_len)); + PFSYNC_LOCK_ASSERT(sc); + + if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) { + pfsync_drop(sc); + return; + } + + m = m_get2(M_NOWAIT, MT_DATA, M_PKTHDR, max_linkhdr + sc->sc_len); + if (m == NULL) { + sc->sc_ifp->if_oerrors++; + V_pfsyncstats.pfsyncs_onomem++; + return; + } + m->m_data += max_linkhdr; + m->m_len = m->m_pkthdr.len = sc->sc_len; + + /* build the ip header */ + ip = (struct ip *)m->m_data; + bcopy(&sc->sc_template, ip, sizeof(*ip)); + offset = sizeof(*ip); + + ip->ip_len = m->m_pkthdr.len; + ip->ip_id = htons(ip_randomid()); + + /* build the pfsync header */ + ph = (struct pfsync_header *)(m->m_data + offset); + bzero(ph, sizeof(*ph)); + offset += sizeof(*ph); + + ph->version = PFSYNC_VERSION; + ph->len = htons(sc->sc_len - sizeof(*ip)); + bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); + + /* walk the queues */ + for (q = 0; q < PFSYNC_S_COUNT; q++) { + if (TAILQ_EMPTY(&sc->sc_qs[q])) + continue; + + subh = (struct pfsync_subheader *)(m->m_data + offset); + offset += sizeof(*subh); + + count = 0; + TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) { + KASSERT(st->sync_state == q, + ("%s: st->sync_state == q", + __func__)); + /* + * XXXGL: some of write methods do unlocked reads + * of state data :( + */ + offset += pfsync_qs[q].write(st, m, offset); + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + count++; + } + TAILQ_INIT(&sc->sc_qs[q]); + + bzero(subh, sizeof(*subh)); + subh->action = pfsync_qs[q].action; + subh->count = htons(count); + V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count; + } + + if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { + subh = (struct pfsync_subheader *)(m->m_data + offset); + offset += sizeof(*subh); + + count = 0; + while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + + bcopy(&ur->ur_msg, m->m_data + offset, + sizeof(ur->ur_msg)); + offset += sizeof(ur->ur_msg); + free(ur, M_PFSYNC); + count++; + } + + bzero(subh, sizeof(*subh)); + subh->action = PFSYNC_ACT_UPD_REQ; + subh->count = htons(count); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count; + } + + /* has someone built a custom region for us to add? */ + if (sc->sc_plus != NULL) { + bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen); + offset += sc->sc_pluslen; + + sc->sc_plus = NULL; + } + + subh = (struct pfsync_subheader *)(m->m_data + offset); + offset += sizeof(*subh); + + bzero(subh, sizeof(*subh)); + subh->action = PFSYNC_ACT_EOF; + subh->count = htons(1); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++; + + /* XXX write checksum in EOF here */ + + /* we're done, let's put it on the wire */ + if (ifp->if_bpf) { + m->m_data += sizeof(*ip); + m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip); + BPF_MTAP(ifp, m); + m->m_data -= sizeof(*ip); + m->m_len = m->m_pkthdr.len = sc->sc_len; + } + + if (sc->sc_sync_if == NULL) { + sc->sc_len = PFSYNC_MINPKT; + m_freem(m); + return; + } + + sc->sc_ifp->if_opackets++; + sc->sc_ifp->if_obytes += m->m_pkthdr.len; + sc->sc_len = PFSYNC_MINPKT; + + if (!_IF_QFULL(&sc->sc_ifp->if_snd)) + _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); + else { + m_freem(m); + sc->sc_ifp->if_snd.ifq_drops++; + } + if (schedswi) + swi_sched(V_pfsync_swi_cookie, 0); +} + +static void +pfsync_insert_state(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + + if (st->state_flags & PFSTATE_NOSYNC) + return; + + if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) || + st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { + st->state_flags |= PFSTATE_NOSYNC; + return; + } + + KASSERT(st->sync_state == PFSYNC_S_NONE, + ("%s: st->sync_state == PFSYNC_S_NONE", __func__)); + + PFSYNC_LOCK(sc); + if (sc->sc_len == PFSYNC_MINPKT) + callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + + pfsync_q_ins(st, PFSYNC_S_INS); + PFSYNC_UNLOCK(sc); + + st->sync_updates = 0; +} + +static int +pfsync_defer(struct pf_state *st, struct mbuf *m) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_deferral *pd; + + if (m->m_flags & (M_BCAST|M_MCAST)) + return (0); + + PFSYNC_LOCK(sc); + + if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) || + !(sc->sc_flags & PFSYNCF_DEFER)) { + PFSYNC_UNLOCK(sc); + return (0); + } + + if (sc->sc_deferred >= 128) + pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0); + + pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT); + if (pd == NULL) + return (0); + sc->sc_deferred++; + + m->m_flags |= M_SKIP_FIREWALL; + st->state_flags |= PFSTATE_ACK; + + pd->pd_sc = sc; + pd->pd_refs = 0; + pd->pd_st = st; + pf_ref_state(st); + pd->pd_m = m; + + TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); + callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); + callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd); + + pfsync_push(sc); + + return (1); +} + +static void +pfsync_undefer(struct pfsync_deferral *pd, int drop) +{ + struct pfsync_softc *sc = pd->pd_sc; + struct mbuf *m = pd->pd_m; + struct pf_state *st = pd->pd_st; + + PFSYNC_LOCK_ASSERT(sc); + + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ + free(pd, M_PFSYNC); + pf_release_state(st); + + if (drop) + m_freem(m); + else { + _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); + pfsync_push(sc); + } +} + +static void +pfsync_defer_tmo(void *arg) +{ + struct pfsync_deferral *pd = arg; + struct pfsync_softc *sc = pd->pd_sc; + struct mbuf *m = pd->pd_m; + struct pf_state *st = pd->pd_st; + + PFSYNC_LOCK_ASSERT(sc); + + CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); + + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ + if (pd->pd_refs == 0) + free(pd, M_PFSYNC); + PFSYNC_UNLOCK(sc); + + ip_output(m, NULL, NULL, 0, NULL, NULL); + + pf_release_state(st); + + CURVNET_RESTORE(); +} + +static void +pfsync_undefer_state(struct pf_state *st, int drop) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_deferral *pd; + + PFSYNC_LOCK_ASSERT(sc); + + TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { + if (pd->pd_st == st) { + if (callout_stop(&pd->pd_tmo)) + pfsync_undefer(pd, drop); + return; + } + } + + panic("%s: unable to find deferred state", __func__); +} + +static void +pfsync_update_state(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + int sync = 0; + + PF_STATE_LOCK_ASSERT(st); + PFSYNC_LOCK(sc); + + if (st->state_flags & PFSTATE_ACK) + pfsync_undefer_state(st, 0); + if (st->state_flags & PFSTATE_NOSYNC) { + if (st->sync_state != PFSYNC_S_NONE) + pfsync_q_del(st); + PFSYNC_UNLOCK(sc); + return; + } + + if (sc->sc_len == PFSYNC_MINPKT) + callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + case PFSYNC_S_INS: + /* we're already handling it */ + + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { + st->sync_updates++; + if (st->sync_updates >= sc->sc_maxupdates) + sync = 1; + } + break; + + case PFSYNC_S_IACK: + pfsync_q_del(st); + case PFSYNC_S_NONE: + pfsync_q_ins(st, PFSYNC_S_UPD_C); + st->sync_updates = 0; + break; + + default: + panic("%s: unexpected sync state %d", __func__, st->sync_state); + } + + if (sync || (time_uptime - st->pfsync_time) < 2) + pfsync_push(sc); + + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_request_update(u_int32_t creatorid, u_int64_t id) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_upd_req_item *item; + size_t nlen = sizeof(struct pfsync_upd_req); + + PFSYNC_LOCK_ASSERT(sc); + + /* + * This code does nothing to prevent multiple update requests for the + * same state being generated. + */ + item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT); + if (item == NULL) + return; /* XXX stats */ + + item->ur_msg.id = id; + item->ur_msg.creatorid = creatorid; + + if (TAILQ_EMPTY(&sc->sc_upd_req_list)) + nlen += sizeof(struct pfsync_subheader); + + if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { + pfsync_sendout(1); + + nlen = sizeof(struct pfsync_subheader) + + sizeof(struct pfsync_upd_req); + } + + TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); + sc->sc_len += nlen; + + pfsync_push(sc); +} + +static void +pfsync_update_state_req(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + + PF_STATE_LOCK_ASSERT(st); + PFSYNC_LOCK(sc); + + if (st->state_flags & PFSTATE_NOSYNC) { + if (st->sync_state != PFSYNC_S_NONE) + pfsync_q_del(st); + PFSYNC_UNLOCK(sc); + return; + } + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_IACK: + pfsync_q_del(st); + case PFSYNC_S_NONE: + pfsync_q_ins(st, PFSYNC_S_UPD); + pfsync_push(sc); + break; + + case PFSYNC_S_INS: + case PFSYNC_S_UPD: + case PFSYNC_S_DEL: + /* we're already handling it */ + break; + + default: + panic("%s: unexpected sync state %d", __func__, st->sync_state); + } + + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_delete_state(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + + PFSYNC_LOCK(sc); + if (st->state_flags & PFSTATE_ACK) + pfsync_undefer_state(st, 1); + if (st->state_flags & PFSTATE_NOSYNC) { + if (st->sync_state != PFSYNC_S_NONE) + pfsync_q_del(st); + PFSYNC_UNLOCK(sc); + return; + } + + if (sc->sc_len == PFSYNC_MINPKT) + callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + + switch (st->sync_state) { + case PFSYNC_S_INS: + /* We never got to tell the world so just forget about it. */ + pfsync_q_del(st); + break; + + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + case PFSYNC_S_IACK: + pfsync_q_del(st); + /* FALLTHROUGH to putting it on the del list */ + + case PFSYNC_S_NONE: + pfsync_q_ins(st, PFSYNC_S_DEL); + break; + + default: + panic("%s: unexpected sync state %d", __func__, st->sync_state); + } + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_clear_states(u_int32_t creatorid, const char *ifname) +{ + struct pfsync_softc *sc = V_pfsyncif; + struct { + struct pfsync_subheader subh; + struct pfsync_clr clr; + } __packed r; + + bzero(&r, sizeof(r)); + + r.subh.action = PFSYNC_ACT_CLR; + r.subh.count = htons(1); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++; + + strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); + r.clr.creatorid = creatorid; + + PFSYNC_LOCK(sc); + pfsync_send_plus(&r, sizeof(r)); + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_q_ins(struct pf_state *st, int q) +{ + struct pfsync_softc *sc = V_pfsyncif; + size_t nlen = pfsync_qs[q].len; + + PFSYNC_LOCK_ASSERT(sc); + + KASSERT(st->sync_state == PFSYNC_S_NONE, + ("%s: st->sync_state == PFSYNC_S_NONE", __func__)); + KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu", + sc->sc_len)); + + if (TAILQ_EMPTY(&sc->sc_qs[q])) + nlen += sizeof(struct pfsync_subheader); + + if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { + pfsync_sendout(1); + + nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; + } + + sc->sc_len += nlen; + TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); + st->sync_state = q; + pf_ref_state(st); +} + +static void +pfsync_q_del(struct pf_state *st) +{ + struct pfsync_softc *sc = V_pfsyncif; + int q = st->sync_state; + + PFSYNC_LOCK_ASSERT(sc); + KASSERT(st->sync_state != PFSYNC_S_NONE, + ("%s: st->sync_state != PFSYNC_S_NONE", __func__)); + + sc->sc_len -= pfsync_qs[q].len; + TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + + if (TAILQ_EMPTY(&sc->sc_qs[q])) + sc->sc_len -= sizeof(struct pfsync_subheader); +} + +static void +pfsync_bulk_start(void) +{ + struct pfsync_softc *sc = V_pfsyncif; + + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: received bulk update request\n"); + + PFSYNC_BLOCK(sc); + + sc->sc_ureq_received = time_uptime; + sc->sc_bulk_hashid = 0; + sc->sc_bulk_stateid = 0; + pfsync_bulk_status(PFSYNC_BUS_START); + callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc); + PFSYNC_BUNLOCK(sc); +} + +static void +pfsync_bulk_update(void *arg) +{ + struct pfsync_softc *sc = arg; + struct pf_state *s; + int i, sent = 0; + + PFSYNC_BLOCK_ASSERT(sc); + CURVNET_SET(sc->sc_ifp->if_vnet); + + /* + * Start with last state from previous invocation. + * It may had gone, in this case start from the + * hash slot. + */ + s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid); + + if (s != NULL) + i = PF_IDHASH(s); + else + i = sc->sc_bulk_hashid; + + for (; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + + if (s != NULL) + PF_HASHROW_ASSERT(ih); + else { + PF_HASHROW_LOCK(ih); + s = LIST_FIRST(&ih->states); + } + + for (; s; s = LIST_NEXT(s, entry)) { + + if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) < + sizeof(struct pfsync_state)) { + /* We've filled a packet. */ + sc->sc_bulk_hashid = i; + sc->sc_bulk_stateid = s->id; + sc->sc_bulk_creatorid = s->creatorid; + PF_HASHROW_UNLOCK(ih); + callout_reset(&sc->sc_bulk_tmo, 1, + pfsync_bulk_update, sc); + goto full; + } + + if (s->sync_state == PFSYNC_S_NONE && + s->timeout < PFTM_MAX && + s->pfsync_time <= sc->sc_ureq_received) { + PFSYNC_LOCK(sc); + pfsync_update_state_req(s); + PFSYNC_UNLOCK(sc); + sent++; + } + } + PF_HASHROW_UNLOCK(ih); + } + + /* We're done. */ + pfsync_bulk_status(PFSYNC_BUS_END); + +full: + CURVNET_RESTORE(); +} + +static void +pfsync_bulk_status(u_int8_t status) +{ + struct { + struct pfsync_subheader subh; + struct pfsync_bus bus; + } __packed r; + + struct pfsync_softc *sc = V_pfsyncif; + + bzero(&r, sizeof(r)); + + r.subh.action = PFSYNC_ACT_BUS; + r.subh.count = htons(1); + V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++; + + r.bus.creatorid = V_pf_status.hostid; + r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received); + r.bus.status = status; + + PFSYNC_LOCK(sc); + pfsync_send_plus(&r, sizeof(r)); + PFSYNC_UNLOCK(sc); +} + +static void +pfsync_bulk_fail(void *arg) +{ + struct pfsync_softc *sc = arg; + + CURVNET_SET(sc->sc_ifp->if_vnet); + + PFSYNC_BLOCK_ASSERT(sc); + + if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { + /* Try again */ + callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, + pfsync_bulk_fail, V_pfsyncif); + PFSYNC_LOCK(sc); + pfsync_request_update(0, 0); + PFSYNC_UNLOCK(sc); + } else { + /* Pretend like the transfer was ok. */ + sc->sc_ureq_sent = 0; + sc->sc_bulk_tries = 0; + PFSYNC_LOCK(sc); + if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) + (*carp_demote_adj_p)(-V_pfsync_carp_adj, + "pfsync bulk fail"); + sc->sc_flags |= PFSYNCF_OK; + PFSYNC_UNLOCK(sc); + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("pfsync: failed to receive bulk update\n"); + } + + CURVNET_RESTORE(); +} + +static void +pfsync_send_plus(void *plus, size_t pluslen) +{ + struct pfsync_softc *sc = V_pfsyncif; + + PFSYNC_LOCK_ASSERT(sc); + + if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu) + pfsync_sendout(1); + + sc->sc_plus = plus; + sc->sc_len += (sc->sc_pluslen = pluslen); + + pfsync_sendout(1); +} + +static void +pfsync_timeout(void *arg) +{ + struct pfsync_softc *sc = arg; + + CURVNET_SET(sc->sc_ifp->if_vnet); + PFSYNC_LOCK(sc); + pfsync_push(sc); + PFSYNC_UNLOCK(sc); + CURVNET_RESTORE(); +} + +static void +pfsync_push(struct pfsync_softc *sc) +{ + + PFSYNC_LOCK_ASSERT(sc); + + sc->sc_flags |= PFSYNCF_PUSH; + swi_sched(V_pfsync_swi_cookie, 0); +} + +static void +pfsyncintr(void *arg) +{ + struct pfsync_softc *sc = arg; + struct mbuf *m, *n; + + CURVNET_SET(sc->sc_ifp->if_vnet); + + PFSYNC_LOCK(sc); + if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) { + pfsync_sendout(0); + sc->sc_flags &= ~PFSYNCF_PUSH; + } + _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m); + PFSYNC_UNLOCK(sc); + + for (; m != NULL; m = n) { + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* + * We distinguish between a deferral packet and our + * own pfsync packet based on M_SKIP_FIREWALL + * flag. This is XXX. + */ + if (m->m_flags & M_SKIP_FIREWALL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, + NULL) == 0) + V_pfsyncstats.pfsyncs_opackets++; + else + V_pfsyncstats.pfsyncs_oerrors++; + } + CURVNET_RESTORE(); +} + +static int +pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship) +{ + struct ip_moptions *imo = &sc->sc_imo; + int error; + + if (!(ifp->if_flags & IFF_MULTICAST)) + return (EADDRNOTAVAIL); + + imo->imo_membership = (struct in_multi **)mship; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_multicast_vif = -1; + + if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL, + &imo->imo_membership[0])) != 0) { + imo->imo_membership = NULL; + return (error); + } + imo->imo_num_memberships++; + imo->imo_multicast_ifp = ifp; + imo->imo_multicast_ttl = PFSYNC_DFLTTL; + imo->imo_multicast_loop = 0; + + return (0); +} + +static void +pfsync_multicast_cleanup(struct pfsync_softc *sc) +{ + struct ip_moptions *imo = &sc->sc_imo; + + in_leavegroup(imo->imo_membership[0], NULL); + free(imo->imo_membership, M_PFSYNC); + imo->imo_membership = NULL; + imo->imo_multicast_ifp = NULL; +} + +#ifdef INET +extern struct domain inetdomain; +static struct protosw in_pfsync_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_PFSYNC, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = pfsync_input, + .pr_output = (pr_output_t *)rip_output, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; +#endif + +static int +pfsync_init() +{ + VNET_ITERATOR_DECL(vnet_iter); + int error = 0; + + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_pfsync_cloner = pfsync_cloner; + V_pfsync_cloner_data = pfsync_cloner_data; + V_pfsync_cloner.ifc_data = &V_pfsync_cloner_data; + if_clone_attach(&V_pfsync_cloner); + error = swi_add(NULL, "pfsync", pfsyncintr, V_pfsyncif, + SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie); + CURVNET_RESTORE(); + if (error) + goto fail_locked; + } + VNET_LIST_RUNLOCK(); +#ifdef INET + error = pf_proto_register(PF_INET, &in_pfsync_protosw); + if (error) + goto fail; + error = ipproto_register(IPPROTO_PFSYNC); + if (error) { + pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); + goto fail; + } +#endif + PF_RULES_WLOCK(); + pfsync_state_import_ptr = pfsync_state_import; + pfsync_insert_state_ptr = pfsync_insert_state; + pfsync_update_state_ptr = pfsync_update_state; + pfsync_delete_state_ptr = pfsync_delete_state; + pfsync_clear_states_ptr = pfsync_clear_states; + pfsync_defer_ptr = pfsync_defer; + PF_RULES_WUNLOCK(); + + return (0); + +fail: + VNET_LIST_RLOCK(); +fail_locked: + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + if (V_pfsync_swi_cookie) { + swi_remove(V_pfsync_swi_cookie); + if_clone_detach(&V_pfsync_cloner); + } + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + + return (error); +} + +static void +pfsync_uninit() +{ + VNET_ITERATOR_DECL(vnet_iter); + + PF_RULES_WLOCK(); + pfsync_state_import_ptr = NULL; + pfsync_insert_state_ptr = NULL; + pfsync_update_state_ptr = NULL; + pfsync_delete_state_ptr = NULL; + pfsync_clear_states_ptr = NULL; + pfsync_defer_ptr = NULL; + PF_RULES_WUNLOCK(); + + ipproto_unregister(IPPROTO_PFSYNC); + pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + if_clone_detach(&V_pfsync_cloner); + swi_remove(V_pfsync_swi_cookie); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); +} + +static int +pfsync_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch (type) { + case MOD_LOAD: + error = pfsync_init(); + break; + case MOD_QUIESCE: + /* + * Module should not be unloaded due to race conditions. + */ + error = EPERM; + break; + case MOD_UNLOAD: + pfsync_uninit(); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static moduledata_t pfsync_mod = { + "pfsync", + pfsync_modevent, + 0 +}; + +#define PFSYNC_MODVER 1 + +DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); +MODULE_VERSION(pfsync, PFSYNC_MODVER); +MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER); diff --git a/sys/netpfil/pf/in4_cksum.c b/sys/netpfil/pf/in4_cksum.c new file mode 100644 index 0000000..bf25baf --- /dev/null +++ b/sys/netpfil/pf/in4_cksum.c @@ -0,0 +1,120 @@ +/* $FreeBSD$ */ +/* $OpenBSD: in4_cksum.c,v 1.7 2003/06/02 23:28:13 millert Exp $ */ +/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */ +/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> + +#include <machine/in_cksum.h> + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; (void)ADDCARRY(sum);} + +int in4_cksum(struct mbuf *, u_int8_t, int, int); + +int +in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len) +{ + union { + struct ipovly ipov; + u_int16_t w[10]; + } u; + union { + u_int16_t s[2]; + u_int32_t l; + } l_util; + + u_int16_t *w; + int psum; + int sum = 0; + + if (nxt != 0) { + /* pseudo header */ + if (off < sizeof(struct ipovly)) + panic("in4_cksum: offset too short"); + if (m->m_len < sizeof(struct ip)) + panic("in4_cksum: bad mbuf chain"); + bzero(&u.ipov, sizeof(u.ipov)); + u.ipov.ih_len = htons(len); + u.ipov.ih_pr = nxt; + u.ipov.ih_src = mtod(m, struct ip *)->ip_src; + u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst; + w = u.w; + /* assumes sizeof(ipov) == 20 */ + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; + sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; + } + + psum = in_cksum_skip(m, len + off, off); + psum = ~psum & 0xffff; + sum += psum; + REDUCE; + return (~sum & 0xffff); +} diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c new file mode 100644 index 0000000..a61b87b --- /dev/null +++ b/sys/netpfil/pf/pf.c @@ -0,0 +1,6271 @@ +/* $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#include <sys/cdefs.h> + +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_bpf.h" +#include "opt_pf.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/endian.h> +#include <sys/hash.h> +#include <sys/interrupt.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/limits.h> +#include <sys/mbuf.h> +#include <sys/md5.h> +#include <sys/random.h> +#include <sys/refcount.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/ucred.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> +#include <net/radix_mpath.h> +#include <net/vnet.h> + +#include <net/pfvar.h> +#include <net/pf_mtag.h> +#include <net/if_pflog.h> +#include <net/if_pfsync.h> + +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_icmp.h> +#include <netinet/icmp_var.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */ + +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/nd6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#endif /* INET6 */ + +#include <machine/in_cksum.h> +#include <security/mac/mac_framework.h> + +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + +/* + * Global variables + */ + +/* state tables */ +VNET_DEFINE(struct pf_altqqueue, pf_altqs[2]); +VNET_DEFINE(struct pf_palist, pf_pabuf); +VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); +VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); +VNET_DEFINE(struct pf_status, pf_status); + +VNET_DEFINE(u_int32_t, ticket_altqs_active); +VNET_DEFINE(u_int32_t, ticket_altqs_inactive); +VNET_DEFINE(int, altqs_inactive_open); +VNET_DEFINE(u_int32_t, ticket_pabuf); + +VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx); +#define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx) +VNET_DEFINE(u_char, pf_tcp_secret[16]); +#define V_pf_tcp_secret VNET(pf_tcp_secret) +VNET_DEFINE(int, pf_tcp_secret_init); +#define V_pf_tcp_secret_init VNET(pf_tcp_secret_init) +VNET_DEFINE(int, pf_tcp_iss_off); +#define V_pf_tcp_iss_off VNET(pf_tcp_iss_off) + +struct pf_anchor_stackframe { + struct pf_ruleset *rs; + struct pf_rule *r; + struct pf_anchor_node *parent; + struct pf_anchor *child; +}; +VNET_DEFINE(struct pf_anchor_stackframe, pf_anchor_stack[64]); +#define V_pf_anchor_stack VNET(pf_anchor_stack) + +/* + * Queue for pf_intr() sends. + */ +static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations"); +struct pf_send_entry { + STAILQ_ENTRY(pf_send_entry) pfse_next; + struct mbuf *pfse_m; + enum { + PFSE_IP, + PFSE_IP6, + PFSE_ICMP, + PFSE_ICMP6, + } pfse_type; + union { + struct route ro; + struct { + int type; + int code; + int mtu; + } icmpopts; + } u; +#define pfse_ro u.ro +#define pfse_icmp_type u.icmpopts.type +#define pfse_icmp_code u.icmpopts.code +#define pfse_icmp_mtu u.icmpopts.mtu +}; + +STAILQ_HEAD(pf_send_head, pf_send_entry); +static VNET_DEFINE(struct pf_send_head, pf_sendqueue); +#define V_pf_sendqueue VNET(pf_sendqueue) + +static struct mtx pf_sendqueue_mtx; +#define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx) +#define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx) + +/* + * Queue for pf_flush_task() tasks. + */ +struct pf_flush_entry { + SLIST_ENTRY(pf_flush_entry) next; + struct pf_addr addr; + sa_family_t af; + uint8_t dir; + struct pf_rule *rule; /* never dereferenced */ +}; + +SLIST_HEAD(pf_flush_head, pf_flush_entry); +static VNET_DEFINE(struct pf_flush_head, pf_flushqueue); +#define V_pf_flushqueue VNET(pf_flushqueue) +static VNET_DEFINE(struct task, pf_flushtask); +#define V_pf_flushtask VNET(pf_flushtask) + +static struct mtx pf_flushqueue_mtx; +#define PF_FLUSHQ_LOCK() mtx_lock(&pf_flushqueue_mtx) +#define PF_FLUSHQ_UNLOCK() mtx_unlock(&pf_flushqueue_mtx) + +VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules); +struct mtx pf_unlnkdrules_mtx; + +static VNET_DEFINE(uma_zone_t, pf_sources_z); +#define V_pf_sources_z VNET(pf_sources_z) +static VNET_DEFINE(uma_zone_t, pf_mtag_z); +#define V_pf_mtag_z VNET(pf_mtag_z) +VNET_DEFINE(uma_zone_t, pf_state_z); +VNET_DEFINE(uma_zone_t, pf_state_key_z); + +VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]); +#define PFID_CPUBITS 8 +#define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS) +#define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT) +#define PFID_MAXID (~PFID_CPUMASK) +CTASSERT((1 << PFID_CPUBITS) > MAXCPU); + +static void pf_src_tree_remove_state(struct pf_state *); +static void pf_init_threshold(struct pf_threshold *, u_int32_t, + u_int32_t); +static void pf_add_threshold(struct pf_threshold *); +static int pf_check_threshold(struct pf_threshold *); + +static void pf_change_ap(struct pf_addr *, u_int16_t *, + u_int16_t *, u_int16_t *, struct pf_addr *, + u_int16_t, u_int8_t, sa_family_t); +static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, + struct tcphdr *, struct pf_state_peer *); +static void pf_change_icmp(struct pf_addr *, u_int16_t *, + struct pf_addr *, struct pf_addr *, u_int16_t, + u_int16_t *, u_int16_t *, u_int16_t *, + u_int16_t *, u_int8_t, sa_family_t); +static void pf_send_tcp(struct mbuf *, + const struct pf_rule *, sa_family_t, + const struct pf_addr *, const struct pf_addr *, + u_int16_t, u_int16_t, u_int32_t, u_int32_t, + u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, + u_int16_t, struct ifnet *); +static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, + sa_family_t, struct pf_rule *); +static void pf_detach_state(struct pf_state *); +static int pf_state_key_attach(struct pf_state_key *, + struct pf_state_key *, struct pf_state *); +static void pf_state_key_detach(struct pf_state *, int); +static int pf_state_key_ctor(void *, int, void *, int); +static u_int32_t pf_tcp_iss(struct pf_pdesc *); +static int pf_test_rule(struct pf_rule **, struct pf_state **, + int, struct pfi_kif *, struct mbuf *, int, + struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **, struct inpcb *); +static int pf_create_state(struct pf_rule *, struct pf_rule *, + struct pf_rule *, struct pf_pdesc *, + struct pf_src_node *, struct pf_state_key *, + struct pf_state_key *, struct mbuf *, int, + u_int16_t, u_int16_t, int *, struct pfi_kif *, + struct pf_state **, int, u_int16_t, u_int16_t, + int); +static int pf_test_fragment(struct pf_rule **, int, + struct pfi_kif *, struct mbuf *, void *, + struct pf_pdesc *, struct pf_rule **, + struct pf_ruleset **); +static int pf_tcp_track_full(struct pf_state_peer *, + struct pf_state_peer *, struct pf_state **, + struct pfi_kif *, struct mbuf *, int, + struct pf_pdesc *, u_short *, int *); +static int pf_tcp_track_sloppy(struct pf_state_peer *, + struct pf_state_peer *, struct pf_state **, + struct pf_pdesc *, u_short *); +static int pf_test_state_tcp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); +static int pf_test_state_udp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *); +static int pf_test_state_icmp(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, int, + void *, struct pf_pdesc *, u_short *); +static int pf_test_state_other(struct pf_state **, int, + struct pfi_kif *, struct mbuf *, struct pf_pdesc *); +static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, + sa_family_t); +static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, + sa_family_t); +static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, + int, u_int16_t); +static void pf_set_rt_ifp(struct pf_state *, + struct pf_addr *); +static int pf_check_proto_cksum(struct mbuf *, int, int, + u_int8_t, sa_family_t); +static void pf_print_state_parts(struct pf_state *, + struct pf_state_key *, struct pf_state_key *); +static int pf_addr_wrap_neq(struct pf_addr_wrap *, + struct pf_addr_wrap *); +static struct pf_state *pf_find_state(struct pfi_kif *, + struct pf_state_key_cmp *, u_int); +static int pf_src_connlimit(struct pf_state **); +static void pf_flush_task(void *c, int pending); +static int pf_insert_src_node(struct pf_src_node **, + struct pf_rule *, struct pf_addr *, sa_family_t); +static int pf_purge_expired_states(int); +static void pf_purge_unlinked_rules(void); +static int pf_mtag_init(void *, int, int); +static void pf_mtag_free(struct m_tag *); +#ifdef INET +static void pf_route(struct mbuf **, struct pf_rule *, int, + struct ifnet *, struct pf_state *, + struct pf_pdesc *); +#endif /* INET */ +#ifdef INET6 +static void pf_change_a6(struct pf_addr *, u_int16_t *, + struct pf_addr *, u_int8_t); +static void pf_route6(struct mbuf **, struct pf_rule *, int, + struct ifnet *, struct pf_state *, + struct pf_pdesc *); +#endif /* INET6 */ + +int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); + +VNET_DECLARE(int, pf_end_threads); + +VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); + +#define PACKET_LOOPED(pd) ((pd)->pf_mtag && \ + (pd)->pf_mtag->flags & PF_PACKET_LOOPED) + +#define STATE_LOOKUP(i, k, d, s, pd) \ + do { \ + (s) = pf_find_state((i), (k), (d)); \ + if ((s) == NULL || (s)->timeout == PFTM_PURGE) \ + return (PF_DROP); \ + if (PACKET_LOOPED(pd)) \ + return (PF_PASS); \ + if ((d) == PF_OUT && \ + (((s)->rule.ptr->rt == PF_ROUTETO && \ + (s)->rule.ptr->direction == PF_OUT) || \ + ((s)->rule.ptr->rt == PF_REPLYTO && \ + (s)->rule.ptr->direction == PF_IN)) && \ + (s)->rt_kif != NULL && \ + (s)->rt_kif != (i)) \ + return (PF_PASS); \ + } while (0) + +#define BOUND_IFACE(r, k) \ + ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all + +#define STATE_INC_COUNTERS(s) \ + do { \ + s->rule.ptr->states_cur++; \ + s->rule.ptr->states_tot++; \ + if (s->anchor.ptr != NULL) { \ + s->anchor.ptr->states_cur++; \ + s->anchor.ptr->states_tot++; \ + } \ + if (s->nat_rule.ptr != NULL) { \ + s->nat_rule.ptr->states_cur++; \ + s->nat_rule.ptr->states_tot++; \ + } \ + } while (0) + +#define STATE_DEC_COUNTERS(s) \ + do { \ + if (s->nat_rule.ptr != NULL) \ + s->nat_rule.ptr->states_cur--; \ + if (s->anchor.ptr != NULL) \ + s->anchor.ptr->states_cur--; \ + s->rule.ptr->states_cur--; \ + } while (0) + +static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); +VNET_DEFINE(struct pf_keyhash *, pf_keyhash); +VNET_DEFINE(struct pf_idhash *, pf_idhash); +VNET_DEFINE(u_long, pf_hashmask); +VNET_DEFINE(struct pf_srchash *, pf_srchash); +VNET_DEFINE(u_long, pf_srchashmask); + +SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)"); + +VNET_DEFINE(u_long, pf_hashsize); +#define V_pf_hashsize VNET(pf_hashsize) +SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN, + &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable"); + +VNET_DEFINE(u_long, pf_srchashsize); +#define V_pf_srchashsize VNET(pf_srchashsize) +SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN, + &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable"); + +VNET_DEFINE(void *, pf_swi_cookie); + +VNET_DEFINE(uint32_t, pf_hashseed); +#define V_pf_hashseed VNET(pf_hashseed) + +static __inline uint32_t +pf_hashkey(struct pf_state_key *sk) +{ + uint32_t h; + + h = jenkins_hash32((uint32_t *)sk, + sizeof(struct pf_state_key_cmp)/sizeof(uint32_t), + V_pf_hashseed); + + return (h & V_pf_hashmask); +} + +#ifdef INET6 +void +pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + dst->addr32[0] = src->addr32[0]; + break; +#endif /* INET */ + case AF_INET6: + dst->addr32[0] = src->addr32[0]; + dst->addr32[1] = src->addr32[1]; + dst->addr32[2] = src->addr32[2]; + dst->addr32[3] = src->addr32[3]; + break; + } +} +#endif /* INET6 */ + +static void +pf_init_threshold(struct pf_threshold *threshold, + u_int32_t limit, u_int32_t seconds) +{ + threshold->limit = limit * PF_THRESHOLD_MULT; + threshold->seconds = seconds; + threshold->count = 0; + threshold->last = time_uptime; +} + +static void +pf_add_threshold(struct pf_threshold *threshold) +{ + u_int32_t t = time_uptime, diff = t - threshold->last; + + if (diff >= threshold->seconds) + threshold->count = 0; + else + threshold->count -= threshold->count * diff / + threshold->seconds; + threshold->count += PF_THRESHOLD_MULT; + threshold->last = t; +} + +static int +pf_check_threshold(struct pf_threshold *threshold) +{ + return (threshold->count > threshold->limit); +} + +static int +pf_src_connlimit(struct pf_state **state) +{ + struct pfr_addr p; + struct pf_flush_entry *pffe; + int bad = 0; + + PF_STATE_LOCK_ASSERT(*state); + + (*state)->src_node->conn++; + (*state)->src.tcp_est = 1; + pf_add_threshold(&(*state)->src_node->conn_rate); + + if ((*state)->rule.ptr->max_src_conn && + (*state)->rule.ptr->max_src_conn < + (*state)->src_node->conn) { + V_pf_status.lcounters[LCNT_SRCCONN]++; + bad++; + } + + if ((*state)->rule.ptr->max_src_conn_rate.limit && + pf_check_threshold(&(*state)->src_node->conn_rate)) { + V_pf_status.lcounters[LCNT_SRCCONNRATE]++; + bad++; + } + + if (!bad) + return (0); + + /* Kill this state. */ + (*state)->timeout = PFTM_PURGE; + (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; + + if ((*state)->rule.ptr->overload_tbl == NULL) + return (1); + + V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++; + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("%s: blocking address ", __func__); + pf_print_host(&(*state)->src_node->addr, 0, + (*state)->key[PF_SK_WIRE]->af); + printf("\n"); + } + + bzero(&p, sizeof(p)); + p.pfra_af = (*state)->key[PF_SK_WIRE]->af; + switch ((*state)->key[PF_SK_WIRE]->af) { +#ifdef INET + case AF_INET: + p.pfra_net = 32; + p.pfra_ip4addr = (*state)->src_node->addr.v4; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + p.pfra_net = 128; + p.pfra_ip6addr = (*state)->src_node->addr.v6; + break; +#endif /* INET6 */ + } + + pfr_insert_kentry((*state)->rule.ptr->overload_tbl, &p, time_second); + + if ((*state)->rule.ptr->flush == 0) + return (1); + + /* Schedule flushing task. */ + pffe = malloc(sizeof(*pffe), M_PFTEMP, M_NOWAIT); + if (pffe == NULL) + return (1); /* too bad :( */ + + bcopy(&(*state)->src_node->addr, &pffe->addr, sizeof(pffe->addr)); + pffe->af = (*state)->key[PF_SK_WIRE]->af; + pffe->dir = (*state)->direction; + if ((*state)->rule.ptr->flush & PF_FLUSH_GLOBAL) + pffe->rule = NULL; + else + pffe->rule = (*state)->rule.ptr; + PF_FLUSHQ_LOCK(); + SLIST_INSERT_HEAD(&V_pf_flushqueue, pffe, next); + PF_FLUSHQ_UNLOCK(); + taskqueue_enqueue(taskqueue_swi, &V_pf_flushtask); + + return (1); +} + +static void +pf_flush_task(void *c, int pending) +{ + struct pf_flush_head queue; + struct pf_flush_entry *pffe, *pffe1; + uint32_t killed = 0; + + PF_FLUSHQ_LOCK(); + queue = *(struct pf_flush_head *)c; + SLIST_INIT((struct pf_flush_head *)c); + PF_FLUSHQ_UNLOCK(); + + V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++; + + for (int i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + struct pf_state_key *sk; + struct pf_state *s; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + sk = s->key[PF_SK_WIRE]; + SLIST_FOREACH(pffe, &queue, next) + if (sk->af == pffe->af && (pffe->rule == NULL || + pffe->rule == s->rule.ptr) && + ((pffe->dir == PF_OUT && + PF_AEQ(&pffe->addr, &sk->addr[1], sk->af)) || + (pffe->dir == PF_IN && + PF_AEQ(&pffe->addr, &sk->addr[0], sk->af)))) { + s->timeout = PFTM_PURGE; + s->src.state = s->dst.state = TCPS_CLOSED; + killed++; + } + } + PF_HASHROW_UNLOCK(ih); + } + SLIST_FOREACH_SAFE(pffe, &queue, next, pffe1) + free(pffe, M_PFTEMP); + if (V_pf_status.debug >= PF_DEBUG_MISC) + printf("%s: %u states killed", __func__, killed); +} + +/* + * Can return locked on failure, so that we can consistently + * allocate and insert a new one. + */ +struct pf_src_node * +pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af, + int returnlocked) +{ + struct pf_srchash *sh; + struct pf_src_node *n; + + V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++; + + sh = &V_pf_srchash[pf_hashsrc(src, af)]; + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) + if (n->rule.ptr == rule && n->af == af && + ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || + (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) + break; + if (n != NULL || returnlocked == 0) + PF_HASHROW_UNLOCK(sh); + + return (n); +} + +static int +pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule, + struct pf_addr *src, sa_family_t af) +{ + + KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK || + rule->rpool.opts & PF_POOL_STICKYADDR), + ("%s for non-tracking rule %p", __func__, rule)); + + if (*sn == NULL) + *sn = pf_find_src_node(src, rule, af, 1); + + if (*sn == NULL) { + struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)]; + + PF_HASHROW_ASSERT(sh); + + if (!rule->max_src_nodes || + rule->src_nodes < rule->max_src_nodes) + (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); + else + V_pf_status.lcounters[LCNT_SRCNODES]++; + if ((*sn) == NULL) { + PF_HASHROW_UNLOCK(sh); + return (-1); + } + + pf_init_threshold(&(*sn)->conn_rate, + rule->max_src_conn_rate.limit, + rule->max_src_conn_rate.seconds); + + (*sn)->af = af; + (*sn)->rule.ptr = rule; + PF_ACPY(&(*sn)->addr, src, af); + LIST_INSERT_HEAD(&sh->nodes, *sn, entry); + (*sn)->creation = time_uptime; + (*sn)->ruletype = rule->action; + if ((*sn)->rule.ptr != NULL) + (*sn)->rule.ptr->src_nodes++; + PF_HASHROW_UNLOCK(sh); + V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++; + V_pf_status.src_nodes++; + } else { + if (rule->max_src_states && + (*sn)->states >= rule->max_src_states) { + V_pf_status.lcounters[LCNT_SRCSTATES]++; + return (-1); + } + } + return (0); +} + +static void +pf_remove_src_node(struct pf_src_node *src) +{ + struct pf_srchash *sh; + + sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)]; + PF_HASHROW_LOCK(sh); + LIST_REMOVE(src, entry); + PF_HASHROW_UNLOCK(sh); +} + +/* Data storage structures initialization. */ +void +pf_initialize() +{ + struct pf_keyhash *kh; + struct pf_idhash *ih; + struct pf_srchash *sh; + u_int i; + + TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize); + if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize)) + V_pf_hashsize = PF_HASHSIZ; + TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize); + if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize)) + V_pf_srchashsize = PF_HASHSIZ / 4; + + V_pf_hashseed = arc4random(); + + /* States and state keys storage. */ + V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z; + uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT); + + V_pf_state_key_z = uma_zcreate("pf state keys", + sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash), + M_PFHASH, M_WAITOK | M_ZERO); + V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash), + M_PFHASH, M_WAITOK | M_ZERO); + V_pf_hashmask = V_pf_hashsize - 1; + for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask; + i++, kh++, ih++) { + mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF); + mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF); + } + + /* Source nodes. */ + V_pf_sources_z = uma_zcreate("pf source nodes", + sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + 0); + V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z; + uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT); + V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash), + M_PFHASH, M_WAITOK|M_ZERO); + V_pf_srchashmask = V_pf_srchashsize - 1; + for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) + mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); + + /* ALTQ */ + TAILQ_INIT(&V_pf_altqs[0]); + TAILQ_INIT(&V_pf_altqs[1]); + TAILQ_INIT(&V_pf_pabuf); + V_pf_altqs_active = &V_pf_altqs[0]; + V_pf_altqs_inactive = &V_pf_altqs[1]; + + /* Mbuf tags */ + V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) + + sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL, + UMA_ALIGN_PTR, 0); + + /* Send & flush queues. */ + STAILQ_INIT(&V_pf_sendqueue); + SLIST_INIT(&V_pf_flushqueue); + TASK_INIT(&V_pf_flushtask, 0, pf_flush_task, &V_pf_flushqueue); + mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF); + mtx_init(&pf_flushqueue_mtx, "pf flush queue", NULL, MTX_DEF); + + /* Unlinked, but may be referenced rules. */ + TAILQ_INIT(&V_pf_unlinked_rules); + mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF); +} + +void +pf_cleanup() +{ + struct pf_keyhash *kh; + struct pf_idhash *ih; + struct pf_srchash *sh; + struct pf_send_entry *pfse, *next; + u_int i; + + for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask; + i++, kh++, ih++) { + KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", + __func__)); + KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty", + __func__)); + mtx_destroy(&kh->lock); + mtx_destroy(&ih->lock); + } + free(V_pf_keyhash, M_PFHASH); + free(V_pf_idhash, M_PFHASH); + + for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) { + KASSERT(LIST_EMPTY(&sh->nodes), + ("%s: source node hash not empty", __func__)); + mtx_destroy(&sh->lock); + } + free(V_pf_srchash, M_PFHASH); + + STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { + m_freem(pfse->pfse_m); + free(pfse, M_PFTEMP); + } + + mtx_destroy(&pf_sendqueue_mtx); + mtx_destroy(&pf_flushqueue_mtx); + mtx_destroy(&pf_unlnkdrules_mtx); + + uma_zdestroy(V_pf_mtag_z); + uma_zdestroy(V_pf_sources_z); + uma_zdestroy(V_pf_state_z); + uma_zdestroy(V_pf_state_key_z); +} + +static int +pf_mtag_init(void *mem, int size, int how) +{ + struct m_tag *t; + + t = (struct m_tag *)mem; + t->m_tag_cookie = MTAG_ABI_COMPAT; + t->m_tag_id = PACKET_TAG_PF; + t->m_tag_len = sizeof(struct pf_mtag); + t->m_tag_free = pf_mtag_free; + + return (0); +} + +static void +pf_mtag_free(struct m_tag *t) +{ + + uma_zfree(V_pf_mtag_z, t); +} + +struct pf_mtag * +pf_get_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL) + return ((struct pf_mtag *)(mtag + 1)); + + mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT); + if (mtag == NULL) + return (NULL); + bzero(mtag + 1, sizeof(struct pf_mtag)); + m_tag_prepend(m, mtag); + + return ((struct pf_mtag *)(mtag + 1)); +} + +static int +pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks, + struct pf_state *s) +{ + struct pf_keyhash *kh; + struct pf_state_key *sk, *cur; + struct pf_state *si, *olds = NULL; + int idx; + + KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); + KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__)); + KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__)); + + /* + * First run: start with wire key. + */ + sk = skw; + idx = PF_SK_WIRE; + +keyattach: + kh = &V_pf_keyhash[pf_hashkey(sk)]; + + PF_HASHROW_LOCK(kh); + LIST_FOREACH(cur, &kh->keys, entry) + if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0) + break; + + if (cur != NULL) { + /* Key exists. Check for same kif, if none, add to key. */ + TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) { + struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)]; + + PF_HASHROW_LOCK(ih); + if (si->kif == s->kif && + si->direction == s->direction) { + if (sk->proto == IPPROTO_TCP && + si->src.state >= TCPS_FIN_WAIT_2 && + si->dst.state >= TCPS_FIN_WAIT_2) { + si->src.state = si->dst.state = + TCPS_CLOSED; + /* Unlink later or cur can go away. */ + pf_ref_state(si); + olds = si; + } else { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: %s key attach " + "failed on %s: ", + (idx == PF_SK_WIRE) ? + "wire" : "stack", + s->kif->pfik_name); + pf_print_state_parts(s, + (idx == PF_SK_WIRE) ? + sk : NULL, + (idx == PF_SK_STACK) ? + sk : NULL); + printf(", existing: "); + pf_print_state_parts(si, + (idx == PF_SK_WIRE) ? + sk : NULL, + (idx == PF_SK_STACK) ? + sk : NULL); + printf("\n"); + } + PF_HASHROW_UNLOCK(ih); + PF_HASHROW_UNLOCK(kh); + uma_zfree(V_pf_state_key_z, sk); + if (idx == PF_SK_STACK) + pf_detach_state(s); + return (-1); /* collision! */ + } + } + PF_HASHROW_UNLOCK(ih); + } + uma_zfree(V_pf_state_key_z, sk); + s->key[idx] = cur; + } else { + LIST_INSERT_HEAD(&kh->keys, sk, entry); + s->key[idx] = sk; + } + +stateattach: + /* List is sorted, if-bound states before floating. */ + if (s->kif == V_pfi_all) + TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]); + else + TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]); + + /* + * Attach done. See how should we (or should not?) + * attach a second key. + */ + if (sks == skw) { + s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; + idx = PF_SK_STACK; + sks = NULL; + goto stateattach; + } else if (sks != NULL) { + PF_HASHROW_UNLOCK(kh); + if (olds) { + pf_unlink_state(olds, 0); + pf_release_state(olds); + olds = NULL; + } + /* + * Continue attaching with stack key. + */ + sk = sks; + idx = PF_SK_STACK; + sks = NULL; + goto keyattach; + } else + PF_HASHROW_UNLOCK(kh); + + if (olds) { + pf_unlink_state(olds, 0); + pf_release_state(olds); + } + + KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL, + ("%s failure", __func__)); + + return (0); +} + +static void +pf_detach_state(struct pf_state *s) +{ + struct pf_state_key *sks = s->key[PF_SK_STACK]; + struct pf_keyhash *kh; + + if (sks != NULL) { + kh = &V_pf_keyhash[pf_hashkey(sks)]; + PF_HASHROW_LOCK(kh); + if (s->key[PF_SK_STACK] != NULL) + pf_state_key_detach(s, PF_SK_STACK); + /* + * If both point to same key, then we are done. + */ + if (sks == s->key[PF_SK_WIRE]) { + pf_state_key_detach(s, PF_SK_WIRE); + PF_HASHROW_UNLOCK(kh); + return; + } + PF_HASHROW_UNLOCK(kh); + } + + if (s->key[PF_SK_WIRE] != NULL) { + kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])]; + PF_HASHROW_LOCK(kh); + if (s->key[PF_SK_WIRE] != NULL) + pf_state_key_detach(s, PF_SK_WIRE); + PF_HASHROW_UNLOCK(kh); + } +} + +static void +pf_state_key_detach(struct pf_state *s, int idx) +{ + struct pf_state_key *sk = s->key[idx]; +#ifdef INVARIANTS + struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)]; + + PF_HASHROW_ASSERT(kh); +#endif + TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]); + s->key[idx] = NULL; + + if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) { + LIST_REMOVE(sk, entry); + uma_zfree(V_pf_state_key_z, sk); + } +} + +static int +pf_state_key_ctor(void *mem, int size, void *arg, int flags) +{ + struct pf_state_key *sk = mem; + + bzero(sk, sizeof(struct pf_state_key_cmp)); + TAILQ_INIT(&sk->states[PF_SK_WIRE]); + TAILQ_INIT(&sk->states[PF_SK_STACK]); + + return (0); +} + +struct pf_state_key * +pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr, + struct pf_addr *daddr, u_int16_t sport, u_int16_t dport) +{ + struct pf_state_key *sk; + + sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); + if (sk == NULL) + return (NULL); + + PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af); + PF_ACPY(&sk->addr[pd->didx], daddr, pd->af); + sk->port[pd->sidx] = sport; + sk->port[pd->didx] = dport; + sk->proto = pd->proto; + sk->af = pd->af; + + return (sk); +} + +struct pf_state_key * +pf_state_key_clone(struct pf_state_key *orig) +{ + struct pf_state_key *sk; + + sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); + if (sk == NULL) + return (NULL); + + bcopy(orig, sk, sizeof(struct pf_state_key_cmp)); + + return (sk); +} + +int +pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw, + struct pf_state_key *sks, struct pf_state *s) +{ + struct pf_idhash *ih; + struct pf_state *cur; + + KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]), + ("%s: sks not pristine", __func__)); + KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]), + ("%s: skw not pristine", __func__)); + KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); + + s->kif = kif; + + if (pf_state_key_attach(skw, sks, s)) + return (-1); + + if (s->id == 0 && s->creatorid == 0) { + /* XXX: should be atomic, but probability of collision low */ + if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID) + V_pf_stateid[curcpu] = 1; + s->id |= (uint64_t )curcpu << PFID_CPUSHIFT; + s->id = htobe64(s->id); + s->creatorid = V_pf_status.hostid; + } + + ih = &V_pf_idhash[PF_IDHASH(s)]; + PF_HASHROW_LOCK(ih); + LIST_FOREACH(cur, &ih->states, entry) + if (cur->id == s->id && cur->creatorid == s->creatorid) + break; + + if (cur != NULL) { + PF_HASHROW_UNLOCK(ih); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state insert failed: " + "id: %016llx creatorid: %08x", + (unsigned long long)be64toh(s->id), + ntohl(s->creatorid)); + printf("\n"); + } + pf_detach_state(s); + return (-1); + } + LIST_INSERT_HEAD(&ih->states, s, entry); + /* One for keys, one for ID hash. */ + refcount_init(&s->refs, 2); + + V_pf_status.fcounters[FCNT_STATE_INSERT]++; + if (pfsync_insert_state_ptr != NULL) + pfsync_insert_state_ptr(s); + + /* Returns locked. */ + return (0); +} + +/* + * Find state by ID: returns with locked row on success. + */ +struct pf_state * +pf_find_state_byid(uint64_t id, uint32_t creatorid) +{ + struct pf_idhash *ih; + struct pf_state *s; + + V_pf_status.fcounters[FCNT_STATE_SEARCH]++; + + ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))]; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) + if (s->id == id && s->creatorid == creatorid) + break; + + if (s == NULL) + PF_HASHROW_UNLOCK(ih); + + return (s); +} + +/* + * Find state by key. + * Returns with ID hash slot locked on success. + */ +static struct pf_state * +pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir) +{ + struct pf_keyhash *kh; + struct pf_state_key *sk; + struct pf_state *s; + int idx; + + V_pf_status.fcounters[FCNT_STATE_SEARCH]++; + + kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; + + PF_HASHROW_LOCK(kh); + LIST_FOREACH(sk, &kh->keys, entry) + if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) + break; + if (sk == NULL) { + PF_HASHROW_UNLOCK(kh); + return (NULL); + } + + idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK); + + /* List is sorted, if-bound states before floating ones. */ + TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) + if (s->kif == V_pfi_all || s->kif == kif) { + PF_STATE_LOCK(s); + PF_HASHROW_UNLOCK(kh); + if (s->timeout == PFTM_UNLINKED) { + /* + * State is being processed + * by pf_unlink_state() in + * an other thread. + */ + PF_STATE_UNLOCK(s); + return (NULL); + } + return (s); + } + PF_HASHROW_UNLOCK(kh); + + return (NULL); +} + +struct pf_state * +pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) +{ + struct pf_keyhash *kh; + struct pf_state_key *sk; + struct pf_state *s, *ret = NULL; + int idx, inout = 0; + + V_pf_status.fcounters[FCNT_STATE_SEARCH]++; + + kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; + + PF_HASHROW_LOCK(kh); + LIST_FOREACH(sk, &kh->keys, entry) + if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) + break; + if (sk == NULL) { + PF_HASHROW_UNLOCK(kh); + return (NULL); + } + switch (dir) { + case PF_IN: + idx = PF_SK_WIRE; + break; + case PF_OUT: + idx = PF_SK_STACK; + break; + case PF_INOUT: + idx = PF_SK_WIRE; + inout = 1; + break; + default: + panic("%s: dir %u", __func__, dir); + } +second_run: + TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) { + if (more == NULL) { + PF_HASHROW_UNLOCK(kh); + return (s); + } + + if (ret) + (*more)++; + else + ret = s; + } + if (inout == 1) { + inout = 0; + idx = PF_SK_STACK; + goto second_run; + } + PF_HASHROW_UNLOCK(kh); + + return (ret); +} + +/* END state table stuff */ + +static void +pf_send(struct pf_send_entry *pfse) +{ + + PF_SENDQ_LOCK(); + STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next); + PF_SENDQ_UNLOCK(); + swi_sched(V_pf_swi_cookie, 0); +} + +void +pf_intr(void *v) +{ + struct pf_send_head queue; + struct pf_send_entry *pfse, *next; + + CURVNET_SET((struct vnet *)v); + + PF_SENDQ_LOCK(); + queue = V_pf_sendqueue; + STAILQ_INIT(&V_pf_sendqueue); + PF_SENDQ_UNLOCK(); + + STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) { + switch (pfse->pfse_type) { +#ifdef INET + case PFSE_IP: + ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL); + break; + case PFSE_ICMP: + icmp_error(pfse->pfse_m, pfse->pfse_icmp_type, + pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu); + break; +#endif /* INET */ +#ifdef INET6 + case PFSE_IP6: + ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL, + NULL); + break; + case PFSE_ICMP6: + icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type, + pfse->pfse_icmp_code, pfse->pfse_icmp_mtu); + break; +#endif /* INET6 */ + default: + panic("%s: unknown type", __func__); + } + free(pfse, M_PFTEMP); + } + CURVNET_RESTORE(); +} + +void +pf_purge_thread(void *v) +{ + int fullrun; + + CURVNET_SET((struct vnet *)v); + + for (;;) { + PF_RULES_RLOCK(); + rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10); + + if (V_pf_end_threads) { + /* + * To cleanse up all kifs and rules we need + * two runs: first one clears reference flags, + * then pf_purge_expired_states() doesn't + * raise them, and then second run frees. + */ + PF_RULES_RUNLOCK(); + pf_purge_unlinked_rules(); + pfi_kif_purge(); + + /* + * Now purge everything. + */ + pf_purge_expired_states(V_pf_hashmask + 1); + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(); + + /* + * Now all kifs & rules should be unreferenced, + * thus should be successfully freed. + */ + pf_purge_unlinked_rules(); + pfi_kif_purge(); + + /* + * Announce success and exit. + */ + PF_RULES_RLOCK(); + V_pf_end_threads++; + PF_RULES_RUNLOCK(); + wakeup(pf_purge_thread); + kproc_exit(0); + } + PF_RULES_RUNLOCK(); + + /* Process 1/interval fraction of the state table every run. */ + fullrun = pf_purge_expired_states(V_pf_hashmask / + (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10)); + + /* Purge other expired types every PFTM_INTERVAL seconds. */ + if (fullrun) { + /* + * Order is important: + * - states and src nodes reference rules + * - states and rules reference kifs + */ + pf_purge_expired_fragments(); + pf_purge_expired_src_nodes(); + pf_purge_unlinked_rules(); + pfi_kif_purge(); + } + } + /* not reached */ + CURVNET_RESTORE(); +} + +u_int32_t +pf_state_expires(const struct pf_state *state) +{ + u_int32_t timeout; + u_int32_t start; + u_int32_t end; + u_int32_t states; + + /* handle all PFTM_* > PFTM_MAX here */ + if (state->timeout == PFTM_PURGE) + return (time_uptime); + if (state->timeout == PFTM_UNTIL_PACKET) + return (0); + KASSERT(state->timeout != PFTM_UNLINKED, + ("pf_state_expires: timeout == PFTM_UNLINKED")); + KASSERT((state->timeout < PFTM_MAX), + ("pf_state_expires: timeout > PFTM_MAX")); + timeout = state->rule.ptr->timeout[state->timeout]; + if (!timeout) + timeout = V_pf_default_rule.timeout[state->timeout]; + start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; + if (start) { + end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; + states = state->rule.ptr->states_cur; /* XXXGL */ + } else { + start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START]; + end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END]; + states = V_pf_status.states; + } + if (end && states > start && start < end) { + if (states < end) + return (state->expire + timeout * (end - states) / + (end - start)); + else + return (time_uptime); + } + return (state->expire + timeout); +} + +void +pf_purge_expired_src_nodes() +{ + struct pf_srchash *sh; + struct pf_src_node *cur, *next; + int i; + + for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next) + if (cur->states <= 0 && cur->expire <= time_uptime) { + if (cur->rule.ptr != NULL) + cur->rule.ptr->src_nodes--; + LIST_REMOVE(cur, entry); + V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + V_pf_status.src_nodes--; + uma_zfree(V_pf_sources_z, cur); + } else if (cur->rule.ptr != NULL) + cur->rule.ptr->rule_flag |= PFRULE_REFS; + PF_HASHROW_UNLOCK(sh); + } +} + +static void +pf_src_tree_remove_state(struct pf_state *s) +{ + u_int32_t timeout; + + if (s->src_node != NULL) { + if (s->src.tcp_est) + --s->src_node->conn; + if (--s->src_node->states <= 0) { + timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; + if (!timeout) + timeout = + V_pf_default_rule.timeout[PFTM_SRC_NODE]; + s->src_node->expire = time_uptime + timeout; + } + } + if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { + if (--s->nat_src_node->states <= 0) { + timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; + if (!timeout) + timeout = + V_pf_default_rule.timeout[PFTM_SRC_NODE]; + s->nat_src_node->expire = time_uptime + timeout; + } + } + s->src_node = s->nat_src_node = NULL; +} + +/* + * Unlink and potentilly free a state. Function may be + * called with ID hash row locked, but always returns + * unlocked, since it needs to go through key hash locking. + */ +int +pf_unlink_state(struct pf_state *s, u_int flags) +{ + struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)]; + + if ((flags & PF_ENTER_LOCKED) == 0) + PF_HASHROW_LOCK(ih); + else + PF_HASHROW_ASSERT(ih); + + if (s->timeout == PFTM_UNLINKED) { + /* + * State is being processed + * by pf_unlink_state() in + * an other thread. + */ + PF_HASHROW_UNLOCK(ih); + return (0); /* XXXGL: undefined actually */ + } + + s->timeout = PFTM_UNLINKED; + + if (s->src.state == PF_TCPS_PROXY_DST) { + /* XXX wire key the right one? */ + pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af, + &s->key[PF_SK_WIRE]->addr[1], + &s->key[PF_SK_WIRE]->addr[0], + s->key[PF_SK_WIRE]->port[1], + s->key[PF_SK_WIRE]->port[0], + s->src.seqhi, s->src.seqlo + 1, + TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL); + } + + LIST_REMOVE(s, entry); + pf_src_tree_remove_state(s); + PF_HASHROW_UNLOCK(ih); + + if (pfsync_delete_state_ptr != NULL) + pfsync_delete_state_ptr(s); + + pf_detach_state(s); + refcount_release(&s->refs); + + return (pf_release_state(s)); +} + +void +pf_free_state(struct pf_state *cur) +{ + + KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur)); + KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__, + cur->timeout)); + --cur->rule.ptr->states_cur; + if (cur->nat_rule.ptr != NULL) + --cur->nat_rule.ptr->states_cur; + if (cur->anchor.ptr != NULL) + --cur->anchor.ptr->states_cur; + pf_normalize_tcp_cleanup(cur); + uma_zfree(V_pf_state_z, cur); + V_pf_status.fcounters[FCNT_STATE_REMOVALS]++; +} + +/* + * Called only from pf_purge_thread(), thus serialized. + */ +static int +pf_purge_expired_states(int maxcheck) +{ + static u_int i = 0; + + struct pf_idhash *ih; + struct pf_state *s; + int rv = 0; + + V_pf_status.states = uma_zone_get_cur(V_pf_state_z); + + /* + * Go through hash and unlink states that expire now. + */ + while (maxcheck > 0) { + + /* Wrap to start of hash when we hit the end. */ + if (i > V_pf_hashmask) { + i = 0; + rv = 1; + } + + ih = &V_pf_idhash[i]; +relock: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (pf_state_expires(s) <= time_uptime) { + V_pf_status.states -= + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + s->rule.ptr->rule_flag |= PFRULE_REFS; + if (s->nat_rule.ptr != NULL) + s->nat_rule.ptr->rule_flag |= PFRULE_REFS; + if (s->anchor.ptr != NULL) + s->anchor.ptr->rule_flag |= PFRULE_REFS; + s->kif->pfik_flags |= PFI_IFLAG_REFS; + if (s->rt_kif) + s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; + } + PF_HASHROW_UNLOCK(ih); + i++; + maxcheck--; + } + + V_pf_status.states = uma_zone_get_cur(V_pf_state_z); + + return (rv); +} + +static void +pf_purge_unlinked_rules() +{ + struct pf_rulequeue tmpq; + struct pf_rule *r, *r1; + + /* + * Do naive mark-and-sweep garbage collecting of old rules. + * Reference flag is raised by pf_purge_expired_states() + * and pf_purge_expired_src_nodes(). + * + * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK, + * use a temporary queue. + */ + TAILQ_INIT(&tmpq); + PF_UNLNKDRULES_LOCK(); + TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) { + if (!(r->rule_flag & PFRULE_REFS)) { + TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries); + TAILQ_INSERT_TAIL(&tmpq, r, entries); + } else + r->rule_flag &= ~PFRULE_REFS; + } + PF_UNLNKDRULES_UNLOCK(); + + if (!TAILQ_EMPTY(&tmpq)) { + PF_RULES_WLOCK(); + TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) { + TAILQ_REMOVE(&tmpq, r, entries); + pf_free_rule(r); + } + PF_RULES_WUNLOCK(); + } +} + +void +pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: { + u_int32_t a = ntohl(addr->addr32[0]); + printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, + (a>>8)&255, a&255); + if (p) { + p = ntohs(p); + printf(":%u", p); + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + u_int16_t b; + u_int8_t i, curstart, curend, maxstart, maxend; + curstart = curend = maxstart = maxend = 255; + for (i = 0; i < 8; i++) { + if (!addr->addr16[i]) { + if (curstart == 255) + curstart = i; + curend = i; + } else { + if ((curend - curstart) > + (maxend - maxstart)) { + maxstart = curstart; + maxend = curend; + } + curstart = curend = 255; + } + } + if ((curend - curstart) > + (maxend - maxstart)) { + maxstart = curstart; + maxend = curend; + } + for (i = 0; i < 8; i++) { + if (i >= maxstart && i <= maxend) { + if (i == 0) + printf(":"); + if (i == maxend) + printf(":"); + } else { + b = ntohs(addr->addr16[i]); + printf("%x", b); + if (i < 7) + printf(":"); + } + } + if (p) { + p = ntohs(p); + printf("[%u]", p); + } + break; + } +#endif /* INET6 */ + } +} + +void +pf_print_state(struct pf_state *s) +{ + pf_print_state_parts(s, NULL, NULL); +} + +static void +pf_print_state_parts(struct pf_state *s, + struct pf_state_key *skwp, struct pf_state_key *sksp) +{ + struct pf_state_key *skw, *sks; + u_int8_t proto, dir; + + /* Do our best to fill these, but they're skipped if NULL */ + skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL); + sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL); + proto = skw ? skw->proto : (sks ? sks->proto : 0); + dir = s ? s->direction : 0; + + switch (proto) { + case IPPROTO_IPV4: + printf("IPv4"); + break; + case IPPROTO_IPV6: + printf("IPv6"); + break; + case IPPROTO_TCP: + printf("TCP"); + break; + case IPPROTO_UDP: + printf("UDP"); + break; + case IPPROTO_ICMP: + printf("ICMP"); + break; + case IPPROTO_ICMPV6: + printf("ICMPv6"); + break; + default: + printf("%u", skw->proto); + break; + } + switch (dir) { + case PF_IN: + printf(" in"); + break; + case PF_OUT: + printf(" out"); + break; + } + if (skw) { + printf(" wire: "); + pf_print_host(&skw->addr[0], skw->port[0], skw->af); + printf(" "); + pf_print_host(&skw->addr[1], skw->port[1], skw->af); + } + if (sks) { + printf(" stack: "); + if (sks != skw) { + pf_print_host(&sks->addr[0], sks->port[0], sks->af); + printf(" "); + pf_print_host(&sks->addr[1], sks->port[1], sks->af); + } else + printf("-"); + } + if (s) { + if (proto == IPPROTO_TCP) { + printf(" [lo=%u high=%u win=%u modulator=%u", + s->src.seqlo, s->src.seqhi, + s->src.max_win, s->src.seqdiff); + if (s->src.wscale && s->dst.wscale) + printf(" wscale=%u", + s->src.wscale & PF_WSCALE_MASK); + printf("]"); + printf(" [lo=%u high=%u win=%u modulator=%u", + s->dst.seqlo, s->dst.seqhi, + s->dst.max_win, s->dst.seqdiff); + if (s->src.wscale && s->dst.wscale) + printf(" wscale=%u", + s->dst.wscale & PF_WSCALE_MASK); + printf("]"); + } + printf(" %u:%u", s->src.state, s->dst.state); + } +} + +void +pf_print_flags(u_int8_t f) +{ + if (f) + printf(" "); + if (f & TH_FIN) + printf("F"); + if (f & TH_SYN) + printf("S"); + if (f & TH_RST) + printf("R"); + if (f & TH_PUSH) + printf("P"); + if (f & TH_ACK) + printf("A"); + if (f & TH_URG) + printf("U"); + if (f & TH_ECE) + printf("E"); + if (f & TH_CWR) + printf("W"); +} + +#define PF_SET_SKIP_STEPS(i) \ + do { \ + while (head[i] != cur) { \ + head[i]->skip[i].ptr = cur; \ + head[i] = TAILQ_NEXT(head[i], entries); \ + } \ + } while (0) + +void +pf_calc_skip_steps(struct pf_rulequeue *rules) +{ + struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT]; + int i; + + cur = TAILQ_FIRST(rules); + prev = cur; + for (i = 0; i < PF_SKIP_COUNT; ++i) + head[i] = cur; + while (cur != NULL) { + + if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) + PF_SET_SKIP_STEPS(PF_SKIP_IFP); + if (cur->direction != prev->direction) + PF_SET_SKIP_STEPS(PF_SKIP_DIR); + if (cur->af != prev->af) + PF_SET_SKIP_STEPS(PF_SKIP_AF); + if (cur->proto != prev->proto) + PF_SET_SKIP_STEPS(PF_SKIP_PROTO); + if (cur->src.neg != prev->src.neg || + pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) + PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); + if (cur->src.port[0] != prev->src.port[0] || + cur->src.port[1] != prev->src.port[1] || + cur->src.port_op != prev->src.port_op) + PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); + if (cur->dst.neg != prev->dst.neg || + pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) + PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); + if (cur->dst.port[0] != prev->dst.port[0] || + cur->dst.port[1] != prev->dst.port[1] || + cur->dst.port_op != prev->dst.port_op) + PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); + + prev = cur; + cur = TAILQ_NEXT(cur, entries); + } + for (i = 0; i < PF_SKIP_COUNT; ++i) + PF_SET_SKIP_STEPS(i); +} + +static int +pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) +{ + if (aw1->type != aw2->type) + return (1); + switch (aw1->type) { + case PF_ADDR_ADDRMASK: + case PF_ADDR_RANGE: + if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0)) + return (1); + if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0)) + return (1); + return (0); + case PF_ADDR_DYNIFTL: + return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); + case PF_ADDR_NOROUTE: + case PF_ADDR_URPFFAILED: + return (0); + case PF_ADDR_TABLE: + return (aw1->p.tbl != aw2->p.tbl); + default: + printf("invalid address type: %d\n", aw1->type); + return (1); + } +} + +u_int16_t +pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) +{ + u_int32_t l; + + if (udp && !cksum) + return (0x0000); + l = cksum + old - new; + l = (l >> 16) + (l & 65535); + l = l & 65535; + if (udp && !l) + return (0xFFFF); + return (l); +} + +static void +pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, + struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) +{ + struct pf_addr ao; + u_int16_t po = *p; + + PF_ACPY(&ao, a, af); + PF_ACPY(a, an, af); + + *p = pn; + + switch (af) { +#ifdef INET + case AF_INET: + *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, + ao.addr16[0], an->addr16[0], 0), + ao.addr16[1], an->addr16[1], 0); + *p = pn; + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + po, pn, u); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u), + po, pn, u); + break; +#endif /* INET6 */ + } +} + + +/* Changes a u_int32_t. Uses a void * so there are no align restrictions */ +void +pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) +{ + u_int32_t ao; + + memcpy(&ao, a, sizeof(ao)); + memcpy(a, &an, sizeof(u_int32_t)); + *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), + ao % 65536, an % 65536, u); +} + +#ifdef INET6 +static void +pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) +{ + struct pf_addr ao; + + PF_ACPY(&ao, a, AF_INET6); + PF_ACPY(a, an, AF_INET6); + + *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*c, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u); +} +#endif /* INET6 */ + +static void +pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, + struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, + u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) +{ + struct pf_addr oia, ooa; + + PF_ACPY(&oia, ia, af); + if (oa) + PF_ACPY(&ooa, oa, af); + + /* Change inner protocol port, fix inner protocol checksum. */ + if (ip != NULL) { + u_int16_t oip = *ip; + u_int32_t opc; + + if (pc != NULL) + opc = *pc; + *ip = np; + if (pc != NULL) + *pc = pf_cksum_fixup(*pc, oip, *ip, u); + *ic = pf_cksum_fixup(*ic, oip, *ip, 0); + if (pc != NULL) + *ic = pf_cksum_fixup(*ic, opc, *pc, 0); + } + /* Change inner ip address, fix inner ip and icmp checksums. */ + PF_ACPY(ia, na, af); + switch (af) { +#ifdef INET + case AF_INET: { + u_int32_t oh2c = *h2c; + + *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, + oia.addr16[0], ia->addr16[0], 0), + oia.addr16[1], ia->addr16[1], 0); + *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, + oia.addr16[0], ia->addr16[0], 0), + oia.addr16[1], ia->addr16[1], 0); + *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*ic, + oia.addr16[0], ia->addr16[0], u), + oia.addr16[1], ia->addr16[1], u), + oia.addr16[2], ia->addr16[2], u), + oia.addr16[3], ia->addr16[3], u), + oia.addr16[4], ia->addr16[4], u), + oia.addr16[5], ia->addr16[5], u), + oia.addr16[6], ia->addr16[6], u), + oia.addr16[7], ia->addr16[7], u); + break; +#endif /* INET6 */ + } + /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ + if (oa) { + PF_ACPY(oa, na, af); + switch (af) { +#ifdef INET + case AF_INET: + *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, + ooa.addr16[0], oa->addr16[0], 0), + ooa.addr16[1], oa->addr16[1], 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(*ic, + ooa.addr16[0], oa->addr16[0], u), + ooa.addr16[1], oa->addr16[1], u), + ooa.addr16[2], oa->addr16[2], u), + ooa.addr16[3], oa->addr16[3], u), + ooa.addr16[4], oa->addr16[4], u), + ooa.addr16[5], oa->addr16[5], u), + ooa.addr16[6], oa->addr16[6], u), + ooa.addr16[7], oa->addr16[7], u); + break; +#endif /* INET6 */ + } + } +} + + +/* + * Need to modulate the sequence numbers in the TCP SACK option + * (credits to Krzysztof Pfaff for report and patch) + */ +static int +pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, + struct tcphdr *th, struct pf_state_peer *dst) +{ + int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; + u_int8_t opts[TCP_MAXOLEN], *opt = opts; + int copyback = 0, i, olen; + struct sackblk sack; + +#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) + if (hlen < TCPOLEN_SACKLEN || + !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) + return 0; + + while (hlen >= TCPOLEN_SACKLEN) { + olen = opt[1]; + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_SACK: + if (olen > hlen) + olen = hlen; + if (olen >= TCPOLEN_SACKLEN) { + for (i = 2; i + TCPOLEN_SACK <= olen; + i += TCPOLEN_SACK) { + memcpy(&sack, &opt[i], sizeof(sack)); + pf_change_a(&sack.start, &th->th_sum, + htonl(ntohl(sack.start) - + dst->seqdiff), 0); + pf_change_a(&sack.end, &th->th_sum, + htonl(ntohl(sack.end) - + dst->seqdiff), 0); + memcpy(&opt[i], &sack, sizeof(sack)); + } + copyback = 1; + } + /* FALLTHROUGH */ + default: + if (olen < 2) + olen = 2; + hlen -= olen; + opt += olen; + } + } + + if (copyback) + m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); + return (copyback); +} + +static void +pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af, + const struct pf_addr *saddr, const struct pf_addr *daddr, + u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, + u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, + u_int16_t rtag, struct ifnet *ifp) +{ + struct pf_send_entry *pfse; + struct mbuf *m; + int len, tlen; +#ifdef INET + struct ip *h = NULL; +#endif /* INET */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif /* INET6 */ + struct tcphdr *th; + char *opt; + struct pf_mtag *pf_mtag; + + len = 0; + th = NULL; + + /* maximum segment size tcp option */ + tlen = sizeof(struct tcphdr); + if (mss) + tlen += 4; + + switch (af) { +#ifdef INET + case AF_INET: + len = sizeof(struct ip) + tlen; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + len = sizeof(struct ip6_hdr) + tlen; + break; +#endif /* INET6 */ + default: + panic("%s: unsupported af %d", __func__, af); + } + + /* Allocate outgoing queue entry, mbuf and mbuf tag. */ + pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); + if (pfse == NULL) + return; + m = m_gethdr(M_NOWAIT, MT_HEADER); + if (m == NULL) { + free(pfse, M_PFTEMP); + return; + } +#ifdef MAC + mac_netinet_firewall_send(m); +#endif + if ((pf_mtag = pf_get_mtag(m)) == NULL) { + free(pfse, M_PFTEMP); + m_freem(m); + return; + } + if (tag) + m->m_flags |= M_SKIP_FIREWALL; + pf_mtag->tag = rtag; + + if (r != NULL && r->rtableid >= 0) + M_SETFIB(m, r->rtableid); + +#ifdef ALTQ + if (r != NULL && r->qid) { + pf_mtag->qid = r->qid; + + /* add hints for ecn */ + pf_mtag->hdr = mtod(m, struct ip *); + } +#endif /* ALTQ */ + m->m_data += max_linkhdr; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + switch (af) { +#ifdef INET + case AF_INET: + h = mtod(m, struct ip *); + + /* IP header fields included in the TCP checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(tlen); + h->ip_src.s_addr = saddr->v4.s_addr; + h->ip_dst.s_addr = daddr->v4.s_addr; + + th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + h6 = mtod(m, struct ip6_hdr *); + + /* IP header fields included in the TCP checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(tlen); + memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); + memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); + + th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); + break; +#endif /* INET6 */ + } + + /* TCP header */ + th->th_sport = sport; + th->th_dport = dport; + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_off = tlen >> 2; + th->th_flags = flags; + th->th_win = htons(win); + + if (mss) { + opt = (char *)(th + 1); + opt[0] = TCPOPT_MAXSEG; + opt[1] = 4; + HTONS(mss); + bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); + } + + switch (af) { +#ifdef INET + case AF_INET: + /* TCP checksum */ + th->th_sum = in_cksum(m, len); + + /* Finish the IP header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = V_path_mtu_discovery ? IP_DF : 0; + h->ip_len = len; + h->ip_ttl = ttl ? ttl : V_ip_defttl; + h->ip_sum = 0; + + pfse->pfse_type = PFSE_IP; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + /* TCP checksum */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), tlen); + + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + + pfse->pfse_type = PFSE_IP6; + break; +#endif /* INET6 */ + } + pfse->pfse_m = m; + pf_send(pfse); +} + +static void +pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, + struct pf_rule *r) +{ + struct pf_send_entry *pfse; + struct mbuf *m0; + struct pf_mtag *pf_mtag; + + /* Allocate outgoing queue entry, mbuf and mbuf tag. */ + pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); + if (pfse == NULL) + return; + + if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) { + free(pfse, M_PFTEMP); + return; + } + + if ((pf_mtag = pf_get_mtag(m0)) == NULL) { + free(pfse, M_PFTEMP); + return; + } + /* XXX: revisit */ + m0->m_flags |= M_SKIP_FIREWALL; + + if (r->rtableid >= 0) + M_SETFIB(m0, r->rtableid); + +#ifdef ALTQ + if (r->qid) { + pf_mtag->qid = r->qid; + /* add hints for ecn */ + pf_mtag->hdr = mtod(m0, struct ip *); + } +#endif /* ALTQ */ + + switch (af) { +#ifdef INET + case AF_INET: + { + struct ip *ip; + + /* icmp_error() expects host byte ordering */ + ip = mtod(m0, struct ip *); + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + + pfse->pfse_type = PFSE_ICMP; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pfse->pfse_type = PFSE_ICMP6; + break; +#endif /* INET6 */ + } + pfse->pfse_m = m0; + pfse->pfse_icmp_type = type; + pfse->pfse_icmp_code = code; + pf_send(pfse); +} + +/* + * Return 1 if the addresses a and b match (with mask m), otherwise return 0. + * If n is 0, they match if they are equal. If n is != 0, they match if they + * are different. + */ +int +pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, + struct pf_addr *b, sa_family_t af) +{ + int match = 0; + + switch (af) { +#ifdef INET + case AF_INET: + if ((a->addr32[0] & m->addr32[0]) == + (b->addr32[0] & m->addr32[0])) + match++; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (((a->addr32[0] & m->addr32[0]) == + (b->addr32[0] & m->addr32[0])) && + ((a->addr32[1] & m->addr32[1]) == + (b->addr32[1] & m->addr32[1])) && + ((a->addr32[2] & m->addr32[2]) == + (b->addr32[2] & m->addr32[2])) && + ((a->addr32[3] & m->addr32[3]) == + (b->addr32[3] & m->addr32[3]))) + match++; + break; +#endif /* INET6 */ + } + if (match) { + if (n) + return (0); + else + return (1); + } else { + if (n) + return (1); + else + return (0); + } +} + +/* + * Return 1 if b <= a <= e, otherwise return 0. + */ +int +pf_match_addr_range(struct pf_addr *b, struct pf_addr *e, + struct pf_addr *a, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + if ((a->addr32[0] < b->addr32[0]) || + (a->addr32[0] > e->addr32[0])) + return (0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + int i; + + /* check a >= b */ + for (i = 0; i < 4; ++i) + if (a->addr32[i] > b->addr32[i]) + break; + else if (a->addr32[i] < b->addr32[i]) + return (0); + /* check a <= e */ + for (i = 0; i < 4; ++i) + if (a->addr32[i] < e->addr32[i]) + break; + else if (a->addr32[i] > e->addr32[i]) + return (0); + break; + } +#endif /* INET6 */ + } + return (1); +} + +static int +pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) +{ + switch (op) { + case PF_OP_IRG: + return ((p > a1) && (p < a2)); + case PF_OP_XRG: + return ((p < a1) || (p > a2)); + case PF_OP_RRG: + return ((p >= a1) && (p <= a2)); + case PF_OP_EQ: + return (p == a1); + case PF_OP_NE: + return (p != a1); + case PF_OP_LT: + return (p < a1); + case PF_OP_LE: + return (p <= a1); + case PF_OP_GT: + return (p > a1); + case PF_OP_GE: + return (p >= a1); + } + return (0); /* never reached */ +} + +int +pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) +{ + NTOHS(a1); + NTOHS(a2); + NTOHS(p); + return (pf_match(op, a1, a2, p)); +} + +static int +pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) +{ + if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + return (0); + return (pf_match(op, a1, a2, u)); +} + +static int +pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) +{ + if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) + return (0); + return (pf_match(op, a1, a2, g)); +} + +int +pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag) +{ + if (*tag == -1) + *tag = mtag; + + return ((!r->match_tag_not && r->match_tag == *tag) || + (r->match_tag_not && r->match_tag != *tag)); +} + +int +pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag) +{ + + KASSERT(tag > 0, ("%s: tag %d", __func__, tag)); + + if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL)) + return (ENOMEM); + + pd->pf_mtag->tag = tag; + + return (0); +} + +void +pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n, + struct pf_rule **r, struct pf_rule **a, int *match) +{ + struct pf_anchor_stackframe *f; + + PF_RULES_RASSERT(); + + (*r)->anchor->match = 0; + if (match) + *match = 0; + if (*depth >= sizeof(V_pf_anchor_stack) / + sizeof(V_pf_anchor_stack[0])) { + printf("pf_step_into_anchor: stack overflow\n"); + *r = TAILQ_NEXT(*r, entries); + return; + } else if (*depth == 0 && a != NULL) + *a = *r; + f = V_pf_anchor_stack + (*depth)++; + f->rs = *rs; + f->r = *r; + if ((*r)->anchor_wildcard) { + f->parent = &(*r)->anchor->children; + if ((f->child = RB_MIN(pf_anchor_node, f->parent)) == + NULL) { + *r = NULL; + return; + } + *rs = &f->child->ruleset; + } else { + f->parent = NULL; + f->child = NULL; + *rs = &(*r)->anchor->ruleset; + } + *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); +} + +int +pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n, + struct pf_rule **r, struct pf_rule **a, int *match) +{ + struct pf_anchor_stackframe *f; + int quick = 0; + + PF_RULES_RASSERT(); + + do { + if (*depth <= 0) + break; + f = V_pf_anchor_stack + *depth - 1; + if (f->parent != NULL && f->child != NULL) { + if (f->child->match || + (match != NULL && *match)) { + f->r->anchor->match = 1; + *match = 0; + } + f->child = RB_NEXT(pf_anchor_node, f->parent, f->child); + if (f->child != NULL) { + *rs = &f->child->ruleset; + *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); + if (*r == NULL) + continue; + else + break; + } + } + (*depth)--; + if (*depth == 0 && a != NULL) + *a = NULL; + *rs = f->rs; + if (f->r->anchor->match || (match != NULL && *match)) + quick = f->r->quick; + *r = TAILQ_NEXT(f->r, entries); + } while (*r == NULL); + + return (quick); +} + +#ifdef INET6 +void +pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, + struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | + ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); + break; +#endif /* INET */ + case AF_INET6: + naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | + ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); + naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | + ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); + naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | + ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); + naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | + ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); + break; + } +} + +void +pf_addr_inc(struct pf_addr *addr, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); + break; +#endif /* INET */ + case AF_INET6: + if (addr->addr32[3] == 0xffffffff) { + addr->addr32[3] = 0; + if (addr->addr32[2] == 0xffffffff) { + addr->addr32[2] = 0; + if (addr->addr32[1] == 0xffffffff) { + addr->addr32[1] = 0; + addr->addr32[0] = + htonl(ntohl(addr->addr32[0]) + 1); + } else + addr->addr32[1] = + htonl(ntohl(addr->addr32[1]) + 1); + } else + addr->addr32[2] = + htonl(ntohl(addr->addr32[2]) + 1); + } else + addr->addr32[3] = + htonl(ntohl(addr->addr32[3]) + 1); + break; + } +} +#endif /* INET6 */ + +int +pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m) +{ + struct pf_addr *saddr, *daddr; + u_int16_t sport, dport; + struct inpcbinfo *pi; + struct inpcb *inp; + + pd->lookup.uid = UID_MAX; + pd->lookup.gid = GID_MAX; + + switch (pd->proto) { + case IPPROTO_TCP: + if (pd->hdr.tcp == NULL) + return (-1); + sport = pd->hdr.tcp->th_sport; + dport = pd->hdr.tcp->th_dport; + pi = &V_tcbinfo; + break; + case IPPROTO_UDP: + if (pd->hdr.udp == NULL) + return (-1); + sport = pd->hdr.udp->uh_sport; + dport = pd->hdr.udp->uh_dport; + pi = &V_udbinfo; + break; + default: + return (-1); + } + if (direction == PF_IN) { + saddr = pd->src; + daddr = pd->dst; + } else { + u_int16_t p; + + p = sport; + sport = dport; + dport = p; + saddr = pd->dst; + daddr = pd->src; + } + switch (pd->af) { +#ifdef INET + case AF_INET: + inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, + dport, INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) { + inp = in_pcblookup_mbuf(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) + return (-1); + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, + dport, INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) { + inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, NULL, m); + if (inp == NULL) + return (-1); + } + break; +#endif /* INET6 */ + + default: + return (-1); + } + INP_RLOCK_ASSERT(inp); + pd->lookup.uid = inp->inp_cred->cr_uid; + pd->lookup.gid = inp->inp_cred->cr_groups[0]; + INP_RUNLOCK(inp); + + return (1); +} + +static u_int8_t +pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) +{ + int hlen; + u_int8_t hdr[60]; + u_int8_t *opt, optlen; + u_int8_t wscale = 0; + + hlen = th_off << 2; /* hlen <= sizeof(hdr) */ + if (hlen <= sizeof(struct tcphdr)) + return (0); + if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) + return (0); + opt = hdr + sizeof(struct tcphdr); + hlen -= sizeof(struct tcphdr); + while (hlen >= 3) { + switch (*opt) { + case TCPOPT_EOL: + case TCPOPT_NOP: + ++opt; + --hlen; + break; + case TCPOPT_WINDOW: + wscale = opt[2]; + if (wscale > TCP_MAX_WINSHIFT) + wscale = TCP_MAX_WINSHIFT; + wscale |= PF_WSCALE_FLAG; + /* FALLTHROUGH */ + default: + optlen = opt[1]; + if (optlen < 2) + optlen = 2; + hlen -= optlen; + opt += optlen; + break; + } + } + return (wscale); +} + +static u_int16_t +pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) +{ + int hlen; + u_int8_t hdr[60]; + u_int8_t *opt, optlen; + u_int16_t mss = V_tcp_mssdflt; + + hlen = th_off << 2; /* hlen <= sizeof(hdr) */ + if (hlen <= sizeof(struct tcphdr)) + return (0); + if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) + return (0); + opt = hdr + sizeof(struct tcphdr); + hlen -= sizeof(struct tcphdr); + while (hlen >= TCPOLEN_MAXSEG) { + switch (*opt) { + case TCPOPT_EOL: + case TCPOPT_NOP: + ++opt; + --hlen; + break; + case TCPOPT_MAXSEG: + bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); + NTOHS(mss); + /* FALLTHROUGH */ + default: + optlen = opt[1]; + if (optlen < 2) + optlen = 2; + hlen -= optlen; + opt += optlen; + break; + } + } + return (mss); +} + +static u_int16_t +pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) +{ +#ifdef INET + struct sockaddr_in *dst; + struct route ro; +#endif /* INET */ +#ifdef INET6 + struct sockaddr_in6 *dst6; + struct route_in6 ro6; +#endif /* INET6 */ + struct rtentry *rt = NULL; + int hlen = 0; + u_int16_t mss = V_tcp_mssdflt; + + switch (af) { +#ifdef INET + case AF_INET: + hlen = sizeof(struct ip); + bzero(&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = addr->v4; + in_rtalloc_ign(&ro, 0, rtableid); + rt = ro.ro_rt; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + hlen = sizeof(struct ip6_hdr); + bzero(&ro6, sizeof(ro6)); + dst6 = (struct sockaddr_in6 *)&ro6.ro_dst; + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = addr->v6; + in6_rtalloc_ign(&ro6, 0, rtableid); + rt = ro6.ro_rt; + break; +#endif /* INET6 */ + } + + if (rt && rt->rt_ifp) { + mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr); + mss = max(V_tcp_mssdflt, mss); + RTFREE(rt); + } + mss = min(mss, offer); + mss = max(mss, 64); /* sanity - at least max opt space */ + return (mss); +} + +static void +pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr) +{ + struct pf_rule *r = s->rule.ptr; + struct pf_src_node *sn = NULL; + + s->rt_kif = NULL; + if (!r->rt || r->rt == PF_FASTROUTE) + return; + switch (s->key[PF_SK_WIRE]->af) { +#ifdef INET + case AF_INET: + pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn); + s->rt_kif = r->rpool.cur->kif; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn); + s->rt_kif = r->rpool.cur->kif; + break; +#endif /* INET6 */ + } +} + +static u_int32_t +pf_tcp_iss(struct pf_pdesc *pd) +{ + MD5_CTX ctx; + u_int32_t digest[4]; + + if (V_pf_tcp_secret_init == 0) { + read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); + MD5Init(&V_pf_tcp_secret_ctx); + MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, + sizeof(V_pf_tcp_secret)); + V_pf_tcp_secret_init = 1; + } + + ctx = V_pf_tcp_secret_ctx; + + MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short)); + MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short)); + if (pd->af == AF_INET6) { + MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr)); + MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr)); + } else { + MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr)); + MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr)); + } + MD5Final((u_char *)digest, &ctx); + V_pf_tcp_iss_off += 4096; +#define ISN_RANDOM_INCREMENT (4096 - 1) + return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) + + V_pf_tcp_iss_off); +#undef ISN_RANDOM_INCREMENT +} + +static int +pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, + struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, + struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp) +{ + struct pf_rule *nr = NULL; + struct pf_addr * const saddr = pd->src; + struct pf_addr * const daddr = pd->dst; + sa_family_t af = pd->af; + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_src_node *nsn = NULL; + struct tcphdr *th = pd->hdr.tcp; + struct pf_state_key *sk = NULL, *nk = NULL; + u_short reason; + int rewrite = 0, hdrlen = 0; + int tag = -1, rtableid = -1; + int asd = 0; + int match = 0; + int state_icmp = 0; + u_int16_t sport = 0, dport = 0; + u_int16_t bproto_sum = 0, bip_sum = 0; + u_int8_t icmptype = 0, icmpcode = 0; + + PF_RULES_RASSERT(); + + if (inp != NULL) { + INP_LOCK_ASSERT(inp); + pd->lookup.uid = inp->inp_cred->cr_uid; + pd->lookup.gid = inp->inp_cred->cr_groups[0]; + pd->lookup.done = 1; + } + + switch (pd->proto) { + case IPPROTO_TCP: + sport = th->th_sport; + dport = th->th_dport; + hdrlen = sizeof(*th); + break; + case IPPROTO_UDP: + sport = pd->hdr.udp->uh_sport; + dport = pd->hdr.udp->uh_dport; + hdrlen = sizeof(*pd->hdr.udp); + break; +#ifdef INET + case IPPROTO_ICMP: + if (pd->af != AF_INET) + break; + sport = dport = pd->hdr.icmp->icmp_id; + hdrlen = sizeof(*pd->hdr.icmp); + icmptype = pd->hdr.icmp->icmp_type; + icmpcode = pd->hdr.icmp->icmp_code; + + if (icmptype == ICMP_UNREACH || + icmptype == ICMP_SOURCEQUENCH || + icmptype == ICMP_REDIRECT || + icmptype == ICMP_TIMXCEED || + icmptype == ICMP_PARAMPROB) + state_icmp++; + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + if (af != AF_INET6) + break; + sport = dport = pd->hdr.icmp6->icmp6_id; + hdrlen = sizeof(*pd->hdr.icmp6); + icmptype = pd->hdr.icmp6->icmp6_type; + icmpcode = pd->hdr.icmp6->icmp6_code; + + if (icmptype == ICMP6_DST_UNREACH || + icmptype == ICMP6_PACKET_TOO_BIG || + icmptype == ICMP6_TIME_EXCEEDED || + icmptype == ICMP6_PARAM_PROB) + state_icmp++; + break; +#endif /* INET6 */ + default: + sport = dport = hdrlen = 0; + break; + } + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + + /* check packet for BINAT/NAT/RDR */ + if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk, + &nk, saddr, daddr, sport, dport)) != NULL) { + KASSERT(sk != NULL, ("%s: null sk", __func__)); + KASSERT(nk != NULL, ("%s: null nk", __func__)); + + if (pd->ip_sum) + bip_sum = *pd->ip_sum; + + switch (pd->proto) { + case IPPROTO_TCP: + bproto_sum = th->th_sum; + pd->proto_sum = &th->th_sum; + + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || + nk->port[pd->sidx] != sport) { + pf_change_ap(saddr, &th->th_sport, pd->ip_sum, + &th->th_sum, &nk->addr[pd->sidx], + nk->port[pd->sidx], 0, af); + pd->sport = &th->th_sport; + sport = th->th_sport; + } + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || + nk->port[pd->didx] != dport) { + pf_change_ap(daddr, &th->th_dport, pd->ip_sum, + &th->th_sum, &nk->addr[pd->didx], + nk->port[pd->didx], 0, af); + dport = th->th_dport; + pd->dport = &th->th_dport; + } + rewrite++; + break; + case IPPROTO_UDP: + bproto_sum = pd->hdr.udp->uh_sum; + pd->proto_sum = &pd->hdr.udp->uh_sum; + + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || + nk->port[pd->sidx] != sport) { + pf_change_ap(saddr, &pd->hdr.udp->uh_sport, + pd->ip_sum, &pd->hdr.udp->uh_sum, + &nk->addr[pd->sidx], + nk->port[pd->sidx], 1, af); + sport = pd->hdr.udp->uh_sport; + pd->sport = &pd->hdr.udp->uh_sport; + } + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || + nk->port[pd->didx] != dport) { + pf_change_ap(daddr, &pd->hdr.udp->uh_dport, + pd->ip_sum, &pd->hdr.udp->uh_sum, + &nk->addr[pd->didx], + nk->port[pd->didx], 1, af); + dport = pd->hdr.udp->uh_dport; + pd->dport = &pd->hdr.udp->uh_dport; + } + rewrite++; + break; +#ifdef INET + case IPPROTO_ICMP: + nk->port[0] = nk->port[1]; + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&saddr->v4.s_addr, pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, 0); + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) + pf_change_a(&daddr->v4.s_addr, pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, 0); + + if (nk->port[1] != pd->hdr.icmp->icmp_id) { + pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, sport, + nk->port[1], 0); + pd->hdr.icmp->icmp_id = nk->port[1]; + pd->sport = &pd->hdr.icmp->icmp_id; + } + m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + nk->port[0] = nk->port[1]; + if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) + pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->sidx], 0); + + if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) + pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->didx], 0); + rewrite++; + break; +#endif /* INET */ + default: + switch (af) { +#ifdef INET + case AF_INET: + if (PF_ANEQ(saddr, + &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&saddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, 0); + + if (PF_ANEQ(daddr, + &nk->addr[pd->didx], AF_INET)) + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, 0); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (PF_ANEQ(saddr, + &nk->addr[pd->sidx], AF_INET6)) + PF_ACPY(saddr, &nk->addr[pd->sidx], af); + + if (PF_ANEQ(daddr, + &nk->addr[pd->didx], AF_INET6)) + PF_ACPY(saddr, &nk->addr[pd->didx], af); + break; +#endif /* INET */ + } + break; + } + if (nr->natpass) + r = NULL; + pd->nat_rule = nr; + } + + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, saddr, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + /* tcp/udp only. port_op always 0 in other cases */ + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + /* tcp/udp only. port_op always 0 in other cases */ + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + /* icmp only. type always 0 in other cases */ + else if (r->type && r->type != icmptype + 1) + r = TAILQ_NEXT(r, entries); + /* icmp only. type always 0 in other cases */ + else if (r->code && r->code != icmpcode + 1) + r = TAILQ_NEXT(r, entries); + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->rule_flag & PFRULE_FRAGMENT) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_TCP && + (r->flagset & th->th_flags) != r->flags) + r = TAILQ_NEXT(r, entries); + /* tcp/udp only. uid.op always 0 in other cases */ + else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = + pf_socket_lookup(direction, pd, m), 1)) && + !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], + pd->lookup.uid)) + r = TAILQ_NEXT(r, entries); + /* tcp/udp only. gid.op always 0 in other cases */ + else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = + pf_socket_lookup(direction, pd, m), 1)) && + !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], + pd->lookup.gid)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && + r->prob <= arc4random()) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY && + (pd->proto != IPPROTO_TCP || !pf_osfp_match( + pf_osfp_fingerprint(pd, m, off, th), + r->os_fingerprint))) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log || (nr != NULL && nr->log)) { + if (rewrite) + m_copyback(m, off, hdrlen, pd->hdr.any); + PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a, + ruleset, pd, 1); + } + + if ((r->action == PF_DROP) && + ((r->rule_flag & PFRULE_RETURNRST) || + (r->rule_flag & PFRULE_RETURNICMP) || + (r->rule_flag & PFRULE_RETURN))) { + /* undo NAT changes, if they have taken place */ + if (nr != NULL) { + PF_ACPY(saddr, &sk->addr[pd->sidx], af); + PF_ACPY(daddr, &sk->addr[pd->didx], af); + if (pd->sport) + *pd->sport = sk->port[pd->sidx]; + if (pd->dport) + *pd->dport = sk->port[pd->didx]; + if (pd->proto_sum) + *pd->proto_sum = bproto_sum; + if (pd->ip_sum) + *pd->ip_sum = bip_sum; + m_copyback(m, off, hdrlen, pd->hdr.any); + } + if (pd->proto == IPPROTO_TCP && + ((r->rule_flag & PFRULE_RETURNRST) || + (r->rule_flag & PFRULE_RETURN)) && + !(th->th_flags & TH_RST)) { + u_int32_t ack = ntohl(th->th_seq) + pd->p_len; + int len = 0; +#ifdef INET + struct ip *h4; +#endif +#ifdef INET6 + struct ip6_hdr *h6; +#endif + + switch (af) { +#ifdef INET + case AF_INET: + h4 = mtod(m, struct ip *); + len = ntohs(h4->ip_len) - off; + break; +#endif +#ifdef INET6 + case AF_INET6: + h6 = mtod(m, struct ip6_hdr *); + len = ntohs(h6->ip6_plen) - (off - sizeof(*h6)); + break; +#endif + } + + if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af)) + REASON_SET(&reason, PFRES_PROTCKSUM); + else { + if (th->th_flags & TH_SYN) + ack++; + if (th->th_flags & TH_FIN) + ack++; + pf_send_tcp(m, r, af, pd->dst, + pd->src, th->th_dport, th->th_sport, + ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, + r->return_ttl, 1, 0, kif->pfik_ifp); + } + } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && + r->return_icmp) + pf_send_icmp(m, r->return_icmp >> 8, + r->return_icmp & 255, af, r); + else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && + r->return_icmp6) + pf_send_icmp(m, r->return_icmp6 >> 8, + r->return_icmp6 & 255, af, r); + } + + if (r->action == PF_DROP) + goto cleanup; + + if (tag > 0 && pf_tag_packet(m, pd, tag)) { + REASON_SET(&reason, PFRES_MEMORY); + goto cleanup; + } + if (rtableid >= 0) + M_SETFIB(m, rtableid); + + if (!state_icmp && (r->keep_state || nr != NULL || + (pd->flags & PFDESC_TCP_NORM))) { + int action; + action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, + sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, + hdrlen); + if (action != PF_PASS) + return (action); + } else { + if (sk != NULL) + uma_zfree(V_pf_state_key_z, sk); + if (nk != NULL) + uma_zfree(V_pf_state_key_z, nk); + } + + /* copy back packet headers if we performed NAT operations */ + if (rewrite) + m_copyback(m, off, hdrlen, pd->hdr.any); + + if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) && + direction == PF_OUT && + pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m)) + /* + * We want the state created, but we dont + * want to send this in case a partner + * firewall has to know about it to allow + * replies through it. + */ + return (PF_DEFER); + + return (PF_PASS); + +cleanup: + if (sk != NULL) + uma_zfree(V_pf_state_key_z, sk); + if (nk != NULL) + uma_zfree(V_pf_state_key_z, nk); + return (PF_DROP); +} + +static int +pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a, + struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk, + struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, + u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm, + int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen) +{ + struct pf_state *s = NULL; + struct pf_src_node *sn = NULL; + struct tcphdr *th = pd->hdr.tcp; + u_int16_t mss = V_tcp_mssdflt; + u_short reason; + + /* check maximums */ + if (r->max_states && (r->states_cur >= r->max_states)) { + V_pf_status.lcounters[LCNT_STATES]++; + REASON_SET(&reason, PFRES_MAXSTATES); + return (PF_DROP); + } + /* src node for filter rule */ + if ((r->rule_flag & PFRULE_SRCTRACK || + r->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto csfailed; + } + /* src node for translation rule */ + if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && + pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) { + REASON_SET(&reason, PFRES_SRCLIMIT); + goto csfailed; + } + s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO); + if (s == NULL) { + REASON_SET(&reason, PFRES_MEMORY); + goto csfailed; + } + s->rule.ptr = r; + s->nat_rule.ptr = nr; + s->anchor.ptr = a; + STATE_INC_COUNTERS(s); + if (r->allow_opts) + s->state_flags |= PFSTATE_ALLOWOPTS; + if (r->rule_flag & PFRULE_STATESLOPPY) + s->state_flags |= PFSTATE_SLOPPY; + s->log = r->log & PF_LOG_ALL; + s->sync_state = PFSYNC_S_NONE; + if (nr != NULL) + s->log |= nr->log & PF_LOG_ALL; + switch (pd->proto) { + case IPPROTO_TCP: + s->src.seqlo = ntohl(th->th_seq); + s->src.seqhi = s->src.seqlo + pd->p_len + 1; + if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && + r->keep_state == PF_STATE_MODULATE) { + /* Generate sequence number modulator */ + if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) == + 0) + s->src.seqdiff = 1; + pf_change_a(&th->th_seq, &th->th_sum, + htonl(s->src.seqlo + s->src.seqdiff), 0); + *rewrite = 1; + } else + s->src.seqdiff = 0; + if (th->th_flags & TH_SYN) { + s->src.seqhi++; + s->src.wscale = pf_get_wscale(m, off, + th->th_off, pd->af); + } + s->src.max_win = MAX(ntohs(th->th_win), 1); + if (s->src.wscale & PF_WSCALE_MASK) { + /* Remove scale factor from initial window */ + int win = s->src.max_win; + win += 1 << (s->src.wscale & PF_WSCALE_MASK); + s->src.max_win = (win - 1) >> + (s->src.wscale & PF_WSCALE_MASK); + } + if (th->th_flags & TH_FIN) + s->src.seqhi++; + s->dst.seqhi = 1; + s->dst.max_win = 1; + s->src.state = TCPS_SYN_SENT; + s->dst.state = TCPS_CLOSED; + s->timeout = PFTM_TCP_FIRST_PACKET; + break; + case IPPROTO_UDP: + s->src.state = PFUDPS_SINGLE; + s->dst.state = PFUDPS_NO_TRAFFIC; + s->timeout = PFTM_UDP_FIRST_PACKET; + break; + case IPPROTO_ICMP: +#ifdef INET6 + case IPPROTO_ICMPV6: +#endif + s->timeout = PFTM_ICMP_FIRST_PACKET; + break; + default: + s->src.state = PFOTHERS_SINGLE; + s->dst.state = PFOTHERS_NO_TRAFFIC; + s->timeout = PFTM_OTHER_FIRST_PACKET; + } + + s->creation = time_uptime; + s->expire = time_uptime; + + if (sn != NULL) { + s->src_node = sn; + s->src_node->states++; + } + if (nsn != NULL) { + /* XXX We only modify one side for now. */ + PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); + s->nat_src_node = nsn; + s->nat_src_node->states++; + } + if (pd->proto == IPPROTO_TCP) { + if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, + off, pd, th, &s->src, &s->dst)) { + REASON_SET(&reason, PFRES_MEMORY); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + return (PF_DROP); + } + if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && + pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, + &s->src, &s->dst, rewrite)) { + /* This really shouldn't happen!!! */ + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_normalize_tcp_stateful failed on first pkt")); + pf_normalize_tcp_cleanup(s); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + return (PF_DROP); + } + } + s->direction = pd->dir; + + /* + * sk/nk could already been setup by pf_get_translation(). + */ + if (nr == NULL) { + KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p", + __func__, nr, sk, nk)); + sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport); + if (sk == NULL) + goto csfailed; + nk = sk; + } else + KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p", + __func__, nr, sk, nk)); + + /* Swap sk/nk for PF_OUT. */ + if (pf_state_insert(BOUND_IFACE(r, kif), + (pd->dir == PF_IN) ? sk : nk, + (pd->dir == PF_IN) ? nk : sk, s)) { + if (pd->proto == IPPROTO_TCP) + pf_normalize_tcp_cleanup(s); + REASON_SET(&reason, PFRES_STATEINS); + pf_src_tree_remove_state(s); + STATE_DEC_COUNTERS(s); + uma_zfree(V_pf_state_z, s); + return (PF_DROP); + } else + *sm = s; + + pf_set_rt_ifp(s, pd->src); /* needs s->state_key set */ + if (tag > 0) + s->tag = tag; + if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) == + TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { + s->src.state = PF_TCPS_PROXY_SRC; + /* undo NAT changes, if they have taken place */ + if (nr != NULL) { + struct pf_state_key *skt = s->key[PF_SK_WIRE]; + if (pd->dir == PF_OUT) + skt = s->key[PF_SK_STACK]; + PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af); + PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af); + if (pd->sport) + *pd->sport = skt->port[pd->sidx]; + if (pd->dport) + *pd->dport = skt->port[pd->didx]; + if (pd->proto_sum) + *pd->proto_sum = bproto_sum; + if (pd->ip_sum) + *pd->ip_sum = bip_sum; + m_copyback(m, off, hdrlen, pd->hdr.any); + } + s->src.seqhi = htonl(arc4random()); + /* Find mss option */ + int rtid = M_GETFIB(m); + mss = pf_get_mss(m, off, th->th_off, pd->af); + mss = pf_calc_mss(pd->src, pd->af, rtid, mss); + mss = pf_calc_mss(pd->dst, pd->af, rtid, mss); + s->src.mss = mss; + pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport, + th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, + TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL); + REASON_SET(&reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + + return (PF_PASS); + +csfailed: + if (sk != NULL) + uma_zfree(V_pf_state_key_z, sk); + if (nk != NULL) + uma_zfree(V_pf_state_key_z, nk); + + if (sn != NULL && sn->states == 0 && sn->expire == 0) { + pf_remove_src_node(sn); + V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + V_pf_status.src_nodes--; + uma_zfree(V_pf_sources_z, sn); + } + if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) { + pf_remove_src_node(nsn); + V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; + V_pf_status.src_nodes--; + uma_zfree(V_pf_sources_z, nsn); + } + return (PF_DROP); +} + +static int +pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, + struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, + struct pf_ruleset **rsm) +{ + struct pf_rule *r, *a = NULL; + struct pf_ruleset *ruleset = NULL; + sa_family_t af = pd->af; + u_short reason; + int tag = -1; + int asd = 0; + int match = 0; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->tos && !(r->tos == pd->tos)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_UDP && + (r->src.port_op || r->dst.port_op)) + r = TAILQ_NEXT(r, entries); + else if (pd->proto == IPPROTO_TCP && + (r->src.port_op || r->dst.port_op || r->flagset)) + r = TAILQ_NEXT(r, entries); + else if ((pd->proto == IPPROTO_ICMP || + pd->proto == IPPROTO_ICMPV6) && + (r->type || r->code)) + r = TAILQ_NEXT(r, entries); + else if (r->prob && r->prob <= + (arc4random() % (UINT_MAX - 1) + 1)) + r = TAILQ_NEXT(r, entries); + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else { + if (r->anchor == NULL) { + match = 1; + *rm = r; + *am = a; + *rsm = ruleset; + if ((*rm)->quick) + break; + r = TAILQ_NEXT(r, entries); + } else + pf_step_into_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match); + } + if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset, + PF_RULESET_FILTER, &r, &a, &match)) + break; + } + r = *rm; + a = *am; + ruleset = *rsm; + + REASON_SET(&reason, PFRES_MATCH); + + if (r->log) + PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd, + 1); + + if (r->action != PF_PASS) + return (PF_DROP); + + if (tag > 0 && pf_tag_packet(m, pd, tag)) { + REASON_SET(&reason, PFRES_MEMORY); + return (PF_DROP); + } + + return (PF_PASS); +} + +static int +pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst, + struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off, + struct pf_pdesc *pd, u_short *reason, int *copyback) +{ + struct tcphdr *th = pd->hdr.tcp; + u_int16_t win = ntohs(th->th_win); + u_int32_t ack, end, seq, orig_seq; + u_int8_t sws, dws; + int ackskew; + + if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { + sws = src->wscale & PF_WSCALE_MASK; + dws = dst->wscale & PF_WSCALE_MASK; + } else + sws = dws = 0; + + /* + * Sequence tracking algorithm from Guido van Rooij's paper: + * http://www.madison-gurkha.com/publications/tcp_filtering/ + * tcp_filtering.ps + */ + + orig_seq = seq = ntohl(th->th_seq); + if (src->seqlo == 0) { + /* First packet from this end. Set its state */ + + if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && + src->scrub == NULL) { + if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } + } + + /* Deferred generation of sequence number modulator */ + if (dst->seqdiff && !src->seqdiff) { + /* use random iss for the TCP server */ + while ((src->seqdiff = arc4random() - seq) == 0) + ; + ack = ntohl(th->th_ack) - dst->seqdiff; + pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + + src->seqdiff), 0); + pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); + *copyback = 1; + } else { + ack = ntohl(th->th_ack); + } + + end = seq + pd->p_len; + if (th->th_flags & TH_SYN) { + end++; + if (dst->wscale & PF_WSCALE_FLAG) { + src->wscale = pf_get_wscale(m, off, th->th_off, + pd->af); + if (src->wscale & PF_WSCALE_FLAG) { + /* Remove scale factor from initial + * window */ + sws = src->wscale & PF_WSCALE_MASK; + win = ((u_int32_t)win + (1 << sws) - 1) + >> sws; + dws = dst->wscale & PF_WSCALE_MASK; + } else { + /* fixup other window */ + dst->max_win <<= dst->wscale & + PF_WSCALE_MASK; + /* in case of a retrans SYN|ACK */ + dst->wscale = 0; + } + } + } + if (th->th_flags & TH_FIN) + end++; + + src->seqlo = seq; + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + + /* + * May need to slide the window (seqhi may have been set by + * the crappy stack check or if we picked up the connection + * after establishment) + */ + if (src->seqhi == 1 || + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) + src->seqhi = end + MAX(1, dst->max_win << dws); + if (win > src->max_win) + src->max_win = win; + + } else { + ack = ntohl(th->th_ack) - dst->seqdiff; + if (src->seqdiff) { + /* Modulate sequence numbers */ + pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + + src->seqdiff), 0); + pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); + *copyback = 1; + } + end = seq + pd->p_len; + if (th->th_flags & TH_SYN) + end++; + if (th->th_flags & TH_FIN) + end++; + } + + if ((th->th_flags & TH_ACK) == 0) { + /* Let it pass through the ack skew check */ + ack = dst->seqlo; + } else if ((ack == 0 && + (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || + /* broken tcp stacks do not set ack */ + (dst->state < TCPS_SYN_SENT)) { + /* + * Many stacks (ours included) will set the ACK number in an + * FIN|ACK if the SYN times out -- no sequence to ACK. + */ + ack = dst->seqlo; + } + + if (seq == end) { + /* Ease sequencing restrictions on no data packets */ + seq = src->seqlo; + end = seq; + } + + ackskew = dst->seqlo - ack; + + + /* + * Need to demodulate the sequence numbers in any TCP SACK options + * (Selective ACK). We could optionally validate the SACK values + * against the current ACK window, either forwards or backwards, but + * I'm not confident that SACK has been implemented properly + * everywhere. It wouldn't surprise me if several stacks accidently + * SACK too far backwards of previously ACKed data. There really aren't + * any security implications of bad SACKing unless the target stack + * doesn't validate the option length correctly. Someone trying to + * spoof into a TCP connection won't bother blindly sending SACK + * options anyway. + */ + if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { + if (pf_modulate_sack(m, off, pd, th, dst)) + *copyback = 1; + } + + +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ + if (SEQ_GEQ(src->seqhi, end) && + /* Last octet inside other's window space */ + SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && + /* Retrans: not more than one window back */ + (ackskew >= -MAXACKWINDOW) && + /* Acking not more than one reassembled fragment backwards */ + (ackskew <= (MAXACKWINDOW << sws)) && + /* Acking not more than one window forward */ + ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || + (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) || + (pd->flags & PFDESC_IP_REAS) == 0)) { + /* Require an exact/+1 sequence match on resets when possible */ + + if (dst->scrub || src->scrub) { + if (pf_normalize_tcp_stateful(m, off, pd, reason, th, + *state, src, dst, copyback)) + return (PF_DROP); + } + + /* update max window */ + if (src->max_win < win) + src->max_win = win; + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) + src->seqlo = end; + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) + dst->seqhi = ack + MAX((win << sws), 1); + + + /* update states */ + if (th->th_flags & TH_SYN) + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_ACK) { + if (dst->state == TCPS_SYN_SENT) { + dst->state = TCPS_ESTABLISHED; + if (src->state == TCPS_ESTABLISHED && + (*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (dst->state == TCPS_CLOSING) + dst->state = TCPS_FIN_WAIT_2; + } + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state >= TCPS_FIN_WAIT_2 && + dst->state >= TCPS_FIN_WAIT_2) + (*state)->timeout = PFTM_TCP_CLOSED; + else if (src->state >= TCPS_CLOSING && + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_FIN_WAIT; + else if (src->state < TCPS_ESTABLISHED || + dst->state < TCPS_ESTABLISHED) + (*state)->timeout = PFTM_TCP_OPENING; + else if (src->state >= TCPS_CLOSING || + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_CLOSING; + else + (*state)->timeout = PFTM_TCP_ESTABLISHED; + + /* Fall through to PASS packet */ + + } else if ((dst->state < TCPS_SYN_SENT || + dst->state >= TCPS_FIN_WAIT_2 || + src->state >= TCPS_FIN_WAIT_2) && + SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && + /* Within a window forward of the originating packet */ + SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { + /* Within a window backward of the originating packet */ + + /* + * This currently handles three situations: + * 1) Stupid stacks will shotgun SYNs before their peer + * replies. + * 2) When PF catches an already established stream (the + * firewall rebooted, the state table was flushed, routes + * changed...) + * 3) Packets get funky immediately after the connection + * closes (this should catch Solaris spurious ACK|FINs + * that web servers like to spew after a close) + * + * This must be a little more careful than the above code + * since packet floods will also be caught here. We don't + * update the TTL here to mitigate the damage of a packet + * flood and so the same code can handle awkward establishment + * and a loosened connection close. + * In the establishment case, a correct peer response will + * validate the connection, go through the normal state code + * and keep updating the state TTL. + */ + + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: loose state match: "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " + "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, + pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], + (unsigned long long)(*state)->packets[1], + pd->dir == PF_IN ? "in" : "out", + pd->dir == (*state)->direction ? "fwd" : "rev"); + } + + if (dst->scrub || src->scrub) { + if (pf_normalize_tcp_stateful(m, off, pd, reason, th, + *state, src, dst, copyback)) + return (PF_DROP); + } + + /* update max window */ + if (src->max_win < win) + src->max_win = win; + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) + src->seqlo = end; + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) + dst->seqhi = ack + MAX((win << sws), 1); + + /* + * Cannot set dst->seqhi here since this could be a shotgunned + * SYN and not an already established connection. + */ + + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* Fall through to PASS packet */ + + } else { + if ((*state)->dst.state == TCPS_SYN_SENT && + (*state)->src.state == TCPS_SYN_SENT) { + /* Send RST for state mismatches during handshake */ + if (!(th->th_flags & TH_RST)) + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + pd->dst, pd->src, th->th_dport, + th->th_sport, ntohl(th->th_ack), 0, + TH_RST, 0, 0, + (*state)->rule.ptr->return_ttl, 1, 0, + kif->pfik_ifp); + src->seqlo = 0; + src->seqhi = 1; + src->max_win = 1; + } else if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD state: "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " + "pkts=%llu:%llu dir=%s,%s\n", + seq, orig_seq, ack, pd->p_len, ackskew, + (unsigned long long)(*state)->packets[0], + (unsigned long long)(*state)->packets[1], + pd->dir == PF_IN ? "in" : "out", + pd->dir == (*state)->direction ? "fwd" : "rev"); + printf("pf: State failure on: %c %c %c %c | %c %c\n", + SEQ_GEQ(src->seqhi, end) ? ' ' : '1', + SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? + ' ': '2', + (ackskew >= -MAXACKWINDOW) ? ' ' : '3', + (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', + SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', + SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } + + return (PF_PASS); +} + +static int +pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst, + struct pf_state **state, struct pf_pdesc *pd, u_short *reason) +{ + struct tcphdr *th = pd->hdr.tcp; + + if (th->th_flags & TH_SYN) + if (src->state < TCPS_SYN_SENT) + src->state = TCPS_SYN_SENT; + if (th->th_flags & TH_FIN) + if (src->state < TCPS_CLOSING) + src->state = TCPS_CLOSING; + if (th->th_flags & TH_ACK) { + if (dst->state == TCPS_SYN_SENT) { + dst->state = TCPS_ESTABLISHED; + if (src->state == TCPS_ESTABLISHED && + (*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (dst->state == TCPS_CLOSING) { + dst->state = TCPS_FIN_WAIT_2; + } else if (src->state == TCPS_SYN_SENT && + dst->state < TCPS_SYN_SENT) { + /* + * Handle a special sloppy case where we only see one + * half of the connection. If there is a ACK after + * the initial SYN without ever seeing a packet from + * the destination, set the connection to established. + */ + dst->state = src->state = TCPS_ESTABLISHED; + if ((*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } + } else if (src->state == TCPS_CLOSING && + dst->state == TCPS_ESTABLISHED && + dst->seqlo == 0) { + /* + * Handle the closing of half connections where we + * don't see the full bidirectional FIN/ACK+ACK + * handshake. + */ + dst->state = TCPS_CLOSING; + } + } + if (th->th_flags & TH_RST) + src->state = dst->state = TCPS_TIME_WAIT; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state >= TCPS_FIN_WAIT_2 && + dst->state >= TCPS_FIN_WAIT_2) + (*state)->timeout = PFTM_TCP_CLOSED; + else if (src->state >= TCPS_CLOSING && + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_FIN_WAIT; + else if (src->state < TCPS_ESTABLISHED || + dst->state < TCPS_ESTABLISHED) + (*state)->timeout = PFTM_TCP_OPENING; + else if (src->state >= TCPS_CLOSING || + dst->state >= TCPS_CLOSING) + (*state)->timeout = PFTM_TCP_CLOSING; + else + (*state)->timeout = PFTM_TCP_ESTABLISHED; + + return (PF_PASS); +} + +static int +pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, + u_short *reason) +{ + struct pf_state_key_cmp key; + struct tcphdr *th = pd->hdr.tcp; + int copyback = 0; + struct pf_state_peer *src, *dst; + struct pf_state_key *sk; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = IPPROTO_TCP; + if (direction == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = th->th_sport; + key.port[1] = th->th_dport; + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = th->th_sport; + key.port[0] = th->th_dport; + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + sk = (*state)->key[pd->didx]; + + if ((*state)->src.state == PF_TCPS_PROXY_SRC) { + if (direction != (*state)->direction) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + if (th->th_flags & TH_SYN) { + if (ntohl(th->th_seq) != (*state)->src.seqlo) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, + pd->src, th->th_dport, th->th_sport, + (*state)->src.seqhi, ntohl(th->th_seq) + 1, + TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL); + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } else if (!(th->th_flags & TH_ACK) || + (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || + (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } else if ((*state)->src_node != NULL && + pf_src_connlimit(state)) { + REASON_SET(reason, PFRES_SRCLIMIT); + return (PF_DROP); + } else + (*state)->src.state = PF_TCPS_PROXY_DST; + } + if ((*state)->src.state == PF_TCPS_PROXY_DST) { + if (direction == (*state)->direction) { + if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || + (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || + (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } + (*state)->src.max_win = MAX(ntohs(th->th_win), 1); + if ((*state)->dst.seqhi == 1) + (*state)->dst.seqhi = htonl(arc4random()); + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + &sk->addr[pd->sidx], &sk->addr[pd->didx], + sk->port[pd->sidx], sk->port[pd->didx], + (*state)->dst.seqhi, 0, TH_SYN, 0, + (*state)->src.mss, 0, 0, (*state)->tag, NULL); + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } else if (((th->th_flags & (TH_SYN|TH_ACK)) != + (TH_SYN|TH_ACK)) || + (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_DROP); + } else { + (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); + (*state)->dst.seqlo = ntohl(th->th_seq); + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, + pd->src, th->th_dport, th->th_sport, + ntohl(th->th_ack), ntohl(th->th_seq) + 1, + TH_ACK, (*state)->src.max_win, 0, 0, 0, + (*state)->tag, NULL); + pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, + &sk->addr[pd->sidx], &sk->addr[pd->didx], + sk->port[pd->sidx], sk->port[pd->didx], + (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, + TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL); + (*state)->src.seqdiff = (*state)->dst.seqhi - + (*state)->src.seqlo; + (*state)->dst.seqdiff = (*state)->src.seqhi - + (*state)->dst.seqlo; + (*state)->src.seqhi = (*state)->src.seqlo + + (*state)->dst.max_win; + (*state)->dst.seqhi = (*state)->dst.seqlo + + (*state)->src.max_win; + (*state)->src.wscale = (*state)->dst.wscale = 0; + (*state)->src.state = (*state)->dst.state = + TCPS_ESTABLISHED; + REASON_SET(reason, PFRES_SYNPROXY); + return (PF_SYNPROXY_DROP); + } + } + + if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) && + dst->state >= TCPS_FIN_WAIT_2 && + src->state >= TCPS_FIN_WAIT_2) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: state reuse "); + pf_print_state(*state); + pf_print_flags(th->th_flags); + printf("\n"); + } + /* XXX make sure it's the same direction ?? */ + (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; + pf_unlink_state(*state, PF_ENTER_LOCKED); + *state = NULL; + return (PF_DROP); + } + + if ((*state)->state_flags & PFSTATE_SLOPPY) { + if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP) + return (PF_DROP); + } else { + if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason, + ©back) == PF_DROP) + return (PF_DROP); + } + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || + nk->port[pd->sidx] != th->th_sport) + pf_change_ap(pd->src, &th->th_sport, pd->ip_sum, + &th->th_sum, &nk->addr[pd->sidx], + nk->port[pd->sidx], 0, pd->af); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || + nk->port[pd->didx] != th->th_dport) + pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum, + &th->th_sum, &nk->addr[pd->didx], + nk->port[pd->didx], 0, pd->af); + copyback = 1; + } + + /* Copyback sequence modulation or stateful scrub changes if needed */ + if (copyback) + m_copyback(m, off, sizeof(*th), (caddr_t)th); + + return (PF_PASS); +} + +static int +pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd) +{ + struct pf_state_peer *src, *dst; + struct pf_state_key_cmp key; + struct udphdr *uh = pd->hdr.udp; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = IPPROTO_UDP; + if (direction == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = uh->uh_sport; + key.port[1] = uh->uh_dport; + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = uh->uh_sport; + key.port[0] = uh->uh_dport; + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + /* update states */ + if (src->state < PFUDPS_SINGLE) + src->state = PFUDPS_SINGLE; + if (dst->state == PFUDPS_SINGLE) + dst->state = PFUDPS_MULTIPLE; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) + (*state)->timeout = PFTM_UDP_MULTIPLE; + else + (*state)->timeout = PFTM_UDP_SINGLE; + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || + nk->port[pd->sidx] != uh->uh_sport) + pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum, + &uh->uh_sum, &nk->addr[pd->sidx], + nk->port[pd->sidx], 1, pd->af); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || + nk->port[pd->didx] != uh->uh_dport) + pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum, + &uh->uh_sum, &nk->addr[pd->didx], + nk->port[pd->didx], 1, pd->af); + m_copyback(m, off, sizeof(*uh), (caddr_t)uh); + } + + return (PF_PASS); +} + +static int +pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) +{ + struct pf_addr *saddr = pd->src, *daddr = pd->dst; + u_int16_t icmpid = 0, *icmpsum; + u_int8_t icmptype; + int state_icmp = 0; + struct pf_state_key_cmp key; + + bzero(&key, sizeof(key)); + switch (pd->proto) { +#ifdef INET + case IPPROTO_ICMP: + icmptype = pd->hdr.icmp->icmp_type; + icmpid = pd->hdr.icmp->icmp_id; + icmpsum = &pd->hdr.icmp->icmp_cksum; + + if (icmptype == ICMP_UNREACH || + icmptype == ICMP_SOURCEQUENCH || + icmptype == ICMP_REDIRECT || + icmptype == ICMP_TIMXCEED || + icmptype == ICMP_PARAMPROB) + state_icmp++; + break; +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: + icmptype = pd->hdr.icmp6->icmp6_type; + icmpid = pd->hdr.icmp6->icmp6_id; + icmpsum = &pd->hdr.icmp6->icmp6_cksum; + + if (icmptype == ICMP6_DST_UNREACH || + icmptype == ICMP6_PACKET_TOO_BIG || + icmptype == ICMP6_TIME_EXCEEDED || + icmptype == ICMP6_PARAM_PROB) + state_icmp++; + break; +#endif /* INET6 */ + } + + if (!state_icmp) { + + /* + * ICMP query/reply message not related to a TCP/UDP packet. + * Search for an ICMP state. + */ + key.af = pd->af; + key.proto = pd->proto; + key.port[0] = key.port[1] = icmpid; + if (direction == PF_IN) { /* wire side, straight */ + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + } else { /* stack side, reverse */ + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + (*state)->expire = time_uptime; + (*state)->timeout = PFTM_ICMP_ERROR_REPLY; + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + switch (pd->af) { +#ifdef INET + case AF_INET: + if (PF_ANEQ(pd->src, + &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&saddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, 0); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], + AF_INET)) + pf_change_a(&daddr->v4.s_addr, + pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, 0); + + if (nk->port[0] != + pd->hdr.icmp->icmp_id) { + pd->hdr.icmp->icmp_cksum = + pf_cksum_fixup( + pd->hdr.icmp->icmp_cksum, icmpid, + nk->port[pd->sidx], 0); + pd->hdr.icmp->icmp_id = + nk->port[pd->sidx]; + } + + m_copyback(m, off, ICMP_MINLEN, + (caddr_t )pd->hdr.icmp); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (PF_ANEQ(pd->src, + &nk->addr[pd->sidx], AF_INET6)) + pf_change_a6(saddr, + &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->sidx], 0); + + if (PF_ANEQ(pd->dst, + &nk->addr[pd->didx], AF_INET6)) + pf_change_a6(daddr, + &pd->hdr.icmp6->icmp6_cksum, + &nk->addr[pd->didx], 0); + + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + break; +#endif /* INET6 */ + } + } + return (PF_PASS); + + } else { + /* + * ICMP error message in response to a TCP/UDP packet. + * Extract the inner TCP/UDP header and search for that state. + */ + + struct pf_pdesc pd2; + bzero(&pd2, sizeof pd2); +#ifdef INET + struct ip h2; +#endif /* INET */ +#ifdef INET6 + struct ip6_hdr h2_6; + int terminal = 0; +#endif /* INET6 */ + int ipoff2 = 0; + int off2 = 0; + + pd2.af = pd->af; + /* Payload packet is from the opposite direction. */ + pd2.sidx = (direction == PF_IN) ? 1 : 0; + pd2.didx = (direction == PF_IN) ? 0 : 1; + switch (pd->af) { +#ifdef INET + case AF_INET: + /* offset of h2 in mbuf chain */ + ipoff2 = off + ICMP_MINLEN; + + if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(ip)\n")); + return (PF_DROP); + } + /* + * ICMP error messages don't refer to non-first + * fragments + */ + if (h2.ip_off & htons(IP_OFFMASK)) { + REASON_SET(reason, PFRES_FRAG); + return (PF_DROP); + } + + /* offset of protocol header that follows h2 */ + off2 = ipoff2 + (h2.ip_hl << 2); + + pd2.proto = h2.ip_p; + pd2.src = (struct pf_addr *)&h2.ip_src; + pd2.dst = (struct pf_addr *)&h2.ip_dst; + pd2.ip_sum = &h2.ip_sum; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + ipoff2 = off + sizeof(struct icmp6_hdr); + + if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(ip6)\n")); + return (PF_DROP); + } + pd2.proto = h2_6.ip6_nxt; + pd2.src = (struct pf_addr *)&h2_6.ip6_src; + pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; + pd2.ip_sum = NULL; + off2 = ipoff2 + sizeof(h2_6); + do { + switch (pd2.proto) { + case IPPROTO_FRAGMENT: + /* + * ICMPv6 error messages for + * non-first fragments + */ + REASON_SET(reason, PFRES_FRAG); + return (PF_DROP); + case IPPROTO_AH: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct ip6_ext opt6; + + if (!pf_pull_hdr(m, off2, &opt6, + sizeof(opt6), NULL, reason, + pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMPv6 short opt\n")); + return (PF_DROP); + } + if (pd2.proto == IPPROTO_AH) + off2 += (opt6.ip6e_len + 2) * 4; + else + off2 += (opt6.ip6e_len + 1) * 8; + pd2.proto = opt6.ip6e_nxt; + /* goto the next header */ + break; + } + default: + terminal++; + break; + } + } while (!terminal); + break; +#endif /* INET6 */ + } + + switch (pd2.proto) { + case IPPROTO_TCP: { + struct tcphdr th; + u_int32_t seq; + struct pf_state_peer *src, *dst; + u_int8_t dws; + int copyback = 0; + + /* + * Only the first 8 bytes of the TCP header can be + * expected. Don't access any TCP header fields after + * th_seq, an ackskew test is not possible. + */ + if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, + pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(tcp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_TCP; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[pd2.sidx] = th.th_sport; + key.port[pd2.didx] = th.th_dport; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->dst; + dst = &(*state)->src; + } else { + src = &(*state)->src; + dst = &(*state)->dst; + } + + if (src->wscale && dst->wscale) + dws = dst->wscale & PF_WSCALE_MASK; + else + dws = 0; + + /* Demodulate sequence number */ + seq = ntohl(th.th_seq) - src->seqdiff; + if (src->seqdiff) { + pf_change_a(&th.th_seq, icmpsum, + htonl(seq), 0); + copyback = 1; + } + + if (!((*state)->state_flags & PFSTATE_SLOPPY) && + (!SEQ_GEQ(src->seqhi, seq) || + !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: BAD ICMP %d:%d ", + icmptype, pd->hdr.icmp->icmp_code); + pf_print_host(pd->src, 0, pd->af); + printf(" -> "); + pf_print_host(pd->dst, 0, pd->af); + printf(" state: "); + pf_print_state(*state); + printf(" seq=%u\n", seq); + } + REASON_SET(reason, PFRES_BADSTATE); + return (PF_DROP); + } else { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf: OK ICMP %d:%d ", + icmptype, pd->hdr.icmp->icmp_code); + pf_print_host(pd->src, 0, pd->af); + printf(" -> "); + pf_print_host(pd->dst, 0, pd->af); + printf(" state: "); + pf_print_state(*state); + printf(" seq=%u\n", seq); + } + } + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != th.th_sport) + pf_change_icmp(pd2.src, &th.th_sport, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != th.th_dport) + pf_change_icmp(pd2.dst, &th.th_dport, + NULL, /* XXX Inbound NAT? */ + &nk->addr[pd2.didx], + nk->port[pd2.didx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + copyback = 1; + } + + if (copyback) { + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t )pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), + (caddr_t )&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t )&h2_6); + break; +#endif /* INET6 */ + } + m_copyback(m, off2, 8, (caddr_t)&th); + } + + return (PF_PASS); + break; + } + case IPPROTO_UDP: { + struct udphdr uh; + + if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(udp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_UDP; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[pd2.sidx] = uh.uh_sport; + key.port[pd2.didx] = uh.uh_dport; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != uh.uh_sport) + pf_change_icmp(pd2.src, &uh.uh_sport, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], &uh.uh_sum, + pd2.ip_sum, icmpsum, + pd->ip_sum, 1, pd2.af); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != uh.uh_dport) + pf_change_icmp(pd2.dst, &uh.uh_dport, + NULL, /* XXX Inbound NAT? */ + &nk->addr[pd2.didx], + nk->port[pd2.didx], &uh.uh_sum, + pd2.ip_sum, icmpsum, + pd->ip_sum, 1, pd2.af); + + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t )pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t )&h2_6); + break; +#endif /* INET6 */ + } + m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); + } + return (PF_PASS); + break; + } +#ifdef INET + case IPPROTO_ICMP: { + struct icmp iih; + + if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, + NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short i" + "(icmp)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_ICMP; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[0] = key.port[1] = iih.icmp_id; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != iih.icmp_id) + pf_change_icmp(pd2.src, &iih.icmp_id, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != iih.icmp_id) + pf_change_icmp(pd2.dst, &iih.icmp_id, + NULL, /* XXX Inbound NAT? */ + &nk->addr[pd2.didx], + nk->port[pd2.didx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET); + + m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); + m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); + } + return (PF_PASS); + break; + } +#endif /* INET */ +#ifdef INET6 + case IPPROTO_ICMPV6: { + struct icmp6_hdr iih; + + if (!pf_pull_hdr(m, off2, &iih, + sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: ICMP error message too short " + "(icmp6)\n")); + return (PF_DROP); + } + + key.af = pd2.af; + key.proto = IPPROTO_ICMPV6; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[0] = key.port[1] = iih.icmp6_id; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af) || + nk->port[pd2.sidx] != iih.icmp6_id) + pf_change_icmp(pd2.src, &iih.icmp6_id, + daddr, &nk->addr[pd2.sidx], + nk->port[pd2.sidx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET6); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af) || + nk->port[pd2.didx] != iih.icmp6_id) + pf_change_icmp(pd2.dst, &iih.icmp6_id, + NULL, /* XXX Inbound NAT? */ + &nk->addr[pd2.didx], + nk->port[pd2.didx], NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, AF_INET6); + + m_copyback(m, off, sizeof(struct icmp6_hdr), + (caddr_t)pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); + m_copyback(m, off2, sizeof(struct icmp6_hdr), + (caddr_t)&iih); + } + return (PF_PASS); + break; + } +#endif /* INET6 */ + default: { + key.af = pd2.af; + key.proto = pd2.proto; + PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); + PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); + key.port[0] = key.port[1] = 0; + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != + (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = + (*state)->key[pd->didx]; + + if (PF_ANEQ(pd2.src, + &nk->addr[pd2.sidx], pd2.af)) + pf_change_icmp(pd2.src, NULL, daddr, + &nk->addr[pd2.sidx], 0, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + + if (PF_ANEQ(pd2.dst, + &nk->addr[pd2.didx], pd2.af)) + pf_change_icmp(pd2.src, NULL, + NULL, /* XXX Inbound NAT? */ + &nk->addr[pd2.didx], 0, NULL, + pd2.ip_sum, icmpsum, + pd->ip_sum, 0, pd2.af); + + switch (pd2.af) { +#ifdef INET + case AF_INET: + m_copyback(m, off, ICMP_MINLEN, + (caddr_t)pd->hdr.icmp); + m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + m_copyback(m, off, + sizeof(struct icmp6_hdr), + (caddr_t )pd->hdr.icmp6); + m_copyback(m, ipoff2, sizeof(h2_6), + (caddr_t )&h2_6); + break; +#endif /* INET6 */ + } + } + return (PF_PASS); + break; + } + } + } +} + +static int +pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, + struct mbuf *m, struct pf_pdesc *pd) +{ + struct pf_state_peer *src, *dst; + struct pf_state_key_cmp key; + + bzero(&key, sizeof(key)); + key.af = pd->af; + key.proto = pd->proto; + if (direction == PF_IN) { + PF_ACPY(&key.addr[0], pd->src, key.af); + PF_ACPY(&key.addr[1], pd->dst, key.af); + key.port[0] = key.port[1] = 0; + } else { + PF_ACPY(&key.addr[1], pd->src, key.af); + PF_ACPY(&key.addr[0], pd->dst, key.af); + key.port[1] = key.port[0] = 0; + } + + STATE_LOOKUP(kif, &key, direction, *state, pd); + + if (direction == (*state)->direction) { + src = &(*state)->src; + dst = &(*state)->dst; + } else { + src = &(*state)->dst; + dst = &(*state)->src; + } + + /* update states */ + if (src->state < PFOTHERS_SINGLE) + src->state = PFOTHERS_SINGLE; + if (dst->state == PFOTHERS_SINGLE) + dst->state = PFOTHERS_MULTIPLE; + + /* update expire time */ + (*state)->expire = time_uptime; + if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) + (*state)->timeout = PFTM_OTHER_MULTIPLE; + else + (*state)->timeout = PFTM_OTHER_SINGLE; + + /* translate source/destination address, if necessary */ + if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { + struct pf_state_key *nk = (*state)->key[pd->didx]; + + KASSERT(nk, ("%s: nk is null", __func__)); + KASSERT(pd, ("%s: pd is null", __func__)); + KASSERT(pd->src, ("%s: pd->src is null", __func__)); + KASSERT(pd->dst, ("%s: pd->dst is null", __func__)); + switch (pd->af) { +#ifdef INET + case AF_INET: + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) + pf_change_a(&pd->src->v4.s_addr, + pd->ip_sum, + nk->addr[pd->sidx].v4.s_addr, + 0); + + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) + pf_change_a(&pd->dst->v4.s_addr, + pd->ip_sum, + nk->addr[pd->didx].v4.s_addr, + 0); + + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) + PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); + + if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) + PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); +#endif /* INET6 */ + } + } + return (PF_PASS); +} + +/* + * ipoff and off are measured from the start of the mbuf chain. + * h must be at "ipoff" on the mbuf chain. + */ +void * +pf_pull_hdr(struct mbuf *m, int off, void *p, int len, + u_short *actionp, u_short *reasonp, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: { + struct ip *h = mtod(m, struct ip *); + u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + + if (fragoff) { + if (fragoff >= len) + ACTION_SET(actionp, PF_PASS); + else { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_FRAG); + } + return (NULL); + } + if (m->m_pkthdr.len < off + len || + ntohs(h->ip_len) < off + len) { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_SHORT); + return (NULL); + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + + if (m->m_pkthdr.len < off + len || + (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < + (unsigned)(off + len)) { + ACTION_SET(actionp, PF_DROP); + REASON_SET(reasonp, PFRES_SHORT); + return (NULL); + } + break; + } +#endif /* INET6 */ + } + m_copydata(m, off, len, p); + return (p); +} + +int +pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, + int rtableid) +{ +#ifdef RADIX_MPATH + struct radix_node_head *rnh; +#endif + struct sockaddr_in *dst; + int ret = 1; + int check_mpath; +#ifdef INET6 + struct sockaddr_in6 *dst6; + struct route_in6 ro; +#else + struct route ro; +#endif + struct radix_node *rn; + struct rtentry *rt; + struct ifnet *ifp; + + check_mpath = 0; +#ifdef RADIX_MPATH + /* XXX: stick to table 0 for now */ + rnh = rt_tables_get_rnh(0, af); + if (rnh != NULL && rn_mpath_capable(rnh)) + check_mpath = 1; +#endif + bzero(&ro, sizeof(ro)); + switch (af) { + case AF_INET: + dst = satosin(&ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = addr->v4; + break; +#ifdef INET6 + case AF_INET6: + /* + * Skip check for addresses with embedded interface scope, + * as they would always match anyway. + */ + if (IN6_IS_SCOPE_EMBED(&addr->v6)) + goto out; + dst6 = (struct sockaddr_in6 *)&ro.ro_dst; + dst6->sin6_family = AF_INET6; + dst6->sin6_len = sizeof(*dst6); + dst6->sin6_addr = addr->v6; + break; +#endif /* INET6 */ + default: + return (0); + } + + /* Skip checks for ipsec interfaces */ + if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) + goto out; + + switch (af) { +#ifdef INET6 + case AF_INET6: + in6_rtalloc_ign(&ro, 0, rtableid); + break; +#endif +#ifdef INET + case AF_INET: + in_rtalloc_ign((struct route *)&ro, 0, rtableid); + break; +#endif + default: + rtalloc_ign((struct route *)&ro, 0); /* No/default FIB. */ + break; + } + + if (ro.ro_rt != NULL) { + /* No interface given, this is a no-route check */ + if (kif == NULL) + goto out; + + if (kif->pfik_ifp == NULL) { + ret = 0; + goto out; + } + + /* Perform uRPF check if passed input interface */ + ret = 0; + rn = (struct radix_node *)ro.ro_rt; + do { + rt = (struct rtentry *)rn; + ifp = rt->rt_ifp; + + if (kif->pfik_ifp == ifp) + ret = 1; +#ifdef RADIX_MPATH + rn = rn_mpath_next(rn); +#endif + } while (check_mpath == 1 && rn != NULL && ret == 0); + } else + ret = 0; +out: + if (ro.ro_rt != NULL) + RTFREE(ro.ro_rt); + return (ret); +} + +#ifdef INET +static void +pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, + struct pf_state *s, struct pf_pdesc *pd) +{ + struct mbuf *m0, *m1; + struct sockaddr_in dst; + struct ip *ip; + struct ifnet *ifp = NULL; + struct pf_addr naddr; + struct pf_src_node *sn = NULL; + int error = 0; + int sw_csum; + + KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); + KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", + __func__)); + + if ((pd->pf_mtag == NULL && + ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || + pd->pf_mtag->routed++ > 3) { + m0 = *m; + *m = NULL; + goto bad_locked; + } + + if (r->rt == PF_DUPTO) { + if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + } else { + if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + m0 = *m; + } + + ip = mtod(m0, struct ip *); + + bzero(&dst, sizeof(dst)); + dst.sin_family = AF_INET; + dst.sin_len = sizeof(dst); + dst.sin_addr = ip->ip_dst; + + if (r->rt == PF_FASTROUTE) { + struct rtentry *rt; + + if (s) + PF_STATE_UNLOCK(s); + rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0)); + if (rt == NULL) { + RTFREE_LOCKED(rt); + KMOD_IPSTAT_INC(ips_noroute); + error = EHOSTUNREACH; + goto bad; + } + + ifp = rt->rt_ifp; + rt->rt_rmx.rmx_pksent++; + + if (rt->rt_flags & RTF_GATEWAY) + bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst)); + RTFREE_LOCKED(rt); + } else { + if (TAILQ_EMPTY(&r->rpool.list)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); + goto bad_locked; + } + if (s == NULL) { + pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, + &naddr, NULL, &sn); + if (!PF_AZERO(&naddr, AF_INET)) + dst.sin_addr.s_addr = naddr.v4.s_addr; + ifp = r->rpool.cur->kif ? + r->rpool.cur->kif->pfik_ifp : NULL; + } else { + if (!PF_AZERO(&s->rt_addr, AF_INET)) + dst.sin_addr.s_addr = + s->rt_addr.v4.s_addr; + ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; + PF_STATE_UNLOCK(s); + } + } + if (ifp == NULL) + goto bad; + + if (oifp != ifp) { + if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS) + goto bad; + else if (m0 == NULL) + goto done; + if (m0->m_len < sizeof(struct ip)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); + goto bad; + } + ip = mtod(m0, struct ip *); + } + + if (ifp->if_flags & IFF_LOOPBACK) + m0->m_flags |= M_SKIP_FIREWALL; + + /* Back to host byte order. */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + /* Copied from FreeBSD 10.0-CURRENT ip_output. */ + m0->m_pkthdr.csum_flags |= CSUM_IP; + sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist; + if (sw_csum & CSUM_DELAY_DATA) { + in_delayed_cksum(m0); + sw_csum &= ~CSUM_DELAY_DATA; + } +#ifdef SCTP + if (sw_csum & CSUM_SCTP) { + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + sw_csum &= ~CSUM_SCTP; + } +#endif + m0->m_pkthdr.csum_flags &= ifp->if_hwassist; + + /* + * If small enough for interface, or the interface will take + * care of the fragmentation for us, we can just send directly. + */ + if (ip->ip_len <= ifp->if_mtu || + (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || + ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) + ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); + m0->m_flags &= ~(M_PROTOFLAGS); + error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); + goto done; + } + + /* Balk when DF bit is set or the interface didn't support TSO. */ + if ((ip->ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { + error = EMSGSIZE; + KMOD_IPSTAT_INC(ips_cantfrag); + if (r->rt != PF_DUPTO) { + icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, + ifp->if_mtu); + goto done; + } else + goto bad; + } + + error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum); + if (error) + goto bad; + + for (; m0; m0 = m1) { + m1 = m0->m_nextpkt; + m0->m_nextpkt = NULL; + if (error == 0) { + m0->m_flags &= ~(M_PROTOFLAGS); + error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); + } else + m_freem(m0); + } + + if (error == 0) + KMOD_IPSTAT_INC(ips_fragmented); + +done: + if (r->rt != PF_DUPTO) + *m = NULL; + return; + +bad_locked: + if (s) + PF_STATE_UNLOCK(s); +bad: + m_freem(m0); + goto done; +} +#endif /* INET */ + +#ifdef INET6 +static void +pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, + struct pf_state *s, struct pf_pdesc *pd) +{ + struct mbuf *m0; + struct sockaddr_in6 dst; + struct ip6_hdr *ip6; + struct ifnet *ifp = NULL; + struct pf_addr naddr; + struct pf_src_node *sn = NULL; + + KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); + KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", + __func__)); + + if ((pd->pf_mtag == NULL && + ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || + pd->pf_mtag->routed++ > 3) { + m0 = *m; + *m = NULL; + goto bad_locked; + } + + if (r->rt == PF_DUPTO) { + if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + } else { + if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { + if (s) + PF_STATE_UNLOCK(s); + return; + } + m0 = *m; + } + + ip6 = mtod(m0, struct ip6_hdr *); + + bzero(&dst, sizeof(dst)); + dst.sin6_family = AF_INET6; + dst.sin6_len = sizeof(dst); + dst.sin6_addr = ip6->ip6_dst; + + /* Cheat. XXX why only in the v6 case??? */ + if (r->rt == PF_FASTROUTE) { + if (s) + PF_STATE_UNLOCK(s); + m0->m_flags |= M_SKIP_FIREWALL; + ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); + return; + } + + if (TAILQ_EMPTY(&r->rpool.list)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); + goto bad_locked; + } + if (s == NULL) { + pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, + &naddr, NULL, &sn); + if (!PF_AZERO(&naddr, AF_INET6)) + PF_ACPY((struct pf_addr *)&dst.sin6_addr, + &naddr, AF_INET6); + ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; + } else { + if (!PF_AZERO(&s->rt_addr, AF_INET6)) + PF_ACPY((struct pf_addr *)&dst.sin6_addr, + &s->rt_addr, AF_INET6); + ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; + } + + if (s) + PF_STATE_UNLOCK(s); + + if (ifp == NULL) + goto bad; + + if (oifp != ifp) { + if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS) + goto bad; + else if (m0 == NULL) + goto done; + if (m0->m_len < sizeof(struct ip6_hdr)) { + DPFPRINTF(PF_DEBUG_URGENT, + ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", + __func__)); + goto bad; + } + ip6 = mtod(m0, struct ip6_hdr *); + } + + if (ifp->if_flags & IFF_LOOPBACK) + m0->m_flags |= M_SKIP_FIREWALL; + + /* + * If the packet is too large for the outgoing interface, + * send back an icmp6 error. + */ + if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr)) + dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) + nd6_output(ifp, ifp, m0, &dst, NULL); + else { + in6_ifstat_inc(ifp, ifs6_in_toobig); + if (r->rt != PF_DUPTO) + icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); + else + goto bad; + } + +done: + if (r->rt != PF_DUPTO) + *m = NULL; + return; + +bad_locked: + if (s) + PF_STATE_UNLOCK(s); +bad: + m_freem(m0); + goto done; +} +#endif /* INET6 */ + +/* + * FreeBSD supports cksum offloads for the following drivers. + * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4), + * ti(4), txp(4), xl(4) + * + * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : + * network driver performed cksum including pseudo header, need to verify + * csum_data + * CSUM_DATA_VALID : + * network driver performed cksum, needs to additional pseudo header + * cksum computation with partial csum_data(i.e. lack of H/W support for + * pseudo header, for instance hme(4), sk(4) and possibly gem(4)) + * + * After validating the cksum of packet, set both flag CSUM_DATA_VALID and + * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper + * TCP/UDP layer. + * Also, set csum_data to 0xffff to force cksum validation. + */ +static int +pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) +{ + u_int16_t sum = 0; + int hw_assist = 0; + struct ip *ip; + + if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) + return (1); + if (m->m_pkthdr.len < off + len) + return (1); + + switch (p) { + case IPPROTO_TCP: + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { + sum = m->m_pkthdr.csum_data; + } else { + ip = mtod(m, struct ip *); + sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_TCP)); + } + sum ^= 0xffff; + ++hw_assist; + } + break; + case IPPROTO_UDP: + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { + sum = m->m_pkthdr.csum_data; + } else { + ip = mtod(m, struct ip *); + sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_UDP)); + } + sum ^= 0xffff; + ++hw_assist; + } + break; + case IPPROTO_ICMP: +#ifdef INET6 + case IPPROTO_ICMPV6: +#endif /* INET6 */ + break; + default: + return (1); + } + + if (!hw_assist) { + switch (af) { + case AF_INET: + if (p == IPPROTO_ICMP) { + if (m->m_len < off) + return (1); + m->m_data += off; + m->m_len -= off; + sum = in_cksum(m, len); + m->m_data -= off; + m->m_len += off; + } else { + if (m->m_len < sizeof(struct ip)) + return (1); + sum = in4_cksum(m, p, off, len); + } + break; +#ifdef INET6 + case AF_INET6: + if (m->m_len < sizeof(struct ip6_hdr)) + return (1); + sum = in6_cksum(m, p, off, len); + break; +#endif /* INET6 */ + default: + return (1); + } + } + if (sum) { + switch (p) { + case IPPROTO_TCP: + { + KMOD_TCPSTAT_INC(tcps_rcvbadsum); + break; + } + case IPPROTO_UDP: + { + KMOD_UDPSTAT_INC(udps_badsum); + break; + } +#ifdef INET + case IPPROTO_ICMP: + { + KMOD_ICMPSTAT_INC(icps_checksum); + break; + } +#endif +#ifdef INET6 + case IPPROTO_ICMPV6: + { + KMOD_ICMP6STAT_INC(icp6s_checksum); + break; + } +#endif /* INET6 */ + } + return (1); + } else { + if (p == IPPROTO_TCP || p == IPPROTO_UDP) { + m->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + } + return (0); +} + + +#ifdef INET +int +pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) +{ + struct pfi_kif *kif; + u_short action, reason = 0, log = 0; + struct mbuf *m = *m0; + struct ip *h = NULL; + struct m_tag *ipfwtag; + struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; + struct pf_state *s = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_pdesc pd; + int off, dirndx, pqid = 0; + + M_ASSERTPKTHDR(m); + + if (!V_pf_status.running) + return (PF_PASS); + + memset(&pd, 0, sizeof(pd)); + + kif = (struct pfi_kif *)ifp->if_pf_kif; + + if (kif == NULL) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); + return (PF_DROP); + } + if (kif->pfik_flags & PFI_IFLAG_SKIP) + return (PF_PASS); + + if (m->m_flags & M_SKIP_FIREWALL) + return (PF_PASS); + + if (m->m_pkthdr.len < (int)sizeof(struct ip)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + pd.pf_mtag = pf_find_mtag(m); + + PF_RULES_RLOCK(); + + if (ip_divert_ptr != NULL && + ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) { + struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1); + if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + goto done; + } + pd.pf_mtag->flags |= PF_PACKET_LOOPED; + m_tag_delete(m, ipfwtag); + } + if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) { + m->m_flags |= M_FASTFWD_OURS; + pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT; + } + } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { + /* We do IP header normalization and packet reassembly here */ + action = PF_DROP; + goto done; + } + m = *m0; /* pf_normalize messes with m0 */ + h = mtod(m, struct ip *); + + off = h->ip_hl << 2; + if (off < (int)sizeof(struct ip)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + pd.src = (struct pf_addr *)&h->ip_src; + pd.dst = (struct pf_addr *)&h->ip_dst; + pd.sport = pd.dport = NULL; + pd.ip_sum = &h->ip_sum; + pd.proto_sum = NULL; + pd.proto = h->ip_p; + pd.dir = dir; + pd.sidx = (dir == PF_IN) ? 0 : 1; + pd.didx = (dir == PF_IN) ? 1 : 0; + pd.af = AF_INET; + pd.tos = h->ip_tos; + pd.tot_len = ntohs(h->ip_len); + + /* handle fragments that didn't get reassembled by normalization */ + if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { + action = pf_test_fragment(&r, dir, kif, m, h, + &pd, &a, &ruleset); + goto done; + } + + switch (h->ip_p) { + + case IPPROTO_TCP: { + struct tcphdr th; + + pd.hdr.tcp = &th; + if (!pf_pull_hdr(m, off, &th, sizeof(th), + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + pd.p_len = pd.tot_len - off - (th.th_off << 2); + if ((th.th_flags & TH_ACK) && pd.p_len == 0) + pqid = 1; + action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_UDP: { + struct udphdr uh; + + pd.hdr.udp = &uh; + if (!pf_pull_hdr(m, off, &uh, sizeof(uh), + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + if (uh.uh_dport == 0 || + ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || + ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_ICMP: { + struct icmp ih; + + pd.hdr.icmp = &ih; + if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, + &action, &reason, AF_INET)) { + log = action != PF_PASS; + goto done; + } + action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + +#ifdef INET6 + case IPPROTO_ICMPV6: { + action = PF_DROP; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping IPv4 packet with ICMPv6 payload\n")); + goto done; + } +#endif + + default: + action = pf_test_state_other(&s, dir, kif, m, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + +done: + PF_RULES_RUNLOCK(); + if (action == PF_PASS && h->ip_hl > 5 && + !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping packet with ip options\n")); + } + + if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } + if (r->rtableid >= 0) + M_SETFIB(m, r->rtableid); + +#ifdef ALTQ + if (action == PF_PASS && r->qid) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } + if (pqid || (pd.tos & IPTOS_LOWDELAY)) + pd.pf_mtag->qid = r->pqid; + else + pd.pf_mtag->qid = r->qid; + /* add hints for ecn */ + pd.pf_mtag->hdr = h; + + } +#endif /* ALTQ */ + + /* + * connections redirected to loopback should not match sockets + * bound specifically to loopback due to security implications, + * see tcp_input() and in_pcblookup_listen(). + */ + if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || + pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && + (s->nat_rule.ptr->action == PF_RDR || + s->nat_rule.ptr->action == PF_BINAT) && + (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + m->m_flags |= M_SKIP_FIREWALL; + + if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL && + !PACKET_LOOPED(&pd)) { + + ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); + if (ipfwtag != NULL) { + ((struct ipfw_rule_ref *)(ipfwtag+1))->info = + ntohs(r->divert.port); + ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir; + + if (s) + PF_STATE_UNLOCK(s); + + m_tag_prepend(m, ipfwtag); + if (m->m_flags & M_FASTFWD_OURS) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: failed to allocate tag\n")); + } + pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT; + m->m_flags &= ~M_FASTFWD_OURS; + } + ip_divert_ptr(*m0, dir == PF_IN ? DIR_IN : DIR_OUT); + *m0 = NULL; + + return (action); + } else { + /* XXX: ipfw has the same behaviour! */ + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: failed to allocate divert tag\n")); + } + } + + if (log) { + struct pf_rule *lr; + + if (s != NULL && s->nat_rule.ptr != NULL && + s->nat_rule.ptr->log & PF_LOG_ALL) + lr = s->nat_rule.ptr; + else + lr = r; + PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd, + (s == NULL)); + } + + kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len; + kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++; + + if (action == PF_PASS || r->action == PF_DROP) { + dirndx = (dir == PF_OUT); + r->packets[dirndx]++; + r->bytes[dirndx] += pd.tot_len; + if (a != NULL) { + a->packets[dirndx]++; + a->bytes[dirndx] += pd.tot_len; + } + if (s != NULL) { + if (s->nat_rule.ptr != NULL) { + s->nat_rule.ptr->packets[dirndx]++; + s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; + } + if (s->src_node != NULL) { + s->src_node->packets[dirndx]++; + s->src_node->bytes[dirndx] += pd.tot_len; + } + if (s->nat_src_node != NULL) { + s->nat_src_node->packets[dirndx]++; + s->nat_src_node->bytes[dirndx] += pd.tot_len; + } + dirndx = (dir == s->direction) ? 0 : 1; + s->packets[dirndx]++; + s->bytes[dirndx] += pd.tot_len; + } + tr = r; + nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; + if (nr != NULL && r == &V_pf_default_rule) + tr = nr; + if (tr->src.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->src.addr.p.tbl, + (s == NULL) ? pd.src : + &s->key[(s->direction == PF_IN)]-> + addr[(s->direction == PF_OUT)], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->src.neg); + if (tr->dst.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->dst.addr.p.tbl, + (s == NULL) ? pd.dst : + &s->key[(s->direction == PF_IN)]-> + addr[(s->direction == PF_IN)], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->dst.neg); + } + + switch (action) { + case PF_SYNPROXY_DROP: + m_freem(*m0); + case PF_DEFER: + *m0 = NULL; + action = PF_PASS; + break; + default: + /* pf_route() returns unlocked. */ + if (r->rt) { + pf_route(m0, r, dir, kif->pfik_ifp, s, &pd); + return (action); + } + break; + } + if (s) + PF_STATE_UNLOCK(s); + + return (action); +} +#endif /* INET */ + +#ifdef INET6 +int +pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) +{ + struct pfi_kif *kif; + u_short action, reason = 0, log = 0; + struct mbuf *m = *m0, *n = NULL; + struct ip6_hdr *h = NULL; + struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; + struct pf_state *s = NULL; + struct pf_ruleset *ruleset = NULL; + struct pf_pdesc pd; + int off, terminal = 0, dirndx, rh_cnt = 0; + + M_ASSERTPKTHDR(m); + + if (!V_pf_status.running) + return (PF_PASS); + + memset(&pd, 0, sizeof(pd)); + pd.pf_mtag = pf_find_mtag(m); + + if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED) + return (PF_PASS); + + kif = (struct pfi_kif *)ifp->if_pf_kif; + if (kif == NULL) { + DPFPRINTF(PF_DEBUG_URGENT, + ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); + return (PF_DROP); + } + if (kif->pfik_flags & PFI_IFLAG_SKIP) + return (PF_PASS); + + if (m->m_pkthdr.len < (int)sizeof(*h)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + + PF_RULES_RLOCK(); + + /* We do IP header normalization and packet reassembly here */ + if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { + action = PF_DROP; + goto done; + } + m = *m0; /* pf_normalize messes with m0 */ + h = mtod(m, struct ip6_hdr *); + +#if 1 + /* + * we do not support jumbogram yet. if we keep going, zero ip6_plen + * will do something bad, so drop the packet for now. + */ + if (htons(h->ip6_plen) == 0) { + action = PF_DROP; + REASON_SET(&reason, PFRES_NORM); /*XXX*/ + goto done; + } +#endif + + pd.src = (struct pf_addr *)&h->ip6_src; + pd.dst = (struct pf_addr *)&h->ip6_dst; + pd.sport = pd.dport = NULL; + pd.ip_sum = NULL; + pd.proto_sum = NULL; + pd.dir = dir; + pd.sidx = (dir == PF_IN) ? 0 : 1; + pd.didx = (dir == PF_IN) ? 1 : 0; + pd.af = AF_INET6; + pd.tos = 0; + pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); + + off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); + pd.proto = h->ip6_nxt; + do { + switch (pd.proto) { + case IPPROTO_FRAGMENT: + action = pf_test_fragment(&r, dir, kif, m, h, + &pd, &a, &ruleset); + if (action == PF_DROP) + REASON_SET(&reason, PFRES_FRAG); + goto done; + case IPPROTO_ROUTING: { + struct ip6_rthdr rthdr; + + if (rh_cnt++) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 more than one rthdr\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + goto done; + } + if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, + &reason, pd.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 short rthdr\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + log = 1; + goto done; + } + if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 rthdr0\n")); + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + goto done; + } + /* FALLTHROUGH */ + } + case IPPROTO_AH: + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct ip6_ext opt6; + + if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), + NULL, &reason, pd.af)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: IPv6 short opt\n")); + action = PF_DROP; + log = 1; + goto done; + } + if (pd.proto == IPPROTO_AH) + off += (opt6.ip6e_len + 2) * 4; + else + off += (opt6.ip6e_len + 1) * 8; + pd.proto = opt6.ip6e_nxt; + /* goto the next header */ + break; + } + default: + terminal++; + break; + } + } while (!terminal); + + /* if there's no routing header, use unmodified mbuf for checksumming */ + if (!n) + n = m; + + switch (pd.proto) { + + case IPPROTO_TCP: { + struct tcphdr th; + + pd.hdr.tcp = &th; + if (!pf_pull_hdr(m, off, &th, sizeof(th), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + pd.p_len = pd.tot_len - off - (th.th_off << 2); + action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); + if (action == PF_DROP) + goto done; + action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, + &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_UDP: { + struct udphdr uh; + + pd.hdr.udp = &uh; + if (!pf_pull_hdr(m, off, &uh, sizeof(uh), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + if (uh.uh_dport == 0 || + ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || + ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_SHORT); + goto done; + } + action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + case IPPROTO_ICMP: { + action = PF_DROP; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping IPv6 packet with ICMPv4 payload\n")); + goto done; + } + + case IPPROTO_ICMPV6: { + struct icmp6_hdr ih; + + pd.hdr.icmp6 = &ih; + if (!pf_pull_hdr(m, off, &ih, sizeof(ih), + &action, &reason, AF_INET6)) { + log = action != PF_PASS; + goto done; + } + action = pf_test_state_icmp(&s, dir, kif, + m, off, h, &pd, &reason); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + + default: + action = pf_test_state_other(&s, dir, kif, m, &pd); + if (action == PF_PASS) { + if (pfsync_update_state_ptr != NULL) + pfsync_update_state_ptr(s); + r = s->rule.ptr; + a = s->anchor.ptr; + log = s->log; + } else if (s == NULL) + action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, + &a, &ruleset, inp); + break; + } + +done: + PF_RULES_RUNLOCK(); + if (n != m) { + m_freem(n); + n = NULL; + } + + /* handle dangerous IPv6 extension headers. */ + if (action == PF_PASS && rh_cnt && + !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_IPOPTIONS); + log = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: dropping packet with dangerous v6 headers\n")); + } + + if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } + if (r->rtableid >= 0) + M_SETFIB(m, r->rtableid); + +#ifdef ALTQ + if (action == PF_PASS && r->qid) { + if (pd.pf_mtag == NULL && + ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { + action = PF_DROP; + REASON_SET(&reason, PFRES_MEMORY); + } + if (pd.tos & IPTOS_LOWDELAY) + pd.pf_mtag->qid = r->pqid; + else + pd.pf_mtag->qid = r->qid; + /* add hints for ecn */ + pd.pf_mtag->hdr = h; + } +#endif /* ALTQ */ + + if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || + pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && + (s->nat_rule.ptr->action == PF_RDR || + s->nat_rule.ptr->action == PF_BINAT) && + IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) + m->m_flags |= M_SKIP_FIREWALL; + + /* XXX: Anybody working on it?! */ + if (r->divert.port) + printf("pf: divert(9) is not supported for IPv6\n"); + + if (log) { + struct pf_rule *lr; + + if (s != NULL && s->nat_rule.ptr != NULL && + s->nat_rule.ptr->log & PF_LOG_ALL) + lr = s->nat_rule.ptr; + else + lr = r; + PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset, + &pd, (s == NULL)); + } + + kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len; + kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++; + + if (action == PF_PASS || r->action == PF_DROP) { + dirndx = (dir == PF_OUT); + r->packets[dirndx]++; + r->bytes[dirndx] += pd.tot_len; + if (a != NULL) { + a->packets[dirndx]++; + a->bytes[dirndx] += pd.tot_len; + } + if (s != NULL) { + if (s->nat_rule.ptr != NULL) { + s->nat_rule.ptr->packets[dirndx]++; + s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; + } + if (s->src_node != NULL) { + s->src_node->packets[dirndx]++; + s->src_node->bytes[dirndx] += pd.tot_len; + } + if (s->nat_src_node != NULL) { + s->nat_src_node->packets[dirndx]++; + s->nat_src_node->bytes[dirndx] += pd.tot_len; + } + dirndx = (dir == s->direction) ? 0 : 1; + s->packets[dirndx]++; + s->bytes[dirndx] += pd.tot_len; + } + tr = r; + nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; + if (nr != NULL && r == &V_pf_default_rule) + tr = nr; + if (tr->src.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->src.addr.p.tbl, + (s == NULL) ? pd.src : + &s->key[(s->direction == PF_IN)]->addr[0], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->src.neg); + if (tr->dst.addr.type == PF_ADDR_TABLE) + pfr_update_stats(tr->dst.addr.p.tbl, + (s == NULL) ? pd.dst : + &s->key[(s->direction == PF_IN)]->addr[1], + pd.af, pd.tot_len, dir == PF_OUT, + r->action == PF_PASS, tr->dst.neg); + } + + switch (action) { + case PF_SYNPROXY_DROP: + m_freem(*m0); + case PF_DEFER: + *m0 = NULL; + action = PF_PASS; + break; + default: + /* pf_route6() returns unlocked. */ + if (r->rt) { + pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); + return (action); + } + break; + } + + if (s) + PF_STATE_UNLOCK(s); + + return (action); +} +#endif /* INET6 */ diff --git a/sys/netpfil/pf/pf_if.c b/sys/netpfil/pf/pf_if.c new file mode 100644 index 0000000..c010b65 --- /dev/null +++ b/sys/netpfil/pf/pf_if.c @@ -0,0 +1,859 @@ +/* $OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $ */ + +/* + * Copyright 2005 Henning Brauer <henning@openbsd.org> + * Copyright 2005 Ryan McBride <mcbride@openbsd.org> + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2003 Cedric Berger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/pfvar.h> +#include <net/route.h> + +VNET_DEFINE(struct pfi_kif *, pfi_all); +static VNET_DEFINE(long, pfi_update); +#define V_pfi_update VNET(pfi_update) +#define PFI_BUFFER_MAX 0x10000 + +static VNET_DEFINE(struct pfr_addr *, pfi_buffer); +static VNET_DEFINE(int, pfi_buffer_cnt); +static VNET_DEFINE(int, pfi_buffer_max); +#define V_pfi_buffer VNET(pfi_buffer) +#define V_pfi_buffer_cnt VNET(pfi_buffer_cnt) +#define V_pfi_buffer_max VNET(pfi_buffer_max) + +eventhandler_tag pfi_attach_cookie; +eventhandler_tag pfi_detach_cookie; +eventhandler_tag pfi_attach_group_cookie; +eventhandler_tag pfi_change_group_cookie; +eventhandler_tag pfi_detach_group_cookie; +eventhandler_tag pfi_ifaddr_event_cookie; + +static void pfi_attach_ifnet(struct ifnet *); +static void pfi_attach_ifgroup(struct ifg_group *); + +static void pfi_kif_update(struct pfi_kif *); +static void pfi_dynaddr_update(struct pfi_dynaddr *dyn); +static void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, int, + int); +static void pfi_instance_add(struct ifnet *, int, int); +static void pfi_address_add(struct sockaddr *, int, int); +static int pfi_if_compare(struct pfi_kif *, struct pfi_kif *); +static int pfi_skip_if(const char *, struct pfi_kif *); +static int pfi_unmask(void *); +static void pfi_attach_ifnet_event(void * __unused, struct ifnet *); +static void pfi_detach_ifnet_event(void * __unused, struct ifnet *); +static void pfi_attach_group_event(void *, struct ifg_group *); +static void pfi_change_group_event(void *, char *); +static void pfi_detach_group_event(void *, struct ifg_group *); +static void pfi_ifaddr_event(void * __unused, struct ifnet *); + +RB_HEAD(pfi_ifhead, pfi_kif); +static RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); +static RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare); +static VNET_DEFINE(struct pfi_ifhead, pfi_ifs); +#define V_pfi_ifs VNET(pfi_ifs) + +#define PFI_BUFFER_MAX 0x10000 +MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database"); + +LIST_HEAD(pfi_list, pfi_kif); +static VNET_DEFINE(struct pfi_list, pfi_unlinked_kifs); +#define V_pfi_unlinked_kifs VNET(pfi_unlinked_kifs) +static struct mtx pfi_unlnkdkifs_mtx; + +void +pfi_initialize(void) +{ + struct ifg_group *ifg; + struct ifnet *ifp; + struct pfi_kif *kif; + + V_pfi_buffer_max = 64; + V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer), + PFI_MTYPE, M_WAITOK); + + mtx_init(&pfi_unlnkdkifs_mtx, "pf unlinked interfaces", NULL, MTX_DEF); + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WLOCK(); + V_pfi_all = pfi_kif_attach(kif, IFG_ALL); + PF_RULES_WUNLOCK(); + + IFNET_RLOCK(); + TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) + pfi_attach_ifgroup(ifg); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) + pfi_attach_ifnet(ifp); + IFNET_RUNLOCK(); + + pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, + pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event, + pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); + pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event, + pfi_attach_group_event, curvnet, EVENTHANDLER_PRI_ANY); + pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event, + pfi_change_group_event, curvnet, EVENTHANDLER_PRI_ANY); + pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event, + pfi_detach_group_event, curvnet, EVENTHANDLER_PRI_ANY); + pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event, + pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY); +} + +void +pfi_cleanup(void) +{ + struct pfi_kif *p; + + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie); + EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie); + EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie); + EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie); + EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie); + + V_pfi_all = NULL; + while ((p = RB_MIN(pfi_ifhead, &V_pfi_ifs))) { + RB_REMOVE(pfi_ifhead, &V_pfi_ifs, p); + free(p, PFI_MTYPE); + } + + while ((p = LIST_FIRST(&V_pfi_unlinked_kifs))) { + LIST_REMOVE(p, pfik_list); + free(p, PFI_MTYPE); + } + + mtx_destroy(&pfi_unlnkdkifs_mtx); + + free(V_pfi_buffer, PFI_MTYPE); +} + +struct pfi_kif * +pfi_kif_find(const char *kif_name) +{ + struct pfi_kif_cmp s; + + PF_RULES_ASSERT(); + + bzero(&s, sizeof(s)); + strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name)); + + return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&s)); +} + +struct pfi_kif * +pfi_kif_attach(struct pfi_kif *kif, const char *kif_name) +{ + struct pfi_kif *kif1; + + PF_RULES_WASSERT(); + KASSERT(kif != NULL, ("%s: null kif", __func__)); + + kif1 = pfi_kif_find(kif_name); + if (kif1 != NULL) { + free(kif, PFI_MTYPE); + return (kif1); + } + + bzero(kif, sizeof(*kif)); + strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name)); + /* + * It seems that the value of time_second is in unintialzied state + * when pf sets interface statistics clear time in boot phase if pf + * was statically linked to kernel. Instead of setting the bogus + * time value have pfi_get_ifaces handle this case. In + * pfi_get_ifaces it uses time_second if it sees the time is 0. + */ + kif->pfik_tzero = time_second > 1 ? time_second : 0; + TAILQ_INIT(&kif->pfik_dynaddrs); + + RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif); + + return (kif); +} + +void +pfi_kif_ref(struct pfi_kif *kif) +{ + + PF_RULES_WASSERT(); + kif->pfik_rulerefs++; +} + +void +pfi_kif_unref(struct pfi_kif *kif) +{ + + PF_RULES_WASSERT(); + KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif)); + + kif->pfik_rulerefs--; + + if (kif->pfik_rulerefs > 0) + return; + + /* kif referencing an existing ifnet or group should exist. */ + if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == V_pfi_all) + return; + + RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif); + + kif->pfik_flags |= PFI_IFLAG_REFS; + + mtx_lock(&pfi_unlnkdkifs_mtx); + LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list); + mtx_unlock(&pfi_unlnkdkifs_mtx); +} + +void +pfi_kif_purge(void) +{ + struct pfi_kif *kif, *kif1; + + /* + * Do naive mark-and-sweep garbage collecting of old kifs. + * Reference flag is raised by pf_purge_expired_states(). + */ + mtx_lock(&pfi_unlnkdkifs_mtx); + LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) { + if (!(kif->pfik_flags & PFI_IFLAG_REFS)) { + LIST_REMOVE(kif, pfik_list); + free(kif, PFI_MTYPE); + } else + kif->pfik_flags &= ~PFI_IFLAG_REFS; + } + mtx_unlock(&pfi_unlnkdkifs_mtx); +} + +int +pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif) +{ + struct ifg_list *p; + + if (rule_kif == NULL || rule_kif == packet_kif) + return (1); + + if (rule_kif->pfik_group != NULL) + /* XXXGL: locking? */ + TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next) + if (p->ifgl_group == rule_kif->pfik_group) + return (1); + + return (0); +} + +static void +pfi_attach_ifnet(struct ifnet *ifp) +{ + struct pfi_kif *kif; + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + + PF_RULES_WLOCK(); + V_pfi_update++; + kif = pfi_kif_attach(kif, ifp->if_xname); + + kif->pfik_ifp = ifp; + ifp->if_pf_kif = kif; + + pfi_kif_update(kif); + PF_RULES_WUNLOCK(); +} + +static void +pfi_attach_ifgroup(struct ifg_group *ifg) +{ + struct pfi_kif *kif; + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + + PF_RULES_WLOCK(); + V_pfi_update++; + kif = pfi_kif_attach(kif, ifg->ifg_group); + + kif->pfik_group = ifg; + ifg->ifg_pf_kif = kif; + PF_RULES_WUNLOCK(); +} + +int +pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af) +{ + switch (af) { +#ifdef INET + case AF_INET: + switch (dyn->pfid_acnt4) { + case 0: + return (0); + case 1: + return (PF_MATCHA(0, &dyn->pfid_addr4, + &dyn->pfid_mask4, a, AF_INET)); + default: + return (pfr_match_addr(dyn->pfid_kt, a, AF_INET)); + } + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + switch (dyn->pfid_acnt6) { + case 0: + return (0); + case 1: + return (PF_MATCHA(0, &dyn->pfid_addr6, + &dyn->pfid_mask6, a, AF_INET6)); + default: + return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6)); + } + break; +#endif /* INET6 */ + default: + return (0); + } +} + +int +pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af) +{ + struct pfi_dynaddr *dyn; + char tblname[PF_TABLE_NAME_SIZE]; + struct pf_ruleset *ruleset = NULL; + struct pfi_kif *kif; + int rv = 0; + + PF_RULES_WASSERT(); + KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u", + __func__, aw->type)); + KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn)); + + if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL) + return (ENOMEM); + + if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) { + free(dyn, PFI_MTYPE); + return (ENOMEM); + } + + if (!strcmp(aw->v.ifname, "self")) + dyn->pfid_kif = pfi_kif_attach(kif, IFG_ALL); + else + dyn->pfid_kif = pfi_kif_attach(kif, aw->v.ifname); + pfi_kif_ref(dyn->pfid_kif); + + dyn->pfid_net = pfi_unmask(&aw->v.a.mask); + if (af == AF_INET && dyn->pfid_net == 32) + dyn->pfid_net = 128; + strlcpy(tblname, aw->v.ifname, sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_NETWORK) + strlcat(tblname, ":network", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_BROADCAST) + strlcat(tblname, ":broadcast", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_PEER) + strlcat(tblname, ":peer", sizeof(tblname)); + if (aw->iflags & PFI_AFLAG_NOALIAS) + strlcat(tblname, ":0", sizeof(tblname)); + if (dyn->pfid_net != 128) + snprintf(tblname + strlen(tblname), + sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net); + if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) { + rv = ENOMEM; + goto _bad; + } + + if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) { + rv = ENOMEM; + goto _bad; + } + + dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE; + dyn->pfid_iflags = aw->iflags; + dyn->pfid_af = af; + + TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); + aw->p.dyn = dyn; + pfi_kif_update(dyn->pfid_kif); + + return (0); + +_bad: + if (dyn->pfid_kt != NULL) + pfr_detach_table(dyn->pfid_kt); + if (ruleset != NULL) + pf_remove_if_empty_ruleset(ruleset); + if (dyn->pfid_kif != NULL) + pfi_kif_unref(dyn->pfid_kif); + free(dyn, PFI_MTYPE); + + return (rv); +} + +static void +pfi_kif_update(struct pfi_kif *kif) +{ + struct ifg_list *ifgl; + struct pfi_dynaddr *p; + + PF_RULES_WASSERT(); + + /* update all dynaddr */ + TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry) + pfi_dynaddr_update(p); + + /* again for all groups kif is member of */ + if (kif->pfik_ifp != NULL) { + IF_ADDR_RLOCK(kif->pfik_ifp); + TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next) + pfi_kif_update((struct pfi_kif *) + ifgl->ifgl_group->ifg_pf_kif); + IF_ADDR_RUNLOCK(kif->pfik_ifp); + } +} + +static void +pfi_dynaddr_update(struct pfi_dynaddr *dyn) +{ + struct pfi_kif *kif; + struct pfr_ktable *kt; + + PF_RULES_WASSERT(); + KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt, + ("%s: bad argument", __func__)); + + kif = dyn->pfid_kif; + kt = dyn->pfid_kt; + + if (kt->pfrkt_larg != V_pfi_update) { + /* this table needs to be brought up-to-date */ + pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags); + kt->pfrkt_larg = V_pfi_update; + } + pfr_dynaddr_update(kt, dyn); +} + +static void +pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags) +{ + int e, size2 = 0; + struct ifg_member *ifgm; + + V_pfi_buffer_cnt = 0; + + if (kif->pfik_ifp != NULL) + pfi_instance_add(kif->pfik_ifp, net, flags); + else if (kif->pfik_group != NULL) { + IFNET_RLOCK(); + TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next) + pfi_instance_add(ifgm->ifgm_ifp, net, flags); + IFNET_RUNLOCK(); + } + + if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2, + NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK))) + printf("%s: cannot set %d new addresses into table %s: %d\n", + __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e); +} + +static void +pfi_instance_add(struct ifnet *ifp, int net, int flags) +{ + struct ifaddr *ia; + int got4 = 0, got6 = 0; + int net2, af; + + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_list) { + if (ia->ifa_addr == NULL) + continue; + af = ia->ifa_addr->sa_family; + if (af != AF_INET && af != AF_INET6) + continue; + /* + * XXX: For point-to-point interfaces, (ifname:0) and IPv4, + * jump over addresses without a proper route to work + * around a problem with ppp not fully removing the + * address used during IPCP. + */ + if ((ifp->if_flags & IFF_POINTOPOINT) && + !(ia->ifa_flags & IFA_ROUTE) && + (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET)) + continue; + if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) + continue; + if ((flags & PFI_AFLAG_BROADCAST) && + !(ifp->if_flags & IFF_BROADCAST)) + continue; + if ((flags & PFI_AFLAG_PEER) && + !(ifp->if_flags & IFF_POINTOPOINT)) + continue; + if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && + IN6_IS_ADDR_LINKLOCAL( + &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) + continue; + if (flags & PFI_AFLAG_NOALIAS) { + if (af == AF_INET && got4) + continue; + if (af == AF_INET6 && got6) + continue; + } + if (af == AF_INET) + got4 = 1; + else if (af == AF_INET6) + got6 = 1; + net2 = net; + if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) { + if (af == AF_INET) + net2 = pfi_unmask(&((struct sockaddr_in *) + ia->ifa_netmask)->sin_addr); + else if (af == AF_INET6) + net2 = pfi_unmask(&((struct sockaddr_in6 *) + ia->ifa_netmask)->sin6_addr); + } + if (af == AF_INET && net2 > 32) + net2 = 32; + if (flags & PFI_AFLAG_BROADCAST) + pfi_address_add(ia->ifa_broadaddr, af, net2); + else if (flags & PFI_AFLAG_PEER) + pfi_address_add(ia->ifa_dstaddr, af, net2); + else + pfi_address_add(ia->ifa_addr, af, net2); + } + IF_ADDR_RUNLOCK(ifp); +} + +static void +pfi_address_add(struct sockaddr *sa, int af, int net) +{ + struct pfr_addr *p; + int i; + + if (V_pfi_buffer_cnt >= V_pfi_buffer_max) { + int new_max = V_pfi_buffer_max * 2; + + if (new_max > PFI_BUFFER_MAX) { + printf("%s: address buffer full (%d/%d)\n", __func__, + V_pfi_buffer_cnt, PFI_BUFFER_MAX); + return; + } + p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE, + M_NOWAIT); + if (p == NULL) { + printf("%s: no memory to grow buffer (%d/%d)\n", + __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX); + return; + } + memcpy(V_pfi_buffer, p, V_pfi_buffer_cnt * sizeof(*V_pfi_buffer)); + /* no need to zero buffer */ + free(V_pfi_buffer, PFI_MTYPE); + V_pfi_buffer = p; + V_pfi_buffer_max = new_max; + } + if (af == AF_INET && net > 32) + net = 128; + p = V_pfi_buffer + V_pfi_buffer_cnt++; + bzero(p, sizeof(*p)); + p->pfra_af = af; + p->pfra_net = net; + if (af == AF_INET) + p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr; + else if (af == AF_INET6) { + p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr; + if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr)) + p->pfra_ip6addr.s6_addr16[1] = 0; + } + /* mask network address bits */ + if (net < 128) + ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8)); + for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++) + ((caddr_t)p)[i] = 0; +} + +void +pfi_dynaddr_remove(struct pfi_dynaddr *dyn) +{ + + KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__)); + KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__)); + + TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry); + pfi_kif_unref(dyn->pfid_kif); + pfr_detach_table(dyn->pfid_kt); + free(dyn, PFI_MTYPE); +} + +void +pfi_dynaddr_copyout(struct pf_addr_wrap *aw) +{ + + KASSERT(aw->type == PF_ADDR_DYNIFTL, + ("%s: type %u", __func__, aw->type)); + + if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL) + return; + aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6; +} + +static int +pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q) +{ + return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ)); +} + +void +pfi_update_status(const char *name, struct pf_status *pfs) +{ + struct pfi_kif *p; + struct pfi_kif_cmp key; + struct ifg_member p_member, *ifgm; + TAILQ_HEAD(, ifg_member) ifg_members; + int i, j, k; + + strlcpy(key.pfik_name, name, sizeof(key.pfik_name)); + p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&key); + if (p == NULL) + return; + + if (p->pfik_group != NULL) { + bcopy(&p->pfik_group->ifg_members, &ifg_members, + sizeof(ifg_members)); + } else { + /* build a temporary list for p only */ + bzero(&p_member, sizeof(p_member)); + p_member.ifgm_ifp = p->pfik_ifp; + TAILQ_INIT(&ifg_members); + TAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next); + } + if (pfs) { + bzero(pfs->pcounters, sizeof(pfs->pcounters)); + bzero(pfs->bcounters, sizeof(pfs->bcounters)); + } + TAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) { + if (ifgm->ifgm_ifp == NULL) + continue; + p = (struct pfi_kif *)ifgm->ifgm_ifp->if_pf_kif; + + /* just clear statistics */ + if (pfs == NULL) { + bzero(p->pfik_packets, sizeof(p->pfik_packets)); + bzero(p->pfik_bytes, sizeof(p->pfik_bytes)); + p->pfik_tzero = time_second; + continue; + } + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) { + pfs->pcounters[i][j][k] += + p->pfik_packets[i][j][k]; + pfs->bcounters[i][j] += + p->pfik_bytes[i][j][k]; + } + } +} + +void +pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size) +{ + struct pfi_kif *p, *nextp; + int n = 0; + + for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) { + nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); + if (pfi_skip_if(name, p)) + continue; + if (*size <= n++) + break; + if (!p->pfik_tzero) + p->pfik_tzero = time_second; + bcopy(p, buf++, sizeof(*buf)); + nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p); + } + *size = n; +} + +static int +pfi_skip_if(const char *filter, struct pfi_kif *p) +{ + int n; + + if (filter == NULL || !*filter) + return (0); + if (!strcmp(p->pfik_name, filter)) + return (0); /* exact match */ + n = strlen(filter); + if (n < 1 || n >= IFNAMSIZ) + return (1); /* sanity check */ + if (filter[n-1] >= '0' && filter[n-1] <= '9') + return (1); /* only do exact match in that case */ + if (strncmp(p->pfik_name, filter, n)) + return (1); /* prefix doesn't match */ + return (p->pfik_name[n] < '0' || p->pfik_name[n] > '9'); +} + +int +pfi_set_flags(const char *name, int flags) +{ + struct pfi_kif *p; + + RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + p->pfik_flags |= flags; + } + return (0); +} + +int +pfi_clear_flags(const char *name, int flags) +{ + struct pfi_kif *p; + + RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) { + if (pfi_skip_if(name, p)) + continue; + p->pfik_flags &= ~flags; + } + return (0); +} + +/* from pf_print_state.c */ +static int +pfi_unmask(void *addr) +{ + struct pf_addr *m = addr; + int i = 31, j = 0, b = 0; + u_int32_t tmp; + + while (j < 4 && m->addr32[j] == 0xffffffff) { + b += 32; + j++; + } + if (j < 4) { + tmp = ntohl(m->addr32[j]); + for (i = 31; tmp & (1 << i); --i) + b++; + } + return (b); +} + +static void +pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp) +{ + + CURVNET_SET(ifp->if_vnet); + pfi_attach_ifnet(ifp); +#ifdef ALTQ + PF_RULES_WLOCK(); + pf_altq_ifnet_event(ifp, 0); + PF_RULES_WUNLOCK(); +#endif + CURVNET_RESTORE(); +} + +static void +pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp) +{ + struct pfi_kif *kif = (struct pfi_kif *)ifp->if_pf_kif; + + CURVNET_SET(ifp->if_vnet); + PF_RULES_WLOCK(); + V_pfi_update++; + pfi_kif_update(kif); + + kif->pfik_ifp = NULL; + ifp->if_pf_kif = NULL; +#ifdef ALTQ + pf_altq_ifnet_event(ifp, 1); +#endif + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} + +static void +pfi_attach_group_event(void *arg , struct ifg_group *ifg) +{ + + CURVNET_SET((struct vnet *)arg); + pfi_attach_ifgroup(ifg); + CURVNET_RESTORE(); +} + +static void +pfi_change_group_event(void *arg, char *gname) +{ + struct pfi_kif *kif; + + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + + CURVNET_SET((struct vnet *)arg); + PF_RULES_WLOCK(); + V_pfi_update++; + kif = pfi_kif_attach(kif, gname); + pfi_kif_update(kif); + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} + +static void +pfi_detach_group_event(void *arg, struct ifg_group *ifg) +{ + struct pfi_kif *kif = (struct pfi_kif *)ifg->ifg_pf_kif; + + CURVNET_SET((struct vnet *)arg); + PF_RULES_WLOCK(); + V_pfi_update++; + + kif->pfik_group = NULL; + ifg->ifg_pf_kif = NULL; + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} + +static void +pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp) +{ + + CURVNET_SET(ifp->if_vnet); + PF_RULES_WLOCK(); + if (ifp && ifp->if_pf_kif) { + V_pfi_update++; + pfi_kif_update(ifp->if_pf_kif); + } + PF_RULES_WUNLOCK(); + CURVNET_RESTORE(); +} diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c new file mode 100644 index 0000000..032f051 --- /dev/null +++ b/sys/netpfil/pf/pf_ioctl.c @@ -0,0 +1,3774 @@ +/* $OpenBSD: pf_ioctl.c,v 1.213 2009/02/15 21:46:12 mbalmer Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_bpf.h" +#include "opt_pf.h" + +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/endian.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/interrupt.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/md5.h> +#include <sys/ucred.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/pfil.h> +#include <net/pfvar.h> +#include <net/if_pfsync.h> +#include <net/if_pflog.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif /* INET6 */ + +#ifdef ALTQ +#include <altq/altq.h> +#endif + +static int pfattach(void); +static struct pf_pool *pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t, + u_int8_t, u_int8_t, u_int8_t); + +static void pf_mv_pool(struct pf_palist *, struct pf_palist *); +static void pf_empty_pool(struct pf_palist *); +static int pfioctl(struct cdev *, u_long, caddr_t, int, + struct thread *); +#ifdef ALTQ +static int pf_begin_altq(u_int32_t *); +static int pf_rollback_altq(u_int32_t); +static int pf_commit_altq(u_int32_t); +static int pf_enable_altq(struct pf_altq *); +static int pf_disable_altq(struct pf_altq *); +static u_int32_t pf_qname2qid(char *); +static void pf_qid_unref(u_int32_t); +#endif /* ALTQ */ +static int pf_begin_rules(u_int32_t *, int, const char *); +static int pf_rollback_rules(u_int32_t, int, char *); +static int pf_setup_pfsync_matching(struct pf_ruleset *); +static void pf_hash_rule(MD5_CTX *, struct pf_rule *); +static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *); +static int pf_commit_rules(u_int32_t, int, char *); +static int pf_addr_setup(struct pf_ruleset *, + struct pf_addr_wrap *, sa_family_t); +static void pf_addr_copyout(struct pf_addr_wrap *); + +VNET_DEFINE(struct pf_rule, pf_default_rule); + +#ifdef ALTQ +static VNET_DEFINE(int, pf_altq_running); +#define V_pf_altq_running VNET(pf_altq_running) +#endif + +#define TAGID_MAX 50000 +struct pf_tagname { + TAILQ_ENTRY(pf_tagname) entries; + char name[PF_TAG_NAME_SIZE]; + uint16_t tag; + int ref; +}; + +TAILQ_HEAD(pf_tags, pf_tagname); +#define V_pf_tags VNET(pf_tags) +VNET_DEFINE(struct pf_tags, pf_tags); +#define V_pf_qids VNET(pf_qids) +VNET_DEFINE(struct pf_tags, pf_qids); +static MALLOC_DEFINE(M_PFTAG, "pf_tag", "pf(4) tag names"); +static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db"); +static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules"); + +#if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) +#error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE +#endif + +static u_int16_t tagname2tag(struct pf_tags *, char *); +static u_int16_t pf_tagname2tag(char *); +static void tag_unref(struct pf_tags *, u_int16_t); + +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + +struct cdev *pf_dev; + +/* + * XXX - These are new and need to be checked when moveing to a new version + */ +static void pf_clear_states(void); +static int pf_clear_tables(void); +static void pf_clear_srcnodes(struct pf_src_node *); +static void pf_tbladdr_copyout(struct pf_addr_wrap *); + +/* + * Wrapper functions for pfil(9) hooks + */ +#ifdef INET +static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +#endif +#ifdef INET6 +static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, + int dir, struct inpcb *inp); +#endif + +static int hook_pf(void); +static int dehook_pf(void); +static int shutdown_pf(void); +static int pf_load(void); +static int pf_unload(void); + +static struct cdevsw pf_cdevsw = { + .d_ioctl = pfioctl, + .d_name = PF_NAME, + .d_version = D_VERSION, +}; + +static volatile VNET_DEFINE(int, pf_pfil_hooked); +#define V_pf_pfil_hooked VNET(pf_pfil_hooked) +VNET_DEFINE(int, pf_end_threads); + +struct rwlock pf_rules_lock; + +/* pfsync */ +pfsync_state_import_t *pfsync_state_import_ptr = NULL; +pfsync_insert_state_t *pfsync_insert_state_ptr = NULL; +pfsync_update_state_t *pfsync_update_state_ptr = NULL; +pfsync_delete_state_t *pfsync_delete_state_ptr = NULL; +pfsync_clear_states_t *pfsync_clear_states_ptr = NULL; +pfsync_defer_t *pfsync_defer_ptr = NULL; +/* pflog */ +pflog_packet_t *pflog_packet_ptr = NULL; + +static int +pfattach(void) +{ + u_int32_t *my_timeout = V_pf_default_rule.timeout; + int error; + + pf_initialize(); + pfr_initialize(); + pfi_initialize(); + pf_normalize_init(); + + V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT; + V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT; + + RB_INIT(&V_pf_anchors); + pf_init_ruleset(&pf_main_ruleset); + + /* default rule should never be garbage collected */ + V_pf_default_rule.entries.tqe_prev = &V_pf_default_rule.entries.tqe_next; + V_pf_default_rule.action = PF_PASS; + V_pf_default_rule.nr = -1; + V_pf_default_rule.rtableid = -1; + + /* initialize default timeouts */ + my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL; + my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL; + my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL; + my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL; + my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL; + my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL; + my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL; + my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL; + my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL; + my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL; + my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL; + my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL; + my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL; + my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL; + my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL; + my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL; + my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL; + my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL; + my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START; + my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END; + + bzero(&V_pf_status, sizeof(V_pf_status)); + V_pf_status.debug = PF_DEBUG_URGENT; + + V_pf_pfil_hooked = 0; + + /* XXX do our best to avoid a conflict */ + V_pf_status.hostid = arc4random(); + + if ((error = kproc_create(pf_purge_thread, curvnet, NULL, 0, 0, + "pf purge")) != 0) + /* XXXGL: leaked all above. */ + return (error); + if ((error = swi_add(NULL, "pf send", pf_intr, curvnet, SWI_NET, + INTR_MPSAFE, &V_pf_swi_cookie)) != 0) + /* XXXGL: leaked all above. */ + return (error); + + return (0); +} + +static struct pf_pool * +pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action, + u_int32_t rule_number, u_int8_t r_last, u_int8_t active, + u_int8_t check_ticket) +{ + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num; + + ruleset = pf_find_ruleset(anchor); + if (ruleset == NULL) + return (NULL); + rs_num = pf_get_ruleset_number(rule_action); + if (rs_num >= PF_RULESET_MAX) + return (NULL); + if (active) { + if (check_ticket && ticket != + ruleset->rules[rs_num].active.ticket) + return (NULL); + if (r_last) + rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue); + else + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + } else { + if (check_ticket && ticket != + ruleset->rules[rs_num].inactive.ticket) + return (NULL); + if (r_last) + rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + else + rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr); + } + if (!r_last) { + while ((rule != NULL) && (rule->nr != rule_number)) + rule = TAILQ_NEXT(rule, entries); + } + if (rule == NULL) + return (NULL); + + return (&rule->rpool); +} + +static void +pf_mv_pool(struct pf_palist *poola, struct pf_palist *poolb) +{ + struct pf_pooladdr *mv_pool_pa; + + while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) { + TAILQ_REMOVE(poola, mv_pool_pa, entries); + TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries); + } +} + +static void +pf_empty_pool(struct pf_palist *poola) +{ + struct pf_pooladdr *pa; + + while ((pa = TAILQ_FIRST(poola)) != NULL) { + switch (pa->addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(pa->addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(pa->addr.p.tbl); + break; + } + if (pa->kif) + pfi_kif_unref(pa->kif); + TAILQ_REMOVE(poola, pa, entries); + free(pa, M_PFRULE); + } +} + +static void +pf_unlink_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule) +{ + + PF_RULES_WASSERT(); + + TAILQ_REMOVE(rulequeue, rule, entries); + + PF_UNLNKDRULES_LOCK(); + rule->rule_flag |= PFRULE_REFS; + TAILQ_INSERT_TAIL(&V_pf_unlinked_rules, rule, entries); + PF_UNLNKDRULES_UNLOCK(); +} + +void +pf_free_rule(struct pf_rule *rule) +{ + + PF_RULES_WASSERT(); + + if (rule->tag) + tag_unref(&V_pf_tags, rule->tag); + if (rule->match_tag) + tag_unref(&V_pf_tags, rule->match_tag); +#ifdef ALTQ + if (rule->pqid != rule->qid) + pf_qid_unref(rule->pqid); + pf_qid_unref(rule->qid); +#endif + switch (rule->src.addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(rule->src.addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(rule->src.addr.p.tbl); + break; + } + switch (rule->dst.addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(rule->dst.addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(rule->dst.addr.p.tbl); + break; + } + if (rule->overload_tbl) + pfr_detach_table(rule->overload_tbl); + if (rule->kif) + pfi_kif_unref(rule->kif); + pf_anchor_remove(rule); + pf_empty_pool(&rule->rpool.list); + free(rule, M_PFRULE); +} + +static u_int16_t +tagname2tag(struct pf_tags *head, char *tagname) +{ + struct pf_tagname *tag, *p = NULL; + u_int16_t new_tagid = 1; + + PF_RULES_WASSERT(); + + TAILQ_FOREACH(tag, head, entries) + if (strcmp(tagname, tag->name) == 0) { + tag->ref++; + return (tag->tag); + } + + /* + * to avoid fragmentation, we do a linear search from the beginning + * and take the first free slot we find. if there is none or the list + * is empty, append a new entry at the end. + */ + + /* new entry */ + if (!TAILQ_EMPTY(head)) + for (p = TAILQ_FIRST(head); p != NULL && + p->tag == new_tagid; p = TAILQ_NEXT(p, entries)) + new_tagid = p->tag + 1; + + if (new_tagid > TAGID_MAX) + return (0); + + /* allocate and fill new struct pf_tagname */ + tag = malloc(sizeof(*tag), M_PFTAG, M_NOWAIT|M_ZERO); + if (tag == NULL) + return (0); + strlcpy(tag->name, tagname, sizeof(tag->name)); + tag->tag = new_tagid; + tag->ref++; + + if (p != NULL) /* insert new entry before p */ + TAILQ_INSERT_BEFORE(p, tag, entries); + else /* either list empty or no free slot in between */ + TAILQ_INSERT_TAIL(head, tag, entries); + + return (tag->tag); +} + +static void +tag_unref(struct pf_tags *head, u_int16_t tag) +{ + struct pf_tagname *p, *next; + + PF_RULES_WASSERT(); + + for (p = TAILQ_FIRST(head); p != NULL; p = next) { + next = TAILQ_NEXT(p, entries); + if (tag == p->tag) { + if (--p->ref == 0) { + TAILQ_REMOVE(head, p, entries); + free(p, M_PFTAG); + } + break; + } + } +} + +static u_int16_t +pf_tagname2tag(char *tagname) +{ + return (tagname2tag(&V_pf_tags, tagname)); +} + +#ifdef ALTQ +static u_int32_t +pf_qname2qid(char *qname) +{ + return ((u_int32_t)tagname2tag(&V_pf_qids, qname)); +} + +static void +pf_qid_unref(u_int32_t qid) +{ + tag_unref(&V_pf_qids, (u_int16_t)qid); +} + +static int +pf_begin_altq(u_int32_t *ticket) +{ + struct pf_altq *altq; + int error = 0; + + PF_RULES_WASSERT(); + + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* detach and destroy the discipline */ + error = altq_remove(altq); + } else + pf_qid_unref(altq->qid); + free(altq, M_PFALTQ); + } + if (error) + return (error); + *ticket = ++V_ticket_altqs_inactive; + V_altqs_inactive_open = 1; + return (0); +} + +static int +pf_rollback_altq(u_int32_t ticket) +{ + struct pf_altq *altq; + int error = 0; + + PF_RULES_WASSERT(); + + if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive) + return (0); + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* detach and destroy the discipline */ + error = altq_remove(altq); + } else + pf_qid_unref(altq->qid); + free(altq, M_PFALTQ); + } + V_altqs_inactive_open = 0; + return (error); +} + +static int +pf_commit_altq(u_int32_t ticket) +{ + struct pf_altqqueue *old_altqs; + struct pf_altq *altq; + int err, error = 0; + + PF_RULES_WASSERT(); + + if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive) + return (EBUSY); + + /* swap altqs, keep the old. */ + old_altqs = V_pf_altqs_active; + V_pf_altqs_active = V_pf_altqs_inactive; + V_pf_altqs_inactive = old_altqs; + V_ticket_altqs_active = V_ticket_altqs_inactive; + + /* Attach new disciplines */ + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* attach the discipline */ + error = altq_pfattach(altq); + if (error == 0 && V_pf_altq_running) + error = pf_enable_altq(altq); + if (error != 0) + return (error); + } + } + + /* Purge the old altq list */ + while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) { + TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries); + if (altq->qname[0] == 0 && + (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) { + /* detach and destroy the discipline */ + if (V_pf_altq_running) + error = pf_disable_altq(altq); + err = altq_pfdetach(altq); + if (err != 0 && error == 0) + error = err; + err = altq_remove(altq); + if (err != 0 && error == 0) + error = err; + } else + pf_qid_unref(altq->qid); + free(altq, M_PFALTQ); + } + + V_altqs_inactive_open = 0; + return (error); +} + +static int +pf_enable_altq(struct pf_altq *altq) +{ + struct ifnet *ifp; + struct tb_profile tb; + int error = 0; + + if ((ifp = ifunit(altq->ifname)) == NULL) + return (EINVAL); + + if (ifp->if_snd.altq_type != ALTQT_NONE) + error = altq_enable(&ifp->if_snd); + + /* set tokenbucket regulator */ + if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + tb.rate = altq->ifbandwidth; + tb.depth = altq->tbrsize; + error = tbr_set(&ifp->if_snd, &tb); + } + + return (error); +} + +static int +pf_disable_altq(struct pf_altq *altq) +{ + struct ifnet *ifp; + struct tb_profile tb; + int error; + + if ((ifp = ifunit(altq->ifname)) == NULL) + return (EINVAL); + + /* + * when the discipline is no longer referenced, it was overridden + * by a new one. if so, just return. + */ + if (altq->altq_disc != ifp->if_snd.altq_disc) + return (0); + + error = altq_disable(&ifp->if_snd); + + if (error == 0) { + /* clear tokenbucket regulator */ + tb.rate = 0; + error = tbr_set(&ifp->if_snd, &tb); + } + + return (error); +} + +void +pf_altq_ifnet_event(struct ifnet *ifp, int remove) +{ + struct ifnet *ifp1; + struct pf_altq *a1, *a2, *a3; + u_int32_t ticket; + int error = 0; + + /* Interrupt userland queue modifications */ + if (V_altqs_inactive_open) + pf_rollback_altq(V_ticket_altqs_inactive); + + /* Start new altq ruleset */ + if (pf_begin_altq(&ticket)) + return; + + /* Copy the current active set */ + TAILQ_FOREACH(a1, V_pf_altqs_active, entries) { + a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT); + if (a2 == NULL) { + error = ENOMEM; + break; + } + bcopy(a1, a2, sizeof(struct pf_altq)); + + if (a2->qname[0] != 0) { + if ((a2->qid = pf_qname2qid(a2->qname)) == 0) { + error = EBUSY; + free(a2, M_PFALTQ); + break; + } + a2->altq_disc = NULL; + TAILQ_FOREACH(a3, V_pf_altqs_inactive, entries) { + if (strncmp(a3->ifname, a2->ifname, + IFNAMSIZ) == 0 && a3->qname[0] == 0) { + a2->altq_disc = a3->altq_disc; + break; + } + } + } + /* Deactivate the interface in question */ + a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED; + if ((ifp1 = ifunit(a2->ifname)) == NULL || + (remove && ifp1 == ifp)) { + a2->local_flags |= PFALTQ_FLAG_IF_REMOVED; + } else { + error = altq_add(a2); + + if (ticket != V_ticket_altqs_inactive) + error = EBUSY; + + if (error) { + free(a2, M_PFALTQ); + break; + } + } + + TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries); + } + + if (error != 0) + pf_rollback_altq(ticket); + else + pf_commit_altq(ticket); +} +#endif /* ALTQ */ + +static int +pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule; + + PF_RULES_WASSERT(); + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_or_create_ruleset(anchor); + if (rs == NULL) + return (EINVAL); + while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) { + pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule); + rs->rules[rs_num].inactive.rcount--; + } + *ticket = ++rs->rules[rs_num].inactive.ticket; + rs->rules[rs_num].inactive.open = 1; + return (0); +} + +static int +pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule; + + PF_RULES_WASSERT(); + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_ruleset(anchor); + if (rs == NULL || !rs->rules[rs_num].inactive.open || + rs->rules[rs_num].inactive.ticket != ticket) + return (0); + while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) { + pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule); + rs->rules[rs_num].inactive.rcount--; + } + rs->rules[rs_num].inactive.open = 0; + return (0); +} + +#define PF_MD5_UPD(st, elm) \ + MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm)) + +#define PF_MD5_UPD_STR(st, elm) \ + MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm)) + +#define PF_MD5_UPD_HTONL(st, elm, stor) do { \ + (stor) = htonl((st)->elm); \ + MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\ +} while (0) + +#define PF_MD5_UPD_HTONS(st, elm, stor) do { \ + (stor) = htons((st)->elm); \ + MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\ +} while (0) + +static void +pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) +{ + PF_MD5_UPD(pfr, addr.type); + switch (pfr->addr.type) { + case PF_ADDR_DYNIFTL: + PF_MD5_UPD(pfr, addr.v.ifname); + PF_MD5_UPD(pfr, addr.iflags); + break; + case PF_ADDR_TABLE: + PF_MD5_UPD(pfr, addr.v.tblname); + break; + case PF_ADDR_ADDRMASK: + /* XXX ignore af? */ + PF_MD5_UPD(pfr, addr.v.a.addr.addr32); + PF_MD5_UPD(pfr, addr.v.a.mask.addr32); + break; + } + + PF_MD5_UPD(pfr, port[0]); + PF_MD5_UPD(pfr, port[1]); + PF_MD5_UPD(pfr, neg); + PF_MD5_UPD(pfr, port_op); +} + +static void +pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule) +{ + u_int16_t x; + u_int32_t y; + + pf_hash_rule_addr(ctx, &rule->src); + pf_hash_rule_addr(ctx, &rule->dst); + PF_MD5_UPD_STR(rule, label); + PF_MD5_UPD_STR(rule, ifname); + PF_MD5_UPD_STR(rule, match_tagname); + PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */ + PF_MD5_UPD_HTONL(rule, os_fingerprint, y); + PF_MD5_UPD_HTONL(rule, prob, y); + PF_MD5_UPD_HTONL(rule, uid.uid[0], y); + PF_MD5_UPD_HTONL(rule, uid.uid[1], y); + PF_MD5_UPD(rule, uid.op); + PF_MD5_UPD_HTONL(rule, gid.gid[0], y); + PF_MD5_UPD_HTONL(rule, gid.gid[1], y); + PF_MD5_UPD(rule, gid.op); + PF_MD5_UPD_HTONL(rule, rule_flag, y); + PF_MD5_UPD(rule, action); + PF_MD5_UPD(rule, direction); + PF_MD5_UPD(rule, af); + PF_MD5_UPD(rule, quick); + PF_MD5_UPD(rule, ifnot); + PF_MD5_UPD(rule, match_tag_not); + PF_MD5_UPD(rule, natpass); + PF_MD5_UPD(rule, keep_state); + PF_MD5_UPD(rule, proto); + PF_MD5_UPD(rule, type); + PF_MD5_UPD(rule, code); + PF_MD5_UPD(rule, flags); + PF_MD5_UPD(rule, flagset); + PF_MD5_UPD(rule, allow_opts); + PF_MD5_UPD(rule, rt); + PF_MD5_UPD(rule, tos); +} + +static int +pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) +{ + struct pf_ruleset *rs; + struct pf_rule *rule, **old_array; + struct pf_rulequeue *old_rules; + int error; + u_int32_t old_rcount; + + PF_RULES_WASSERT(); + + if (rs_num < 0 || rs_num >= PF_RULESET_MAX) + return (EINVAL); + rs = pf_find_ruleset(anchor); + if (rs == NULL || !rs->rules[rs_num].inactive.open || + ticket != rs->rules[rs_num].inactive.ticket) + return (EBUSY); + + /* Calculate checksum for the main ruleset */ + if (rs == &pf_main_ruleset) { + error = pf_setup_pfsync_matching(rs); + if (error != 0) + return (error); + } + + /* Swap rules, keep the old. */ + old_rules = rs->rules[rs_num].active.ptr; + old_rcount = rs->rules[rs_num].active.rcount; + old_array = rs->rules[rs_num].active.ptr_array; + + rs->rules[rs_num].active.ptr = + rs->rules[rs_num].inactive.ptr; + rs->rules[rs_num].active.ptr_array = + rs->rules[rs_num].inactive.ptr_array; + rs->rules[rs_num].active.rcount = + rs->rules[rs_num].inactive.rcount; + rs->rules[rs_num].inactive.ptr = old_rules; + rs->rules[rs_num].inactive.ptr_array = old_array; + rs->rules[rs_num].inactive.rcount = old_rcount; + + rs->rules[rs_num].active.ticket = + rs->rules[rs_num].inactive.ticket; + pf_calc_skip_steps(rs->rules[rs_num].active.ptr); + + + /* Purge the old rule list. */ + while ((rule = TAILQ_FIRST(old_rules)) != NULL) + pf_unlink_rule(old_rules, rule); + if (rs->rules[rs_num].inactive.ptr_array) + free(rs->rules[rs_num].inactive.ptr_array, M_TEMP); + rs->rules[rs_num].inactive.ptr_array = NULL; + rs->rules[rs_num].inactive.rcount = 0; + rs->rules[rs_num].inactive.open = 0; + pf_remove_if_empty_ruleset(rs); + + return (0); +} + +static int +pf_setup_pfsync_matching(struct pf_ruleset *rs) +{ + MD5_CTX ctx; + struct pf_rule *rule; + int rs_cnt; + u_int8_t digest[PF_MD5_DIGEST_LENGTH]; + + MD5Init(&ctx); + for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) { + /* XXX PF_RULESET_SCRUB as well? */ + if (rs_cnt == PF_RULESET_SCRUB) + continue; + + if (rs->rules[rs_cnt].inactive.ptr_array) + free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP); + rs->rules[rs_cnt].inactive.ptr_array = NULL; + + if (rs->rules[rs_cnt].inactive.rcount) { + rs->rules[rs_cnt].inactive.ptr_array = + malloc(sizeof(caddr_t) * + rs->rules[rs_cnt].inactive.rcount, + M_TEMP, M_NOWAIT); + + if (!rs->rules[rs_cnt].inactive.ptr_array) + return (ENOMEM); + } + + TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr, + entries) { + pf_hash_rule(&ctx, rule); + (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule; + } + } + + MD5Final(digest, &ctx); + memcpy(V_pf_status.pf_chksum, digest, sizeof(V_pf_status.pf_chksum)); + return (0); +} + +static int +pf_addr_setup(struct pf_ruleset *ruleset, struct pf_addr_wrap *addr, + sa_family_t af) +{ + int error = 0; + + switch (addr->type) { + case PF_ADDR_TABLE: + addr->p.tbl = pfr_attach_table(ruleset, addr->v.tblname); + if (addr->p.tbl == NULL) + error = ENOMEM; + break; + case PF_ADDR_DYNIFTL: + error = pfi_dynaddr_setup(addr, af); + break; + } + + return (error); +} + +static void +pf_addr_copyout(struct pf_addr_wrap *addr) +{ + + switch (addr->type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_copyout(addr); + break; + case PF_ADDR_TABLE: + pf_tbladdr_copyout(addr); + break; + } +} + +static int +pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) +{ + int error = 0; + + CURVNET_SET(TD_TO_VNET(td)); + + /* XXX keep in sync with switch() below */ + if (securelevel_gt(td->td_ucred, 2)) + switch (cmd) { + case DIOCGETRULES: + case DIOCGETRULE: + case DIOCGETADDRS: + case DIOCGETADDR: + case DIOCGETSTATE: + case DIOCSETSTATUSIF: + case DIOCGETSTATUS: + case DIOCCLRSTATUS: + case DIOCNATLOOK: + case DIOCSETDEBUG: + case DIOCGETSTATES: + case DIOCGETTIMEOUT: + case DIOCCLRRULECTRS: + case DIOCGETLIMIT: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCGETQSTATS: + case DIOCGETRULESETS: + case DIOCGETRULESET: + case DIOCRGETTABLES: + case DIOCRGETTSTATS: + case DIOCRCLRTSTATS: + case DIOCRCLRADDRS: + case DIOCRADDADDRS: + case DIOCRDELADDRS: + case DIOCRSETADDRS: + case DIOCRGETADDRS: + case DIOCRGETASTATS: + case DIOCRCLRASTATS: + case DIOCRTSTADDRS: + case DIOCOSFPGET: + case DIOCGETSRCNODES: + case DIOCCLRSRCNODES: + case DIOCIGETIFACES: + case DIOCGIFSPEED: + case DIOCSETIFFLAG: + case DIOCCLRIFFLAG: + break; + case DIOCRCLRTABLES: + case DIOCRADDTABLES: + case DIOCRDELTABLES: + case DIOCRSETTFLAGS: + if (((struct pfioc_table *)addr)->pfrio_flags & + PFR_FLAG_DUMMY) + break; /* dummy operation ok */ + return (EPERM); + default: + return (EPERM); + } + + if (!(flags & FWRITE)) + switch (cmd) { + case DIOCGETRULES: + case DIOCGETADDRS: + case DIOCGETADDR: + case DIOCGETSTATE: + case DIOCGETSTATUS: + case DIOCGETSTATES: + case DIOCGETTIMEOUT: + case DIOCGETLIMIT: + case DIOCGETALTQS: + case DIOCGETALTQ: + case DIOCGETQSTATS: + case DIOCGETRULESETS: + case DIOCGETRULESET: + case DIOCNATLOOK: + case DIOCRGETTABLES: + case DIOCRGETTSTATS: + case DIOCRGETADDRS: + case DIOCRGETASTATS: + case DIOCRTSTADDRS: + case DIOCOSFPGET: + case DIOCGETSRCNODES: + case DIOCIGETIFACES: + case DIOCGIFSPEED: + break; + case DIOCRCLRTABLES: + case DIOCRADDTABLES: + case DIOCRDELTABLES: + case DIOCRCLRTSTATS: + case DIOCRCLRADDRS: + case DIOCRADDADDRS: + case DIOCRDELADDRS: + case DIOCRSETADDRS: + case DIOCRSETTFLAGS: + if (((struct pfioc_table *)addr)->pfrio_flags & + PFR_FLAG_DUMMY) { + flags |= FWRITE; /* need write lock for dummy */ + break; /* dummy operation ok */ + } + return (EACCES); + case DIOCGETRULE: + if (((struct pfioc_rule *)addr)->action == + PF_GET_CLR_CNTR) + return (EACCES); + break; + default: + return (EACCES); + } + + switch (cmd) { + case DIOCSTART: + PF_RULES_WLOCK(); + if (V_pf_status.running) + error = EEXIST; + else { + int cpu; + + PF_RULES_WUNLOCK(); + error = hook_pf(); + if (error) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: pfil registration failed\n")); + break; + } + PF_RULES_WLOCK(); + V_pf_status.running = 1; + V_pf_status.since = time_second; + + CPU_FOREACH(cpu) + V_pf_stateid[cpu] = time_second; + + DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); + } + PF_RULES_WUNLOCK(); + break; + + case DIOCSTOP: + PF_RULES_WLOCK(); + if (!V_pf_status.running) + error = ENOENT; + else { + V_pf_status.running = 0; + PF_RULES_WUNLOCK(); + error = dehook_pf(); + if (error) { + V_pf_status.running = 1; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: pfil unregistration failed\n")); + } + PF_RULES_WLOCK(); + V_pf_status.since = time_second; + DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); + } + PF_RULES_WUNLOCK(); + break; + + case DIOCADDRULE: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *rule, *tail; + struct pf_pooladdr *pa; + struct pfi_kif *kif = NULL; + int rs_num; + + if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } +#ifndef INET + if (pr->rule.af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pr->rule.af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + + rule = malloc(sizeof(*rule), M_PFRULE, M_WAITOK); + bcopy(&pr->rule, rule, sizeof(struct pf_rule)); + if (rule->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + rule->cuid = td->td_ucred->cr_ruid; + rule->cpid = td->td_proc ? td->td_proc->p_pid : 0; + TAILQ_INIT(&rule->rpool.list); + +#define ERROUT(x) { error = (x); goto DIOCADDRULE_error; } + + PF_RULES_WLOCK(); + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) + ERROUT(EINVAL); + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) + ERROUT(EINVAL); + if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) { + DPFPRINTF(PF_DEBUG_MISC, + ("ticket: %d != [%d]%d\n", pr->ticket, rs_num, + ruleset->rules[rs_num].inactive.ticket)); + ERROUT(EBUSY); + } + if (pr->pool_ticket != V_ticket_pabuf) { + DPFPRINTF(PF_DEBUG_MISC, + ("pool_ticket: %d != %d\n", pr->pool_ticket, + V_ticket_pabuf)); + ERROUT(EBUSY); + } + + tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr, + pf_rulequeue); + if (tail) + rule->nr = tail->nr + 1; + else + rule->nr = 0; + if (rule->ifname[0]) { + rule->kif = pfi_kif_attach(kif, rule->ifname); + pfi_kif_ref(rule->kif); + } else + rule->kif = NULL; + + if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs) + error = EBUSY; + +#ifdef ALTQ + /* set queue IDs */ + if (rule->qname[0] != 0) { + if ((rule->qid = pf_qname2qid(rule->qname)) == 0) + error = EBUSY; + else if (rule->pqname[0] != 0) { + if ((rule->pqid = + pf_qname2qid(rule->pqname)) == 0) + error = EBUSY; + } else + rule->pqid = rule->qid; + } +#endif + if (rule->tagname[0]) + if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0) + error = EBUSY; + if (rule->match_tagname[0]) + if ((rule->match_tag = + pf_tagname2tag(rule->match_tagname)) == 0) + error = EBUSY; + if (rule->rt && !rule->direction) + error = EINVAL; + if (!rule->log) + rule->logif = 0; + if (rule->logif >= PFLOGIFS_MAX) + error = EINVAL; + if (pf_addr_setup(ruleset, &rule->src.addr, rule->af)) + error = ENOMEM; + if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af)) + error = ENOMEM; + if (pf_anchor_setup(rule, ruleset, pr->anchor_call)) + error = EINVAL; + TAILQ_FOREACH(pa, &V_pf_pabuf, entries) + if (pa->addr.type == PF_ADDR_TABLE) { + pa->addr.p.tbl = pfr_attach_table(ruleset, + pa->addr.v.tblname); + if (pa->addr.p.tbl == NULL) + error = ENOMEM; + } + + if (rule->overload_tblname[0]) { + if ((rule->overload_tbl = pfr_attach_table(ruleset, + rule->overload_tblname)) == NULL) + error = EINVAL; + else + rule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&V_pf_pabuf, &rule->rpool.list); + if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) || + (rule->action == PF_BINAT)) && rule->anchor == NULL) || + (rule->rt > PF_FASTROUTE)) && + (TAILQ_FIRST(&rule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_free_rule(rule); + PF_RULES_WUNLOCK(); + break; + } + + rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list); + rule->evaluations = rule->packets[0] = rule->packets[1] = + rule->bytes[0] = rule->bytes[1] = 0; + TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr, + rule, entries); + ruleset->rules[rs_num].inactive.rcount++; + PF_RULES_WUNLOCK(); + break; + +#undef ERROUT +DIOCADDRULE_error: + PF_RULES_WUNLOCK(); + free(rule, M_PFRULE); + if (kif) + free(kif, PFI_MTYPE); + break; + } + + case DIOCGETRULES: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *tail; + int rs_num; + + PF_RULES_WLOCK(); + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr, + pf_rulequeue); + if (tail) + pr->nr = tail->nr + 1; + else + pr->nr = 0; + pr->ticket = ruleset->rules[rs_num].active.ticket; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETRULE: { + struct pfioc_rule *pr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *rule; + int rs_num, i; + + PF_RULES_WLOCK(); + pr->anchor[sizeof(pr->anchor) - 1] = 0; + ruleset = pf_find_ruleset(pr->anchor); + if (ruleset == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + rs_num = pf_get_ruleset_number(pr->rule.action); + if (rs_num >= PF_RULESET_MAX) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + if (pr->ticket != ruleset->rules[rs_num].active.ticket) { + PF_RULES_WUNLOCK(); + error = EBUSY; + break; + } + rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); + while ((rule != NULL) && (rule->nr != pr->nr)) + rule = TAILQ_NEXT(rule, entries); + if (rule == NULL) { + PF_RULES_WUNLOCK(); + error = EBUSY; + break; + } + bcopy(rule, &pr->rule, sizeof(struct pf_rule)); + if (pf_anchor_copyout(ruleset, rule, pr)) { + PF_RULES_WUNLOCK(); + error = EBUSY; + break; + } + pf_addr_copyout(&pr->rule.src.addr); + pf_addr_copyout(&pr->rule.dst.addr); + for (i = 0; i < PF_SKIP_COUNT; ++i) + if (rule->skip[i].ptr == NULL) + pr->rule.skip[i].nr = -1; + else + pr->rule.skip[i].nr = + rule->skip[i].ptr->nr; + + if (pr->action == PF_GET_CLR_CNTR) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; + rule->states_tot = 0; + } + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCHANGERULE: { + struct pfioc_rule *pcr = (struct pfioc_rule *)addr; + struct pf_ruleset *ruleset; + struct pf_rule *oldrule = NULL, *newrule = NULL; + struct pfi_kif *kif = NULL; + struct pf_pooladdr *pa; + u_int32_t nr = 0; + int rs_num; + + if (pcr->action < PF_CHANGE_ADD_HEAD || + pcr->action > PF_CHANGE_GET_TICKET) { + error = EINVAL; + break; + } + if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) { + error = EINVAL; + break; + } + + if (pcr->action != PF_CHANGE_REMOVE) { +#ifndef INET + if (pcr->rule.af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pcr->rule.af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + newrule = malloc(sizeof(*newrule), M_PFRULE, M_WAITOK); + bcopy(&pcr->rule, newrule, sizeof(struct pf_rule)); + newrule->cuid = td->td_ucred->cr_ruid; + newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0; + TAILQ_INIT(&newrule->rpool.list); + /* Initialize refcounting. */ + newrule->states_cur = 0; + newrule->entries.tqe_prev = NULL; + + if (newrule->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + } + +#define ERROUT(x) { error = (x); goto DIOCCHANGERULE_error; } + + PF_RULES_WLOCK(); + if (!(pcr->action == PF_CHANGE_REMOVE || + pcr->action == PF_CHANGE_GET_TICKET) && + pcr->pool_ticket != V_ticket_pabuf) + ERROUT(EBUSY); + + ruleset = pf_find_ruleset(pcr->anchor); + if (ruleset == NULL) + ERROUT(EINVAL); + + rs_num = pf_get_ruleset_number(pcr->rule.action); + if (rs_num >= PF_RULESET_MAX) + ERROUT(EINVAL); + + if (pcr->action == PF_CHANGE_GET_TICKET) { + pcr->ticket = ++ruleset->rules[rs_num].active.ticket; + ERROUT(0); + } else if (pcr->ticket != + ruleset->rules[rs_num].active.ticket) + ERROUT(EINVAL); + + if (pcr->action != PF_CHANGE_REMOVE) { + if (newrule->ifname[0]) { + newrule->kif = pfi_kif_attach(kif, + newrule->ifname); + pfi_kif_ref(newrule->kif); + } else + newrule->kif = NULL; + + if (newrule->rtableid > 0 && + newrule->rtableid >= rt_numfibs) + error = EBUSY; + +#ifdef ALTQ + /* set queue IDs */ + if (newrule->qname[0] != 0) { + if ((newrule->qid = + pf_qname2qid(newrule->qname)) == 0) + error = EBUSY; + else if (newrule->pqname[0] != 0) { + if ((newrule->pqid = + pf_qname2qid(newrule->pqname)) == 0) + error = EBUSY; + } else + newrule->pqid = newrule->qid; + } +#endif /* ALTQ */ + if (newrule->tagname[0]) + if ((newrule->tag = + pf_tagname2tag(newrule->tagname)) == 0) + error = EBUSY; + if (newrule->match_tagname[0]) + if ((newrule->match_tag = pf_tagname2tag( + newrule->match_tagname)) == 0) + error = EBUSY; + if (newrule->rt && !newrule->direction) + error = EINVAL; + if (!newrule->log) + newrule->logif = 0; + if (newrule->logif >= PFLOGIFS_MAX) + error = EINVAL; + if (pf_addr_setup(ruleset, &newrule->src.addr, newrule->af)) + error = ENOMEM; + if (pf_addr_setup(ruleset, &newrule->dst.addr, newrule->af)) + error = ENOMEM; + if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call)) + error = EINVAL; + TAILQ_FOREACH(pa, &V_pf_pabuf, entries) + if (pa->addr.type == PF_ADDR_TABLE) { + pa->addr.p.tbl = + pfr_attach_table(ruleset, + pa->addr.v.tblname); + if (pa->addr.p.tbl == NULL) + error = ENOMEM; + } + + if (newrule->overload_tblname[0]) { + if ((newrule->overload_tbl = pfr_attach_table( + ruleset, newrule->overload_tblname)) == + NULL) + error = EINVAL; + else + newrule->overload_tbl->pfrkt_flags |= + PFR_TFLAG_ACTIVE; + } + + pf_mv_pool(&V_pf_pabuf, &newrule->rpool.list); + if (((((newrule->action == PF_NAT) || + (newrule->action == PF_RDR) || + (newrule->action == PF_BINAT) || + (newrule->rt > PF_FASTROUTE)) && + !newrule->anchor)) && + (TAILQ_FIRST(&newrule->rpool.list) == NULL)) + error = EINVAL; + + if (error) { + pf_free_rule(newrule); + PF_RULES_WUNLOCK(); + break; + } + + newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list); + newrule->evaluations = 0; + newrule->packets[0] = newrule->packets[1] = 0; + newrule->bytes[0] = newrule->bytes[1] = 0; + } + pf_empty_pool(&V_pf_pabuf); + + if (pcr->action == PF_CHANGE_ADD_HEAD) + oldrule = TAILQ_FIRST( + ruleset->rules[rs_num].active.ptr); + else if (pcr->action == PF_CHANGE_ADD_TAIL) + oldrule = TAILQ_LAST( + ruleset->rules[rs_num].active.ptr, pf_rulequeue); + else { + oldrule = TAILQ_FIRST( + ruleset->rules[rs_num].active.ptr); + while ((oldrule != NULL) && (oldrule->nr != pcr->nr)) + oldrule = TAILQ_NEXT(oldrule, entries); + if (oldrule == NULL) { + if (newrule != NULL) + pf_free_rule(newrule); + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + } + + if (pcr->action == PF_CHANGE_REMOVE) { + pf_unlink_rule(ruleset->rules[rs_num].active.ptr, + oldrule); + ruleset->rules[rs_num].active.rcount--; + } else { + if (oldrule == NULL) + TAILQ_INSERT_TAIL( + ruleset->rules[rs_num].active.ptr, + newrule, entries); + else if (pcr->action == PF_CHANGE_ADD_HEAD || + pcr->action == PF_CHANGE_ADD_BEFORE) + TAILQ_INSERT_BEFORE(oldrule, newrule, entries); + else + TAILQ_INSERT_AFTER( + ruleset->rules[rs_num].active.ptr, + oldrule, newrule, entries); + ruleset->rules[rs_num].active.rcount++; + } + + nr = 0; + TAILQ_FOREACH(oldrule, + ruleset->rules[rs_num].active.ptr, entries) + oldrule->nr = nr++; + + ruleset->rules[rs_num].active.ticket++; + + pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr); + pf_remove_if_empty_ruleset(ruleset); + + PF_RULES_WUNLOCK(); + break; + +#undef ERROUT +DIOCCHANGERULE_error: + PF_RULES_WUNLOCK(); + if (newrule != NULL) + free(newrule, M_PFRULE); + if (kif != NULL) + free(kif, PFI_MTYPE); + break; + } + + case DIOCCLRSTATES: { + struct pf_state *s; + struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; + u_int i, killed = 0; + + for (i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + +relock_DIOCCLRSTATES: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) + if (!psk->psk_ifname[0] || + !strcmp(psk->psk_ifname, + s->kif->pfik_name)) { + /* + * Don't send out individual + * delete messages. + */ + s->state_flags |= PFSTATE_NOSYNC; + pf_unlink_state(s, PF_ENTER_LOCKED); + killed++; + goto relock_DIOCCLRSTATES; + } + PF_HASHROW_UNLOCK(ih); + } + psk->psk_killed = killed; + if (pfsync_clear_states_ptr != NULL) + pfsync_clear_states_ptr(V_pf_status.hostid, psk->psk_ifname); + break; + } + + case DIOCKILLSTATES: { + struct pf_state *s; + struct pf_state_key *sk; + struct pf_addr *srcaddr, *dstaddr; + u_int16_t srcport, dstport; + struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; + u_int i, killed = 0; + + if (psk->psk_pfcmp.id) { + if (psk->psk_pfcmp.creatorid == 0) + psk->psk_pfcmp.creatorid = V_pf_status.hostid; + if ((s = pf_find_state_byid(psk->psk_pfcmp.id, + psk->psk_pfcmp.creatorid))) { + pf_unlink_state(s, PF_ENTER_LOCKED); + psk->psk_killed = 1; + } + break; + } + + for (i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + +relock_DIOCKILLSTATES: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + sk = s->key[PF_SK_WIRE]; + if (s->direction == PF_OUT) { + srcaddr = &sk->addr[1]; + dstaddr = &sk->addr[0]; + srcport = sk->port[0]; + dstport = sk->port[0]; + } else { + srcaddr = &sk->addr[0]; + dstaddr = &sk->addr[1]; + srcport = sk->port[0]; + dstport = sk->port[0]; + } + + if ((!psk->psk_af || sk->af == psk->psk_af) + && (!psk->psk_proto || psk->psk_proto == + sk->proto) && + PF_MATCHA(psk->psk_src.neg, + &psk->psk_src.addr.v.a.addr, + &psk->psk_src.addr.v.a.mask, + srcaddr, sk->af) && + PF_MATCHA(psk->psk_dst.neg, + &psk->psk_dst.addr.v.a.addr, + &psk->psk_dst.addr.v.a.mask, + dstaddr, sk->af) && + (psk->psk_src.port_op == 0 || + pf_match_port(psk->psk_src.port_op, + psk->psk_src.port[0], psk->psk_src.port[1], + srcport)) && + (psk->psk_dst.port_op == 0 || + pf_match_port(psk->psk_dst.port_op, + psk->psk_dst.port[0], psk->psk_dst.port[1], + dstport)) && + (!psk->psk_label[0] || + (s->rule.ptr->label[0] && + !strcmp(psk->psk_label, + s->rule.ptr->label))) && + (!psk->psk_ifname[0] || + !strcmp(psk->psk_ifname, + s->kif->pfik_name))) { + pf_unlink_state(s, PF_ENTER_LOCKED); + killed++; + goto relock_DIOCKILLSTATES; + } + } + PF_HASHROW_UNLOCK(ih); + } + psk->psk_killed = killed; + break; + } + + case DIOCADDSTATE: { + struct pfioc_state *ps = (struct pfioc_state *)addr; + struct pfsync_state *sp = &ps->state; + + if (sp->timeout >= PFTM_MAX && + sp->timeout != PFTM_UNTIL_PACKET) { + error = EINVAL; + break; + } + if (pfsync_state_import_ptr != NULL) { + PF_RULES_RLOCK(); + error = pfsync_state_import_ptr(sp, PFSYNC_SI_IOCTL); + PF_RULES_RUNLOCK(); + } + error = EOPNOTSUPP; + break; + } + + case DIOCGETSTATE: { + struct pfioc_state *ps = (struct pfioc_state *)addr; + struct pf_state *s; + + s = pf_find_state_byid(ps->state.id, ps->state.creatorid); + if (s == NULL) { + error = ENOENT; + break; + } + + pfsync_state_export(&ps->state, s); + PF_STATE_UNLOCK(s); + break; + } + + case DIOCGETSTATES: { + struct pfioc_states *ps = (struct pfioc_states *)addr; + struct pf_state *s; + struct pfsync_state *pstore, *p; + int i, nr; + + if (ps->ps_len == 0) { + nr = uma_zone_get_cur(V_pf_state_z); + ps->ps_len = sizeof(struct pfsync_state) * nr; + break; + } + + p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK); + nr = 0; + + for (i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + + if (s->timeout == PFTM_UNLINKED) + continue; + + if ((nr+1) * sizeof(*p) > ps->ps_len) { + PF_HASHROW_UNLOCK(ih); + goto DIOCGETSTATES_full; + } + pfsync_state_export(p, s); + p++; + nr++; + } + PF_HASHROW_UNLOCK(ih); + } +DIOCGETSTATES_full: + error = copyout(pstore, ps->ps_states, + sizeof(struct pfsync_state) * nr); + if (error) { + free(pstore, M_TEMP); + break; + } + ps->ps_len = sizeof(struct pfsync_state) * nr; + free(pstore, M_TEMP); + + break; + } + + case DIOCGETSTATUS: { + struct pf_status *s = (struct pf_status *)addr; + PF_RULES_RLOCK(); + bcopy(&V_pf_status, s, sizeof(struct pf_status)); + pfi_update_status(s->ifname, s); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCSETSTATUSIF: { + struct pfioc_if *pi = (struct pfioc_if *)addr; + + if (pi->ifname[0] == 0) { + bzero(V_pf_status.ifname, IFNAMSIZ); + break; + } + PF_RULES_WLOCK(); + strlcpy(V_pf_status.ifname, pi->ifname, IFNAMSIZ); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCLRSTATUS: { + PF_RULES_WLOCK(); + bzero(V_pf_status.counters, sizeof(V_pf_status.counters)); + bzero(V_pf_status.fcounters, sizeof(V_pf_status.fcounters)); + bzero(V_pf_status.scounters, sizeof(V_pf_status.scounters)); + V_pf_status.since = time_second; + if (*V_pf_status.ifname) + pfi_update_status(V_pf_status.ifname, NULL); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCNATLOOK: { + struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr; + struct pf_state_key *sk; + struct pf_state *state; + struct pf_state_key_cmp key; + int m = 0, direction = pnl->direction; + int sidx, didx; + + /* NATLOOK src and dst are reversed, so reverse sidx/didx */ + sidx = (direction == PF_IN) ? 1 : 0; + didx = (direction == PF_IN) ? 0 : 1; + + if (!pnl->proto || + PF_AZERO(&pnl->saddr, pnl->af) || + PF_AZERO(&pnl->daddr, pnl->af) || + ((pnl->proto == IPPROTO_TCP || + pnl->proto == IPPROTO_UDP) && + (!pnl->dport || !pnl->sport))) + error = EINVAL; + else { + key.af = pnl->af; + key.proto = pnl->proto; + PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af); + key.port[sidx] = pnl->sport; + PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af); + key.port[didx] = pnl->dport; + + state = pf_find_state_all(&key, direction, &m); + + if (m > 1) + error = E2BIG; /* more than one state */ + else if (state != NULL) { + /* XXXGL: not locked read */ + sk = state->key[sidx]; + PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af); + pnl->rsport = sk->port[sidx]; + PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af); + pnl->rdport = sk->port[didx]; + } else + error = ENOENT; + } + break; + } + + case DIOCSETTIMEOUT: { + struct pfioc_tm *pt = (struct pfioc_tm *)addr; + int old; + + if (pt->timeout < 0 || pt->timeout >= PFTM_MAX || + pt->seconds < 0) { + error = EINVAL; + break; + } + PF_RULES_WLOCK(); + old = V_pf_default_rule.timeout[pt->timeout]; + if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0) + pt->seconds = 1; + V_pf_default_rule.timeout[pt->timeout] = pt->seconds; + if (pt->timeout == PFTM_INTERVAL && pt->seconds < old) + wakeup(pf_purge_thread); + pt->seconds = old; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETTIMEOUT: { + struct pfioc_tm *pt = (struct pfioc_tm *)addr; + + if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) { + error = EINVAL; + break; + } + PF_RULES_RLOCK(); + pt->seconds = V_pf_default_rule.timeout[pt->timeout]; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETLIMIT: { + struct pfioc_limit *pl = (struct pfioc_limit *)addr; + + if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) { + error = EINVAL; + break; + } + PF_RULES_RLOCK(); + pl->limit = V_pf_limits[pl->index].limit; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCSETLIMIT: { + struct pfioc_limit *pl = (struct pfioc_limit *)addr; + int old_limit; + + PF_RULES_WLOCK(); + if (pl->index < 0 || pl->index >= PF_LIMIT_MAX || + V_pf_limits[pl->index].zone == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + uma_zone_set_max(V_pf_limits[pl->index].zone, pl->limit); + old_limit = V_pf_limits[pl->index].limit; + V_pf_limits[pl->index].limit = pl->limit; + pl->limit = old_limit; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCSETDEBUG: { + u_int32_t *level = (u_int32_t *)addr; + + PF_RULES_WLOCK(); + V_pf_status.debug = *level; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCLRRULECTRS: { + /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */ + struct pf_ruleset *ruleset = &pf_main_ruleset; + struct pf_rule *rule; + + PF_RULES_WLOCK(); + TAILQ_FOREACH(rule, + ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) { + rule->evaluations = 0; + rule->packets[0] = rule->packets[1] = 0; + rule->bytes[0] = rule->bytes[1] = 0; + } + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGIFSPEED: { + struct pf_ifspeed *psp = (struct pf_ifspeed *)addr; + struct pf_ifspeed ps; + struct ifnet *ifp; + + if (psp->ifname[0] != 0) { + /* Can we completely trust user-land? */ + strlcpy(ps.ifname, psp->ifname, IFNAMSIZ); + ifp = ifunit(ps.ifname); + if (ifp != NULL) + psp->baudrate = ifp->if_baudrate; + else + error = EINVAL; + } else + error = EINVAL; + break; + } + +#ifdef ALTQ + case DIOCSTARTALTQ: { + struct pf_altq *altq; + + PF_RULES_WLOCK(); + /* enable all altq interfaces on active list */ + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (altq->qname[0] == 0 && (altq->local_flags & + PFALTQ_FLAG_IF_REMOVED) == 0) { + error = pf_enable_altq(altq); + if (error != 0) + break; + } + } + if (error == 0) + V_pf_altq_running = 1; + PF_RULES_WUNLOCK(); + DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n")); + break; + } + + case DIOCSTOPALTQ: { + struct pf_altq *altq; + + PF_RULES_WLOCK(); + /* disable all altq interfaces on active list */ + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) { + if (altq->qname[0] == 0 && (altq->local_flags & + PFALTQ_FLAG_IF_REMOVED) == 0) { + error = pf_disable_altq(altq); + if (error != 0) + break; + } + } + if (error == 0) + V_pf_altq_running = 0; + PF_RULES_WUNLOCK(); + DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n")); + break; + } + + case DIOCADDALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq, *a; + struct ifnet *ifp; + + altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK); + bcopy(&pa->altq, altq, sizeof(struct pf_altq)); + altq->local_flags = 0; + + PF_RULES_WLOCK(); + if (pa->ticket != V_ticket_altqs_inactive) { + PF_RULES_WUNLOCK(); + free(altq, M_PFALTQ); + error = EBUSY; + break; + } + + /* + * if this is for a queue, find the discipline and + * copy the necessary fields + */ + if (altq->qname[0] != 0) { + if ((altq->qid = pf_qname2qid(altq->qname)) == 0) { + PF_RULES_WUNLOCK(); + error = EBUSY; + free(altq, M_PFALTQ); + break; + } + altq->altq_disc = NULL; + TAILQ_FOREACH(a, V_pf_altqs_inactive, entries) { + if (strncmp(a->ifname, altq->ifname, + IFNAMSIZ) == 0 && a->qname[0] == 0) { + altq->altq_disc = a->altq_disc; + break; + } + } + } + + if ((ifp = ifunit(altq->ifname)) == NULL) + altq->local_flags |= PFALTQ_FLAG_IF_REMOVED; + else + error = altq_add(altq); + + if (error) { + PF_RULES_WUNLOCK(); + free(altq, M_PFALTQ); + break; + } + + TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries); + bcopy(altq, &pa->altq, sizeof(struct pf_altq)); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETALTQS: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq; + + PF_RULES_RLOCK(); + pa->nr = 0; + TAILQ_FOREACH(altq, V_pf_altqs_active, entries) + pa->nr++; + pa->ticket = V_ticket_altqs_active; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETALTQ: { + struct pfioc_altq *pa = (struct pfioc_altq *)addr; + struct pf_altq *altq; + u_int32_t nr; + + PF_RULES_RLOCK(); + if (pa->ticket != V_ticket_altqs_active) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + nr = 0; + altq = TAILQ_FIRST(V_pf_altqs_active); + while ((altq != NULL) && (nr < pa->nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + bcopy(altq, &pa->altq, sizeof(struct pf_altq)); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCCHANGEALTQ: + /* CHANGEALTQ not supported yet! */ + error = ENODEV; + break; + + case DIOCGETQSTATS: { + struct pfioc_qstats *pq = (struct pfioc_qstats *)addr; + struct pf_altq *altq; + u_int32_t nr; + int nbytes; + + PF_RULES_RLOCK(); + if (pq->ticket != V_ticket_altqs_active) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + nbytes = pq->nbytes; + nr = 0; + altq = TAILQ_FIRST(V_pf_altqs_active); + while ((altq != NULL) && (nr < pq->nr)) { + altq = TAILQ_NEXT(altq, entries); + nr++; + } + if (altq == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + + if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) { + PF_RULES_RUNLOCK(); + error = ENXIO; + break; + } + PF_RULES_RUNLOCK(); + error = altq_getqstats(altq, pq->buf, &nbytes); + if (error == 0) { + pq->scheduler = altq->scheduler; + pq->nbytes = nbytes; + } + break; + } +#endif /* ALTQ */ + + case DIOCBEGINADDRS: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + + PF_RULES_WLOCK(); + pf_empty_pool(&V_pf_pabuf); + pp->ticket = ++V_ticket_pabuf; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCADDADDR: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + struct pf_pooladdr *pa; + struct pfi_kif *kif = NULL; + +#ifndef INET + if (pp->af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pp->af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + if (pp->addr.addr.type != PF_ADDR_ADDRMASK && + pp->addr.addr.type != PF_ADDR_DYNIFTL && + pp->addr.addr.type != PF_ADDR_TABLE) { + error = EINVAL; + break; + } + pa = malloc(sizeof(*pa), M_PFRULE, M_WAITOK); + bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr)); + if (pa->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + PF_RULES_WLOCK(); + if (pp->ticket != V_ticket_pabuf) { + PF_RULES_WUNLOCK(); + if (pa->ifname[0]) + free(kif, PFI_MTYPE); + free(pa, M_PFRULE); + error = EBUSY; + break; + } + if (pa->ifname[0]) { + pa->kif = pfi_kif_attach(kif, pa->ifname); + pfi_kif_ref(pa->kif); + } else + pa->kif = NULL; + if (pa->addr.type == PF_ADDR_DYNIFTL && ((error = + pfi_dynaddr_setup(&pa->addr, pp->af)) != 0)) { + if (pa->ifname[0]) + pfi_kif_unref(pa->kif); + PF_RULES_WUNLOCK(); + free(pa, M_PFRULE); + break; + } + TAILQ_INSERT_TAIL(&V_pf_pabuf, pa, entries); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCGETADDRS: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + struct pf_pool *pool; + struct pf_pooladdr *pa; + + PF_RULES_RLOCK(); + pp->nr = 0; + pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, + pp->r_num, 0, 1, 0); + if (pool == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + TAILQ_FOREACH(pa, &pool->list, entries) + pp->nr++; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETADDR: { + struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + struct pf_pool *pool; + struct pf_pooladdr *pa; + u_int32_t nr = 0; + + PF_RULES_RLOCK(); + pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, + pp->r_num, 0, 1, 1); + if (pool == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + pa = TAILQ_FIRST(&pool->list); + while ((pa != NULL) && (nr < pp->nr)) { + pa = TAILQ_NEXT(pa, entries); + nr++; + } + if (pa == NULL) { + PF_RULES_RUNLOCK(); + error = EBUSY; + break; + } + bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr)); + pf_addr_copyout(&pp->addr.addr); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCCHANGEADDR: { + struct pfioc_pooladdr *pca = (struct pfioc_pooladdr *)addr; + struct pf_pool *pool; + struct pf_pooladdr *oldpa = NULL, *newpa = NULL; + struct pf_ruleset *ruleset; + struct pfi_kif *kif = NULL; + + if (pca->action < PF_CHANGE_ADD_HEAD || + pca->action > PF_CHANGE_REMOVE) { + error = EINVAL; + break; + } + if (pca->addr.addr.type != PF_ADDR_ADDRMASK && + pca->addr.addr.type != PF_ADDR_DYNIFTL && + pca->addr.addr.type != PF_ADDR_TABLE) { + error = EINVAL; + break; + } + + if (pca->action != PF_CHANGE_REMOVE) { +#ifndef INET + if (pca->af == AF_INET) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET */ +#ifndef INET6 + if (pca->af == AF_INET6) { + error = EAFNOSUPPORT; + break; + } +#endif /* INET6 */ + newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK); + bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr)); + if (newpa->ifname[0]) + kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK); + } + +#define ERROUT(x) { error = (x); goto DIOCCHANGEADDR_error; } + PF_RULES_WLOCK(); + ruleset = pf_find_ruleset(pca->anchor); + if (ruleset == NULL) + ERROUT(EBUSY); + + pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action, + pca->r_num, pca->r_last, 1, 1); + if (pool == NULL) + ERROUT(EBUSY); + + if (pca->action != PF_CHANGE_REMOVE) { + if (newpa->ifname[0]) { + newpa->kif = pfi_kif_attach(kif, newpa->ifname); + pfi_kif_ref(newpa->kif); + } else + newpa->kif = NULL; + + switch (newpa->addr.type) { + case PF_ADDR_DYNIFTL: + error = pfi_dynaddr_setup(&newpa->addr, + pca->af); + break; + case PF_ADDR_TABLE: + newpa->addr.p.tbl = pfr_attach_table(ruleset, + newpa->addr.v.tblname); + if (newpa->addr.p.tbl == NULL) + error = ENOMEM; + break; + } + if (error) { + if (newpa->kif) + pfi_kif_unref(newpa->kif); + PF_RULES_WUNLOCK(); + free(newpa, M_PFRULE); + break; + } + } + + if (pca->action == PF_CHANGE_ADD_HEAD) + oldpa = TAILQ_FIRST(&pool->list); + else if (pca->action == PF_CHANGE_ADD_TAIL) + oldpa = TAILQ_LAST(&pool->list, pf_palist); + else { + int i = 0; + + oldpa = TAILQ_FIRST(&pool->list); + while ((oldpa != NULL) && (i < pca->nr)) { + oldpa = TAILQ_NEXT(oldpa, entries); + i++; + } + if (oldpa == NULL) { + PF_RULES_WUNLOCK(); + error = EINVAL; + break; + } + } + + if (pca->action == PF_CHANGE_REMOVE) { + TAILQ_REMOVE(&pool->list, oldpa, entries); + switch (oldpa->addr.type) { + case PF_ADDR_DYNIFTL: + pfi_dynaddr_remove(oldpa->addr.p.dyn); + break; + case PF_ADDR_TABLE: + pfr_detach_table(oldpa->addr.p.tbl); + break; + } + if (oldpa->kif) + pfi_kif_unref(oldpa->kif); + free(oldpa, M_PFRULE); + } else { + if (oldpa == NULL) + TAILQ_INSERT_TAIL(&pool->list, newpa, entries); + else if (pca->action == PF_CHANGE_ADD_HEAD || + pca->action == PF_CHANGE_ADD_BEFORE) + TAILQ_INSERT_BEFORE(oldpa, newpa, entries); + else + TAILQ_INSERT_AFTER(&pool->list, oldpa, + newpa, entries); + } + + pool->cur = TAILQ_FIRST(&pool->list); + PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, + pca->af); + PF_RULES_WUNLOCK(); + break; + +#undef ERROUT +DIOCCHANGEADDR_error: + PF_RULES_WUNLOCK(); + if (newpa != NULL) + free(newpa, M_PFRULE); + if (kif != NULL) + free(kif, PFI_MTYPE); + break; + } + + case DIOCGETRULESETS: { + struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor; + + PF_RULES_RLOCK(); + pr->path[sizeof(pr->path) - 1] = 0; + if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { + PF_RULES_RUNLOCK(); + error = ENOENT; + break; + } + pr->nr = 0; + if (ruleset->anchor == NULL) { + /* XXX kludge for pf_main_ruleset */ + RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors) + if (anchor->parent == NULL) + pr->nr++; + } else { + RB_FOREACH(anchor, pf_anchor_node, + &ruleset->anchor->children) + pr->nr++; + } + PF_RULES_RUNLOCK(); + break; + } + + case DIOCGETRULESET: { + struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor; + u_int32_t nr = 0; + + PF_RULES_RLOCK(); + pr->path[sizeof(pr->path) - 1] = 0; + if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { + PF_RULES_RUNLOCK(); + error = ENOENT; + break; + } + pr->name[0] = 0; + if (ruleset->anchor == NULL) { + /* XXX kludge for pf_main_ruleset */ + RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors) + if (anchor->parent == NULL && nr++ == pr->nr) { + strlcpy(pr->name, anchor->name, + sizeof(pr->name)); + break; + } + } else { + RB_FOREACH(anchor, pf_anchor_node, + &ruleset->anchor->children) + if (nr++ == pr->nr) { + strlcpy(pr->name, anchor->name, + sizeof(pr->name)); + break; + } + } + if (!pr->name[0]) + error = EBUSY; + PF_RULES_RUNLOCK(); + break; + } + + case DIOCRCLRTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != 0) { + error = ENODEV; + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, + io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCRADDTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_add_tables(pfrts, io->pfrio_size, + &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRDELTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_del_tables(pfrts, io->pfrio_size, + &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRGETTABLES: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + error = pfr_get_tables(&io->pfrio_table, pfrts, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfrts, io->pfrio_buffer, totlen); + free(pfrts, M_TEMP); + break; + } + + case DIOCRGETTSTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_tstats *pfrtstats; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_tstats)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_tstats); + pfrtstats = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_WLOCK(); + error = pfr_get_tstats(&io->pfrio_table, pfrtstats, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0) + error = copyout(pfrtstats, io->pfrio_buffer, totlen); + free(pfrtstats, M_TEMP); + break; + } + + case DIOCRCLRTSTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_tstats(pfrts, io->pfrio_size, + &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRSETTFLAGS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_table *pfrts; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_table)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_table); + pfrts = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfrts, totlen); + if (error) { + free(pfrts, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_set_tflags(pfrts, io->pfrio_size, + io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange, + &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfrts, M_TEMP); + break; + } + + case DIOCRCLRADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + + if (io->pfrio_esize != 0) { + error = ENODEV; + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, + io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCRADDADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_add_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRDELADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_del_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRSETADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = (io->pfrio_size + io->pfrio_size2) * + sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_set_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd, + &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags | + PFR_FLAG_USERIOCTL, 0); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRGETADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + error = pfr_get_addrs(&io->pfrio_table, pfras, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRGETASTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_astats *pfrastats; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_astats)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_astats); + pfrastats = malloc(totlen, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + error = pfr_get_astats(&io->pfrio_table, pfrastats, + &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfrastats, io->pfrio_buffer, totlen); + free(pfrastats, M_TEMP); + break; + } + + case DIOCRCLRASTATS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_clr_astats(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRTSTADDRS: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_RLOCK(); + error = pfr_tst_addrs(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags | + PFR_FLAG_USERIOCTL); + PF_RULES_RUNLOCK(); + if (error == 0) + error = copyout(pfras, io->pfrio_buffer, totlen); + free(pfras, M_TEMP); + break; + } + + case DIOCRINADEFINE: { + struct pfioc_table *io = (struct pfioc_table *)addr; + struct pfr_addr *pfras; + size_t totlen; + + if (io->pfrio_esize != sizeof(struct pfr_addr)) { + error = ENODEV; + break; + } + totlen = io->pfrio_size * sizeof(struct pfr_addr); + pfras = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->pfrio_buffer, pfras, totlen); + if (error) { + free(pfras, M_TEMP); + break; + } + PF_RULES_WLOCK(); + error = pfr_ina_define(&io->pfrio_table, pfras, + io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr, + io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL); + PF_RULES_WUNLOCK(); + free(pfras, M_TEMP); + break; + } + + case DIOCOSFPADD: { + struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; + PF_RULES_WLOCK(); + error = pf_osfp_add(io); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCOSFPGET: { + struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr; + PF_RULES_RLOCK(); + error = pf_osfp_get(io); + PF_RULES_RUNLOCK(); + break; + } + + case DIOCXBEGIN: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioes, *ioe; + size_t totlen; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + break; + } + totlen = sizeof(struct pfioc_trans_e) * io->size; + ioes = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->array, ioes, totlen); + if (error) { + free(ioes, M_TEMP); + break; + } + PF_RULES_WLOCK(); + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + if ((error = pf_begin_altq(&ioe->ticket))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + { + struct pfr_table table; + + bzero(&table, sizeof(table)); + strlcpy(table.pfrt_anchor, ioe->anchor, + sizeof(table.pfrt_anchor)); + if ((error = pfr_ina_begin(&table, + &ioe->ticket, NULL, 0))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; + } + break; + } + default: + if ((error = pf_begin_rules(&ioe->ticket, + ioe->rs_num, ioe->anchor))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; + } + break; + } + } + PF_RULES_WUNLOCK(); + error = copyout(ioes, io->array, totlen); + free(ioes, M_TEMP); + break; + } + + case DIOCXROLLBACK: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe, *ioes; + size_t totlen; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + break; + } + totlen = sizeof(struct pfioc_trans_e) * io->size; + ioes = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->array, ioes, totlen); + if (error) { + free(ioes, M_TEMP); + break; + } + PF_RULES_WLOCK(); + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + if ((error = pf_rollback_altq(ioe->ticket))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + { + struct pfr_table table; + + bzero(&table, sizeof(table)); + strlcpy(table.pfrt_anchor, ioe->anchor, + sizeof(table.pfrt_anchor)); + if ((error = pfr_ina_rollback(&table, + ioe->ticket, NULL, 0))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + default: + if ((error = pf_rollback_rules(ioe->ticket, + ioe->rs_num, ioe->anchor))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + } + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + break; + } + + case DIOCXCOMMIT: { + struct pfioc_trans *io = (struct pfioc_trans *)addr; + struct pfioc_trans_e *ioe, *ioes; + struct pf_ruleset *rs; + size_t totlen; + int i; + + if (io->esize != sizeof(*ioe)) { + error = ENODEV; + break; + } + totlen = sizeof(struct pfioc_trans_e) * io->size; + ioes = malloc(totlen, M_TEMP, M_WAITOK); + error = copyin(io->array, ioes, totlen); + if (error) { + free(ioes, M_TEMP); + break; + } + PF_RULES_WLOCK(); + /* First makes sure everything will succeed. */ + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if (ioe->anchor[0]) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + if (!V_altqs_inactive_open || ioe->ticket != + V_ticket_altqs_inactive) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EBUSY; + goto fail; + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + rs = pf_find_ruleset(ioe->anchor); + if (rs == NULL || !rs->topen || ioe->ticket != + rs->tticket) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EBUSY; + goto fail; + } + break; + default: + if (ioe->rs_num < 0 || ioe->rs_num >= + PF_RULESET_MAX) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EINVAL; + goto fail; + } + rs = pf_find_ruleset(ioe->anchor); + if (rs == NULL || + !rs->rules[ioe->rs_num].inactive.open || + rs->rules[ioe->rs_num].inactive.ticket != + ioe->ticket) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + error = EBUSY; + goto fail; + } + break; + } + } + /* Now do the commit - no errors should happen here. */ + for (i = 0, ioe = ioes; i < io->size; i++, ioe++) { + switch (ioe->rs_num) { +#ifdef ALTQ + case PF_RULESET_ALTQ: + if ((error = pf_commit_altq(ioe->ticket))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; +#endif /* ALTQ */ + case PF_RULESET_TABLE: + { + struct pfr_table table; + + bzero(&table, sizeof(table)); + strlcpy(table.pfrt_anchor, ioe->anchor, + sizeof(table.pfrt_anchor)); + if ((error = pfr_ina_commit(&table, + ioe->ticket, NULL, NULL, 0))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + default: + if ((error = pf_commit_rules(ioe->ticket, + ioe->rs_num, ioe->anchor))) { + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + goto fail; /* really bad */ + } + break; + } + } + PF_RULES_WUNLOCK(); + free(ioes, M_TEMP); + break; + } + + case DIOCGETSRCNODES: { + struct pfioc_src_nodes *psn = (struct pfioc_src_nodes *)addr; + struct pf_srchash *sh; + struct pf_src_node *n, *p, *pstore; + uint32_t i, nr = 0; + + if (psn->psn_len == 0) { + for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) + nr++; + PF_HASHROW_UNLOCK(sh); + } + psn->psn_len = sizeof(struct pf_src_node) * nr; + break; + } + + p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK); + for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) { + int secs = time_uptime, diff; + + if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len) + break; + + bcopy(n, p, sizeof(struct pf_src_node)); + if (n->rule.ptr != NULL) + p->rule.nr = n->rule.ptr->nr; + p->creation = secs - p->creation; + if (p->expire > secs) + p->expire -= secs; + else + p->expire = 0; + + /* Adjust the connection rate estimate. */ + diff = secs - n->conn_rate.last; + if (diff >= n->conn_rate.seconds) + p->conn_rate.count = 0; + else + p->conn_rate.count -= + n->conn_rate.count * diff / + n->conn_rate.seconds; + p++; + nr++; + } + PF_HASHROW_UNLOCK(sh); + } + error = copyout(pstore, psn->psn_src_nodes, + sizeof(struct pf_src_node) * nr); + if (error) { + free(pstore, M_TEMP); + break; + } + psn->psn_len = sizeof(struct pf_src_node) * nr; + free(pstore, M_TEMP); + break; + } + + case DIOCCLRSRCNODES: { + + pf_clear_srcnodes(NULL); + pf_purge_expired_src_nodes(); + V_pf_status.src_nodes = 0; + break; + } + + case DIOCKILLSRCNODES: { + struct pfioc_src_node_kill *psnk = + (struct pfioc_src_node_kill *)addr; + struct pf_srchash *sh; + struct pf_src_node *sn; + u_int i, killed = 0; + + for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask; + i++, sh++) { + /* + * XXXGL: we don't ever acquire sources hash lock + * but if we ever do, the below call to pf_clear_srcnodes() + * would lead to a LOR. + */ + PF_HASHROW_LOCK(sh); + LIST_FOREACH(sn, &sh->nodes, entry) + if (PF_MATCHA(psnk->psnk_src.neg, + &psnk->psnk_src.addr.v.a.addr, + &psnk->psnk_src.addr.v.a.mask, + &sn->addr, sn->af) && + PF_MATCHA(psnk->psnk_dst.neg, + &psnk->psnk_dst.addr.v.a.addr, + &psnk->psnk_dst.addr.v.a.mask, + &sn->raddr, sn->af)) { + /* Handle state to src_node linkage */ + if (sn->states != 0) + pf_clear_srcnodes(sn); + sn->expire = 1; + killed++; + } + PF_HASHROW_UNLOCK(sh); + } + + if (killed > 0) + pf_purge_expired_src_nodes(); + + psnk->psnk_killed = killed; + break; + } + + case DIOCSETHOSTID: { + u_int32_t *hostid = (u_int32_t *)addr; + + PF_RULES_WLOCK(); + if (*hostid == 0) + V_pf_status.hostid = arc4random(); + else + V_pf_status.hostid = *hostid; + PF_RULES_WUNLOCK(); + break; + } + + case DIOCOSFPFLUSH: + PF_RULES_WLOCK(); + pf_osfp_flush(); + PF_RULES_WUNLOCK(); + break; + + case DIOCIGETIFACES: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + struct pfi_kif *ifstore; + size_t bufsiz; + + if (io->pfiio_esize != sizeof(struct pfi_kif)) { + error = ENODEV; + break; + } + + bufsiz = io->pfiio_size * sizeof(struct pfi_kif); + ifstore = malloc(bufsiz, M_TEMP, M_WAITOK); + PF_RULES_RLOCK(); + pfi_get_ifaces(io->pfiio_name, ifstore, &io->pfiio_size); + PF_RULES_RUNLOCK(); + error = copyout(ifstore, io->pfiio_buffer, bufsiz); + free(ifstore, M_TEMP); + break; + } + + case DIOCSETIFFLAG: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + PF_RULES_WLOCK(); + error = pfi_set_flags(io->pfiio_name, io->pfiio_flags); + PF_RULES_WUNLOCK(); + break; + } + + case DIOCCLRIFFLAG: { + struct pfioc_iface *io = (struct pfioc_iface *)addr; + + PF_RULES_WLOCK(); + error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags); + PF_RULES_WUNLOCK(); + break; + } + + default: + error = ENODEV; + break; + } +fail: + CURVNET_RESTORE(); + + return (error); +} + +void +pfsync_state_export(struct pfsync_state *sp, struct pf_state *st) +{ + bzero(sp, sizeof(struct pfsync_state)); + + /* copy from state key */ + sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0]; + sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1]; + sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0]; + sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1]; + sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0]; + sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1]; + sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0]; + sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1]; + sp->proto = st->key[PF_SK_WIRE]->proto; + sp->af = st->key[PF_SK_WIRE]->af; + + /* copy from state */ + strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname)); + bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr)); + sp->creation = htonl(time_uptime - st->creation); + sp->expire = pf_state_expires(st); + if (sp->expire <= time_uptime) + sp->expire = htonl(0); + else + sp->expire = htonl(sp->expire - time_uptime); + + sp->direction = st->direction; + sp->log = st->log; + sp->timeout = st->timeout; + sp->state_flags = st->state_flags; + if (st->src_node) + sp->sync_flags |= PFSYNC_FLAG_SRCNODE; + if (st->nat_src_node) + sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE; + + sp->id = st->id; + sp->creatorid = st->creatorid; + pf_state_peer_hton(&st->src, &sp->src); + pf_state_peer_hton(&st->dst, &sp->dst); + + if (st->rule.ptr == NULL) + sp->rule = htonl(-1); + else + sp->rule = htonl(st->rule.ptr->nr); + if (st->anchor.ptr == NULL) + sp->anchor = htonl(-1); + else + sp->anchor = htonl(st->anchor.ptr->nr); + if (st->nat_rule.ptr == NULL) + sp->nat_rule = htonl(-1); + else + sp->nat_rule = htonl(st->nat_rule.ptr->nr); + + pf_state_counter_hton(st->packets[0], sp->packets[0]); + pf_state_counter_hton(st->packets[1], sp->packets[1]); + pf_state_counter_hton(st->bytes[0], sp->bytes[0]); + pf_state_counter_hton(st->bytes[1], sp->bytes[1]); + +} + +static void +pf_tbladdr_copyout(struct pf_addr_wrap *aw) +{ + struct pfr_ktable *kt; + + KASSERT(aw->type == PF_ADDR_TABLE, ("%s: type %u", __func__, aw->type)); + + kt = aw->p.tbl; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + aw->p.tbl = NULL; + aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ? + kt->pfrkt_cnt : -1; +} + +/* + * XXX - Check for version missmatch!!! + */ +static void +pf_clear_states(void) +{ + struct pf_state *s; + u_int i; + + for (i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; +relock: + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + s->timeout = PFTM_PURGE; + /* Don't send out individual delete messages. */ + s->sync_state = PFSTATE_NOSYNC; + pf_unlink_state(s, PF_ENTER_LOCKED); + goto relock; + } + PF_HASHROW_UNLOCK(ih); + } +} + +static int +pf_clear_tables(void) +{ + struct pfioc_table io; + int error; + + bzero(&io, sizeof(io)); + + error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel, + io.pfrio_flags); + + return (error); +} + +static void +pf_clear_srcnodes(struct pf_src_node *n) +{ + struct pf_state *s; + int i; + + for (i = 0; i <= V_pf_hashmask; i++) { + struct pf_idhash *ih = &V_pf_idhash[i]; + + PF_HASHROW_LOCK(ih); + LIST_FOREACH(s, &ih->states, entry) { + if (n == NULL || n == s->src_node) + s->src_node = NULL; + if (n == NULL || n == s->nat_src_node) + s->nat_src_node = NULL; + } + PF_HASHROW_UNLOCK(ih); + } + + if (n == NULL) { + struct pf_srchash *sh; + + for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask; + i++, sh++) { + PF_HASHROW_LOCK(sh); + LIST_FOREACH(n, &sh->nodes, entry) { + n->expire = 1; + n->states = 0; + } + PF_HASHROW_UNLOCK(sh); + } + } else { + /* XXX: hash slot should already be locked here. */ + n->expire = 1; + n->states = 0; + } +} +/* + * XXX - Check for version missmatch!!! + */ + +/* + * Duplicate pfctl -Fa operation to get rid of as much as we can. + */ +static int +shutdown_pf(void) +{ + int error = 0; + u_int32_t t[5]; + char nn = '\0'; + + V_pf_status.running = 0; + do { + if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n")); + break; + } + if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n")); + break; /* XXX: rollback? */ + } + if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn)) + != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n")); + break; /* XXX: rollback? */ + } + + /* XXX: these should always succeed here */ + pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn); + pf_commit_rules(t[1], PF_RULESET_FILTER, &nn); + pf_commit_rules(t[2], PF_RULESET_NAT, &nn); + pf_commit_rules(t[3], PF_RULESET_BINAT, &nn); + pf_commit_rules(t[4], PF_RULESET_RDR, &nn); + + if ((error = pf_clear_tables()) != 0) + break; + +#ifdef ALTQ + if ((error = pf_begin_altq(&t[0])) != 0) { + DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n")); + break; + } + pf_commit_altq(t[0]); +#endif + + pf_clear_states(); + + pf_clear_srcnodes(NULL); + + /* status does not use malloced mem so no need to cleanup */ + /* fingerprints and interfaces have thier own cleanup code */ + } while(0); + + return (error); +} + +#ifdef INET +static int +pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + /* + * XXX Wed Jul 9 22:03:16 2003 UTC + * OpenBSD has changed its byte ordering convention on ip_len/ip_off + * in network stack. OpenBSD's network stack have converted + * ip_len/ip_off to host byte order frist as FreeBSD. + * Now this is not true anymore , so we should convert back to network + * byte order. + */ + struct ip *h = NULL; + int chk; + + if ((*m)->m_pkthdr.len >= (int)sizeof(struct ip)) { + /* if m_pkthdr.len is less than ip header, pf will handle. */ + h = mtod(*m, struct ip *); + HTONS(h->ip_len); + HTONS(h->ip_off); + } + CURVNET_SET(ifp->if_vnet); + chk = pf_test(PF_IN, ifp, m, inp); + CURVNET_RESTORE(); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + if (*m != NULL) { + /* pf_test can change ip header location */ + h = mtod(*m, struct ip *); + NTOHS(h->ip_len); + NTOHS(h->ip_off); + } + return chk; +} + +static int +pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + /* + * XXX Wed Jul 9 22:03:16 2003 UTC + * OpenBSD has changed its byte ordering convention on ip_len/ip_off + * in network stack. OpenBSD's network stack have converted + * ip_len/ip_off to host byte order frist as FreeBSD. + * Now this is not true anymore , so we should convert back to network + * byte order. + */ + struct ip *h = NULL; + int chk; + + /* We need a proper CSUM befor we start (s. OpenBSD ip_output) */ + if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(*m); + (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + if ((*m)->m_pkthdr.len >= (int)sizeof(*h)) { + /* if m_pkthdr.len is less than ip header, pf will handle. */ + h = mtod(*m, struct ip *); + HTONS(h->ip_len); + HTONS(h->ip_off); + } + CURVNET_SET(ifp->if_vnet); + chk = pf_test(PF_OUT, ifp, m, inp); + CURVNET_RESTORE(); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + if (*m != NULL) { + /* pf_test can change ip header location */ + h = mtod(*m, struct ip *); + NTOHS(h->ip_len); + NTOHS(h->ip_off); + } + return chk; +} +#endif + +#ifdef INET6 +static int +pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + + /* + * IPv6 is not affected by ip_len/ip_off byte order changes. + */ + int chk; + + /* + * In case of loopback traffic IPv6 uses the real interface in + * order to support scoped addresses. In order to support stateful + * filtering we have change this to lo0 as it is the case in IPv4. + */ + CURVNET_SET(ifp->if_vnet); + chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp); + CURVNET_RESTORE(); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + return chk; +} + +static int +pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + /* + * IPv6 does not affected ip_len/ip_off byte order changes. + */ + int chk; + + /* We need a proper CSUM before we start (s. OpenBSD ip_output) */ + if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { +#ifdef INET + /* XXX-BZ copy&paste error from r126261? */ + in_delayed_cksum(*m); +#endif + (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + CURVNET_SET(ifp->if_vnet); + chk = pf_test6(PF_OUT, ifp, m, inp); + CURVNET_RESTORE(); + if (chk && *m) { + m_freem(*m); + *m = NULL; + } + return chk; +} +#endif /* INET6 */ + +static int +hook_pf(void) +{ +#ifdef INET + struct pfil_head *pfh_inet; +#endif +#ifdef INET6 + struct pfil_head *pfh_inet6; +#endif + + if (V_pf_pfil_hooked) + return (0); + +#ifdef INET + pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); + if (pfh_inet == NULL) + return (ESRCH); /* XXX */ + pfil_add_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet); + pfil_add_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet); +#endif +#ifdef INET6 + pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); + if (pfh_inet6 == NULL) { +#ifdef INET + pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet); + pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet); +#endif + return (ESRCH); /* XXX */ + } + pfil_add_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6); + pfil_add_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6); +#endif + + V_pf_pfil_hooked = 1; + return (0); +} + +static int +dehook_pf(void) +{ +#ifdef INET + struct pfil_head *pfh_inet; +#endif +#ifdef INET6 + struct pfil_head *pfh_inet6; +#endif + + if (V_pf_pfil_hooked == 0) + return (0); + +#ifdef INET + pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); + if (pfh_inet == NULL) + return (ESRCH); /* XXX */ + pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet); + pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet); +#endif +#ifdef INET6 + pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); + if (pfh_inet6 == NULL) + return (ESRCH); /* XXX */ + pfil_remove_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, + pfh_inet6); + pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, + pfh_inet6); +#endif + + V_pf_pfil_hooked = 0; + return (0); +} + +static int +pf_load(void) +{ + int error; + + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_pf_pfil_hooked = 0; + V_pf_end_threads = 0; + TAILQ_INIT(&V_pf_tags); + TAILQ_INIT(&V_pf_qids); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + + rw_init(&pf_rules_lock, "pf rulesets"); + + pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME); + if ((error = pfattach()) != 0) + return (error); + + return (0); +} + +static int +pf_unload(void) +{ + int error = 0; + + PF_RULES_WLOCK(); + V_pf_status.running = 0; + PF_RULES_WUNLOCK(); + swi_remove(V_pf_swi_cookie); + error = dehook_pf(); + if (error) { + /* + * Should not happen! + * XXX Due to error code ESRCH, kldunload will show + * a message like 'No such process'. + */ + printf("%s : pfil unregisteration fail\n", __FUNCTION__); + return error; + } + PF_RULES_WLOCK(); + shutdown_pf(); + V_pf_end_threads = 1; + while (V_pf_end_threads < 2) { + wakeup_one(pf_purge_thread); + rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftmo", 0); + } + pf_normalize_cleanup(); + pfi_cleanup(); + pfr_cleanup(); + pf_osfp_flush(); + pf_cleanup(); + PF_RULES_WUNLOCK(); + destroy_dev(pf_dev); + rw_destroy(&pf_rules_lock); + + return (error); +} + +static int +pf_modevent(module_t mod, int type, void *data) +{ + int error = 0; + + switch(type) { + case MOD_LOAD: + error = pf_load(); + break; + case MOD_QUIESCE: + /* + * Module should not be unloaded due to race conditions. + */ + error = EPERM; + break; + case MOD_UNLOAD: + error = pf_unload(); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static moduledata_t pf_mod = { + "pf", + pf_modevent, + 0 +}; + +DECLARE_MODULE(pf, pf_mod, SI_SUB_PSEUDO, SI_ORDER_FIRST); +MODULE_VERSION(pf, PF_MODVER); diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c new file mode 100644 index 0000000..5b47852 --- /dev/null +++ b/sys/netpfil/pf/pf_lb.c @@ -0,0 +1,663 @@ +/* $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_pf.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/pfvar.h> +#include <net/if_pflog.h> +#include <net/pf_mtag.h> + +#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x + +static void pf_hash(struct pf_addr *, struct pf_addr *, + struct pf_poolhashkey *, sa_family_t); +static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, + int, int, struct pfi_kif *, + struct pf_addr *, u_int16_t, struct pf_addr *, + u_int16_t, int); +static int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *, + struct pf_addr *, struct pf_addr *, u_int16_t, + struct pf_addr *, u_int16_t*, u_int16_t, u_int16_t, + struct pf_src_node **); + +#define mix(a,b,c) \ + do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ + } while (0) + +/* + * hash function based on bridge_hash in if_bridge.c + */ +static void +pf_hash(struct pf_addr *inaddr, struct pf_addr *hash, + struct pf_poolhashkey *key, sa_family_t af) +{ + u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0]; + + switch (af) { +#ifdef INET + case AF_INET: + a += inaddr->addr32[0]; + b += key->key32[1]; + mix(a, b, c); + hash->addr32[0] = c + key->key32[2]; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + a += inaddr->addr32[0]; + b += inaddr->addr32[2]; + mix(a, b, c); + hash->addr32[0] = c; + a += inaddr->addr32[1]; + b += inaddr->addr32[3]; + c += key->key32[1]; + mix(a, b, c); + hash->addr32[1] = c; + a += inaddr->addr32[2]; + b += inaddr->addr32[1]; + c += key->key32[2]; + mix(a, b, c); + hash->addr32[2] = c; + a += inaddr->addr32[3]; + b += inaddr->addr32[0]; + c += key->key32[3]; + mix(a, b, c); + hash->addr32[3] = c; + break; +#endif /* INET6 */ + } +} + +static struct pf_rule * +pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, + int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport, + struct pf_addr *daddr, u_int16_t dport, int rs_num) +{ + struct pf_rule *r, *rm = NULL; + struct pf_ruleset *ruleset = NULL; + int tag = -1; + int rtableid = -1; + int asd = 0; + + r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr); + while (r && rm == NULL) { + struct pf_rule_addr *src = NULL, *dst = NULL; + struct pf_addr_wrap *xdst = NULL; + + if (r->action == PF_BINAT && direction == PF_IN) { + src = &r->dst; + if (r->rpool.cur != NULL) + xdst = &r->rpool.cur->addr; + } else { + src = &r->src; + dst = &r->dst; + } + + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != direction) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != pd->af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, + src->neg, kif, M_GETFIB(m))) + r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : + PF_SKIP_DST_ADDR].ptr; + else if (src->port_op && !pf_match_port(src->port_op, + src->port[0], src->port[1], sport)) + r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT : + PF_SKIP_DST_PORT].ptr; + else if (dst != NULL && + PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL, + M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, + 0, NULL, M_GETFIB(m))) + r = TAILQ_NEXT(r, entries); + else if (dst != NULL && dst->port_op && + !pf_match_port(dst->port_op, dst->port[0], + dst->port[1], dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != + IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m, + off, pd->hdr.tcp), r->os_fingerprint))) + r = TAILQ_NEXT(r, entries); + else { + if (r->tag) + tag = r->tag; + if (r->rtableid >= 0) + rtableid = r->rtableid; + if (r->anchor == NULL) { + rm = r; + } else + pf_step_into_anchor(&asd, &ruleset, rs_num, + &r, NULL, NULL); + } + if (r == NULL) + pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r, + NULL, NULL); + } + + if (tag > 0 && pf_tag_packet(m, pd, tag)) + return (NULL); + if (rtableid >= 0) + M_SETFIB(m, rtableid); + + if (rm != NULL && (rm->action == PF_NONAT || + rm->action == PF_NORDR || rm->action == PF_NOBINAT)) + return (NULL); + return (rm); +} + +static int +pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, + struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t dport, + struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high, + struct pf_src_node **sn) +{ + struct pf_state_key_cmp key; + struct pf_addr init_addr; + u_int16_t cut; + + bzero(&init_addr, sizeof(init_addr)); + if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) + return (1); + + if (proto == IPPROTO_ICMP) { + low = 1; + high = 65535; + } + + do { + key.af = af; + key.proto = proto; + PF_ACPY(&key.addr[1], daddr, key.af); + PF_ACPY(&key.addr[0], naddr, key.af); + key.port[1] = dport; + + /* + * port search; start random, step; + * similar 2 portloop in in_pcbbind + */ + if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || + proto == IPPROTO_ICMP)) { + key.port[0] = dport; + if (pf_find_state_all(&key, PF_IN, NULL) == NULL) + return (0); + } else if (low == 0 && high == 0) { + key.port[0] = *nport; + if (pf_find_state_all(&key, PF_IN, NULL) == NULL) + return (0); + } else if (low == high) { + key.port[0] = htons(low); + if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { + *nport = htons(low); + return (0); + } + } else { + u_int16_t tmp; + + if (low > high) { + tmp = low; + low = high; + high = tmp; + } + /* low < high */ + cut = htonl(arc4random()) % (1 + high - low) + low; + /* low <= cut <= high */ + for (tmp = cut; tmp <= high; ++(tmp)) { + key.port[0] = htons(tmp); + if (pf_find_state_all(&key, PF_IN, NULL) == + NULL) { + *nport = htons(tmp); + return (0); + } + } + for (tmp = cut - 1; tmp >= low; --(tmp)) { + key.port[0] = htons(tmp); + if (pf_find_state_all(&key, PF_IN, NULL) == + NULL) { + *nport = htons(tmp); + return (0); + } + } + } + + switch (r->rpool.opts & PF_POOL_TYPEMASK) { + case PF_POOL_RANDOM: + case PF_POOL_ROUNDROBIN: + if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) + return (1); + break; + case PF_POOL_NONE: + case PF_POOL_SRCHASH: + case PF_POOL_BITMASK: + default: + return (1); + } + } while (! PF_AEQ(&init_addr, naddr, af) ); + return (1); /* none available */ +} + +int +pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, + struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn) +{ + struct pf_pool *rpool = &r->rpool; + struct pf_addr *raddr = NULL, *rmask = NULL; + + if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && + (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { + *sn = pf_find_src_node(saddr, r, af, 0); + if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) { + PF_ACPY(naddr, &(*sn)->raddr, af); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + printf("pf_map_addr: src tracking maps "); + pf_print_host(saddr, 0, af); + printf(" to "); + pf_print_host(naddr, 0, af); + printf("\n"); + } + return (0); + } + } + + if (rpool->cur->addr.type == PF_ADDR_NOROUTE) + return (1); + if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + switch (af) { +#ifdef INET + case AF_INET: + if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 && + (rpool->opts & PF_POOL_TYPEMASK) != + PF_POOL_ROUNDROBIN) + return (1); + raddr = &rpool->cur->addr.p.dyn->pfid_addr4; + rmask = &rpool->cur->addr.p.dyn->pfid_mask4; + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 && + (rpool->opts & PF_POOL_TYPEMASK) != + PF_POOL_ROUNDROBIN) + return (1); + raddr = &rpool->cur->addr.p.dyn->pfid_addr6; + rmask = &rpool->cur->addr.p.dyn->pfid_mask6; + break; +#endif /* INET6 */ + } + } else if (rpool->cur->addr.type == PF_ADDR_TABLE) { + if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) + return (1); /* unsupported */ + } else { + raddr = &rpool->cur->addr.v.a.addr; + rmask = &rpool->cur->addr.v.a.mask; + } + + switch (rpool->opts & PF_POOL_TYPEMASK) { + case PF_POOL_NONE: + PF_ACPY(naddr, raddr, af); + break; + case PF_POOL_BITMASK: + PF_POOLMASK(naddr, raddr, rmask, saddr, af); + break; + case PF_POOL_RANDOM: + if (init_addr != NULL && PF_AZERO(init_addr, af)) { + switch (af) { +#ifdef INET + case AF_INET: + rpool->counter.addr32[0] = htonl(arc4random()); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (rmask->addr32[3] != 0xffffffff) + rpool->counter.addr32[3] = + htonl(arc4random()); + else + break; + if (rmask->addr32[2] != 0xffffffff) + rpool->counter.addr32[2] = + htonl(arc4random()); + else + break; + if (rmask->addr32[1] != 0xffffffff) + rpool->counter.addr32[1] = + htonl(arc4random()); + else + break; + if (rmask->addr32[0] != 0xffffffff) + rpool->counter.addr32[0] = + htonl(arc4random()); + break; +#endif /* INET6 */ + } + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + PF_ACPY(init_addr, naddr, af); + + } else { + PF_AINC(&rpool->counter, af); + PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); + } + break; + case PF_POOL_SRCHASH: + { + unsigned char hash[16]; + + pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); + PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); + break; + } + case PF_POOL_ROUNDROBIN: + { + struct pf_pooladdr *acur = rpool->cur; + + /* + * XXXGL: in the round-robin case we need to store + * the round-robin machine state in the rule, thus + * forwarding thread needs to modify rule. + * + * This is done w/o locking, because performance is assumed + * more important than round-robin precision. + * + * In the simpliest case we just update the "rpool->cur" + * pointer. However, if pool contains tables or dynamic + * addresses, then "tblidx" is also used to store machine + * state. Since "tblidx" is int, concurrent access to it can't + * lead to inconsistence, only to lost of precision. + * + * Things get worse, if table contains not hosts, but + * prefixes. In this case counter also stores machine state, + * and for IPv6 address, counter can't be updated atomically. + * Probably, using round-robin on a table containing IPv6 + * prefixes (or even IPv4) would cause a panic. + */ + + if (rpool->cur->addr.type == PF_ADDR_TABLE) { + if (!pfr_pool_get(rpool->cur->addr.p.tbl, + &rpool->tblidx, &rpool->counter, af)) + goto get_addr; + } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, + &rpool->tblidx, &rpool->counter, af)) + goto get_addr; + } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) + goto get_addr; + + try_next: + if (TAILQ_NEXT(rpool->cur, entries) == NULL) + rpool->cur = TAILQ_FIRST(&rpool->list); + else + rpool->cur = TAILQ_NEXT(rpool->cur, entries); + if (rpool->cur->addr.type == PF_ADDR_TABLE) { + rpool->tblidx = -1; + if (pfr_pool_get(rpool->cur->addr.p.tbl, + &rpool->tblidx, &rpool->counter, af)) { + /* table contains no address of type 'af' */ + if (rpool->cur != acur) + goto try_next; + return (1); + } + } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { + rpool->tblidx = -1; + if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, + &rpool->tblidx, &rpool->counter, af)) { + /* table contains no address of type 'af' */ + if (rpool->cur != acur) + goto try_next; + return (1); + } + } else { + raddr = &rpool->cur->addr.v.a.addr; + rmask = &rpool->cur->addr.v.a.mask; + PF_ACPY(&rpool->counter, raddr, af); + } + + get_addr: + PF_ACPY(naddr, &rpool->counter, af); + if (init_addr != NULL && PF_AZERO(init_addr, af)) + PF_ACPY(init_addr, naddr, af); + PF_AINC(&rpool->counter, af); + break; + } + } + if (*sn != NULL) + PF_ACPY(&(*sn)->raddr, naddr, af); + + if (V_pf_status.debug >= PF_DEBUG_MISC && + (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { + printf("pf_map_addr: selected address "); + pf_print_host(naddr, 0, af); + printf("\n"); + } + + return (0); +} + +struct pf_rule * +pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, + struct pfi_kif *kif, struct pf_src_node **sn, + struct pf_state_key **skp, struct pf_state_key **nkp, + struct pf_addr *saddr, struct pf_addr *daddr, + u_int16_t sport, u_int16_t dport) +{ + struct pf_rule *r = NULL; + struct pf_addr *naddr; + uint16_t *nport; + + PF_RULES_RASSERT(); + KASSERT(*skp == NULL, ("*skp not NULL")); + KASSERT(*nkp == NULL, ("*nkp not NULL")); + + if (direction == PF_OUT) { + r = pf_match_translation(pd, m, off, direction, kif, saddr, + sport, daddr, dport, PF_RULESET_BINAT); + if (r == NULL) + r = pf_match_translation(pd, m, off, direction, kif, + saddr, sport, daddr, dport, PF_RULESET_NAT); + } else { + r = pf_match_translation(pd, m, off, direction, kif, saddr, + sport, daddr, dport, PF_RULESET_RDR); + if (r == NULL) + r = pf_match_translation(pd, m, off, direction, kif, + saddr, sport, daddr, dport, PF_RULESET_BINAT); + } + + if (r == NULL) + return (NULL); + + switch (r->action) { + case PF_NONAT: + case PF_NOBINAT: + case PF_NORDR: + return (NULL); + } + + *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport); + if (*skp == NULL) + return (NULL); + *nkp = pf_state_key_clone(*skp); + if (*nkp == NULL) { + uma_zfree(V_pf_state_key_z, skp); + *skp = NULL; + return (NULL); + } + + /* XXX We only modify one side for now. */ + naddr = &(*nkp)->addr[1]; + nport = &(*nkp)->port[1]; + + switch (r->action) { + case PF_NAT: + if (pf_get_sport(pd->af, pd->proto, r, saddr, daddr, dport, + naddr, nport, r->rpool.proxy_port[0], + r->rpool.proxy_port[1], sn)) { + DPFPRINTF(PF_DEBUG_MISC, + ("pf: NAT proxy port allocation (%u-%u) failed\n", + r->rpool.proxy_port[0], r->rpool.proxy_port[1])); + goto notrans; + } + break; + case PF_BINAT: + switch (direction) { + case PF_OUT: + if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){ + switch (pd->af) { +#ifdef INET + case AF_INET: + if (r->rpool.cur->addr.p.dyn-> + pfid_acnt4 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->rpool.cur->addr.p.dyn-> + pfid_addr4, + &r->rpool.cur->addr.p.dyn-> + pfid_mask4, saddr, AF_INET); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (r->rpool.cur->addr.p.dyn-> + pfid_acnt6 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->rpool.cur->addr.p.dyn-> + pfid_addr6, + &r->rpool.cur->addr.p.dyn-> + pfid_mask6, saddr, AF_INET6); + break; +#endif /* INET6 */ + } + } else + PF_POOLMASK(naddr, + &r->rpool.cur->addr.v.a.addr, + &r->rpool.cur->addr.v.a.mask, saddr, + pd->af); + break; + case PF_IN: + if (r->src.addr.type == PF_ADDR_DYNIFTL) { + switch (pd->af) { +#ifdef INET + case AF_INET: + if (r->src.addr.p.dyn-> pfid_acnt4 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->src.addr.p.dyn->pfid_addr4, + &r->src.addr.p.dyn->pfid_mask4, + daddr, AF_INET); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (r->src.addr.p.dyn->pfid_acnt6 < 1) + goto notrans; + PF_POOLMASK(naddr, + &r->src.addr.p.dyn->pfid_addr6, + &r->src.addr.p.dyn->pfid_mask6, + daddr, AF_INET6); + break; +#endif /* INET6 */ + } + } else + PF_POOLMASK(naddr, &r->src.addr.v.a.addr, + &r->src.addr.v.a.mask, daddr, pd->af); + break; + } + break; + case PF_RDR: { + if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn)) + goto notrans; + if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) + PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask, + daddr, pd->af); + + if (r->rpool.proxy_port[1]) { + uint32_t tmp_nport; + + tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) % + (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] + + 1)) + r->rpool.proxy_port[0]; + + /* Wrap around if necessary. */ + if (tmp_nport > 65535) + tmp_nport -= 65535; + *nport = htons((uint16_t)tmp_nport); + } else if (r->rpool.proxy_port[0]) + *nport = htons(r->rpool.proxy_port[0]); + break; + } + default: + panic("%s: unknown action %u", __func__, r->action); + } + + /* Return success only if translation really happened. */ + if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) + return (r); + +notrans: + uma_zfree(V_pf_state_key_z, *nkp); + uma_zfree(V_pf_state_key_z, *skp); + *skp = *nkp = NULL; + + return (NULL); +} diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c new file mode 100644 index 0000000..9063fe8 --- /dev/null +++ b/sys/netpfil/pf/pf_norm.c @@ -0,0 +1,1999 @@ +/* $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */ + +/* + * Copyright 2001 Niels Provos <provos@citi.umich.edu> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_pf.h" + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/socket.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> +#include <net/pf_mtag.h> +#include <net/if_pflog.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif /* INET6 */ + +struct pf_frent { + LIST_ENTRY(pf_frent) fr_next; + union { + struct { + struct ip *_fr_ip; + struct mbuf *_fr_m; + } _frag; + struct { + uint16_t _fr_off; + uint16_t _fr_end; + } _cache; + } _u; +}; +#define fr_ip _u._frag._fr_ip +#define fr_m _u._frag._fr_m +#define fr_off _u._cache._fr_off +#define fr_end _u._cache._fr_end + +struct pf_fragment { + RB_ENTRY(pf_fragment) fr_entry; + TAILQ_ENTRY(pf_fragment) frag_next; + struct in_addr fr_src; + struct in_addr fr_dst; + u_int8_t fr_p; /* protocol of this fragment */ + u_int8_t fr_flags; /* status flags */ +#define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */ +#define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */ +#define PFFRAG_DROP 0x0004 /* Drop all fragments */ +#define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER)) + u_int16_t fr_id; /* fragment id for reassemble */ + u_int16_t fr_max; /* fragment data max */ + u_int32_t fr_timeout; + LIST_HEAD(, pf_frent) fr_queue; +}; + +static struct mtx pf_frag_mtx; +#define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx) +#define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx) +#define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED) + +VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ + +static VNET_DEFINE(uma_zone_t, pf_frent_z); +#define V_pf_frent_z VNET(pf_frent_z) +static VNET_DEFINE(uma_zone_t, pf_frag_z); +#define V_pf_frag_z VNET(pf_frag_z) + +TAILQ_HEAD(pf_fragqueue, pf_fragment); +TAILQ_HEAD(pf_cachequeue, pf_fragment); +static VNET_DEFINE(struct pf_fragqueue, pf_fragqueue); +#define V_pf_fragqueue VNET(pf_fragqueue) +static VNET_DEFINE(struct pf_cachequeue, pf_cachequeue); +#define V_pf_cachequeue VNET(pf_cachequeue) +RB_HEAD(pf_frag_tree, pf_fragment); +static VNET_DEFINE(struct pf_frag_tree, pf_frag_tree); +#define V_pf_frag_tree VNET(pf_frag_tree) +static VNET_DEFINE(struct pf_frag_tree, pf_cache_tree); +#define V_pf_cache_tree VNET(pf_cache_tree) +static int pf_frag_compare(struct pf_fragment *, + struct pf_fragment *); +static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); +static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); + +/* Private prototypes */ +static void pf_free_fragment(struct pf_fragment *); +static void pf_remove_fragment(struct pf_fragment *); +static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, + struct tcphdr *, int, sa_family_t); +#ifdef INET +static void pf_ip2key(struct pf_fragment *, struct ip *); +static void pf_scrub_ip(struct mbuf **, u_int32_t, u_int8_t, + u_int8_t); +static void pf_flush_fragments(void); +static struct pf_fragment *pf_find_fragment(struct ip *, struct pf_frag_tree *); +static struct mbuf *pf_reassemble(struct mbuf **, struct pf_fragment **, + struct pf_frent *, int); +static struct mbuf *pf_fragcache(struct mbuf **, struct ip*, + struct pf_fragment **, int, int, int *); +#endif /* INET */ +#ifdef INET6 +static void pf_scrub_ip6(struct mbuf **, u_int8_t); +#endif +#define DPFPRINTF(x) do { \ + if (V_pf_status.debug >= PF_DEBUG_MISC) { \ + printf("%s: ", __func__); \ + printf x ; \ + } \ +} while(0) + +void +pf_normalize_init(void) +{ + + V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_state_scrub_z = uma_zcreate("pf state scrubs", + sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; + V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; + uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); + + mtx_init(&pf_frag_mtx, "pf fragments", NULL, MTX_DEF); + + TAILQ_INIT(&V_pf_fragqueue); + TAILQ_INIT(&V_pf_cachequeue); +} + +void +pf_normalize_cleanup(void) +{ + + uma_zdestroy(V_pf_state_scrub_z); + uma_zdestroy(V_pf_frent_z); + uma_zdestroy(V_pf_frag_z); + + mtx_destroy(&pf_frag_mtx); +} + +static int +pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +{ + int diff; + + if ((diff = a->fr_id - b->fr_id)) + return (diff); + else if ((diff = a->fr_p - b->fr_p)) + return (diff); + else if (a->fr_src.s_addr < b->fr_src.s_addr) + return (-1); + else if (a->fr_src.s_addr > b->fr_src.s_addr) + return (1); + else if (a->fr_dst.s_addr < b->fr_dst.s_addr) + return (-1); + else if (a->fr_dst.s_addr > b->fr_dst.s_addr) + return (1); + return (0); +} + +void +pf_purge_expired_fragments(void) +{ + struct pf_fragment *frag; + u_int32_t expire = time_uptime - + V_pf_default_rule.timeout[PFTM_FRAG]; + + PF_FRAG_LOCK(); + while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { + KASSERT((BUFFER_FRAGMENTS(frag)), + ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__)); + if (frag->fr_timeout > expire) + break; + + DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + pf_free_fragment(frag); + } + + while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) { + KASSERT((!BUFFER_FRAGMENTS(frag)), + ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__)); + if (frag->fr_timeout > expire) + break; + + DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); + pf_free_fragment(frag); + KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) || + TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag), + ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s", + __FUNCTION__)); + } + PF_FRAG_UNLOCK(); +} + +#ifdef INET +/* + * Try to flush old fragments to make space for new ones + */ +static void +pf_flush_fragments(void) +{ + struct pf_fragment *frag, *cache; + int goal; + + PF_FRAG_ASSERT(); + + goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; + DPFPRINTF(("trying to free %d frag entriess\n", goal)); + while (goal < uma_zone_get_cur(V_pf_frent_z)) { + frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); + if (frag) + pf_free_fragment(frag); + cache = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue); + if (cache) + pf_free_fragment(cache); + if (frag == NULL && cache == NULL) + break; + } +} +#endif /* INET */ + +/* Frees the fragments and all associated entries */ +static void +pf_free_fragment(struct pf_fragment *frag) +{ + struct pf_frent *frent; + + PF_FRAG_ASSERT(); + + /* Free all fragments */ + if (BUFFER_FRAGMENTS(frag)) { + for (frent = LIST_FIRST(&frag->fr_queue); frent; + frent = LIST_FIRST(&frag->fr_queue)) { + LIST_REMOVE(frent, fr_next); + + m_freem(frent->fr_m); + uma_zfree(V_pf_frent_z, frent); + } + } else { + for (frent = LIST_FIRST(&frag->fr_queue); frent; + frent = LIST_FIRST(&frag->fr_queue)) { + LIST_REMOVE(frent, fr_next); + + KASSERT((LIST_EMPTY(&frag->fr_queue) || + LIST_FIRST(&frag->fr_queue)->fr_off > + frent->fr_end), + ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >" + " frent->fr_end): %s", __func__)); + + uma_zfree(V_pf_frent_z, frent); + } + } + + pf_remove_fragment(frag); +} + +#ifdef INET +static void +pf_ip2key(struct pf_fragment *key, struct ip *ip) +{ + key->fr_p = ip->ip_p; + key->fr_id = ip->ip_id; + key->fr_src.s_addr = ip->ip_src.s_addr; + key->fr_dst.s_addr = ip->ip_dst.s_addr; +} + +static struct pf_fragment * +pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree) +{ + struct pf_fragment key; + struct pf_fragment *frag; + + PF_FRAG_ASSERT(); + + pf_ip2key(&key, ip); + + frag = RB_FIND(pf_frag_tree, tree, &key); + if (frag != NULL) { + /* XXX Are we sure we want to update the timeout? */ + frag->fr_timeout = time_uptime; + if (BUFFER_FRAGMENTS(frag)) { + TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); + TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); + } else { + TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next); + TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next); + } + } + + return (frag); +} +#endif /* INET */ + +/* Removes a fragment from the fragment queue and frees the fragment */ + +static void +pf_remove_fragment(struct pf_fragment *frag) +{ + + PF_FRAG_ASSERT(); + + if (BUFFER_FRAGMENTS(frag)) { + RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); + TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); + uma_zfree(V_pf_frag_z, frag); + } else { + RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag); + TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next); + uma_zfree(V_pf_frag_z, frag); + } +} + +#ifdef INET +#define FR_IP_OFF(fr) ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3) +static struct mbuf * +pf_reassemble(struct mbuf **m0, struct pf_fragment **frag, + struct pf_frent *frent, int mff) +{ + struct mbuf *m = *m0, *m2; + struct pf_frent *frea, *next; + struct pf_frent *frep = NULL; + struct ip *ip = frent->fr_ip; + int hlen = ip->ip_hl << 2; + u_int16_t off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; + u_int16_t ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4; + u_int16_t max = ip_len + off; + + PF_FRAG_ASSERT(); + KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)), + ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); + + /* Strip off ip header */ + m->m_data += hlen; + m->m_len -= hlen; + + /* Create a new reassembly queue for this packet */ + if (*frag == NULL) { + *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); + if (*frag == NULL) { + pf_flush_fragments(); + *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); + if (*frag == NULL) + goto drop_fragment; + } + + (*frag)->fr_flags = 0; + (*frag)->fr_max = 0; + (*frag)->fr_src = frent->fr_ip->ip_src; + (*frag)->fr_dst = frent->fr_ip->ip_dst; + (*frag)->fr_p = frent->fr_ip->ip_p; + (*frag)->fr_id = frent->fr_ip->ip_id; + (*frag)->fr_timeout = time_uptime; + LIST_INIT(&(*frag)->fr_queue); + + RB_INSERT(pf_frag_tree, &V_pf_frag_tree, *frag); + TAILQ_INSERT_HEAD(&V_pf_fragqueue, *frag, frag_next); + + /* We do not have a previous fragment */ + frep = NULL; + goto insert; + } + + /* + * Find a fragment after the current one: + * - off contains the real shifted offset. + */ + LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) { + if (FR_IP_OFF(frea) > off) + break; + frep = frea; + } + + KASSERT((frep != NULL || frea != NULL), + ("!(frep != NULL || frea != NULL): %s", __FUNCTION__));; + + if (frep != NULL && + FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * + 4 > off) + { + u_int16_t precut; + + precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - + frep->fr_ip->ip_hl * 4 - off; + if (precut >= ip_len) + goto drop_fragment; + m_adj(frent->fr_m, precut); + DPFPRINTF(("overlap -%d\n", precut)); + /* Enforce 8 byte boundaries */ + ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3)); + off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; + ip_len -= precut; + ip->ip_len = htons(ip_len); + } + + for (; frea != NULL && ip_len + off > FR_IP_OFF(frea); + frea = next) + { + u_int16_t aftercut; + + aftercut = ip_len + off - FR_IP_OFF(frea); + DPFPRINTF(("adjust overlap %d\n", aftercut)); + if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl + * 4) + { + frea->fr_ip->ip_len = + htons(ntohs(frea->fr_ip->ip_len) - aftercut); + frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) + + (aftercut >> 3)); + m_adj(frea->fr_m, aftercut); + break; + } + + /* This fragment is completely overlapped, lose it */ + next = LIST_NEXT(frea, fr_next); + m_freem(frea->fr_m); + LIST_REMOVE(frea, fr_next); + uma_zfree(V_pf_frent_z, frea); + } + + insert: + /* Update maximum data size */ + if ((*frag)->fr_max < max) + (*frag)->fr_max = max; + /* This is the last segment */ + if (!mff) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + if (frep == NULL) + LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next); + else + LIST_INSERT_AFTER(frep, frent, fr_next); + + /* Check if we are completely reassembled */ + if (!((*frag)->fr_flags & PFFRAG_SEENLAST)) + return (NULL); + + /* Check if we have all the data */ + off = 0; + for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) { + next = LIST_NEXT(frep, fr_next); + + off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4; + if (off < (*frag)->fr_max && + (next == NULL || FR_IP_OFF(next) != off)) + { + DPFPRINTF(("missing fragment at %d, next %d, max %d\n", + off, next == NULL ? -1 : FR_IP_OFF(next), + (*frag)->fr_max)); + return (NULL); + } + } + DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max)); + if (off < (*frag)->fr_max) + return (NULL); + + /* We have all the data */ + frent = LIST_FIRST(&(*frag)->fr_queue); + KASSERT((frent != NULL), ("frent == NULL: %s", __FUNCTION__)); + if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) { + DPFPRINTF(("drop: too big: %d\n", off)); + pf_free_fragment(*frag); + *frag = NULL; + return (NULL); + } + next = LIST_NEXT(frent, fr_next); + + /* Magic from ip_input */ + ip = frent->fr_ip; + m = frent->fr_m; + m2 = m->m_next; + m->m_next = NULL; + m_cat(m, m2); + uma_zfree(V_pf_frent_z, frent); + for (frent = next; frent != NULL; frent = next) { + next = LIST_NEXT(frent, fr_next); + + m2 = frent->fr_m; + uma_zfree(V_pf_frent_z, frent); + m->m_pkthdr.csum_flags &= m2->m_pkthdr.csum_flags; + m->m_pkthdr.csum_data += m2->m_pkthdr.csum_data; + m_cat(m, m2); + } + + while (m->m_pkthdr.csum_data & 0xffff0000) + m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + + (m->m_pkthdr.csum_data >> 16); + ip->ip_src = (*frag)->fr_src; + ip->ip_dst = (*frag)->fr_dst; + + /* Remove from fragment queue */ + pf_remove_fragment(*frag); + *frag = NULL; + + hlen = ip->ip_hl << 2; + ip->ip_len = htons(off + hlen); + m->m_len += hlen; + m->m_data -= hlen; + + /* some debugging cruft by sklower, below, will go away soon */ + /* XXX this should be done elsewhere */ + if (m->m_flags & M_PKTHDR) { + int plen = 0; + for (m2 = m; m2; m2 = m2->m_next) + plen += m2->m_len; + m->m_pkthdr.len = plen; + } + + DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); + return (m); + + drop_fragment: + /* Oops - fail safe - drop packet */ + uma_zfree(V_pf_frent_z, frent); + m_freem(m); + return (NULL); +} + +static struct mbuf * +pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff, + int drop, int *nomem) +{ + struct mbuf *m = *m0; + struct pf_frent *frp, *fra, *cur = NULL; + int ip_len = ntohs(h->ip_len) - (h->ip_hl << 2); + u_int16_t off = ntohs(h->ip_off) << 3; + u_int16_t max = ip_len + off; + int hosed = 0; + + PF_FRAG_ASSERT(); + KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)), + ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); + + /* Create a new range queue for this packet */ + if (*frag == NULL) { + *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); + if (*frag == NULL) { + pf_flush_fragments(); + *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); + if (*frag == NULL) + goto no_mem; + } + + /* Get an entry for the queue */ + cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); + if (cur == NULL) { + uma_zfree(V_pf_frag_z, *frag); + *frag = NULL; + goto no_mem; + } + + (*frag)->fr_flags = PFFRAG_NOBUFFER; + (*frag)->fr_max = 0; + (*frag)->fr_src = h->ip_src; + (*frag)->fr_dst = h->ip_dst; + (*frag)->fr_p = h->ip_p; + (*frag)->fr_id = h->ip_id; + (*frag)->fr_timeout = time_uptime; + + cur->fr_off = off; + cur->fr_end = max; + LIST_INIT(&(*frag)->fr_queue); + LIST_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next); + + RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag); + TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next); + + DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max)); + + goto pass; + } + + /* + * Find a fragment after the current one: + * - off contains the real shifted offset. + */ + frp = NULL; + LIST_FOREACH(fra, &(*frag)->fr_queue, fr_next) { + if (fra->fr_off > off) + break; + frp = fra; + } + + KASSERT((frp != NULL || fra != NULL), + ("!(frp != NULL || fra != NULL): %s", __FUNCTION__)); + + if (frp != NULL) { + int precut; + + precut = frp->fr_end - off; + if (precut >= ip_len) { + /* Fragment is entirely a duplicate */ + DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n", + h->ip_id, frp->fr_off, frp->fr_end, off, max)); + goto drop_fragment; + } + if (precut == 0) { + /* They are adjacent. Fixup cache entry */ + DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n", + h->ip_id, frp->fr_off, frp->fr_end, off, max)); + frp->fr_end = max; + } else if (precut > 0) { + /* The first part of this payload overlaps with a + * fragment that has already been passed. + * Need to trim off the first part of the payload. + * But to do so easily, we need to create another + * mbuf to throw the original header into. + */ + + DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n", + h->ip_id, precut, frp->fr_off, frp->fr_end, off, + max)); + + off += precut; + max -= precut; + /* Update the previous frag to encompass this one */ + frp->fr_end = max; + + if (!drop) { + /* XXX Optimization opportunity + * This is a very heavy way to trim the payload. + * we could do it much faster by diddling mbuf + * internals but that would be even less legible + * than this mbuf magic. For my next trick, + * I'll pull a rabbit out of my laptop. + */ + *m0 = m_dup(m, M_NOWAIT); + if (*m0 == NULL) + goto no_mem; + /* From KAME Project : We have missed this! */ + m_adj(*m0, (h->ip_hl << 2) - + (*m0)->m_pkthdr.len); + + KASSERT(((*m0)->m_next == NULL), + ("(*m0)->m_next != NULL: %s", + __FUNCTION__)); + m_adj(m, precut + (h->ip_hl << 2)); + m_cat(*m0, m); + m = *m0; + if (m->m_flags & M_PKTHDR) { + int plen = 0; + struct mbuf *t; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + } + + + h = mtod(m, struct ip *); + + KASSERT(((int)m->m_len == + ntohs(h->ip_len) - precut), + ("m->m_len != ntohs(h->ip_len) - precut: %s", + __FUNCTION__)); + h->ip_off = htons(ntohs(h->ip_off) + + (precut >> 3)); + h->ip_len = htons(ntohs(h->ip_len) - precut); + } else { + hosed++; + } + } else { + /* There is a gap between fragments */ + + DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n", + h->ip_id, -precut, frp->fr_off, frp->fr_end, off, + max)); + + cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); + if (cur == NULL) + goto no_mem; + + cur->fr_off = off; + cur->fr_end = max; + LIST_INSERT_AFTER(frp, cur, fr_next); + } + } + + if (fra != NULL) { + int aftercut; + int merge = 0; + + aftercut = max - fra->fr_off; + if (aftercut == 0) { + /* Adjacent fragments */ + DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n", + h->ip_id, off, max, fra->fr_off, fra->fr_end)); + fra->fr_off = off; + merge = 1; + } else if (aftercut > 0) { + /* Need to chop off the tail of this fragment */ + DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n", + h->ip_id, aftercut, off, max, fra->fr_off, + fra->fr_end)); + fra->fr_off = off; + max -= aftercut; + + merge = 1; + + if (!drop) { + m_adj(m, -aftercut); + if (m->m_flags & M_PKTHDR) { + int plen = 0; + struct mbuf *t; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + } + h = mtod(m, struct ip *); + KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut), + ("m->m_len != ntohs(h->ip_len) - aftercut: %s", + __FUNCTION__)); + h->ip_len = htons(ntohs(h->ip_len) - aftercut); + } else { + hosed++; + } + } else if (frp == NULL) { + /* There is a gap between fragments */ + DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n", + h->ip_id, -aftercut, off, max, fra->fr_off, + fra->fr_end)); + + cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); + if (cur == NULL) + goto no_mem; + + cur->fr_off = off; + cur->fr_end = max; + LIST_INSERT_BEFORE(fra, cur, fr_next); + } + + + /* Need to glue together two separate fragment descriptors */ + if (merge) { + if (cur && fra->fr_off <= cur->fr_end) { + /* Need to merge in a previous 'cur' */ + DPFPRINTF(("fragcache[%d]: adjacent(merge " + "%d-%d) %d-%d (%d-%d)\n", + h->ip_id, cur->fr_off, cur->fr_end, off, + max, fra->fr_off, fra->fr_end)); + fra->fr_off = cur->fr_off; + LIST_REMOVE(cur, fr_next); + uma_zfree(V_pf_frent_z, cur); + cur = NULL; + + } else if (frp && fra->fr_off <= frp->fr_end) { + /* Need to merge in a modified 'frp' */ + KASSERT((cur == NULL), ("cur != NULL: %s", + __FUNCTION__)); + DPFPRINTF(("fragcache[%d]: adjacent(merge " + "%d-%d) %d-%d (%d-%d)\n", + h->ip_id, frp->fr_off, frp->fr_end, off, + max, fra->fr_off, fra->fr_end)); + fra->fr_off = frp->fr_off; + LIST_REMOVE(frp, fr_next); + uma_zfree(V_pf_frent_z, frp); + frp = NULL; + + } + } + } + + if (hosed) { + /* + * We must keep tracking the overall fragment even when + * we're going to drop it anyway so that we know when to + * free the overall descriptor. Thus we drop the frag late. + */ + goto drop_fragment; + } + + + pass: + /* Update maximum data size */ + if ((*frag)->fr_max < max) + (*frag)->fr_max = max; + + /* This is the last segment */ + if (!mff) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + /* Check if we are completely reassembled */ + if (((*frag)->fr_flags & PFFRAG_SEENLAST) && + LIST_FIRST(&(*frag)->fr_queue)->fr_off == 0 && + LIST_FIRST(&(*frag)->fr_queue)->fr_end == (*frag)->fr_max) { + /* Remove from fragment queue */ + DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id, + (*frag)->fr_max)); + pf_free_fragment(*frag); + *frag = NULL; + } + + return (m); + + no_mem: + *nomem = 1; + + /* Still need to pay attention to !IP_MF */ + if (!mff && *frag != NULL) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + m_freem(m); + return (NULL); + + drop_fragment: + + /* Still need to pay attention to !IP_MF */ + if (!mff && *frag != NULL) + (*frag)->fr_flags |= PFFRAG_SEENLAST; + + if (drop) { + /* This fragment has been deemed bad. Don't reass */ + if (((*frag)->fr_flags & PFFRAG_DROP) == 0) + DPFPRINTF(("fragcache[%d]: dropping overall fragment\n", + h->ip_id)); + (*frag)->fr_flags |= PFFRAG_DROP; + } + + m_freem(m); + return (NULL); +} + +int +pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, + struct pf_pdesc *pd) +{ + struct mbuf *m = *m0; + struct pf_rule *r; + struct pf_frent *frent; + struct pf_fragment *frag = NULL; + struct ip *h = mtod(m, struct ip *); + int mff = (ntohs(h->ip_off) & IP_MF); + int hlen = h->ip_hl << 2; + u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + u_int16_t max; + int ip_len; + int ip_off; + int tag = -1; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != AF_INET) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != h->ip_p) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, + (struct pf_addr *)&h->ip_src.s_addr, AF_INET, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, + (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->match_tag && !pf_match_tag(m, r, &tag, + pd->pf_mtag ? pd->pf_mtag->tag : 0)) + r = TAILQ_NEXT(r, entries); + else + break; + } + + if (r == NULL || r->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + /* Check for illegal packets */ + if (hlen < (int)sizeof(struct ip)) + goto drop; + + if (hlen > ntohs(h->ip_len)) + goto drop; + + /* Clear IP_DF if the rule uses the no-df option */ + if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(~IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* We will need other tests here */ + if (!fragoff && !mff) + goto no_fragment; + + /* We're dealing with a fragment now. Don't allow fragments + * with IP_DF to enter the cache. If the flag was cleared by + * no-df above, fine. Otherwise drop it. + */ + if (h->ip_off & htons(IP_DF)) { + DPFPRINTF(("IP_DF\n")); + goto bad; + } + + ip_len = ntohs(h->ip_len) - hlen; + ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3; + + /* All fragments are 8 byte aligned */ + if (mff && (ip_len & 0x7)) { + DPFPRINTF(("mff and %d\n", ip_len)); + goto bad; + } + + /* Respect maximum length */ + if (fragoff + ip_len > IP_MAXPACKET) { + DPFPRINTF(("max packet %d\n", fragoff + ip_len)); + goto bad; + } + max = fragoff + ip_len; + + if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) { + + /* Fully buffer all of the fragments */ + PF_FRAG_LOCK(); + frag = pf_find_fragment(h, &V_pf_frag_tree); + + /* Check if we saw the last fragment already */ + if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && + max > frag->fr_max) + goto bad; + + /* Get an entry for the fragment queue */ + frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); + if (frent == NULL) { + PF_FRAG_UNLOCK(); + REASON_SET(reason, PFRES_MEMORY); + return (PF_DROP); + } + frent->fr_ip = h; + frent->fr_m = m; + + /* Might return a completely reassembled mbuf, or NULL */ + DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); + *m0 = m = pf_reassemble(m0, &frag, frent, mff); + PF_FRAG_UNLOCK(); + + if (m == NULL) + return (PF_DROP); + + /* use mtag from concatenated mbuf chain */ + pd->pf_mtag = pf_find_mtag(m); +#ifdef DIAGNOSTIC + if (pd->pf_mtag == NULL) { + printf("%s: pf_find_mtag returned NULL(1)\n", __func__); + if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) { + m_freem(m); + *m0 = NULL; + goto no_mem; + } + } +#endif + if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) + goto drop; + + h = mtod(m, struct ip *); + } else { + /* non-buffering fragment cache (drops or masks overlaps) */ + int nomem = 0; + + if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) { + /* + * Already passed the fragment cache in the + * input direction. If we continued, it would + * appear to be a dup and would be dropped. + */ + goto fragment_pass; + } + + PF_FRAG_LOCK(); + frag = pf_find_fragment(h, &V_pf_cache_tree); + + /* Check if we saw the last fragment already */ + if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && + max > frag->fr_max) { + if (r->rule_flag & PFRULE_FRAGDROP) + frag->fr_flags |= PFFRAG_DROP; + goto bad; + } + + *m0 = m = pf_fragcache(m0, h, &frag, mff, + (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem); + PF_FRAG_UNLOCK(); + if (m == NULL) { + if (nomem) + goto no_mem; + goto drop; + } + + /* use mtag from copied and trimmed mbuf chain */ + pd->pf_mtag = pf_find_mtag(m); +#ifdef DIAGNOSTIC + if (pd->pf_mtag == NULL) { + printf("%s: pf_find_mtag returned NULL(2)\n", __func__); + if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) { + m_freem(m); + *m0 = NULL; + goto no_mem; + } + } +#endif + if (dir == PF_IN) + pd->pf_mtag->flags |= PF_TAG_FRAGCACHE; + + if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) + goto drop; + goto fragment_pass; + } + + no_fragment: + /* At this point, only IP_DF is allowed in ip_off */ + if (h->ip_off & ~htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* not missing a return here */ + + fragment_pass: + pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos); + + if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) + pd->flags |= PFDESC_IP_REAS; + return (PF_PASS); + + no_mem: + REASON_SET(reason, PFRES_MEMORY); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); + + drop: + REASON_SET(reason, PFRES_NORM); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); + + bad: + DPFPRINTF(("dropping bad fragment\n")); + + /* Free associated fragments */ + if (frag != NULL) { + pf_free_fragment(frag); + PF_FRAG_UNLOCK(); + } + + REASON_SET(reason, PFRES_FRAG); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, + 1); + + return (PF_DROP); +} +#endif + +#ifdef INET6 +int +pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, + u_short *reason, struct pf_pdesc *pd) +{ + struct mbuf *m = *m0; + struct pf_rule *r; + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + int off; + struct ip6_ext ext; + struct ip6_opt opt; + struct ip6_opt_jumbo jumbo; + struct ip6_frag frag; + u_int32_t jumbolen = 0, plen; + u_int16_t fragoff = 0; + int optend; + int ooff; + u_int8_t proto; + int terminal; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != AF_INET6) + r = r->skip[PF_SKIP_AF].ptr; +#if 0 /* header chain! */ + else if (r->proto && r->proto != h->ip6_nxt) + r = r->skip[PF_SKIP_PROTO].ptr; +#endif + else if (PF_MISMATCHAW(&r->src.addr, + (struct pf_addr *)&h->ip6_src, AF_INET6, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, + (struct pf_addr *)&h->ip6_dst, AF_INET6, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else + break; + } + + if (r == NULL || r->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + /* Check for illegal packets */ + if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) + goto drop; + + off = sizeof(struct ip6_hdr); + proto = h->ip6_nxt; + terminal = 0; + do { + switch (proto) { + case IPPROTO_FRAGMENT: + goto fragment; + break; + case IPPROTO_AH: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) + goto shortpkt; + if (proto == IPPROTO_AH) + off += (ext.ip6e_len + 2) * 4; + else + off += (ext.ip6e_len + 1) * 8; + proto = ext.ip6e_nxt; + break; + case IPPROTO_HOPOPTS: + if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, + NULL, AF_INET6)) + goto shortpkt; + optend = off + (ext.ip6e_len + 1) * 8; + ooff = off + sizeof(ext); + do { + if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, + sizeof(opt.ip6o_type), NULL, NULL, + AF_INET6)) + goto shortpkt; + if (opt.ip6o_type == IP6OPT_PAD1) { + ooff++; + continue; + } + if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), + NULL, NULL, AF_INET6)) + goto shortpkt; + if (ooff + sizeof(opt) + opt.ip6o_len > optend) + goto drop; + switch (opt.ip6o_type) { + case IP6OPT_JUMBO: + if (h->ip6_plen != 0) + goto drop; + if (!pf_pull_hdr(m, ooff, &jumbo, + sizeof(jumbo), NULL, NULL, + AF_INET6)) + goto shortpkt; + memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, + sizeof(jumbolen)); + jumbolen = ntohl(jumbolen); + if (jumbolen <= IPV6_MAXPACKET) + goto drop; + if (sizeof(struct ip6_hdr) + jumbolen != + m->m_pkthdr.len) + goto drop; + break; + default: + break; + } + ooff += sizeof(opt) + opt.ip6o_len; + } while (ooff < optend); + + off = optend; + proto = ext.ip6e_nxt; + break; + default: + terminal = 1; + break; + } + } while (!terminal); + + /* jumbo payload option must be present, or plen > 0 */ + if (ntohs(h->ip6_plen) == 0) + plen = jumbolen; + else + plen = ntohs(h->ip6_plen); + if (plen == 0) + goto drop; + if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) + goto shortpkt; + + pf_scrub_ip6(&m, r->min_ttl); + + return (PF_PASS); + + fragment: + if (ntohs(h->ip6_plen) == 0 || jumbolen) + goto drop; + plen = ntohs(h->ip6_plen); + + if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) + goto shortpkt; + fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK); + if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET) + goto badfrag; + + /* do something about it */ + /* remember to set pd->flags |= PFDESC_IP_REAS */ + return (PF_PASS); + + shortpkt: + REASON_SET(reason, PFRES_SHORT); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); + + drop: + REASON_SET(reason, PFRES_NORM); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); + + badfrag: + REASON_SET(reason, PFRES_FRAG); + if (r != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); +} +#endif /* INET6 */ + +int +pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, + int off, void *h, struct pf_pdesc *pd) +{ + struct pf_rule *r, *rm = NULL; + struct tcphdr *th = pd->hdr.tcp; + int rewrite = 0; + u_short reason; + u_int8_t flags; + sa_family_t af = pd->af; + + PF_RULES_RASSERT(); + + r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); + while (r != NULL) { + r->evaluations++; + if (pfi_kif_match(r->kif, kif) == r->ifnot) + r = r->skip[PF_SKIP_IFP].ptr; + else if (r->direction && r->direction != dir) + r = r->skip[PF_SKIP_DIR].ptr; + else if (r->af && r->af != af) + r = r->skip[PF_SKIP_AF].ptr; + else if (r->proto && r->proto != pd->proto) + r = r->skip[PF_SKIP_PROTO].ptr; + else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, + r->src.neg, kif, M_GETFIB(m))) + r = r->skip[PF_SKIP_SRC_ADDR].ptr; + else if (r->src.port_op && !pf_match_port(r->src.port_op, + r->src.port[0], r->src.port[1], th->th_sport)) + r = r->skip[PF_SKIP_SRC_PORT].ptr; + else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, + r->dst.neg, NULL, M_GETFIB(m))) + r = r->skip[PF_SKIP_DST_ADDR].ptr; + else if (r->dst.port_op && !pf_match_port(r->dst.port_op, + r->dst.port[0], r->dst.port[1], th->th_dport)) + r = r->skip[PF_SKIP_DST_PORT].ptr; + else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( + pf_osfp_fingerprint(pd, m, off, th), + r->os_fingerprint)) + r = TAILQ_NEXT(r, entries); + else { + rm = r; + break; + } + } + + if (rm == NULL || rm->action == PF_NOSCRUB) + return (PF_PASS); + else { + r->packets[dir == PF_OUT]++; + r->bytes[dir == PF_OUT] += pd->tot_len; + } + + if (rm->rule_flag & PFRULE_REASSEMBLE_TCP) + pd->flags |= PFDESC_TCP_NORM; + + flags = th->th_flags; + if (flags & TH_SYN) { + /* Illegal packet */ + if (flags & TH_RST) + goto tcp_drop; + + if (flags & TH_FIN) + flags &= ~TH_FIN; + } else { + /* Illegal packet */ + if (!(flags & (TH_ACK|TH_RST))) + goto tcp_drop; + } + + if (!(flags & TH_ACK)) { + /* These flags are only valid if ACK is set */ + if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) + goto tcp_drop; + } + + /* Check for illegal header length */ + if (th->th_off < (sizeof(struct tcphdr) >> 2)) + goto tcp_drop; + + /* If flags changed, or reserved data set, then adjust */ + if (flags != th->th_flags || th->th_x2 != 0) { + u_int16_t ov, nv; + + ov = *(u_int16_t *)(&th->th_ack + 1); + th->th_flags = flags; + th->th_x2 = 0; + nv = *(u_int16_t *)(&th->th_ack + 1); + + th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0); + rewrite = 1; + } + + /* Remove urgent pointer, if TH_URG is not set */ + if (!(flags & TH_URG) && th->th_urp) { + th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0); + th->th_urp = 0; + rewrite = 1; + } + + /* Process options */ + if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af)) + rewrite = 1; + + /* copy back packet headers if we sanitized */ + if (rewrite) + m_copyback(m, off, sizeof(*th), (caddr_t)th); + + return (PF_PASS); + + tcp_drop: + REASON_SET(&reason, PFRES_NORM); + if (rm != NULL && r->log) + PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd, + 1); + return (PF_DROP); +} + +int +pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, + struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) +{ + u_int32_t tsval, tsecr; + u_int8_t hdr[60]; + u_int8_t *opt; + + KASSERT((src->scrub == NULL), + ("pf_normalize_tcp_init: src->scrub != NULL")); + + src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); + if (src->scrub == NULL) + return (1); + + switch (pd->af) { +#ifdef INET + case AF_INET: { + struct ip *h = mtod(m, struct ip *); + src->scrub->pfss_ttl = h->ip_ttl; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + src->scrub->pfss_ttl = h->ip6_hlim; + break; + } +#endif /* INET6 */ + } + + + /* + * All normalizations below are only begun if we see the start of + * the connections. They must all set an enabled bit in pfss_flags + */ + if ((th->th_flags & TH_SYN) == 0) + return (0); + + + if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && + pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { + /* Diddle with TCP options */ + int hlen; + opt = hdr + sizeof(struct tcphdr); + hlen = (th->th_off << 2) - sizeof(struct tcphdr); + while (hlen >= TCPOLEN_TIMESTAMP) { + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_TIMESTAMP: + if (opt[1] >= TCPOLEN_TIMESTAMP) { + src->scrub->pfss_flags |= + PFSS_TIMESTAMP; + src->scrub->pfss_ts_mod = + htonl(arc4random()); + + /* note PFSS_PAWS not set yet */ + memcpy(&tsval, &opt[2], + sizeof(u_int32_t)); + memcpy(&tsecr, &opt[6], + sizeof(u_int32_t)); + src->scrub->pfss_tsval0 = ntohl(tsval); + src->scrub->pfss_tsval = ntohl(tsval); + src->scrub->pfss_tsecr = ntohl(tsecr); + getmicrouptime(&src->scrub->pfss_last); + } + /* FALLTHROUGH */ + default: + hlen -= MAX(opt[1], 2); + opt += MAX(opt[1], 2); + break; + } + } + } + + return (0); +} + +void +pf_normalize_tcp_cleanup(struct pf_state *state) +{ + if (state->src.scrub) + uma_zfree(V_pf_state_scrub_z, state->src.scrub); + if (state->dst.scrub) + uma_zfree(V_pf_state_scrub_z, state->dst.scrub); + + /* Someday... flush the TCP segment reassembly descriptors. */ +} + +int +pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, + u_short *reason, struct tcphdr *th, struct pf_state *state, + struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) +{ + struct timeval uptime; + u_int32_t tsval, tsecr; + u_int tsval_from_last; + u_int8_t hdr[60]; + u_int8_t *opt; + int copyback = 0; + int got_ts = 0; + + KASSERT((src->scrub || dst->scrub), + ("%s: src->scrub && dst->scrub!", __func__)); + + /* + * Enforce the minimum TTL seen for this connection. Negate a common + * technique to evade an intrusion detection system and confuse + * firewall state code. + */ + switch (pd->af) { +#ifdef INET + case AF_INET: { + if (src->scrub) { + struct ip *h = mtod(m, struct ip *); + if (h->ip_ttl > src->scrub->pfss_ttl) + src->scrub->pfss_ttl = h->ip_ttl; + h->ip_ttl = src->scrub->pfss_ttl; + } + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: { + if (src->scrub) { + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + if (h->ip6_hlim > src->scrub->pfss_ttl) + src->scrub->pfss_ttl = h->ip6_hlim; + h->ip6_hlim = src->scrub->pfss_ttl; + } + break; + } +#endif /* INET6 */ + } + + if (th->th_off > (sizeof(struct tcphdr) >> 2) && + ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || + (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && + pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { + /* Diddle with TCP options */ + int hlen; + opt = hdr + sizeof(struct tcphdr); + hlen = (th->th_off << 2) - sizeof(struct tcphdr); + while (hlen >= TCPOLEN_TIMESTAMP) { + switch (*opt) { + case TCPOPT_EOL: /* FALLTHROUGH */ + case TCPOPT_NOP: + opt++; + hlen--; + break; + case TCPOPT_TIMESTAMP: + /* Modulate the timestamps. Can be used for + * NAT detection, OS uptime determination or + * reboot detection. + */ + + if (got_ts) { + /* Huh? Multiple timestamps!? */ + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("multiple TS??")); + pf_print_state(state); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + if (opt[1] >= TCPOLEN_TIMESTAMP) { + memcpy(&tsval, &opt[2], + sizeof(u_int32_t)); + if (tsval && src->scrub && + (src->scrub->pfss_flags & + PFSS_TIMESTAMP)) { + tsval = ntohl(tsval); + pf_change_a(&opt[2], + &th->th_sum, + htonl(tsval + + src->scrub->pfss_ts_mod), + 0); + copyback = 1; + } + + /* Modulate TS reply iff valid (!0) */ + memcpy(&tsecr, &opt[6], + sizeof(u_int32_t)); + if (tsecr && dst->scrub && + (dst->scrub->pfss_flags & + PFSS_TIMESTAMP)) { + tsecr = ntohl(tsecr) + - dst->scrub->pfss_ts_mod; + pf_change_a(&opt[6], + &th->th_sum, htonl(tsecr), + 0); + copyback = 1; + } + got_ts = 1; + } + /* FALLTHROUGH */ + default: + hlen -= MAX(opt[1], 2); + opt += MAX(opt[1], 2); + break; + } + } + if (copyback) { + /* Copyback the options, caller copys back header */ + *writeback = 1; + m_copyback(m, off + sizeof(struct tcphdr), + (th->th_off << 2) - sizeof(struct tcphdr), hdr + + sizeof(struct tcphdr)); + } + } + + + /* + * Must invalidate PAWS checks on connections idle for too long. + * The fastest allowed timestamp clock is 1ms. That turns out to + * be about 24 days before it wraps. XXX Right now our lowerbound + * TS echo check only works for the first 12 days of a connection + * when the TS has exhausted half its 32bit space + */ +#define TS_MAX_IDLE (24*24*60*60) +#define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ + + getmicrouptime(&uptime); + if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && + (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || + time_uptime - state->creation > TS_MAX_CONN)) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("src idled out of PAWS\n")); + pf_print_state(state); + printf("\n"); + } + src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) + | PFSS_PAWS_IDLED; + } + if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && + uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("dst idled out of PAWS\n")); + pf_print_state(state); + printf("\n"); + } + dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) + | PFSS_PAWS_IDLED; + } + + if (got_ts && src->scrub && dst->scrub && + (src->scrub->pfss_flags & PFSS_PAWS) && + (dst->scrub->pfss_flags & PFSS_PAWS)) { + /* Validate that the timestamps are "in-window". + * RFC1323 describes TCP Timestamp options that allow + * measurement of RTT (round trip time) and PAWS + * (protection against wrapped sequence numbers). PAWS + * gives us a set of rules for rejecting packets on + * long fat pipes (packets that were somehow delayed + * in transit longer than the time it took to send the + * full TCP sequence space of 4Gb). We can use these + * rules and infer a few others that will let us treat + * the 32bit timestamp and the 32bit echoed timestamp + * as sequence numbers to prevent a blind attacker from + * inserting packets into a connection. + * + * RFC1323 tells us: + * - The timestamp on this packet must be greater than + * or equal to the last value echoed by the other + * endpoint. The RFC says those will be discarded + * since it is a dup that has already been acked. + * This gives us a lowerbound on the timestamp. + * timestamp >= other last echoed timestamp + * - The timestamp will be less than or equal to + * the last timestamp plus the time between the + * last packet and now. The RFC defines the max + * clock rate as 1ms. We will allow clocks to be + * up to 10% fast and will allow a total difference + * or 30 seconds due to a route change. And this + * gives us an upperbound on the timestamp. + * timestamp <= last timestamp + max ticks + * We have to be careful here. Windows will send an + * initial timestamp of zero and then initialize it + * to a random value after the 3whs; presumably to + * avoid a DoS by having to call an expensive RNG + * during a SYN flood. Proof MS has at least one + * good security geek. + * + * - The TCP timestamp option must also echo the other + * endpoints timestamp. The timestamp echoed is the + * one carried on the earliest unacknowledged segment + * on the left edge of the sequence window. The RFC + * states that the host will reject any echoed + * timestamps that were larger than any ever sent. + * This gives us an upperbound on the TS echo. + * tescr <= largest_tsval + * - The lowerbound on the TS echo is a little more + * tricky to determine. The other endpoint's echoed + * values will not decrease. But there may be + * network conditions that re-order packets and + * cause our view of them to decrease. For now the + * only lowerbound we can safely determine is that + * the TS echo will never be less than the original + * TS. XXX There is probably a better lowerbound. + * Remove TS_MAX_CONN with better lowerbound check. + * tescr >= other original TS + * + * It is also important to note that the fastest + * timestamp clock of 1ms will wrap its 32bit space in + * 24 days. So we just disable TS checking after 24 + * days of idle time. We actually must use a 12d + * connection limit until we can come up with a better + * lowerbound to the TS echo check. + */ + struct timeval delta_ts; + int ts_fudge; + + + /* + * PFTM_TS_DIFF is how many seconds of leeway to allow + * a host's timestamp. This can happen if the previous + * packet got delayed in transit for much longer than + * this packet. + */ + if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) + ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; + + /* Calculate max ticks since the last timestamp */ +#define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ +#define TS_MICROSECS 1000000 /* microseconds per second */ + delta_ts = uptime; + timevalsub(&delta_ts, &src->scrub->pfss_last); + tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; + tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); + + if ((src->state >= TCPS_ESTABLISHED && + dst->state >= TCPS_ESTABLISHED) && + (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || + SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || + (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || + SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { + /* Bad RFC1323 implementation or an insertion attack. + * + * - Solaris 2.6 and 2.7 are known to send another ACK + * after the FIN,FIN|ACK,ACK closing that carries + * an old timestamp. + */ + + DPFPRINTF(("Timestamp failed %c%c%c%c\n", + SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', + SEQ_GT(tsval, src->scrub->pfss_tsval + + tsval_from_last) ? '1' : ' ', + SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', + SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); + DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " + "idle: %jus %lums\n", + tsval, tsecr, tsval_from_last, + (uintmax_t)delta_ts.tv_sec, + delta_ts.tv_usec / 1000)); + DPFPRINTF((" src->tsval: %u tsecr: %u\n", + src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); + DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" + "\n", dst->scrub->pfss_tsval, + dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); + if (V_pf_status.debug >= PF_DEBUG_MISC) { + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + + /* XXX I'd really like to require tsecr but it's optional */ + + } else if (!got_ts && (th->th_flags & TH_RST) == 0 && + ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) + || pd->p_len > 0 || (th->th_flags & TH_SYN)) && + src->scrub && dst->scrub && + (src->scrub->pfss_flags & PFSS_PAWS) && + (dst->scrub->pfss_flags & PFSS_PAWS)) { + /* Didn't send a timestamp. Timestamps aren't really useful + * when: + * - connection opening or closing (often not even sent). + * but we must not let an attacker to put a FIN on a + * data packet to sneak it through our ESTABLISHED check. + * - on a TCP reset. RFC suggests not even looking at TS. + * - on an empty ACK. The TS will not be echoed so it will + * probably not help keep the RTT calculation in sync and + * there isn't as much danger when the sequence numbers + * got wrapped. So some stacks don't include TS on empty + * ACKs :-( + * + * To minimize the disruption to mostly RFC1323 conformant + * stacks, we will only require timestamps on data packets. + * + * And what do ya know, we cannot require timestamps on data + * packets. There appear to be devices that do legitimate + * TCP connection hijacking. There are HTTP devices that allow + * a 3whs (with timestamps) and then buffer the HTTP request. + * If the intermediate device has the HTTP response cache, it + * will spoof the response but not bother timestamping its + * packets. So we can look for the presence of a timestamp in + * the first data packet and if there, require it in all future + * packets. + */ + + if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { + /* + * Hey! Someone tried to sneak a packet in. Or the + * stack changed its RFC1323 behavior?!?! + */ + if (V_pf_status.debug >= PF_DEBUG_MISC) { + DPFPRINTF(("Did not receive expected RFC1323 " + "timestamp\n")); + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + REASON_SET(reason, PFRES_TS); + return (PF_DROP); + } + } + + + /* + * We will note if a host sends his data packets with or without + * timestamps. And require all data packets to contain a timestamp + * if the first does. PAWS implicitly requires that all data packets be + * timestamped. But I think there are middle-man devices that hijack + * TCP streams immediately after the 3whs and don't timestamp their + * packets (seen in a WWW accelerator or cache). + */ + if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & + (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { + if (got_ts) + src->scrub->pfss_flags |= PFSS_DATA_TS; + else { + src->scrub->pfss_flags |= PFSS_DATA_NOTS; + if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && + (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { + /* Don't warn if other host rejected RFC1323 */ + DPFPRINTF(("Broken RFC1323 stack did not " + "timestamp data packet. Disabled PAWS " + "security.\n")); + pf_print_state(state); + pf_print_flags(th->th_flags); + printf("\n"); + } + } + } + + + /* + * Update PAWS values + */ + if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & + (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { + getmicrouptime(&src->scrub->pfss_last); + if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || + (src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_tsval = tsval; + + if (tsecr) { + if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || + (src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_tsecr = tsecr; + + if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && + (SEQ_LT(tsval, src->scrub->pfss_tsval0) || + src->scrub->pfss_tsval0 == 0)) { + /* tsval0 MUST be the lowest timestamp */ + src->scrub->pfss_tsval0 = tsval; + } + + /* Only fully initialized after a TS gets echoed */ + if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) + src->scrub->pfss_flags |= PFSS_PAWS; + } + } + + /* I have a dream.... TCP segment reassembly.... */ + return (0); +} + +static int +pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, + int off, sa_family_t af) +{ + u_int16_t *mss; + int thoff; + int opt, cnt, optlen = 0; + int rewrite = 0; + u_char opts[TCP_MAXOLEN]; + u_char *optp = opts; + + thoff = th->th_off << 2; + cnt = thoff - sizeof(struct tcphdr); + + if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, + NULL, NULL, af)) + return (rewrite); + + for (; cnt > 0; cnt -= optlen, optp += optlen) { + opt = optp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = optp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + case TCPOPT_MAXSEG: + mss = (u_int16_t *)(optp + 2); + if ((ntohs(*mss)) > r->max_mss) { + th->th_sum = pf_cksum_fixup(th->th_sum, + *mss, htons(r->max_mss), 0); + *mss = htons(r->max_mss); + rewrite = 1; + } + break; + default: + break; + } + } + + if (rewrite) + m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); + + return (rewrite); +} + +#ifdef INET +static void +pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos) +{ + struct mbuf *m = *m0; + struct ip *h = mtod(m, struct ip *); + + /* Clear IP_DF if no-df was requested */ + if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) { + u_int16_t ip_off = h->ip_off; + + h->ip_off &= htons(~IP_DF); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); + } + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (min_ttl && h->ip_ttl < min_ttl) { + u_int16_t ip_ttl = h->ip_ttl; + + h->ip_ttl = min_ttl; + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); + } + + /* Enforce tos */ + if (flags & PFRULE_SET_TOS) { + u_int16_t ov, nv; + + ov = *(u_int16_t *)h; + h->ip_tos = tos; + nv = *(u_int16_t *)h; + + h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); + } + + /* random-id, but not for fragments */ + if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { + u_int16_t ip_id = h->ip_id; + + h->ip_id = ip_randomid(); + h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); + } +} +#endif /* INET */ + +#ifdef INET6 +static void +pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl) +{ + struct mbuf *m = *m0; + struct ip6_hdr *h = mtod(m, struct ip6_hdr *); + + /* Enforce a minimum ttl, may cause endless packet loops */ + if (min_ttl && h->ip6_hlim < min_ttl) + h->ip6_hlim = min_ttl; +} +#endif diff --git a/sys/netpfil/pf/pf_osfp.c b/sys/netpfil/pf/pf_osfp.c new file mode 100644 index 0000000..29d4a40 --- /dev/null +++ b/sys/netpfil/pf/pf_osfp.c @@ -0,0 +1,526 @@ +/* $OpenBSD: pf_osfp.c,v 1.14 2008/06/12 18:17:01 henning Exp $ */ + +/* + * Copyright (c) 2003 Mike Frantzen <frantzen@w4g.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/socket.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <net/if.h> +#include <net/pfvar.h> + +#include <netinet/ip6.h> +#include <netinet6/in6_var.h> + +static MALLOC_DEFINE(M_PFOSFP, "pf_osfp", "pf(4) operating system fingerprints"); +#define DPFPRINTF(format, x...) \ + if (V_pf_status.debug >= PF_DEBUG_NOISY) \ + printf(format , ##x) + +SLIST_HEAD(pf_osfp_list, pf_os_fingerprint); +static VNET_DEFINE(struct pf_osfp_list, pf_osfp_list) = + SLIST_HEAD_INITIALIZER(); +#define V_pf_osfp_list VNET(pf_osfp_list) + +static struct pf_osfp_enlist *pf_osfp_fingerprint_hdr(const struct ip *, + const struct ip6_hdr *, + const struct tcphdr *); +static struct pf_os_fingerprint *pf_osfp_find(struct pf_osfp_list *, + struct pf_os_fingerprint *, u_int8_t); +static struct pf_os_fingerprint *pf_osfp_find_exact(struct pf_osfp_list *, + struct pf_os_fingerprint *); +static void pf_osfp_insert(struct pf_osfp_list *, + struct pf_os_fingerprint *); +#ifdef PFDEBUG +static struct pf_os_fingerprint *pf_osfp_validate(void); +#endif + +/* + * Passively fingerprint the OS of the host (IPv4 TCP SYN packets only) + * Returns the list of possible OSes. + */ +struct pf_osfp_enlist * +pf_osfp_fingerprint(struct pf_pdesc *pd, struct mbuf *m, int off, + const struct tcphdr *tcp) +{ + struct ip *ip; + struct ip6_hdr *ip6; + char hdr[60]; + + if ((pd->af != PF_INET && pd->af != PF_INET6) || + pd->proto != IPPROTO_TCP || (tcp->th_off << 2) < sizeof(*tcp)) + return (NULL); + + if (pd->af == PF_INET) { + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)NULL; + } else { + ip = (struct ip *)NULL; + ip6 = mtod(m, struct ip6_hdr *); + } + if (!pf_pull_hdr(m, off, hdr, tcp->th_off << 2, NULL, NULL, + pd->af)) return (NULL); + + return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)hdr)); +} + +static struct pf_osfp_enlist * +pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const struct tcphdr *tcp) +{ + struct pf_os_fingerprint fp, *fpresult; + int cnt, optlen = 0; + const u_int8_t *optp; + char srcname[128]; + + if ((tcp->th_flags & (TH_SYN|TH_ACK)) != TH_SYN) + return (NULL); + if (ip) { + if ((ip->ip_off & htons(IP_OFFMASK)) != 0) + return (NULL); + } + + memset(&fp, 0, sizeof(fp)); + + if (ip) { + fp.fp_psize = ntohs(ip->ip_len); + fp.fp_ttl = ip->ip_ttl; + if (ip->ip_off & htons(IP_DF)) + fp.fp_flags |= PF_OSFP_DF; + strlcpy(srcname, inet_ntoa(ip->ip_src), sizeof(srcname)); + } +#ifdef INET6 + else if (ip6) { + /* jumbo payload? */ + fp.fp_psize = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); + fp.fp_ttl = ip6->ip6_hlim; + fp.fp_flags |= PF_OSFP_DF; + fp.fp_flags |= PF_OSFP_INET6; + strlcpy(srcname, ip6_sprintf((struct in6_addr *)&ip6->ip6_src), + sizeof(srcname)); + } +#endif + else + return (NULL); + fp.fp_wsize = ntohs(tcp->th_win); + + + cnt = (tcp->th_off << 2) - sizeof(*tcp); + optp = (const u_int8_t *)((const char *)tcp + sizeof(*tcp)); + for (; cnt > 0; cnt -= optlen, optp += optlen) { + if (*optp == TCPOPT_EOL) + break; + + fp.fp_optcnt++; + if (*optp == TCPOPT_NOP) { + fp.fp_tcpopts = (fp.fp_tcpopts << PF_OSFP_TCPOPT_BITS) | + PF_OSFP_TCPOPT_NOP; + optlen = 1; + } else { + if (cnt < 2) + return (NULL); + optlen = optp[1]; + if (optlen > cnt || optlen < 2) + return (NULL); + switch (*optp) { + case TCPOPT_MAXSEG: + if (optlen >= TCPOLEN_MAXSEG) + memcpy(&fp.fp_mss, &optp[2], + sizeof(fp.fp_mss)); + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_MSS; + NTOHS(fp.fp_mss); + break; + case TCPOPT_WINDOW: + if (optlen >= TCPOLEN_WINDOW) + memcpy(&fp.fp_wscale, &optp[2], + sizeof(fp.fp_wscale)); + NTOHS(fp.fp_wscale); + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | + PF_OSFP_TCPOPT_WSCALE; + break; + case TCPOPT_SACK_PERMITTED: + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_SACK; + break; + case TCPOPT_TIMESTAMP: + if (optlen >= TCPOLEN_TIMESTAMP) { + u_int32_t ts; + memcpy(&ts, &optp[2], sizeof(ts)); + if (ts == 0) + fp.fp_flags |= PF_OSFP_TS0; + + } + fp.fp_tcpopts = (fp.fp_tcpopts << + PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_TS; + break; + default: + return (NULL); + } + } + optlen = MAX(optlen, 1); /* paranoia */ + } + + DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) " + "(TS=%s,M=%s%d,W=%s%d)\n", + srcname, ntohs(tcp->th_sport), + fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0, + fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt, + (fp.fp_flags & PF_OSFP_TS0) ? "0" : "", + (fp.fp_flags & PF_OSFP_MSS_MOD) ? "%" : + (fp.fp_flags & PF_OSFP_MSS_DC) ? "*" : "", + fp.fp_mss, + (fp.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" : + (fp.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "", + fp.fp_wscale); + + if ((fpresult = pf_osfp_find(&V_pf_osfp_list, &fp, + PF_OSFP_MAXTTL_OFFSET))) + return (&fpresult->fp_oses); + return (NULL); +} + +/* Match a fingerprint ID against a list of OSes */ +int +pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os) +{ + struct pf_osfp_entry *entry; + int os_class, os_version, os_subtype; + int en_class, en_version, en_subtype; + + if (os == PF_OSFP_ANY) + return (1); + if (list == NULL) { + DPFPRINTF("osfp no match against %x\n", os); + return (os == PF_OSFP_UNKNOWN); + } + PF_OSFP_UNPACK(os, os_class, os_version, os_subtype); + SLIST_FOREACH(entry, list, fp_entry) { + PF_OSFP_UNPACK(entry->fp_os, en_class, en_version, en_subtype); + if ((os_class == PF_OSFP_ANY || en_class == os_class) && + (os_version == PF_OSFP_ANY || en_version == os_version) && + (os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) { + DPFPRINTF("osfp matched %s %s %s %x==%x\n", + entry->fp_class_nm, entry->fp_version_nm, + entry->fp_subtype_nm, os, entry->fp_os); + return (1); + } + } + DPFPRINTF("fingerprint 0x%x didn't match\n", os); + return (0); +} + +/* Flush the fingerprint list */ +void +pf_osfp_flush(void) +{ + struct pf_os_fingerprint *fp; + struct pf_osfp_entry *entry; + + while ((fp = SLIST_FIRST(&V_pf_osfp_list))) { + SLIST_REMOVE_HEAD(&V_pf_osfp_list, fp_next); + while ((entry = SLIST_FIRST(&fp->fp_oses))) { + SLIST_REMOVE_HEAD(&fp->fp_oses, fp_entry); + free(entry, M_PFOSFP); + } + free(fp, M_PFOSFP); + } +} + + +/* Add a fingerprint */ +int +pf_osfp_add(struct pf_osfp_ioctl *fpioc) +{ + struct pf_os_fingerprint *fp, fpadd; + struct pf_osfp_entry *entry; + + PF_RULES_WASSERT(); + + memset(&fpadd, 0, sizeof(fpadd)); + fpadd.fp_tcpopts = fpioc->fp_tcpopts; + fpadd.fp_wsize = fpioc->fp_wsize; + fpadd.fp_psize = fpioc->fp_psize; + fpadd.fp_mss = fpioc->fp_mss; + fpadd.fp_flags = fpioc->fp_flags; + fpadd.fp_optcnt = fpioc->fp_optcnt; + fpadd.fp_wscale = fpioc->fp_wscale; + fpadd.fp_ttl = fpioc->fp_ttl; + +#if 0 /* XXX RYAN wants to fix logging */ + DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d " + "(TS=%s,M=%s%d,W=%s%d) %x\n", + fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm, + fpioc->fp_os.fp_subtype_nm, + (fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_WSIZE_MSS) ? "S" : + (fpadd.fp_flags & PF_OSFP_WSIZE_MTU) ? "T" : + (fpadd.fp_flags & PF_OSFP_WSIZE_DC) ? "*" : "", + fpadd.fp_wsize, + fpadd.fp_ttl, + (fpadd.fp_flags & PF_OSFP_DF) ? 1 : 0, + (fpadd.fp_flags & PF_OSFP_PSIZE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_PSIZE_DC) ? "*" : "", + fpadd.fp_psize, + (long long int)fpadd.fp_tcpopts, fpadd.fp_optcnt, + (fpadd.fp_flags & PF_OSFP_TS0) ? "0" : "", + (fpadd.fp_flags & PF_OSFP_MSS_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_MSS_DC) ? "*" : "", + fpadd.fp_mss, + (fpadd.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" : + (fpadd.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "", + fpadd.fp_wscale, + fpioc->fp_os.fp_os); +#endif + + if ((fp = pf_osfp_find_exact(&V_pf_osfp_list, &fpadd))) { + SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) { + if (PF_OSFP_ENTRY_EQ(entry, &fpioc->fp_os)) + return (EEXIST); + } + if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT)) + == NULL) + return (ENOMEM); + } else { + if ((fp = malloc(sizeof(*fp), M_PFOSFP, M_ZERO | M_NOWAIT)) + == NULL) + return (ENOMEM); + fp->fp_tcpopts = fpioc->fp_tcpopts; + fp->fp_wsize = fpioc->fp_wsize; + fp->fp_psize = fpioc->fp_psize; + fp->fp_mss = fpioc->fp_mss; + fp->fp_flags = fpioc->fp_flags; + fp->fp_optcnt = fpioc->fp_optcnt; + fp->fp_wscale = fpioc->fp_wscale; + fp->fp_ttl = fpioc->fp_ttl; + SLIST_INIT(&fp->fp_oses); + if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT)) + == NULL) { + free(fp, M_PFOSFP); + return (ENOMEM); + } + pf_osfp_insert(&V_pf_osfp_list, fp); + } + memcpy(entry, &fpioc->fp_os, sizeof(*entry)); + + /* Make sure the strings are NUL terminated */ + entry->fp_class_nm[sizeof(entry->fp_class_nm)-1] = '\0'; + entry->fp_version_nm[sizeof(entry->fp_version_nm)-1] = '\0'; + entry->fp_subtype_nm[sizeof(entry->fp_subtype_nm)-1] = '\0'; + + SLIST_INSERT_HEAD(&fp->fp_oses, entry, fp_entry); + +#ifdef PFDEBUG + if ((fp = pf_osfp_validate())) + printf("Invalid fingerprint list\n"); +#endif /* PFDEBUG */ + return (0); +} + + +/* Find a fingerprint in the list */ +static struct pf_os_fingerprint * +pf_osfp_find(struct pf_osfp_list *list, struct pf_os_fingerprint *find, + u_int8_t ttldiff) +{ + struct pf_os_fingerprint *f; + +#define MATCH_INT(_MOD, _DC, _field) \ + if ((f->fp_flags & _DC) == 0) { \ + if ((f->fp_flags & _MOD) == 0) { \ + if (f->_field != find->_field) \ + continue; \ + } else { \ + if (f->_field == 0 || find->_field % f->_field) \ + continue; \ + } \ + } + + SLIST_FOREACH(f, list, fp_next) { + if (f->fp_tcpopts != find->fp_tcpopts || + f->fp_optcnt != find->fp_optcnt || + f->fp_ttl < find->fp_ttl || + f->fp_ttl - find->fp_ttl > ttldiff || + (f->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)) != + (find->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0))) + continue; + + MATCH_INT(PF_OSFP_PSIZE_MOD, PF_OSFP_PSIZE_DC, fp_psize) + MATCH_INT(PF_OSFP_MSS_MOD, PF_OSFP_MSS_DC, fp_mss) + MATCH_INT(PF_OSFP_WSCALE_MOD, PF_OSFP_WSCALE_DC, fp_wscale) + if ((f->fp_flags & PF_OSFP_WSIZE_DC) == 0) { + if (f->fp_flags & PF_OSFP_WSIZE_MSS) { + if (find->fp_mss == 0) + continue; + +/* + * Some "smart" NAT devices and DSL routers will tweak the MSS size and + * will set it to whatever is suitable for the link type. + */ +#define SMART_MSS 1460 + if ((find->fp_wsize % find->fp_mss || + find->fp_wsize / find->fp_mss != + f->fp_wsize) && + (find->fp_wsize % SMART_MSS || + find->fp_wsize / SMART_MSS != + f->fp_wsize)) + continue; + } else if (f->fp_flags & PF_OSFP_WSIZE_MTU) { + if (find->fp_mss == 0) + continue; + +#define MTUOFF (sizeof(struct ip) + sizeof(struct tcphdr)) +#define SMART_MTU (SMART_MSS + MTUOFF) + if ((find->fp_wsize % (find->fp_mss + MTUOFF) || + find->fp_wsize / (find->fp_mss + MTUOFF) != + f->fp_wsize) && + (find->fp_wsize % SMART_MTU || + find->fp_wsize / SMART_MTU != + f->fp_wsize)) + continue; + } else if (f->fp_flags & PF_OSFP_WSIZE_MOD) { + if (f->fp_wsize == 0 || find->fp_wsize % + f->fp_wsize) + continue; + } else { + if (f->fp_wsize != find->fp_wsize) + continue; + } + } + return (f); + } + + return (NULL); +} + +/* Find an exact fingerprint in the list */ +static struct pf_os_fingerprint * +pf_osfp_find_exact(struct pf_osfp_list *list, struct pf_os_fingerprint *find) +{ + struct pf_os_fingerprint *f; + + SLIST_FOREACH(f, list, fp_next) { + if (f->fp_tcpopts == find->fp_tcpopts && + f->fp_wsize == find->fp_wsize && + f->fp_psize == find->fp_psize && + f->fp_mss == find->fp_mss && + f->fp_flags == find->fp_flags && + f->fp_optcnt == find->fp_optcnt && + f->fp_wscale == find->fp_wscale && + f->fp_ttl == find->fp_ttl) + return (f); + } + + return (NULL); +} + +/* Insert a fingerprint into the list */ +static void +pf_osfp_insert(struct pf_osfp_list *list, struct pf_os_fingerprint *ins) +{ + struct pf_os_fingerprint *f, *prev = NULL; + + /* XXX need to go semi tree based. can key on tcp options */ + + SLIST_FOREACH(f, list, fp_next) + prev = f; + if (prev) + SLIST_INSERT_AFTER(prev, ins, fp_next); + else + SLIST_INSERT_HEAD(list, ins, fp_next); +} + +/* Fill a fingerprint by its number (from an ioctl) */ +int +pf_osfp_get(struct pf_osfp_ioctl *fpioc) +{ + struct pf_os_fingerprint *fp; + struct pf_osfp_entry *entry; + int num = fpioc->fp_getnum; + int i = 0; + + + memset(fpioc, 0, sizeof(*fpioc)); + SLIST_FOREACH(fp, &V_pf_osfp_list, fp_next) { + SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) { + if (i++ == num) { + fpioc->fp_mss = fp->fp_mss; + fpioc->fp_wsize = fp->fp_wsize; + fpioc->fp_flags = fp->fp_flags; + fpioc->fp_psize = fp->fp_psize; + fpioc->fp_ttl = fp->fp_ttl; + fpioc->fp_wscale = fp->fp_wscale; + fpioc->fp_getnum = num; + memcpy(&fpioc->fp_os, entry, + sizeof(fpioc->fp_os)); + return (0); + } + } + } + + return (EBUSY); +} + + +#ifdef PFDEBUG +/* Validate that each signature is reachable */ +static struct pf_os_fingerprint * +pf_osfp_validate(void) +{ + struct pf_os_fingerprint *f, *f2, find; + + SLIST_FOREACH(f, &V_pf_osfp_list, fp_next) { + memcpy(&find, f, sizeof(find)); + + /* We do a few MSS/th_win percolations to make things unique */ + if (find.fp_mss == 0) + find.fp_mss = 128; + if (f->fp_flags & PF_OSFP_WSIZE_MSS) + find.fp_wsize *= find.fp_mss; + else if (f->fp_flags & PF_OSFP_WSIZE_MTU) + find.fp_wsize *= (find.fp_mss + 40); + else if (f->fp_flags & PF_OSFP_WSIZE_MOD) + find.fp_wsize *= 2; + if (f != (f2 = pf_osfp_find(&V_pf_osfp_list, &find, 0))) { + if (f2) + printf("Found \"%s %s %s\" instead of " + "\"%s %s %s\"\n", + SLIST_FIRST(&f2->fp_oses)->fp_class_nm, + SLIST_FIRST(&f2->fp_oses)->fp_version_nm, + SLIST_FIRST(&f2->fp_oses)->fp_subtype_nm, + SLIST_FIRST(&f->fp_oses)->fp_class_nm, + SLIST_FIRST(&f->fp_oses)->fp_version_nm, + SLIST_FIRST(&f->fp_oses)->fp_subtype_nm); + else + printf("Couldn't find \"%s %s %s\"\n", + SLIST_FIRST(&f->fp_oses)->fp_class_nm, + SLIST_FIRST(&f->fp_oses)->fp_version_nm, + SLIST_FIRST(&f->fp_oses)->fp_subtype_nm); + return (f); + } + } + return (NULL); +} +#endif /* PFDEBUG */ diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c new file mode 100644 index 0000000..77652a6 --- /dev/null +++ b/sys/netpfil/pf/pf_ruleset.c @@ -0,0 +1,424 @@ +/* $OpenBSD: pf_ruleset.c,v 1.2 2008/12/18 15:31:37 dhill Exp $ */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002,2003 Henning Brauer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/socket.h> +#ifdef _KERNEL +# include <sys/systm.h> +# include <sys/refcount.h> +#endif /* _KERNEL */ +#include <sys/mbuf.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <net/if.h> +#include <net/pfvar.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif /* INET6 */ + + +#ifdef _KERNEL +#define DPFPRINTF(format, x...) \ + if (V_pf_status.debug >= PF_DEBUG_NOISY) \ + printf(format , ##x) +#define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT|M_ZERO) +#define rs_free(x) free(x, M_TEMP) + +#else +/* Userland equivalents so we can lend code to pfctl et al. */ + +#include <arpa/inet.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#define rs_malloc(x) calloc(1, x) +#define rs_free(x) free(x) + +#ifdef PFDEBUG +#include <sys/stdarg.h> +#define DPFPRINTF(format, x...) fprintf(stderr, format , ##x) +#else +#define DPFPRINTF(format, x...) ((void)0) +#endif /* PFDEBUG */ +#endif /* _KERNEL */ + +#ifdef _KERNEL +VNET_DEFINE(struct pf_anchor_global, pf_anchors); +VNET_DEFINE(struct pf_anchor, pf_main_anchor); +#else /* ! _KERNEL */ +struct pf_anchor_global pf_anchors; +struct pf_anchor pf_main_anchor; +#undef V_pf_anchors +#define V_pf_anchors pf_anchors +#undef pf_main_ruleset +#define pf_main_ruleset pf_main_anchor.ruleset +#endif /* _KERNEL */ + +static __inline int pf_anchor_compare(struct pf_anchor *, struct pf_anchor *); + +static struct pf_anchor *pf_find_anchor(const char *); + +RB_GENERATE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare); +RB_GENERATE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare); + +static __inline int +pf_anchor_compare(struct pf_anchor *a, struct pf_anchor *b) +{ + int c = strcmp(a->path, b->path); + + return (c ? (c < 0 ? -1 : 1) : 0); +} + +int +pf_get_ruleset_number(u_int8_t action) +{ + switch (action) { + case PF_SCRUB: + case PF_NOSCRUB: + return (PF_RULESET_SCRUB); + break; + case PF_PASS: + case PF_DROP: + return (PF_RULESET_FILTER); + break; + case PF_NAT: + case PF_NONAT: + return (PF_RULESET_NAT); + break; + case PF_BINAT: + case PF_NOBINAT: + return (PF_RULESET_BINAT); + break; + case PF_RDR: + case PF_NORDR: + return (PF_RULESET_RDR); + break; + default: + return (PF_RULESET_MAX); + break; + } +} + +void +pf_init_ruleset(struct pf_ruleset *ruleset) +{ + int i; + + memset(ruleset, 0, sizeof(struct pf_ruleset)); + for (i = 0; i < PF_RULESET_MAX; i++) { + TAILQ_INIT(&ruleset->rules[i].queues[0]); + TAILQ_INIT(&ruleset->rules[i].queues[1]); + ruleset->rules[i].active.ptr = &ruleset->rules[i].queues[0]; + ruleset->rules[i].inactive.ptr = &ruleset->rules[i].queues[1]; + } +} + +static struct pf_anchor * +pf_find_anchor(const char *path) +{ + struct pf_anchor *key, *found; + + key = (struct pf_anchor *)rs_malloc(sizeof(*key)); + if (key == NULL) + return (NULL); + strlcpy(key->path, path, sizeof(key->path)); + found = RB_FIND(pf_anchor_global, &V_pf_anchors, key); + rs_free(key); + return (found); +} + +struct pf_ruleset * +pf_find_ruleset(const char *path) +{ + struct pf_anchor *anchor; + + while (*path == '/') + path++; + if (!*path) + return (&pf_main_ruleset); + anchor = pf_find_anchor(path); + if (anchor == NULL) + return (NULL); + else + return (&anchor->ruleset); +} + +struct pf_ruleset * +pf_find_or_create_ruleset(const char *path) +{ + char *p, *q, *r; + struct pf_ruleset *ruleset; + struct pf_anchor *anchor = NULL, *dup, *parent = NULL; + + if (path[0] == 0) + return (&pf_main_ruleset); + while (*path == '/') + path++; + ruleset = pf_find_ruleset(path); + if (ruleset != NULL) + return (ruleset); + p = (char *)rs_malloc(MAXPATHLEN); + if (p == NULL) + return (NULL); + strlcpy(p, path, MAXPATHLEN); + while (parent == NULL && (q = strrchr(p, '/')) != NULL) { + *q = 0; + if ((ruleset = pf_find_ruleset(p)) != NULL) { + parent = ruleset->anchor; + break; + } + } + if (q == NULL) + q = p; + else + q++; + strlcpy(p, path, MAXPATHLEN); + if (!*q) { + rs_free(p); + return (NULL); + } + while ((r = strchr(q, '/')) != NULL || *q) { + if (r != NULL) + *r = 0; + if (!*q || strlen(q) >= PF_ANCHOR_NAME_SIZE || + (parent != NULL && strlen(parent->path) >= + MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)) { + rs_free(p); + return (NULL); + } + anchor = (struct pf_anchor *)rs_malloc(sizeof(*anchor)); + if (anchor == NULL) { + rs_free(p); + return (NULL); + } + RB_INIT(&anchor->children); + strlcpy(anchor->name, q, sizeof(anchor->name)); + if (parent != NULL) { + strlcpy(anchor->path, parent->path, + sizeof(anchor->path)); + strlcat(anchor->path, "/", sizeof(anchor->path)); + } + strlcat(anchor->path, anchor->name, sizeof(anchor->path)); + if ((dup = RB_INSERT(pf_anchor_global, &V_pf_anchors, anchor)) != + NULL) { + printf("pf_find_or_create_ruleset: RB_INSERT1 " + "'%s' '%s' collides with '%s' '%s'\n", + anchor->path, anchor->name, dup->path, dup->name); + rs_free(anchor); + rs_free(p); + return (NULL); + } + if (parent != NULL) { + anchor->parent = parent; + if ((dup = RB_INSERT(pf_anchor_node, &parent->children, + anchor)) != NULL) { + printf("pf_find_or_create_ruleset: " + "RB_INSERT2 '%s' '%s' collides with " + "'%s' '%s'\n", anchor->path, anchor->name, + dup->path, dup->name); + RB_REMOVE(pf_anchor_global, &V_pf_anchors, + anchor); + rs_free(anchor); + rs_free(p); + return (NULL); + } + } + pf_init_ruleset(&anchor->ruleset); + anchor->ruleset.anchor = anchor; + parent = anchor; + if (r != NULL) + q = r + 1; + else + *q = 0; + } + rs_free(p); + return (&anchor->ruleset); +} + +void +pf_remove_if_empty_ruleset(struct pf_ruleset *ruleset) +{ + struct pf_anchor *parent; + int i; + + while (ruleset != NULL) { + if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL || + !RB_EMPTY(&ruleset->anchor->children) || + ruleset->anchor->refcnt > 0 || ruleset->tables > 0 || + ruleset->topen) + return; + for (i = 0; i < PF_RULESET_MAX; ++i) + if (!TAILQ_EMPTY(ruleset->rules[i].active.ptr) || + !TAILQ_EMPTY(ruleset->rules[i].inactive.ptr) || + ruleset->rules[i].inactive.open) + return; + RB_REMOVE(pf_anchor_global, &V_pf_anchors, ruleset->anchor); + if ((parent = ruleset->anchor->parent) != NULL) + RB_REMOVE(pf_anchor_node, &parent->children, + ruleset->anchor); + rs_free(ruleset->anchor); + if (parent == NULL) + return; + ruleset = &parent->ruleset; + } +} + +int +pf_anchor_setup(struct pf_rule *r, const struct pf_ruleset *s, + const char *name) +{ + char *p, *path; + struct pf_ruleset *ruleset; + + r->anchor = NULL; + r->anchor_relative = 0; + r->anchor_wildcard = 0; + if (!name[0]) + return (0); + path = (char *)rs_malloc(MAXPATHLEN); + if (path == NULL) + return (1); + if (name[0] == '/') + strlcpy(path, name + 1, MAXPATHLEN); + else { + /* relative path */ + r->anchor_relative = 1; + if (s->anchor == NULL || !s->anchor->path[0]) + path[0] = 0; + else + strlcpy(path, s->anchor->path, MAXPATHLEN); + while (name[0] == '.' && name[1] == '.' && name[2] == '/') { + if (!path[0]) { + printf("pf_anchor_setup: .. beyond root\n"); + rs_free(path); + return (1); + } + if ((p = strrchr(path, '/')) != NULL) + *p = 0; + else + path[0] = 0; + r->anchor_relative++; + name += 3; + } + if (path[0]) + strlcat(path, "/", MAXPATHLEN); + strlcat(path, name, MAXPATHLEN); + } + if ((p = strrchr(path, '/')) != NULL && !strcmp(p, "/*")) { + r->anchor_wildcard = 1; + *p = 0; + } + ruleset = pf_find_or_create_ruleset(path); + rs_free(path); + if (ruleset == NULL || ruleset->anchor == NULL) { + printf("pf_anchor_setup: ruleset\n"); + return (1); + } + r->anchor = ruleset->anchor; + r->anchor->refcnt++; + return (0); +} + +int +pf_anchor_copyout(const struct pf_ruleset *rs, const struct pf_rule *r, + struct pfioc_rule *pr) +{ + pr->anchor_call[0] = 0; + if (r->anchor == NULL) + return (0); + if (!r->anchor_relative) { + strlcpy(pr->anchor_call, "/", sizeof(pr->anchor_call)); + strlcat(pr->anchor_call, r->anchor->path, + sizeof(pr->anchor_call)); + } else { + char *a, *p; + int i; + + a = (char *)rs_malloc(MAXPATHLEN); + if (a == NULL) + return (1); + if (rs->anchor == NULL) + a[0] = 0; + else + strlcpy(a, rs->anchor->path, MAXPATHLEN); + for (i = 1; i < r->anchor_relative; ++i) { + if ((p = strrchr(a, '/')) == NULL) + p = a; + *p = 0; + strlcat(pr->anchor_call, "../", + sizeof(pr->anchor_call)); + } + if (strncmp(a, r->anchor->path, strlen(a))) { + printf("pf_anchor_copyout: '%s' '%s'\n", a, + r->anchor->path); + rs_free(a); + return (1); + } + if (strlen(r->anchor->path) > strlen(a)) + strlcat(pr->anchor_call, r->anchor->path + (a[0] ? + strlen(a) + 1 : 0), sizeof(pr->anchor_call)); + rs_free(a); + } + if (r->anchor_wildcard) + strlcat(pr->anchor_call, pr->anchor_call[0] ? "/*" : "*", + sizeof(pr->anchor_call)); + return (0); +} + +void +pf_anchor_remove(struct pf_rule *r) +{ + if (r->anchor == NULL) + return; + if (r->anchor->refcnt <= 0) { + printf("pf_anchor_remove: broken refcount\n"); + r->anchor = NULL; + return; + } + if (!--r->anchor->refcnt) + pf_remove_if_empty_ruleset(&r->anchor->ruleset); + r->anchor = NULL; +} diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c new file mode 100644 index 0000000..fa88045 --- /dev/null +++ b/sys/netpfil/pf/pf_table.c @@ -0,0 +1,2191 @@ +/* $OpenBSD: pf_table.c,v 1.79 2008/10/08 06:24:50 mcbride Exp $ */ + +/* + * Copyright (c) 2002 Cedric Berger + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/refcount.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <vm/uma.h> + +#include <net/if.h> +#include <net/vnet.h> +#include <net/pfvar.h> + +#define ACCEPT_FLAGS(flags, oklist) \ + do { \ + if ((flags & ~(oklist)) & \ + PFR_FLAG_ALLMASK) \ + return (EINVAL); \ + } while (0) + +#define FILLIN_SIN(sin, addr) \ + do { \ + (sin).sin_len = sizeof(sin); \ + (sin).sin_family = AF_INET; \ + (sin).sin_addr = (addr); \ + } while (0) + +#define FILLIN_SIN6(sin6, addr) \ + do { \ + (sin6).sin6_len = sizeof(sin6); \ + (sin6).sin6_family = AF_INET6; \ + (sin6).sin6_addr = (addr); \ + } while (0) + +#define SWAP(type, a1, a2) \ + do { \ + type tmp = a1; \ + a1 = a2; \ + a2 = tmp; \ + } while (0) + +#define SUNION2PF(su, af) (((af)==AF_INET) ? \ + (struct pf_addr *)&(su)->sin.sin_addr : \ + (struct pf_addr *)&(su)->sin6.sin6_addr) + +#define AF_BITS(af) (((af)==AF_INET)?32:128) +#define ADDR_NETWORK(ad) ((ad)->pfra_net < AF_BITS((ad)->pfra_af)) +#define KENTRY_NETWORK(ke) ((ke)->pfrke_net < AF_BITS((ke)->pfrke_af)) +#define KENTRY_RNF_ROOT(ke) \ + ((((struct radix_node *)(ke))->rn_flags & RNF_ROOT) != 0) + +#define NO_ADDRESSES (-1) +#define ENQUEUE_UNMARKED_ONLY (1) +#define INVERT_NEG_FLAG (1) + +struct pfr_walktree { + enum pfrw_op { + PFRW_MARK, + PFRW_SWEEP, + PFRW_ENQUEUE, + PFRW_GET_ADDRS, + PFRW_GET_ASTATS, + PFRW_POOL_GET, + PFRW_DYNADDR_UPDATE + } pfrw_op; + union { + struct pfr_addr *pfrw1_addr; + struct pfr_astats *pfrw1_astats; + struct pfr_kentryworkq *pfrw1_workq; + struct pfr_kentry *pfrw1_kentry; + struct pfi_dynaddr *pfrw1_dyn; + } pfrw_1; + int pfrw_free; +}; +#define pfrw_addr pfrw_1.pfrw1_addr +#define pfrw_astats pfrw_1.pfrw1_astats +#define pfrw_workq pfrw_1.pfrw1_workq +#define pfrw_kentry pfrw_1.pfrw1_kentry +#define pfrw_dyn pfrw_1.pfrw1_dyn +#define pfrw_cnt pfrw_free + +#define senderr(e) do { rv = (e); goto _bad; } while (0) + +static MALLOC_DEFINE(M_PFTABLE, "pf_table", "pf(4) tables structures"); +static VNET_DEFINE(uma_zone_t, pfr_kentry_z); +#define V_pfr_kentry_z VNET(pfr_kentry_z) +static VNET_DEFINE(uma_zone_t, pfr_kcounters_z); +#define V_pfr_kcounters_z VNET(pfr_kcounters_z) + +static struct pf_addr pfr_ffaddr = { + .addr32 = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff } +}; + +static void pfr_copyout_addr(struct pfr_addr *, + struct pfr_kentry *ke); +static int pfr_validate_addr(struct pfr_addr *); +static void pfr_enqueue_addrs(struct pfr_ktable *, + struct pfr_kentryworkq *, int *, int); +static void pfr_mark_addrs(struct pfr_ktable *); +static struct pfr_kentry + *pfr_lookup_addr(struct pfr_ktable *, + struct pfr_addr *, int); +static struct pfr_kentry *pfr_create_kentry(struct pfr_addr *); +static void pfr_destroy_kentries(struct pfr_kentryworkq *); +static void pfr_destroy_kentry(struct pfr_kentry *); +static void pfr_insert_kentries(struct pfr_ktable *, + struct pfr_kentryworkq *, long); +static void pfr_remove_kentries(struct pfr_ktable *, + struct pfr_kentryworkq *); +static void pfr_clstats_kentries(struct pfr_kentryworkq *, long, + int); +static void pfr_reset_feedback(struct pfr_addr *, int); +static void pfr_prepare_network(union sockaddr_union *, int, int); +static int pfr_route_kentry(struct pfr_ktable *, + struct pfr_kentry *); +static int pfr_unroute_kentry(struct pfr_ktable *, + struct pfr_kentry *); +static int pfr_walktree(struct radix_node *, void *); +static int pfr_validate_table(struct pfr_table *, int, int); +static int pfr_fix_anchor(char *); +static void pfr_commit_ktable(struct pfr_ktable *, long); +static void pfr_insert_ktables(struct pfr_ktableworkq *); +static void pfr_insert_ktable(struct pfr_ktable *); +static void pfr_setflags_ktables(struct pfr_ktableworkq *); +static void pfr_setflags_ktable(struct pfr_ktable *, int); +static void pfr_clstats_ktables(struct pfr_ktableworkq *, long, + int); +static void pfr_clstats_ktable(struct pfr_ktable *, long, int); +static struct pfr_ktable + *pfr_create_ktable(struct pfr_table *, long, int); +static void pfr_destroy_ktables(struct pfr_ktableworkq *, int); +static void pfr_destroy_ktable(struct pfr_ktable *, int); +static int pfr_ktable_compare(struct pfr_ktable *, + struct pfr_ktable *); +static struct pfr_ktable + *pfr_lookup_table(struct pfr_table *); +static void pfr_clean_node_mask(struct pfr_ktable *, + struct pfr_kentryworkq *); +static int pfr_table_count(struct pfr_table *, int); +static int pfr_skip_table(struct pfr_table *, + struct pfr_ktable *, int); +static struct pfr_kentry + *pfr_kentry_byidx(struct pfr_ktable *, int, int); + +static RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare); +static RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare); + +struct pfr_ktablehead pfr_ktables; +struct pfr_table pfr_nulltable; +int pfr_ktable_cnt; + +void +pfr_initialize(void) +{ + + V_pfr_kentry_z = uma_zcreate("pf table entries", + sizeof(struct pfr_kentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, + 0); + V_pfr_kcounters_z = uma_zcreate("pf table counters", + sizeof(struct pfr_kcounters), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + V_pf_limits[PF_LIMIT_TABLE_ENTRIES].zone = V_pfr_kentry_z; + V_pf_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT; +} + +void +pfr_cleanup(void) +{ + + uma_zdestroy(V_pfr_kentry_z); + uma_zdestroy(V_pfr_kcounters_z); +} + +int +pfr_clr_addrs(struct pfr_table *tbl, int *ndel, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + pfr_enqueue_addrs(kt, &workq, ndel, 0); + + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_remove_kentries(kt, &workq); + KASSERT(kt->pfrkt_cnt == 0, ("%s: non-null pfrkt_cnt", __func__)); + } + return (0); +} + +int +pfr_add_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nadd, int flags) +{ + struct pfr_ktable *kt, *tmpkt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p, *q; + struct pfr_addr *ad; + int i, rv, xadd = 0; + long tzero = time_second; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0); + if (tmpkt == NULL) + return (ENOMEM); + SLIST_INIT(&workq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + q = pfr_lookup_addr(tmpkt, ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + if (q != NULL) + ad->pfra_fback = PFR_FB_DUPLICATE; + else if (p == NULL) + ad->pfra_fback = PFR_FB_ADDED; + else if (p->pfrke_not != ad->pfra_not) + ad->pfra_fback = PFR_FB_CONFLICT; + else + ad->pfra_fback = PFR_FB_NONE; + } + if (p == NULL && q == NULL) { + p = pfr_create_kentry(ad); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(tmpkt, p)) { + pfr_destroy_kentry(p); + ad->pfra_fback = PFR_FB_NONE; + } else { + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xadd++; + } + } + } + pfr_clean_node_mask(tmpkt, &workq); + if (!(flags & PFR_FLAG_DUMMY)) + pfr_insert_kentries(kt, &workq, tzero); + else + pfr_destroy_kentries(&workq); + if (nadd != NULL) + *nadd = xadd; + pfr_destroy_ktable(tmpkt, 0); + return (0); +_bad: + pfr_clean_node_mask(tmpkt, &workq); + pfr_destroy_kentries(&workq); + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + pfr_destroy_ktable(tmpkt, 0); + return (rv); +} + +int +pfr_del_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *ndel, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p; + struct pfr_addr *ad; + int i, rv, xdel = 0, log = 1; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + /* + * there are two algorithms to choose from here. + * with: + * n: number of addresses to delete + * N: number of addresses in the table + * + * one is O(N) and is better for large 'n' + * one is O(n*LOG(N)) and is better for small 'n' + * + * following code try to decide which one is best. + */ + for (i = kt->pfrkt_cnt; i > 0; i >>= 1) + log++; + if (size > kt->pfrkt_cnt/log) { + /* full table scan */ + pfr_mark_addrs(kt); + } else { + /* iterate over addresses to delete */ + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + return (EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + if (p != NULL) + p->pfrke_mark = 0; + } + } + SLIST_INIT(&workq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + if (p == NULL) + ad->pfra_fback = PFR_FB_NONE; + else if (p->pfrke_not != ad->pfra_not) + ad->pfra_fback = PFR_FB_CONFLICT; + else if (p->pfrke_mark) + ad->pfra_fback = PFR_FB_DUPLICATE; + else + ad->pfra_fback = PFR_FB_DELETED; + } + if (p != NULL && p->pfrke_not == ad->pfra_not && + !p->pfrke_mark) { + p->pfrke_mark = 1; + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xdel++; + } + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_remove_kentries(kt, &workq); + if (ndel != NULL) + *ndel = xdel; + return (0); +_bad: + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + return (rv); +} + +int +pfr_set_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *size2, int *nadd, int *ndel, int *nchange, int flags, + u_int32_t ignore_pfrt_flags) +{ + struct pfr_ktable *kt, *tmpkt; + struct pfr_kentryworkq addq, delq, changeq; + struct pfr_kentry *p, *q; + struct pfr_addr ad; + int i, rv, xadd = 0, xdel = 0, xchange = 0; + long tzero = time_second; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, ignore_pfrt_flags, flags & + PFR_FLAG_USERIOCTL)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_flags & PFR_TFLAG_CONST) + return (EPERM); + tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0); + if (tmpkt == NULL) + return (ENOMEM); + pfr_mark_addrs(kt); + SLIST_INIT(&addq); + SLIST_INIT(&delq); + SLIST_INIT(&changeq); + for (i = 0; i < size; i++) { + /* + * XXXGL: undertand pf_if usage of this function + * and make ad a moving pointer + */ + bcopy(addr + i, &ad, sizeof(ad)); + if (pfr_validate_addr(&ad)) + senderr(EINVAL); + ad.pfra_fback = PFR_FB_NONE; + p = pfr_lookup_addr(kt, &ad, 1); + if (p != NULL) { + if (p->pfrke_mark) { + ad.pfra_fback = PFR_FB_DUPLICATE; + goto _skip; + } + p->pfrke_mark = 1; + if (p->pfrke_not != ad.pfra_not) { + SLIST_INSERT_HEAD(&changeq, p, pfrke_workq); + ad.pfra_fback = PFR_FB_CHANGED; + xchange++; + } + } else { + q = pfr_lookup_addr(tmpkt, &ad, 1); + if (q != NULL) { + ad.pfra_fback = PFR_FB_DUPLICATE; + goto _skip; + } + p = pfr_create_kentry(&ad); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(tmpkt, p)) { + pfr_destroy_kentry(p); + ad.pfra_fback = PFR_FB_NONE; + } else { + SLIST_INSERT_HEAD(&addq, p, pfrke_workq); + ad.pfra_fback = PFR_FB_ADDED; + xadd++; + } + } +_skip: + if (flags & PFR_FLAG_FEEDBACK) + bcopy(&ad, addr + i, sizeof(ad)); + } + pfr_enqueue_addrs(kt, &delq, &xdel, ENQUEUE_UNMARKED_ONLY); + if ((flags & PFR_FLAG_FEEDBACK) && *size2) { + if (*size2 < size+xdel) { + *size2 = size+xdel; + senderr(0); + } + i = 0; + SLIST_FOREACH(p, &delq, pfrke_workq) { + pfr_copyout_addr(&ad, p); + ad.pfra_fback = PFR_FB_DELETED; + bcopy(&ad, addr + size + i, sizeof(ad)); + i++; + } + } + pfr_clean_node_mask(tmpkt, &addq); + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_insert_kentries(kt, &addq, tzero); + pfr_remove_kentries(kt, &delq); + pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG); + } else + pfr_destroy_kentries(&addq); + if (nadd != NULL) + *nadd = xadd; + if (ndel != NULL) + *ndel = xdel; + if (nchange != NULL) + *nchange = xchange; + if ((flags & PFR_FLAG_FEEDBACK) && size2) + *size2 = size+xdel; + pfr_destroy_ktable(tmpkt, 0); + return (0); +_bad: + pfr_clean_node_mask(tmpkt, &addq); + pfr_destroy_kentries(&addq); + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + pfr_destroy_ktable(tmpkt, 0); + return (rv); +} + +int +pfr_tst_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nmatch, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentry *p; + struct pfr_addr *ad; + int i, xmatch = 0; + + PF_RULES_RASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_REPLACE); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + return (EINVAL); + if (ADDR_NETWORK(ad)) + return (EINVAL); + p = pfr_lookup_addr(kt, ad, 0); + if (flags & PFR_FLAG_REPLACE) + pfr_copyout_addr(ad, p); + ad->pfra_fback = (p == NULL) ? PFR_FB_NONE : + (p->pfrke_not ? PFR_FB_NOTMATCH : PFR_FB_MATCH); + if (p != NULL && !p->pfrke_not) + xmatch++; + } + if (nmatch != NULL) + *nmatch = xmatch; + return (0); +} + +int +pfr_get_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int *size, + int flags) +{ + struct pfr_ktable *kt; + struct pfr_walktree w; + int rv; + + PF_RULES_RASSERT(); + + ACCEPT_FLAGS(flags, 0); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_cnt > *size) { + *size = kt->pfrkt_cnt; + return (0); + } + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_GET_ADDRS; + w.pfrw_addr = addr; + w.pfrw_free = kt->pfrkt_cnt; + rv = kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); + if (!rv) + rv = kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, + &w); + if (rv) + return (rv); + + KASSERT(w.pfrw_free == 0, ("%s: corruption detected (%d)", __func__, + w.pfrw_free)); + + *size = kt->pfrkt_cnt; + return (0); +} + +int +pfr_get_astats(struct pfr_table *tbl, struct pfr_astats *addr, int *size, + int flags) +{ + struct pfr_ktable *kt; + struct pfr_walktree w; + struct pfr_kentryworkq workq; + int rv; + long tzero = time_second; + + PF_RULES_RASSERT(); + + /* XXX PFR_FLAG_CLSTATS disabled */ + ACCEPT_FLAGS(flags, 0); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + if (kt->pfrkt_cnt > *size) { + *size = kt->pfrkt_cnt; + return (0); + } + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_GET_ASTATS; + w.pfrw_astats = addr; + w.pfrw_free = kt->pfrkt_cnt; + rv = kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); + if (!rv) + rv = kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, + &w); + if (!rv && (flags & PFR_FLAG_CLSTATS)) { + pfr_enqueue_addrs(kt, &workq, NULL, 0); + pfr_clstats_kentries(&workq, tzero, 0); + } + if (rv) + return (rv); + + if (w.pfrw_free) { + printf("pfr_get_astats: corruption detected (%d).\n", + w.pfrw_free); + return (ENOTTY); + } + *size = kt->pfrkt_cnt; + return (0); +} + +int +pfr_clr_astats(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nzero, int flags) +{ + struct pfr_ktable *kt; + struct pfr_kentryworkq workq; + struct pfr_kentry *p; + struct pfr_addr *ad; + int i, rv, xzero = 0; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK); + if (pfr_validate_table(tbl, 0, 0)) + return (EINVAL); + kt = pfr_lookup_table(tbl); + if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (ESRCH); + SLIST_INIT(&workq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + p = pfr_lookup_addr(kt, ad, 1); + if (flags & PFR_FLAG_FEEDBACK) { + ad->pfra_fback = (p != NULL) ? + PFR_FB_CLEARED : PFR_FB_NONE; + } + if (p != NULL) { + SLIST_INSERT_HEAD(&workq, p, pfrke_workq); + xzero++; + } + } + + if (!(flags & PFR_FLAG_DUMMY)) + pfr_clstats_kentries(&workq, 0, 0); + if (nzero != NULL) + *nzero = xzero; + return (0); +_bad: + if (flags & PFR_FLAG_FEEDBACK) + pfr_reset_feedback(addr, size); + return (rv); +} + +static int +pfr_validate_addr(struct pfr_addr *ad) +{ + int i; + + switch (ad->pfra_af) { +#ifdef INET + case AF_INET: + if (ad->pfra_net > 32) + return (-1); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + if (ad->pfra_net > 128) + return (-1); + break; +#endif /* INET6 */ + default: + return (-1); + } + if (ad->pfra_net < 128 && + (((caddr_t)ad)[ad->pfra_net/8] & (0xFF >> (ad->pfra_net%8)))) + return (-1); + for (i = (ad->pfra_net+7)/8; i < sizeof(ad->pfra_u); i++) + if (((caddr_t)ad)[i]) + return (-1); + if (ad->pfra_not && ad->pfra_not != 1) + return (-1); + if (ad->pfra_fback) + return (-1); + return (0); +} + +static void +pfr_enqueue_addrs(struct pfr_ktable *kt, struct pfr_kentryworkq *workq, + int *naddr, int sweep) +{ + struct pfr_walktree w; + + SLIST_INIT(workq); + bzero(&w, sizeof(w)); + w.pfrw_op = sweep ? PFRW_SWEEP : PFRW_ENQUEUE; + w.pfrw_workq = workq; + if (kt->pfrkt_ip4 != NULL) + if (kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, + &w)) + printf("pfr_enqueue_addrs: IPv4 walktree failed.\n"); + if (kt->pfrkt_ip6 != NULL) + if (kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, + &w)) + printf("pfr_enqueue_addrs: IPv6 walktree failed.\n"); + if (naddr != NULL) + *naddr = w.pfrw_cnt; +} + +static void +pfr_mark_addrs(struct pfr_ktable *kt) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_MARK; + if (kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w)) + printf("pfr_mark_addrs: IPv4 walktree failed.\n"); + if (kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w)) + printf("pfr_mark_addrs: IPv6 walktree failed.\n"); +} + + +static struct pfr_kentry * +pfr_lookup_addr(struct pfr_ktable *kt, struct pfr_addr *ad, int exact) +{ + union sockaddr_union sa, mask; + struct radix_node_head *head = NULL; + struct pfr_kentry *ke; + + bzero(&sa, sizeof(sa)); + if (ad->pfra_af == AF_INET) { + FILLIN_SIN(sa.sin, ad->pfra_ip4addr); + head = kt->pfrkt_ip4; + } else if ( ad->pfra_af == AF_INET6 ) { + FILLIN_SIN6(sa.sin6, ad->pfra_ip6addr); + head = kt->pfrkt_ip6; + } + if (ADDR_NETWORK(ad)) { + pfr_prepare_network(&mask, ad->pfra_af, ad->pfra_net); + ke = (struct pfr_kentry *)rn_lookup(&sa, &mask, head); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + } else { + ke = (struct pfr_kentry *)rn_match(&sa, head); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + if (exact && ke && KENTRY_NETWORK(ke)) + ke = NULL; + } + return (ke); +} + +static struct pfr_kentry * +pfr_create_kentry(struct pfr_addr *ad) +{ + struct pfr_kentry *ke; + + ke = uma_zalloc(V_pfr_kentry_z, M_NOWAIT | M_ZERO); + if (ke == NULL) + return (NULL); + + if (ad->pfra_af == AF_INET) + FILLIN_SIN(ke->pfrke_sa.sin, ad->pfra_ip4addr); + else if (ad->pfra_af == AF_INET6) + FILLIN_SIN6(ke->pfrke_sa.sin6, ad->pfra_ip6addr); + ke->pfrke_af = ad->pfra_af; + ke->pfrke_net = ad->pfra_net; + ke->pfrke_not = ad->pfra_not; + return (ke); +} + +static void +pfr_destroy_kentries(struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p, *q; + + for (p = SLIST_FIRST(workq); p != NULL; p = q) { + q = SLIST_NEXT(p, pfrke_workq); + pfr_destroy_kentry(p); + } +} + +static void +pfr_destroy_kentry(struct pfr_kentry *ke) +{ + if (ke->pfrke_counters) + uma_zfree(V_pfr_kcounters_z, ke->pfrke_counters); + uma_zfree(V_pfr_kentry_z, ke); +} + +static void +pfr_insert_kentries(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq, long tzero) +{ + struct pfr_kentry *p; + int rv, n = 0; + + SLIST_FOREACH(p, workq, pfrke_workq) { + rv = pfr_route_kentry(kt, p); + if (rv) { + printf("pfr_insert_kentries: cannot route entry " + "(code=%d).\n", rv); + break; + } + p->pfrke_tzero = tzero; + n++; + } + kt->pfrkt_cnt += n; +} + +int +pfr_insert_kentry(struct pfr_ktable *kt, struct pfr_addr *ad, long tzero) +{ + struct pfr_kentry *p; + int rv; + + p = pfr_lookup_addr(kt, ad, 1); + if (p != NULL) + return (0); + p = pfr_create_kentry(ad); + if (p == NULL) + return (EINVAL); + + rv = pfr_route_kentry(kt, p); + if (rv) + return (rv); + + p->pfrke_tzero = tzero; + kt->pfrkt_cnt++; + + return (0); +} + +static void +pfr_remove_kentries(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p; + int n = 0; + + SLIST_FOREACH(p, workq, pfrke_workq) { + pfr_unroute_kentry(kt, p); + n++; + } + kt->pfrkt_cnt -= n; + pfr_destroy_kentries(workq); +} + +static void +pfr_clean_node_mask(struct pfr_ktable *kt, + struct pfr_kentryworkq *workq) +{ + struct pfr_kentry *p; + + SLIST_FOREACH(p, workq, pfrke_workq) + pfr_unroute_kentry(kt, p); +} + +static void +pfr_clstats_kentries(struct pfr_kentryworkq *workq, long tzero, int negchange) +{ + struct pfr_kentry *p; + + SLIST_FOREACH(p, workq, pfrke_workq) { + if (negchange) + p->pfrke_not = !p->pfrke_not; + if (p->pfrke_counters) { + uma_zfree(V_pfr_kcounters_z, p->pfrke_counters); + p->pfrke_counters = NULL; + } + p->pfrke_tzero = tzero; + } +} + +static void +pfr_reset_feedback(struct pfr_addr *addr, int size) +{ + struct pfr_addr *ad; + int i; + + for (i = 0, ad = addr; i < size; i++, ad++) + ad->pfra_fback = PFR_FB_NONE; +} + +static void +pfr_prepare_network(union sockaddr_union *sa, int af, int net) +{ + int i; + + bzero(sa, sizeof(*sa)); + if (af == AF_INET) { + sa->sin.sin_len = sizeof(sa->sin); + sa->sin.sin_family = AF_INET; + sa->sin.sin_addr.s_addr = net ? htonl(-1 << (32-net)) : 0; + } else if (af == AF_INET6) { + sa->sin6.sin6_len = sizeof(sa->sin6); + sa->sin6.sin6_family = AF_INET6; + for (i = 0; i < 4; i++) { + if (net <= 32) { + sa->sin6.sin6_addr.s6_addr32[i] = + net ? htonl(-1 << (32-net)) : 0; + break; + } + sa->sin6.sin6_addr.s6_addr32[i] = 0xFFFFFFFF; + net -= 32; + } + } +} + +static int +pfr_route_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) +{ + union sockaddr_union mask; + struct radix_node *rn; + struct radix_node_head *head = NULL; + + bzero(ke->pfrke_node, sizeof(ke->pfrke_node)); + if (ke->pfrke_af == AF_INET) + head = kt->pfrkt_ip4; + else if (ke->pfrke_af == AF_INET6) + head = kt->pfrkt_ip6; + + if (KENTRY_NETWORK(ke)) { + pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net); + rn = rn_addroute(&ke->pfrke_sa, &mask, head, ke->pfrke_node); + } else + rn = rn_addroute(&ke->pfrke_sa, NULL, head, ke->pfrke_node); + + return (rn == NULL ? -1 : 0); +} + +static int +pfr_unroute_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke) +{ + union sockaddr_union mask; + struct radix_node *rn; + struct radix_node_head *head = NULL; + + if (ke->pfrke_af == AF_INET) + head = kt->pfrkt_ip4; + else if (ke->pfrke_af == AF_INET6) + head = kt->pfrkt_ip6; + + if (KENTRY_NETWORK(ke)) { + pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net); + rn = rn_delete(&ke->pfrke_sa, &mask, head); + } else + rn = rn_delete(&ke->pfrke_sa, NULL, head); + + if (rn == NULL) { + printf("pfr_unroute_kentry: delete failed.\n"); + return (-1); + } + return (0); +} + +static void +pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke) +{ + bzero(ad, sizeof(*ad)); + if (ke == NULL) + return; + ad->pfra_af = ke->pfrke_af; + ad->pfra_net = ke->pfrke_net; + ad->pfra_not = ke->pfrke_not; + if (ad->pfra_af == AF_INET) + ad->pfra_ip4addr = ke->pfrke_sa.sin.sin_addr; + else if (ad->pfra_af == AF_INET6) + ad->pfra_ip6addr = ke->pfrke_sa.sin6.sin6_addr; +} + +static int +pfr_walktree(struct radix_node *rn, void *arg) +{ + struct pfr_kentry *ke = (struct pfr_kentry *)rn; + struct pfr_walktree *w = arg; + + switch (w->pfrw_op) { + case PFRW_MARK: + ke->pfrke_mark = 0; + break; + case PFRW_SWEEP: + if (ke->pfrke_mark) + break; + /* FALLTHROUGH */ + case PFRW_ENQUEUE: + SLIST_INSERT_HEAD(w->pfrw_workq, ke, pfrke_workq); + w->pfrw_cnt++; + break; + case PFRW_GET_ADDRS: + if (w->pfrw_free-- > 0) { + pfr_copyout_addr(w->pfrw_addr, ke); + w->pfrw_addr++; + } + break; + case PFRW_GET_ASTATS: + if (w->pfrw_free-- > 0) { + struct pfr_astats as; + + pfr_copyout_addr(&as.pfras_a, ke); + + if (ke->pfrke_counters) { + bcopy(ke->pfrke_counters->pfrkc_packets, + as.pfras_packets, sizeof(as.pfras_packets)); + bcopy(ke->pfrke_counters->pfrkc_bytes, + as.pfras_bytes, sizeof(as.pfras_bytes)); + } else { + bzero(as.pfras_packets, sizeof(as.pfras_packets)); + bzero(as.pfras_bytes, sizeof(as.pfras_bytes)); + as.pfras_a.pfra_fback = PFR_FB_NOCOUNT; + } + as.pfras_tzero = ke->pfrke_tzero; + + bcopy(&as, w->pfrw_astats, sizeof(as)); + w->pfrw_astats++; + } + break; + case PFRW_POOL_GET: + if (ke->pfrke_not) + break; /* negative entries are ignored */ + if (!w->pfrw_cnt--) { + w->pfrw_kentry = ke; + return (1); /* finish search */ + } + break; + case PFRW_DYNADDR_UPDATE: + { + union sockaddr_union pfr_mask; + + if (ke->pfrke_af == AF_INET) { + if (w->pfrw_dyn->pfid_acnt4++ > 0) + break; + pfr_prepare_network(&pfr_mask, AF_INET, ke->pfrke_net); + w->pfrw_dyn->pfid_addr4 = *SUNION2PF(&ke->pfrke_sa, + AF_INET); + w->pfrw_dyn->pfid_mask4 = *SUNION2PF(&pfr_mask, + AF_INET); + } else if (ke->pfrke_af == AF_INET6){ + if (w->pfrw_dyn->pfid_acnt6++ > 0) + break; + pfr_prepare_network(&pfr_mask, AF_INET6, ke->pfrke_net); + w->pfrw_dyn->pfid_addr6 = *SUNION2PF(&ke->pfrke_sa, + AF_INET6); + w->pfrw_dyn->pfid_mask6 = *SUNION2PF(&pfr_mask, + AF_INET6); + } + break; + } + } + return (0); +} + +int +pfr_clr_tables(struct pfr_table *filter, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + int xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + if (pfr_table_count(filter, flags) < 0) + return (ENOENT); + + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (!strcmp(p->pfrkt_anchor, PF_RESERVED_ANCHOR)) + continue; + if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_setflags_ktables(&workq); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags) +{ + struct pfr_ktableworkq addq, changeq; + struct pfr_ktable *p, *q, *r, key; + int i, rv, xadd = 0; + long tzero = time_second; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + SLIST_INIT(&addq); + SLIST_INIT(&changeq); + for (i = 0; i < size; i++) { + bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK, + flags & PFR_FLAG_USERIOCTL)) + senderr(EINVAL); + key.pfrkt_flags |= PFR_TFLAG_ACTIVE; + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p == NULL) { + p = pfr_create_ktable(&key.pfrkt_t, tzero, 1); + if (p == NULL) + senderr(ENOMEM); + SLIST_FOREACH(q, &addq, pfrkt_workq) { + if (!pfr_ktable_compare(p, q)) + goto _skip; + } + SLIST_INSERT_HEAD(&addq, p, pfrkt_workq); + xadd++; + if (!key.pfrkt_anchor[0]) + goto _skip; + + /* find or create root table */ + bzero(key.pfrkt_anchor, sizeof(key.pfrkt_anchor)); + r = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (r != NULL) { + p->pfrkt_root = r; + goto _skip; + } + SLIST_FOREACH(q, &addq, pfrkt_workq) { + if (!pfr_ktable_compare(&key, q)) { + p->pfrkt_root = q; + goto _skip; + } + } + key.pfrkt_flags = 0; + r = pfr_create_ktable(&key.pfrkt_t, 0, 1); + if (r == NULL) + senderr(ENOMEM); + SLIST_INSERT_HEAD(&addq, r, pfrkt_workq); + p->pfrkt_root = r; + } else if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + SLIST_FOREACH(q, &changeq, pfrkt_workq) + if (!pfr_ktable_compare(&key, q)) + goto _skip; + p->pfrkt_nflags = (p->pfrkt_flags & + ~PFR_TFLAG_USRMASK) | key.pfrkt_flags; + SLIST_INSERT_HEAD(&changeq, p, pfrkt_workq); + xadd++; + } +_skip: + ; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_insert_ktables(&addq); + pfr_setflags_ktables(&changeq); + } else + pfr_destroy_ktables(&addq, 0); + if (nadd != NULL) + *nadd = xadd; + return (0); +_bad: + pfr_destroy_ktables(&addq, 0); + return (rv); +} + +int +pfr_del_tables(struct pfr_table *tbl, int size, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, *q, key; + int i, xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, 0, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + SLIST_FOREACH(q, &workq, pfrkt_workq) + if (!pfr_ktable_compare(p, q)) + goto _skip; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } +_skip: + ; + } + + if (!(flags & PFR_FLAG_DUMMY)) + pfr_setflags_ktables(&workq); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_get_tables(struct pfr_table *filter, struct pfr_table *tbl, int *size, + int flags) +{ + struct pfr_ktable *p; + int n, nn; + + PF_RULES_RASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + n = nn = pfr_table_count(filter, flags); + if (n < 0) + return (ENOENT); + if (n > *size) { + *size = n; + return (0); + } + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (n-- <= 0) + continue; + bcopy(&p->pfrkt_t, tbl++, sizeof(*tbl)); + } + + KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n)); + + *size = nn; + return (0); +} + +int +pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size, + int flags) +{ + struct pfr_ktable *p; + struct pfr_ktableworkq workq; + int n, nn; + long tzero = time_second; + + /* XXX PFR_FLAG_CLSTATS disabled */ + ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS); + if (pfr_fix_anchor(filter->pfrt_anchor)) + return (EINVAL); + n = nn = pfr_table_count(filter, flags); + if (n < 0) + return (ENOENT); + if (n > *size) { + *size = n; + return (0); + } + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (pfr_skip_table(filter, p, flags)) + continue; + if (n-- <= 0) + continue; + bcopy(&p->pfrkt_ts, tbl++, sizeof(*tbl)); + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + } + if (flags & PFR_FLAG_CLSTATS) + pfr_clstats_ktables(&workq, tzero, + flags & PFR_FLAG_ADDRSTOO); + + KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n)); + + *size = nn; + return (0); +} + +int +pfr_clr_tstats(struct pfr_table *tbl, int size, int *nzero, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, key; + int i, xzero = 0; + long tzero = time_second; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, 0, 0)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL) { + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xzero++; + } + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_clstats_ktables(&workq, tzero, flags & PFR_FLAG_ADDRSTOO); + if (nzero != NULL) + *nzero = xzero; + return (0); +} + +int +pfr_set_tflags(struct pfr_table *tbl, int size, int setflag, int clrflag, + int *nchange, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p, *q, key; + int i, xchange = 0, xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + if ((setflag & ~PFR_TFLAG_USRMASK) || + (clrflag & ~PFR_TFLAG_USRMASK) || + (setflag & clrflag)) + return (EINVAL); + SLIST_INIT(&workq); + for (i = 0; i < size; i++) { + bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t)); + if (pfr_validate_table(&key.pfrkt_t, 0, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) { + p->pfrkt_nflags = (p->pfrkt_flags | setflag) & + ~clrflag; + if (p->pfrkt_nflags == p->pfrkt_flags) + goto _skip; + SLIST_FOREACH(q, &workq, pfrkt_workq) + if (!pfr_ktable_compare(p, q)) + goto _skip; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + if ((p->pfrkt_flags & PFR_TFLAG_PERSIST) && + (clrflag & PFR_TFLAG_PERSIST) && + !(p->pfrkt_flags & PFR_TFLAG_REFERENCED)) + xdel++; + else + xchange++; + } +_skip: + ; + } + if (!(flags & PFR_FLAG_DUMMY)) + pfr_setflags_ktables(&workq); + if (nchange != NULL) + *nchange = xchange; + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_begin(struct pfr_table *trs, u_int32_t *ticket, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + struct pf_ruleset *rs; + int xdel = 0; + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + rs = pf_find_or_create_ruleset(trs->pfrt_anchor); + if (rs == NULL) + return (ENOMEM); + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_setflags_ktables(&workq); + if (ticket != NULL) + *ticket = ++rs->tticket; + rs->topen = 1; + } else + pf_remove_if_empty_ruleset(rs); + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_define(struct pfr_table *tbl, struct pfr_addr *addr, int size, + int *nadd, int *naddr, u_int32_t ticket, int flags) +{ + struct pfr_ktableworkq tableq; + struct pfr_kentryworkq addrq; + struct pfr_ktable *kt, *rt, *shadow, key; + struct pfr_kentry *p; + struct pfr_addr *ad; + struct pf_ruleset *rs; + int i, rv, xadd = 0, xaddr = 0; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO); + if (size && !(flags & PFR_FLAG_ADDRSTOO)) + return (EINVAL); + if (pfr_validate_table(tbl, PFR_TFLAG_USRMASK, + flags & PFR_FLAG_USERIOCTL)) + return (EINVAL); + rs = pf_find_ruleset(tbl->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (EBUSY); + tbl->pfrt_flags |= PFR_TFLAG_INACTIVE; + SLIST_INIT(&tableq); + kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)tbl); + if (kt == NULL) { + kt = pfr_create_ktable(tbl, 0, 1); + if (kt == NULL) + return (ENOMEM); + SLIST_INSERT_HEAD(&tableq, kt, pfrkt_workq); + xadd++; + if (!tbl->pfrt_anchor[0]) + goto _skip; + + /* find or create root table */ + bzero(&key, sizeof(key)); + strlcpy(key.pfrkt_name, tbl->pfrt_name, sizeof(key.pfrkt_name)); + rt = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); + if (rt != NULL) { + kt->pfrkt_root = rt; + goto _skip; + } + rt = pfr_create_ktable(&key.pfrkt_t, 0, 1); + if (rt == NULL) { + pfr_destroy_ktables(&tableq, 0); + return (ENOMEM); + } + SLIST_INSERT_HEAD(&tableq, rt, pfrkt_workq); + kt->pfrkt_root = rt; + } else if (!(kt->pfrkt_flags & PFR_TFLAG_INACTIVE)) + xadd++; +_skip: + shadow = pfr_create_ktable(tbl, 0, 0); + if (shadow == NULL) { + pfr_destroy_ktables(&tableq, 0); + return (ENOMEM); + } + SLIST_INIT(&addrq); + for (i = 0, ad = addr; i < size; i++, ad++) { + if (pfr_validate_addr(ad)) + senderr(EINVAL); + if (pfr_lookup_addr(shadow, ad, 1) != NULL) + continue; + p = pfr_create_kentry(ad); + if (p == NULL) + senderr(ENOMEM); + if (pfr_route_kentry(shadow, p)) { + pfr_destroy_kentry(p); + continue; + } + SLIST_INSERT_HEAD(&addrq, p, pfrke_workq); + xaddr++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + if (kt->pfrkt_shadow != NULL) + pfr_destroy_ktable(kt->pfrkt_shadow, 1); + kt->pfrkt_flags |= PFR_TFLAG_INACTIVE; + pfr_insert_ktables(&tableq); + shadow->pfrkt_cnt = (flags & PFR_FLAG_ADDRSTOO) ? + xaddr : NO_ADDRESSES; + kt->pfrkt_shadow = shadow; + } else { + pfr_clean_node_mask(shadow, &addrq); + pfr_destroy_ktable(shadow, 0); + pfr_destroy_ktables(&tableq, 0); + pfr_destroy_kentries(&addrq); + } + if (nadd != NULL) + *nadd = xadd; + if (naddr != NULL) + *naddr = xaddr; + return (0); +_bad: + pfr_destroy_ktable(shadow, 0); + pfr_destroy_ktables(&tableq, 0); + pfr_destroy_kentries(&addrq); + return (rv); +} + +int +pfr_ina_rollback(struct pfr_table *trs, u_int32_t ticket, int *ndel, int flags) +{ + struct pfr_ktableworkq workq; + struct pfr_ktable *p; + struct pf_ruleset *rs; + int xdel = 0; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + rs = pf_find_ruleset(trs->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (0); + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + xdel++; + } + if (!(flags & PFR_FLAG_DUMMY)) { + pfr_setflags_ktables(&workq); + rs->topen = 0; + pf_remove_if_empty_ruleset(rs); + } + if (ndel != NULL) + *ndel = xdel; + return (0); +} + +int +pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd, + int *nchange, int flags) +{ + struct pfr_ktable *p, *q; + struct pfr_ktableworkq workq; + struct pf_ruleset *rs; + int xadd = 0, xchange = 0; + long tzero = time_second; + + PF_RULES_WASSERT(); + + ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY); + rs = pf_find_ruleset(trs->pfrt_anchor); + if (rs == NULL || !rs->topen || ticket != rs->tticket) + return (EBUSY); + + SLIST_INIT(&workq); + RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) { + if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) || + pfr_skip_table(trs, p, 0)) + continue; + SLIST_INSERT_HEAD(&workq, p, pfrkt_workq); + if (p->pfrkt_flags & PFR_TFLAG_ACTIVE) + xchange++; + else + xadd++; + } + + if (!(flags & PFR_FLAG_DUMMY)) { + for (p = SLIST_FIRST(&workq); p != NULL; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_commit_ktable(p, tzero); + } + rs->topen = 0; + pf_remove_if_empty_ruleset(rs); + } + if (nadd != NULL) + *nadd = xadd; + if (nchange != NULL) + *nchange = xchange; + + return (0); +} + +static void +pfr_commit_ktable(struct pfr_ktable *kt, long tzero) +{ + struct pfr_ktable *shadow = kt->pfrkt_shadow; + int nflags; + + PF_RULES_WASSERT(); + + if (shadow->pfrkt_cnt == NO_ADDRESSES) { + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + pfr_clstats_ktable(kt, tzero, 1); + } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) { + /* kt might contain addresses */ + struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq; + struct pfr_kentry *p, *q, *next; + struct pfr_addr ad; + + pfr_enqueue_addrs(shadow, &addrq, NULL, 0); + pfr_mark_addrs(kt); + SLIST_INIT(&addq); + SLIST_INIT(&changeq); + SLIST_INIT(&delq); + SLIST_INIT(&garbageq); + pfr_clean_node_mask(shadow, &addrq); + for (p = SLIST_FIRST(&addrq); p != NULL; p = next) { + next = SLIST_NEXT(p, pfrke_workq); /* XXX */ + pfr_copyout_addr(&ad, p); + q = pfr_lookup_addr(kt, &ad, 1); + if (q != NULL) { + if (q->pfrke_not != p->pfrke_not) + SLIST_INSERT_HEAD(&changeq, q, + pfrke_workq); + q->pfrke_mark = 1; + SLIST_INSERT_HEAD(&garbageq, p, pfrke_workq); + } else { + p->pfrke_tzero = tzero; + SLIST_INSERT_HEAD(&addq, p, pfrke_workq); + } + } + pfr_enqueue_addrs(kt, &delq, NULL, ENQUEUE_UNMARKED_ONLY); + pfr_insert_kentries(kt, &addq, tzero); + pfr_remove_kentries(kt, &delq); + pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG); + pfr_destroy_kentries(&garbageq); + } else { + /* kt cannot contain addresses */ + SWAP(struct radix_node_head *, kt->pfrkt_ip4, + shadow->pfrkt_ip4); + SWAP(struct radix_node_head *, kt->pfrkt_ip6, + shadow->pfrkt_ip6); + SWAP(int, kt->pfrkt_cnt, shadow->pfrkt_cnt); + pfr_clstats_ktable(kt, tzero, 1); + } + nflags = ((shadow->pfrkt_flags & PFR_TFLAG_USRMASK) | + (kt->pfrkt_flags & PFR_TFLAG_SETMASK) | PFR_TFLAG_ACTIVE) + & ~PFR_TFLAG_INACTIVE; + pfr_destroy_ktable(shadow, 0); + kt->pfrkt_shadow = NULL; + pfr_setflags_ktable(kt, nflags); +} + +static int +pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved) +{ + int i; + + if (!tbl->pfrt_name[0]) + return (-1); + if (no_reserved && !strcmp(tbl->pfrt_anchor, PF_RESERVED_ANCHOR)) + return (-1); + if (tbl->pfrt_name[PF_TABLE_NAME_SIZE-1]) + return (-1); + for (i = strlen(tbl->pfrt_name); i < PF_TABLE_NAME_SIZE; i++) + if (tbl->pfrt_name[i]) + return (-1); + if (pfr_fix_anchor(tbl->pfrt_anchor)) + return (-1); + if (tbl->pfrt_flags & ~allowedflags) + return (-1); + return (0); +} + +/* + * Rewrite anchors referenced by tables to remove slashes + * and check for validity. + */ +static int +pfr_fix_anchor(char *anchor) +{ + size_t siz = MAXPATHLEN; + int i; + + if (anchor[0] == '/') { + char *path; + int off; + + path = anchor; + off = 1; + while (*++path == '/') + off++; + bcopy(path, anchor, siz - off); + memset(anchor + siz - off, 0, off); + } + if (anchor[siz - 1]) + return (-1); + for (i = strlen(anchor); i < siz; i++) + if (anchor[i]) + return (-1); + return (0); +} + +static int +pfr_table_count(struct pfr_table *filter, int flags) +{ + struct pf_ruleset *rs; + + PF_RULES_ASSERT(); + + if (flags & PFR_FLAG_ALLRSETS) + return (pfr_ktable_cnt); + if (filter->pfrt_anchor[0]) { + rs = pf_find_ruleset(filter->pfrt_anchor); + return ((rs != NULL) ? rs->tables : -1); + } + return (pf_main_ruleset.tables); +} + +static int +pfr_skip_table(struct pfr_table *filter, struct pfr_ktable *kt, int flags) +{ + if (flags & PFR_FLAG_ALLRSETS) + return (0); + if (strcmp(filter->pfrt_anchor, kt->pfrkt_anchor)) + return (1); + return (0); +} + +static void +pfr_insert_ktables(struct pfr_ktableworkq *workq) +{ + struct pfr_ktable *p; + + SLIST_FOREACH(p, workq, pfrkt_workq) + pfr_insert_ktable(p); +} + +static void +pfr_insert_ktable(struct pfr_ktable *kt) +{ + + PF_RULES_WASSERT(); + + RB_INSERT(pfr_ktablehead, &pfr_ktables, kt); + pfr_ktable_cnt++; + if (kt->pfrkt_root != NULL) + if (!kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]++) + pfr_setflags_ktable(kt->pfrkt_root, + kt->pfrkt_root->pfrkt_flags|PFR_TFLAG_REFDANCHOR); +} + +static void +pfr_setflags_ktables(struct pfr_ktableworkq *workq) +{ + struct pfr_ktable *p, *q; + + for (p = SLIST_FIRST(workq); p; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_setflags_ktable(p, p->pfrkt_nflags); + } +} + +static void +pfr_setflags_ktable(struct pfr_ktable *kt, int newf) +{ + struct pfr_kentryworkq addrq; + + PF_RULES_WASSERT(); + + if (!(newf & PFR_TFLAG_REFERENCED) && + !(newf & PFR_TFLAG_PERSIST)) + newf &= ~PFR_TFLAG_ACTIVE; + if (!(newf & PFR_TFLAG_ACTIVE)) + newf &= ~PFR_TFLAG_USRMASK; + if (!(newf & PFR_TFLAG_SETMASK)) { + RB_REMOVE(pfr_ktablehead, &pfr_ktables, kt); + if (kt->pfrkt_root != NULL) + if (!--kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]) + pfr_setflags_ktable(kt->pfrkt_root, + kt->pfrkt_root->pfrkt_flags & + ~PFR_TFLAG_REFDANCHOR); + pfr_destroy_ktable(kt, 1); + pfr_ktable_cnt--; + return; + } + if (!(newf & PFR_TFLAG_ACTIVE) && kt->pfrkt_cnt) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_remove_kentries(kt, &addrq); + } + if (!(newf & PFR_TFLAG_INACTIVE) && kt->pfrkt_shadow != NULL) { + pfr_destroy_ktable(kt->pfrkt_shadow, 1); + kt->pfrkt_shadow = NULL; + } + kt->pfrkt_flags = newf; +} + +static void +pfr_clstats_ktables(struct pfr_ktableworkq *workq, long tzero, int recurse) +{ + struct pfr_ktable *p; + + SLIST_FOREACH(p, workq, pfrkt_workq) + pfr_clstats_ktable(p, tzero, recurse); +} + +static void +pfr_clstats_ktable(struct pfr_ktable *kt, long tzero, int recurse) +{ + struct pfr_kentryworkq addrq; + + if (recurse) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_clstats_kentries(&addrq, tzero, 0); + } + bzero(kt->pfrkt_packets, sizeof(kt->pfrkt_packets)); + bzero(kt->pfrkt_bytes, sizeof(kt->pfrkt_bytes)); + kt->pfrkt_match = kt->pfrkt_nomatch = 0; + kt->pfrkt_tzero = tzero; +} + +static struct pfr_ktable * +pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset) +{ + struct pfr_ktable *kt; + struct pf_ruleset *rs; + + PF_RULES_WASSERT(); + + kt = malloc(sizeof(*kt), M_PFTABLE, M_NOWAIT|M_ZERO); + if (kt == NULL) + return (NULL); + kt->pfrkt_t = *tbl; + + if (attachruleset) { + rs = pf_find_or_create_ruleset(tbl->pfrt_anchor); + if (!rs) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_rs = rs; + rs->tables++; + } + + if (!rn_inithead((void **)&kt->pfrkt_ip4, + offsetof(struct sockaddr_in, sin_addr) * 8) || + !rn_inithead((void **)&kt->pfrkt_ip6, + offsetof(struct sockaddr_in6, sin6_addr) * 8)) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + kt->pfrkt_tzero = tzero; + + return (kt); +} + +static void +pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr) +{ + struct pfr_ktable *p, *q; + + for (p = SLIST_FIRST(workq); p; p = q) { + q = SLIST_NEXT(p, pfrkt_workq); + pfr_destroy_ktable(p, flushaddr); + } +} + +static void +pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr) +{ + struct pfr_kentryworkq addrq; + + if (flushaddr) { + pfr_enqueue_addrs(kt, &addrq, NULL, 0); + pfr_clean_node_mask(kt, &addrq); + pfr_destroy_kentries(&addrq); + } + if (kt->pfrkt_ip4 != NULL) { + RADIX_NODE_HEAD_DESTROY(kt->pfrkt_ip4); + free((caddr_t)kt->pfrkt_ip4, M_RTABLE); + } + if (kt->pfrkt_ip6 != NULL) { + RADIX_NODE_HEAD_DESTROY(kt->pfrkt_ip6); + free((caddr_t)kt->pfrkt_ip6, M_RTABLE); + } + if (kt->pfrkt_shadow != NULL) + pfr_destroy_ktable(kt->pfrkt_shadow, flushaddr); + if (kt->pfrkt_rs != NULL) { + kt->pfrkt_rs->tables--; + pf_remove_if_empty_ruleset(kt->pfrkt_rs); + } + free(kt, M_PFTABLE); +} + +static int +pfr_ktable_compare(struct pfr_ktable *p, struct pfr_ktable *q) +{ + int d; + + if ((d = strncmp(p->pfrkt_name, q->pfrkt_name, PF_TABLE_NAME_SIZE))) + return (d); + return (strcmp(p->pfrkt_anchor, q->pfrkt_anchor)); +} + +static struct pfr_ktable * +pfr_lookup_table(struct pfr_table *tbl) +{ + /* struct pfr_ktable start like a struct pfr_table */ + return (RB_FIND(pfr_ktablehead, &pfr_ktables, + (struct pfr_ktable *)tbl)); +} + +int +pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af) +{ + struct pfr_kentry *ke = NULL; + int match; + + PF_RULES_RASSERT(); + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (0); + + switch (af) { +#ifdef INET + case AF_INET: + { + struct sockaddr_in sin; + + bzero(&sin, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = a->addr32[0]; + ke = (struct pfr_kentry *)rn_match(&sin, kt->pfrkt_ip4); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 sin6; + + bzero(&sin6, sizeof(sin6)); + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); + ke = (struct pfr_kentry *)rn_match(&sin6, kt->pfrkt_ip6); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET6 */ + } + match = (ke && !ke->pfrke_not); + if (match) + kt->pfrkt_match++; + else + kt->pfrkt_nomatch++; + return (match); +} + +void +pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af, + u_int64_t len, int dir_out, int op_pass, int notrule) +{ + struct pfr_kentry *ke = NULL; + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return; + + switch (af) { +#ifdef INET + case AF_INET: + { + struct sockaddr_in sin; + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = a->addr32[0]; + ke = (struct pfr_kentry *)rn_match(&sin, kt->pfrkt_ip4); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 sin6; + + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); + ke = (struct pfr_kentry *)rn_match(&sin6, kt->pfrkt_ip6); + if (ke && KENTRY_RNF_ROOT(ke)) + ke = NULL; + break; + } +#endif /* INET6 */ + default: + ; + } + if ((ke == NULL || ke->pfrke_not) != notrule) { + if (op_pass != PFR_OP_PASS) + printf("pfr_update_stats: assertion failed.\n"); + op_pass = PFR_OP_XPASS; + } + kt->pfrkt_packets[dir_out][op_pass]++; + kt->pfrkt_bytes[dir_out][op_pass] += len; + if (ke != NULL && op_pass != PFR_OP_XPASS && + (kt->pfrkt_flags & PFR_TFLAG_COUNTERS)) { + if (ke->pfrke_counters == NULL) + ke->pfrke_counters = uma_zalloc(V_pfr_kcounters_z, + M_NOWAIT | M_ZERO); + if (ke->pfrke_counters != NULL) { + ke->pfrke_counters->pfrkc_packets[dir_out][op_pass]++; + ke->pfrke_counters->pfrkc_bytes[dir_out][op_pass] += len; + } + } +} + +struct pfr_ktable * +pfr_attach_table(struct pf_ruleset *rs, char *name) +{ + struct pfr_ktable *kt, *rt; + struct pfr_table tbl; + struct pf_anchor *ac = rs->anchor; + + PF_RULES_WASSERT(); + + bzero(&tbl, sizeof(tbl)); + strlcpy(tbl.pfrt_name, name, sizeof(tbl.pfrt_name)); + if (ac != NULL) + strlcpy(tbl.pfrt_anchor, ac->path, sizeof(tbl.pfrt_anchor)); + kt = pfr_lookup_table(&tbl); + if (kt == NULL) { + kt = pfr_create_ktable(&tbl, time_second, 1); + if (kt == NULL) + return (NULL); + if (ac != NULL) { + bzero(tbl.pfrt_anchor, sizeof(tbl.pfrt_anchor)); + rt = pfr_lookup_table(&tbl); + if (rt == NULL) { + rt = pfr_create_ktable(&tbl, 0, 1); + if (rt == NULL) { + pfr_destroy_ktable(kt, 0); + return (NULL); + } + pfr_insert_ktable(rt); + } + kt->pfrkt_root = rt; + } + pfr_insert_ktable(kt); + } + if (!kt->pfrkt_refcnt[PFR_REFCNT_RULE]++) + pfr_setflags_ktable(kt, kt->pfrkt_flags|PFR_TFLAG_REFERENCED); + return (kt); +} + +void +pfr_detach_table(struct pfr_ktable *kt) +{ + + PF_RULES_WASSERT(); + KASSERT(kt->pfrkt_refcnt[PFR_REFCNT_RULE] > 0, ("%s: refcount %d\n", + __func__, kt->pfrkt_refcnt[PFR_REFCNT_RULE])); + + if (!--kt->pfrkt_refcnt[PFR_REFCNT_RULE]) + pfr_setflags_ktable(kt, kt->pfrkt_flags&~PFR_TFLAG_REFERENCED); +} + +int +pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter, + sa_family_t af) +{ + struct pf_addr *addr, *cur, *mask; + union sockaddr_union uaddr, umask; + struct pfr_kentry *ke, *ke2 = NULL; + int idx = -1, use_counter = 0; + + switch (af) { + case AF_INET: + uaddr.sin.sin_len = sizeof(struct sockaddr_in); + uaddr.sin.sin_family = AF_INET; + break; + case AF_INET6: + uaddr.sin6.sin6_len = sizeof(struct sockaddr_in6); + uaddr.sin6.sin6_family = AF_INET6; + break; + } + addr = SUNION2PF(&uaddr, af); + + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL) + kt = kt->pfrkt_root; + if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE)) + return (-1); + + if (pidx != NULL) + idx = *pidx; + if (counter != NULL && idx >= 0) + use_counter = 1; + if (idx < 0) + idx = 0; + +_next_block: + ke = pfr_kentry_byidx(kt, idx, af); + if (ke == NULL) { + kt->pfrkt_nomatch++; + return (1); + } + pfr_prepare_network(&umask, af, ke->pfrke_net); + cur = SUNION2PF(&ke->pfrke_sa, af); + mask = SUNION2PF(&umask, af); + + if (use_counter) { + /* is supplied address within block? */ + if (!PF_MATCHA(0, cur, mask, counter, af)) { + /* no, go to next block in table */ + idx++; + use_counter = 0; + goto _next_block; + } + PF_ACPY(addr, counter, af); + } else { + /* use first address of block */ + PF_ACPY(addr, cur, af); + } + + if (!KENTRY_NETWORK(ke)) { + /* this is a single IP address - no possible nested block */ + PF_ACPY(counter, addr, af); + *pidx = idx; + kt->pfrkt_match++; + return (0); + } + for (;;) { + /* we don't want to use a nested block */ + switch (af) { + case AF_INET: + ke2 = (struct pfr_kentry *)rn_match(&uaddr, + kt->pfrkt_ip4); + break; + case AF_INET6: + ke2 = (struct pfr_kentry *)rn_match(&uaddr, + kt->pfrkt_ip6); + break; + } + /* no need to check KENTRY_RNF_ROOT() here */ + if (ke2 == ke) { + /* lookup return the same block - perfect */ + PF_ACPY(counter, addr, af); + *pidx = idx; + kt->pfrkt_match++; + return (0); + } + + /* we need to increase the counter past the nested block */ + pfr_prepare_network(&umask, AF_INET, ke2->pfrke_net); + PF_POOLMASK(addr, addr, SUNION2PF(&umask, af), &pfr_ffaddr, af); + PF_AINC(addr, af); + if (!PF_MATCHA(0, cur, mask, addr, af)) { + /* ok, we reached the end of our main block */ + /* go to next block in table */ + idx++; + use_counter = 0; + goto _next_block; + } + } +} + +static struct pfr_kentry * +pfr_kentry_byidx(struct pfr_ktable *kt, int idx, int af) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_POOL_GET; + w.pfrw_cnt = idx; + + switch (af) { +#ifdef INET + case AF_INET: + kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); + return (w.pfrw_kentry); +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w); + return (w.pfrw_kentry); +#endif /* INET6 */ + default: + return (NULL); + } +} + +void +pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn) +{ + struct pfr_walktree w; + + bzero(&w, sizeof(w)); + w.pfrw_op = PFRW_DYNADDR_UPDATE; + w.pfrw_dyn = dyn; + + dyn->pfid_acnt4 = 0; + dyn->pfid_acnt6 = 0; + if (!dyn->pfid_af || dyn->pfid_af == AF_INET) + kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w); + if (!dyn->pfid_af || dyn->pfid_af == AF_INET6) + kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w); +} |