diff options
Diffstat (limited to 'sys/net')
-rw-r--r-- | sys/net/if.h | 4 | ||||
-rw-r--r-- | sys/net/if_arcsubr.c | 4 | ||||
-rw-r--r-- | sys/net/if_bridge.c | 49 | ||||
-rw-r--r-- | sys/net/if_ef.c | 2 | ||||
-rw-r--r-- | sys/net/if_ethersubr.c | 19 | ||||
-rw-r--r-- | sys/net/if_fddisubr.c | 2 | ||||
-rw-r--r-- | sys/net/if_fwsubr.c | 2 | ||||
-rw-r--r-- | sys/net/if_iso88025subr.c | 2 | ||||
-rw-r--r-- | sys/net/if_lagg.c | 38 | ||||
-rw-r--r-- | sys/net/if_pflog.h | 4 | ||||
-rw-r--r-- | sys/net/if_pfsync.h | 3 | ||||
-rw-r--r-- | sys/net/if_stf.c | 887 | ||||
-rw-r--r-- | sys/net/if_stf.h | 9 | ||||
-rw-r--r-- | sys/net/if_tun.c | 79 | ||||
-rw-r--r-- | sys/net/if_vlan.c | 270 | ||||
-rw-r--r-- | sys/net/if_vlan_var.h | 27 | ||||
-rw-r--r-- | sys/net/netisr.c | 25 | ||||
-rw-r--r-- | sys/net/pfil.c | 147 | ||||
-rw-r--r-- | sys/net/pfil.h | 7 | ||||
-rw-r--r-- | sys/net/pfkeyv2.h | 43 | ||||
-rw-r--r-- | sys/net/pfvar.h | 94 |
21 files changed, 1375 insertions, 342 deletions
diff --git a/sys/net/if.h b/sys/net/if.h index 04bd1f1..ec6287d 100644 --- a/sys/net/if.h +++ b/sys/net/if.h @@ -245,7 +245,7 @@ struct if_data { #define IFCAP_CANTCHANGE (IFCAP_NETMAP) -#define IFQ_MAXLEN 50 +#define IFQ_MAXLEN 128 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* @@ -392,6 +392,7 @@ struct ifreq { caddr_t ifru_data; int ifru_cap[2]; u_int ifru_fib; + u_char ifru_vlan_pcp; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -409,6 +410,7 @@ struct ifreq { #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ #define ifr_index ifr_ifru.ifru_index /* interface index */ #define ifr_fib ifr_ifru.ifru_fib /* interface fib */ +#define ifr_vlan_pcp ifr_ifru.ifru_vlan_pcp /* VLAN priority */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ diff --git a/sys/net/if_arcsubr.c b/sys/net/if_arcsubr.c index 2f94785..1091ae4 100644 --- a/sys/net/if_arcsubr.c +++ b/sys/net/if_arcsubr.c @@ -557,15 +557,11 @@ arc_input(struct ifnet *ifp, struct mbuf *m) #ifdef INET case ARCTYPE_IP: m_adj(m, ARC_HDRNEWLEN); - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; case ARCTYPE_IP_OLD: m_adj(m, ARC_HDRLEN); - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index f7c6365..14d9967 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -243,6 +243,7 @@ static void bridge_ifdetach(void *arg __unused, struct ifnet *); static void bridge_init(void *); static void bridge_dummynet(struct mbuf *, struct ifnet *); static void bridge_stop(struct ifnet *, int); +static void bridge_start(struct ifnet *); static int bridge_transmit(struct ifnet *, struct mbuf *); static void bridge_qflush(struct ifnet *); static struct mbuf *bridge_input(struct ifnet *, struct mbuf *); @@ -607,10 +608,13 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) if_initname(ifp, bridge_name, unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = bridge_ioctl; + ifp->if_start = bridge_start; ifp->if_transmit = bridge_transmit; ifp->if_qflush = bridge_qflush; ifp->if_init = bridge_init; ifp->if_type = IFT_BRIDGE; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + IFQ_SET_READY(&ifp->if_snd); /* * Generate an ethernet address with a locally administered address. @@ -815,6 +819,8 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } BRIDGE_LOCK(sc); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp->if_type == IFT_GIF) + continue; if (bif->bif_ifp->if_mtu != ifr->ifr_mtu) { log(LOG_NOTICE, "%s: invalid MTU: %lu(%s)" " != %d\n", sc->sc_ifp->if_xname, @@ -1106,6 +1112,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) } #endif /* Allow the first Ethernet member to define the MTU */ + if (ifs->if_type != IFT_GIF) { if (LIST_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { @@ -1113,6 +1120,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) ifs->if_mtu, ifs->if_xname, sc->sc_ifp->if_mtu); return (EINVAL); } + } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) @@ -2071,6 +2079,47 @@ bridge_qflush(struct ifnet *ifp __unused) } /* + * bridge_start: + * + * Start output on a bridge. + * + */ +static void +bridge_start(struct ifnet *ifp) +{ + struct bridge_softc *sc; + struct mbuf *m; + struct ether_header *eh; + struct ifnet *dst_if; + + sc = ifp->if_softc; + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == 0) + break; + ETHER_BPF_MTAP(ifp, m); + + eh = mtod(m, struct ether_header *); + dst_if = NULL; + + BRIDGE_LOCK(sc); + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { + dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1); + } + + if (dst_if == NULL) + bridge_broadcast(sc, ifp, m, 0); + else { + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + } + } + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; +} + +/* * bridge_forward: * * The forwarding function of the bridge. diff --git a/sys/net/if_ef.c b/sys/net/if_ef.c index fc6402c..478dfb4 100644 --- a/sys/net/if_ef.c +++ b/sys/net/if_ef.c @@ -240,8 +240,6 @@ ef_inputEII(struct mbuf *m, struct ether_header *eh, u_short ether_type) #endif #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return (0); isr = NETISR_IP; break; diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index ea22d33..773918f 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -108,6 +108,8 @@ CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ +SYSCTL_DECL(_net_link); + /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); @@ -610,7 +612,7 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) return; } eh = mtod(m, struct ether_header *); - } + } else { #if defined(INET) || defined(INET6) /* @@ -639,6 +641,7 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } + } if (harvest.ethernet) random_harvest(&(m->m_data), 12, 2, RANDOM_NET_ETHER); @@ -685,6 +688,9 @@ vnet_ether_init(__unused void *arg) if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil link hook, " "error %d\n", __func__, i); + else + pfil_head_export_sysctl(&V_link_pfil_hook, + SYSCTL_STATIC_CHILDREN(_net_link)); } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); @@ -737,7 +743,11 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) if (i != 0 || m == NULL) return; - } + + /* M_PROTO2 is for M_IP[6]_NEXTHOP */ + i = m->m_flags & (M_FASTFWD_OURS|M_PROTO2); + } else + i = 0; eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); @@ -776,6 +786,8 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); + if (i) + m->m_flags |= M_FASTFWD_OURS|M_PROTO2; m_adj(m, ETHER_HDR_LEN); /* @@ -784,8 +796,6 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) switch (ether_type) { #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; @@ -972,7 +982,6 @@ ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) } #endif -SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 diff --git a/sys/net/if_fddisubr.c b/sys/net/if_fddisubr.c index 4f54dcb..271f72c 100644 --- a/sys/net/if_fddisubr.c +++ b/sys/net/if_fddisubr.c @@ -501,8 +501,6 @@ fddi_input(ifp, m) switch (type) { #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; diff --git a/sys/net/if_fwsubr.c b/sys/net/if_fwsubr.c index 31fc2a9..acac423 100644 --- a/sys/net/if_fwsubr.c +++ b/sys/net/if_fwsubr.c @@ -595,8 +595,6 @@ firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) switch (type) { #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; diff --git a/sys/net/if_iso88025subr.c b/sys/net/if_iso88025subr.c index 5975b28..f96df4e 100644 --- a/sys/net/if_iso88025subr.c +++ b/sys/net/if_iso88025subr.c @@ -579,8 +579,6 @@ iso88025_input(ifp, m) #ifdef INET case ETHERTYPE_IP: th->iso88025_shost[0] &= ~(TR_RII); - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; diff --git a/sys/net/if_lagg.c b/sys/net/if_lagg.c index 87f358e..93ec82e 100644 --- a/sys/net/if_lagg.c +++ b/sys/net/if_lagg.c @@ -125,6 +125,7 @@ static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static int lagg_transmit(struct ifnet *, struct mbuf *); +static void lagg_start(struct ifnet *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); @@ -358,11 +359,14 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; + ifp->if_start = lagg_start; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + IFQ_SET_READY(&ifp->if_snd); /* * Attach as an ordinary ethernet device, children will be attached @@ -1494,6 +1498,40 @@ lagg_transmit(struct ifnet *ifp, struct mbuf *m) return (error); } +static void +lagg_start(struct ifnet *ifp) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct rm_priotracker tracker; + struct mbuf *m; + int error = 0, len; + + LAGG_RLOCK(sc, &tracker); + /* We need a Tx algorithm and at least one port */ + if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { + IF_DRAIN(&ifp->if_snd); + LAGG_RUNLOCK(sc, &tracker); + return; + } + + for (;; error = 0) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + + ETHER_BPF_MTAP(ifp, m); + + len = m->m_pkthdr.len; + error = (*sc->sc_start)(sc, m); + if (error == 0) { + counter_u64_add(sc->sc_opackets, 1); + counter_u64_add(sc->sc_obytes, len); + } else + ifp->if_oerrors++; + } + LAGG_RUNLOCK(sc, &tracker); +} + /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ diff --git a/sys/net/if_pflog.h b/sys/net/if_pflog.h index 0faeb7d..326b551 100644 --- a/sys/net/if_pflog.h +++ b/sys/net/if_pflog.h @@ -40,10 +40,14 @@ struct pfloghdr { char ruleset[PFLOG_RULESET_NAME_SIZE]; u_int32_t rulenr; u_int32_t subrulenr; +#ifdef PF_USER_INFO uid_t uid; pid_t pid; uid_t rule_uid; pid_t rule_pid; +#else + u_int32_t ridentifier; +#endif u_int8_t dir; u_int8_t pad[3]; }; diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h index 7a72bbb..ef8ba1f 100644 --- a/sys/net/if_pfsync.h +++ b/sys/net/if_pfsync.h @@ -241,6 +241,9 @@ struct pfsyncreq { char pfsyncr_syncdev[IFNAMSIZ]; struct in_addr pfsyncr_syncpeer; int pfsyncr_maxupdates; +#define PFSYNCF_OK 0x00000001 +#define PFSYNCF_DEFER 0x00000002 +#define PFSYNCF_PUSH 0x00000004 int pfsyncr_defer; }; diff --git a/sys/net/if_stf.c b/sys/net/if_stf.c index 15d7d64..9d923ad 100644 --- a/sys/net/if_stf.c +++ b/sys/net/if_stf.c @@ -3,6 +3,8 @@ /*- * Copyright (C) 2000 WIDE Project. + * Copyright (c) 2010 Hiroki Sato <hrs@FreeBSD.org> + * Copyright (c) 2013 Ermal Luçi <eri@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,7 +33,7 @@ */ /* - * 6to4 interface, based on RFC3056. + * 6to4 interface, based on RFC3056 + 6rd (RFC5569) support. * * 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting. * There is no address mapping defined from IPv6 multicast address to IPv4 @@ -60,7 +62,7 @@ * ICMPv6: * - Redirects cannot be used due to the lack of link-local address. * - * stf interface does not have, and will not need, a link-local address. + * stf interface does not have, and will not need, a link-local address. * It seems to have no real benefit and does not help the above symptoms much. * Even if we assign link-locals to interface, we cannot really * use link-local unicast/multicast on top of 6to4 cloud (since there's no @@ -72,6 +74,12 @@ * http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt * for details. The code tries to filter out some of malicious packets. * Note that there is no way to be 100% secure. + * + * 6rd (RFC5569 & RFC5969) extension is enabled when an IPv6 GUA other than + * 2002::/16 is assigned. The stf(4) recognizes a 32-bit just after + * prefixlen as the IPv4 address of the 6rd customer site. The + * prefixlen must be shorter than 32. + * */ #include "opt_inet.h" @@ -92,13 +100,14 @@ #include <machine/cpu.h> #include <sys/malloc.h> +#include <sys/priv.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/route.h> #include <net/netisr.h> #include <net/if_types.h> -#include <net/if_stf.h> #include <net/vnet.h> #include <netinet/in.h> @@ -106,6 +115,7 @@ #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/in_var.h> +#include <net/if_stf.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> @@ -120,20 +130,48 @@ #include <security/mac/mac_framework.h> +#define STF_DEBUG 1 +#if STF_DEBUG > 3 +#define ip_sprintf(buf, a) \ + sprintf(buf, "%u.%u.%u.%u", \ + (ntohl((a)->s_addr)>>24)&0xFF, \ + (ntohl((a)->s_addr)>>16)&0xFF, \ + (ntohl((a)->s_addr)>>8)&0xFF, \ + (ntohl((a)->s_addr))&0xFF); +#endif + +#if STF_DEBUG +#define DEBUG_PRINTF(a, ...) \ + do { \ + if (V_stf_debug >= a) \ + printf(__VA_ARGS__); \ + } while (0) +#else +#define DEBUG_PRINTF(a, ...) +#endif + SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); -static int stf_route_cache = 1; -SYSCTL_INT(_net_link_stf, OID_AUTO, route_cache, CTLFLAG_RW, - &stf_route_cache, 0, "Caching of IPv4 routes for 6to4 Output"); +static VNET_DEFINE(int, stf_route_cache) = 0; +#define V_stf_route_cache VNET(stf_route_cache) +SYSCTL_VNET_INT(_net_link_stf, OID_AUTO, route_cache, CTLFLAG_RW, + &VNET_NAME(stf_route_cache), 0, + "Enable caching of IPv4 routes for 6to4 output."); + +#if STF_DEBUG +static VNET_DEFINE(int, stf_debug) = 0; +#define V_stf_debug VNET(stf_debug) +SYSCTL_VNET_INT(_net_link_stf, OID_AUTO, stf_debug, CTLFLAG_RW, + &VNET_NAME(stf_debug), 0, + "Enable displaying verbose debug message of stf interfaces"); +#endif -static int stf_permit_rfc1918 = 0; +static int stf_permit_rfc1918 = 1; TUNABLE_INT("net.link.stf.permit_rfc1918", &stf_permit_rfc1918); SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RW | CTLFLAG_TUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); -#define STFUNIT 0 - #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) /* @@ -144,24 +182,37 @@ SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RW | CTLFLAG_TUN, struct stf_softc { struct ifnet *sc_ifp; + in_addr_t dstv4_addr; + in_addr_t srcv4_addr; + in_addr_t inaddr; + u_int v4prefixlen; union { struct route __sc_ro4; struct route_in6 __sc_ro6; /* just for safety */ } __sc_ro46; #define sc_ro __sc_ro46.__sc_ro4 - struct mtx sc_ro_mtx; + struct mtx sc_mtx; u_int sc_fibnum; const struct encaptab *encap_cookie; + u_int sc_flags; + LIST_ENTRY(stf_softc) stf_list; }; #define STF2IFP(sc) ((sc)->sc_ifp) static const char stfname[] = "stf"; -/* - * Note that mutable fields in the softc are not currently locked. - * We do lock sc_ro in stf_output though. - */ +static struct mtx stf_mtx; static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); +static VNET_DEFINE(LIST_HEAD(, stf_softc), stf_softc_list); +#define V_stf_softc_list VNET(stf_softc_list) + +#define STF_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "stf softc", \ + NULL, MTX_DEF); +#define STF_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) +#define STF_LOCK(sc) mtx_lock(&(sc)->sc_mtx) +#define STF_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) +#define STF_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) + static const int ip_stf_ttl = 40; extern struct domain inetdomain; @@ -176,8 +227,6 @@ struct protosw in_stf_protosw = { .pr_usrreqs = &rip_usrreqs }; -static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; - static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); static struct in6_ifaddr *stf_getsrcifa6(struct ifnet *); @@ -191,66 +240,42 @@ static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, static void stf_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); -static int stf_clone_match(struct if_clone *, const char *); -static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); -static int stf_clone_destroy(struct if_clone *, struct ifnet *); -static struct if_clone *stf_cloner; +#define STF_GETIN4_USE_CACHE 1 +static struct sockaddr_in *stf_getin4addr(struct stf_softc *, struct sockaddr_in *, + struct ifaddr *, int); +static struct sockaddr_in *stf_getin4addr_in6(struct stf_softc *, struct sockaddr_in *, + struct ifaddr *, const struct in6_addr *); +static struct sockaddr_in *stf_getin4addr_sin6(struct stf_softc *, struct sockaddr_in *, + struct ifaddr *, struct sockaddr_in6 *); +static int stf_clone_create(struct if_clone *, int, caddr_t); +static void stf_clone_destroy(struct ifnet *); -static int -stf_clone_match(struct if_clone *ifc, const char *name) -{ - int i; - - for(i = 0; stfnames[i] != NULL; i++) { - if (strcmp(stfnames[i], name) == 0) - return (1); - } - - return (0); -} +static struct if_clone *stf_cloner; static int -stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +stf_clone_create(struct if_clone *ifc, int unit, caddr_t params) { - int err, unit; struct stf_softc *sc; struct ifnet *ifp; - /* - * We can only have one unit, but since unit allocation is - * already locked, we use it to keep from allocating extra - * interfaces. - */ - unit = STFUNIT; - err = ifc_alloc_unit(ifc, &unit); - if (err != 0) - return (err); - sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK | M_ZERO); + sc->sc_fibnum = curthread->td_proc->p_fibnum; ifp = STF2IFP(sc) = if_alloc(IFT_STF); - if (ifp == NULL) { + if (sc->sc_ifp == NULL) { free(sc, M_STF); - ifc_free_unit(ifc, unit); - return (ENOSPC); + return (ENOMEM); } + STF_LOCK_INIT(sc); ifp->if_softc = sc; - sc->sc_fibnum = curthread->td_proc->p_fibnum; - /* - * Set the name manually rather then using if_initname because - * we don't conform to the default naming convention for interfaces. - */ - strlcpy(ifp->if_xname, name, IFNAMSIZ); - ifp->if_dname = stfname; - ifp->if_dunit = IF_DUNIT_NONE; + if_initname(ifp, stfname, unit); - mtx_init(&(sc)->sc_ro_mtx, "stf ro", NULL, MTX_DEF); sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6, stf_encapcheck, &in_stf_protosw, sc); if (sc->encap_cookie == NULL) { if_printf(ifp, "attach failed\n"); + if_free(ifp); free(sc, M_STF); - ifc_free_unit(ifc, unit); return (ENOMEM); } @@ -260,42 +285,56 @@ stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifp->if_snd.ifq_maxlen = ifqmaxlen; if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + + mtx_lock(&stf_mtx); + LIST_INSERT_HEAD(&V_stf_softc_list, sc, stf_list); + mtx_unlock(&stf_mtx); + return (0); } -static int -stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) +static void +stf_clone_destroy(struct ifnet *ifp) { struct stf_softc *sc = ifp->if_softc; int err; + mtx_lock(&stf_mtx); + LIST_REMOVE(sc, stf_list); + mtx_unlock(&stf_mtx); + err = encap_detach(sc->encap_cookie); KASSERT(err == 0, ("Unexpected error detaching encap_cookie")); - mtx_destroy(&(sc)->sc_ro_mtx); bpfdetach(ifp); if_detach(ifp); if_free(ifp); + STF_LOCK_DESTROY(sc); free(sc, M_STF); - ifc_free_unit(ifc, STFUNIT); +} - return (0); +static void +vnet_stf_init(const void *unused __unused) +{ + + LIST_INIT(&V_stf_softc_list); } +VNET_SYSINIT(vnet_stf_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, vnet_stf_init, + NULL); static int -stfmodevent(mod, type, data) - module_t mod; - int type; - void *data; +stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, - stf_clone_create, stf_clone_destroy); + mtx_init(&stf_mtx, "stf_mtx", NULL, MTX_DEF); + stf_cloner = if_clone_simple(stfname, + stf_clone_create, stf_clone_destroy, 0); break; case MOD_UNLOAD: if_clone_detach(stf_cloner); + mtx_destroy(&stf_mtx); break; default: return (EOPNOTSUPP); @@ -311,28 +350,31 @@ static moduledata_t stf_mod = { }; DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_stf, 1); static int -stf_encapcheck(m, off, proto, arg) - const struct mbuf *m; - int off; - int proto; - void *arg; +stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct in6_ifaddr *ia6; + struct sockaddr_in ia6_in4addr; + struct sockaddr_in ia6_in4mask; + struct sockaddr_in *sin; struct stf_softc *sc; - struct in_addr a, b, mask; + struct ifnet *ifp; + int ret = 0; + DEBUG_PRINTF(1, "%s: enter\n", __func__); sc = (struct stf_softc *)arg; if (sc == NULL) return 0; + ifp = STF2IFP(sc); - if ((STF2IFP(sc)->if_flags & IFF_UP) == 0) + if ((ifp->if_flags & IFF_UP) == 0) return 0; /* IFF_LINK0 means "no decapsulation" */ - if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0) + if ((ifp->if_flags & IFF_LINK0) != 0) return 0; if (proto != IPPROTO_IPV6) @@ -344,72 +386,169 @@ stf_encapcheck(m, off, proto, arg) if (ip.ip_v != 4) return 0; - ia6 = stf_getsrcifa6(STF2IFP(sc)); + /* Lookup an ia6 whose IPv4 addr encoded in the IPv6 addr is valid. */ + ia6 = stf_getsrcifa6(ifp); if (ia6 == NULL) return 0; + if (sc->srcv4_addr != INADDR_ANY) { + sin = &ia6_in4addr; + sin->sin_addr.s_addr = sc->srcv4_addr; + sin->sin_family = AF_INET; + } else { + sin = stf_getin4addr(sc, &ia6_in4addr, &ia6->ia_ifa, STF_GETIN4_USE_CACHE); + if (sin == NULL) + return (0); + } + +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip6_sprintf(buf, &satosin6(ia6->ia_ifa.ifa_addr)->sin6_addr); + DEBUG_PRINTF(1, "%s: ia6->ia_ifa.ifa_addr = %s\n", __func__, buf); + ip6_sprintf(buf, &ia6->ia_addr.sin6_addr); + DEBUG_PRINTF(1, "%s: ia6->ia_addr = %s\n", __func__, buf); + ip6_sprintf(buf, &satosin6(ia6->ia_ifa.ifa_netmask)->sin6_addr); + DEBUG_PRINTF(1, "%s: ia6->ia_ifa.ifa_netmask = %s\n", __func__, buf); + ip6_sprintf(buf, &ia6->ia_prefixmask.sin6_addr); + DEBUG_PRINTF(1, "%s: ia6->ia_prefixmask = %s\n", __func__, buf); + + ip_sprintf(buf, &ia6_in4addr.sin_addr); + DEBUG_PRINTF(1, "%s: ia6_in4addr.sin_addr = %s\n", __func__, buf); + ip_sprintf(buf, &ip.ip_src); + DEBUG_PRINTF(1, "%s: ip.ip_src = %s\n", __func__, buf); + ip_sprintf(buf, &ip.ip_dst); + DEBUG_PRINTF(1, "%s: ip.ip_dst = %s\n", __func__, buf); + } +#endif /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ - if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst, - sizeof(ip.ip_dst)) != 0) { - ifa_free(&ia6->ia_ifa); - return 0; + DEBUG_PRINTF(1, "%s: check1: ia6_in4addr.sin_addr == ip.ip_dst?\n", __func__); + if (ia6_in4addr.sin_addr.s_addr != ip.ip_dst.s_addr) { + DEBUG_PRINTF(1, "%s: check1: false. Ignore this packet.\n", __func__); + goto freeit; } - /* - * check if IPv4 src matches the IPv4 address derived from the - * local 6to4 address masked by prefixmask. - * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 - * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 - */ - bzero(&a, sizeof(a)); - bcopy(GET_V4(&ia6->ia_addr.sin6_addr), &a, sizeof(a)); - bcopy(GET_V4(&ia6->ia_prefixmask.sin6_addr), &mask, sizeof(mask)); - ifa_free(&ia6->ia_ifa); - a.s_addr &= mask.s_addr; - b = ip.ip_src; - b.s_addr &= mask.s_addr; - if (a.s_addr != b.s_addr) - return 0; + DEBUG_PRINTF(1, "%s: check2: ia6->ia_addr is 2002::/16?\n", __func__); + + if (IN6_IS_ADDR_6TO4(&ia6->ia_addr.sin6_addr)) { + /* 6to4 (RFC 3056) */ + /* + * check if IPv4 src matches the IPv4 address derived + * from the local 6to4 address masked by prefixmask. + * success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24 + * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 + */ + DEBUG_PRINTF(1, "%s: check2: true.\n", __func__); + + memcpy(&ia6_in4mask.sin_addr, + GET_V4(&ia6->ia_prefixmask.sin6_addr), + sizeof(ia6_in4mask)); +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip_sprintf(buf, &ia6_in4addr.sin_addr); + DEBUG_PRINTF(1, "%s: ia6->ia_addr = %s\n", + __func__, buf); + ip_sprintf(buf, &ip.ip_src); + DEBUG_PRINTF(1, "%s: ip.ip_src = %s\n", + __func__, buf); + ip_sprintf(buf, &ia6_in4mask.sin_addr); + DEBUG_PRINTF(1, "%s: ia6->ia_prefixmask = %s\n", + __func__, buf); + + DEBUG_PRINTF(1, "%s: check3: ia6_in4addr.sin_addr & mask == ip.ip_src & mask\n", + __func__); + } +#endif + + if ((ia6_in4addr.sin_addr.s_addr & ia6_in4mask.sin_addr.s_addr) != + (ip.ip_src.s_addr & ia6_in4mask.sin_addr.s_addr)) { + DEBUG_PRINTF(1, "%s: check3: false. Ignore this packet.\n", + __func__); + goto freeit; + } + } else { + /* 6rd (RFC 5569) */ + DEBUG_PRINTF(1, "%s: check2: false. 6rd.\n", __func__); + /* + * No restriction on the src address in the case of + * 6rd because the stf(4) interface always has a + * prefix which covers whole of IPv4 src address + * range. So, stf_output() will catch all of + * 6rd-capsuled IPv4 traffic with suspicious inner dst + * IPv4 address (i.e. the IPv6 destination address is + * one the admin does not like to route to outside), + * and then it discard them silently. + */ + } + DEBUG_PRINTF(1, "%s: all clear!\n", __func__); /* stf interface makes single side match only */ - return 32; + ret = 32; +freeit: + ifa_free(&ia6->ia_ifa); + + return (ret); } static struct in6_ifaddr * -stf_getsrcifa6(ifp) - struct ifnet *ifp; +stf_getsrcifa6(struct ifnet *ifp) { - struct ifaddr *ia; + struct ifaddr *ifa; struct in_ifaddr *ia4; - struct sockaddr_in6 *sin6; - struct in_addr in; + struct sockaddr_in *sin; + struct sockaddr_in in4; + struct stf_softc *sc; + + sc = ifp->if_softc; if_addr_rlock(ifp); - TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { - if (ia->ifa_addr->sa_family != AF_INET6) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) continue; - sin6 = (struct sockaddr_in6 *)ia->ifa_addr; - if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) + + if (sc->srcv4_addr != INADDR_ANY) { + in4.sin_addr.s_addr = sc->srcv4_addr; + sin = &in4; + } else if ((sin = stf_getin4addr(ifp->if_softc, &in4, ifa, + STF_GETIN4_USE_CACHE)) == NULL) continue; - bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); - LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash) - if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) + LIST_FOREACH(ia4, INADDR_HASH(sin->sin_addr.s_addr), ia_hash) + if (ia4->ia_addr.sin_addr.s_addr == sin->sin_addr.s_addr) break; if (ia4 == NULL) continue; - ifa_ref(ia); +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip6_sprintf(buf, &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr); + DEBUG_PRINTF(1, "%s: ifa->ifa_addr->sin6_addr = %s\n", + __func__, buf); + ip_sprintf(buf, &ia4->ia_addr.sin_addr); + DEBUG_PRINTF(1, "%s: ia4->ia_addr.sin_addr = %s\n", + __func__, buf); + } +#endif + + ifa_ref(ifa); if_addr_runlock(ifp); - return (struct in6_ifaddr *)ia; + return (ifatoia6(ifa)); } if_addr_runlock(ifp); - return NULL; + return (NULL); } static int @@ -419,8 +558,8 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct stf_softc *sc; const struct sockaddr_in6 *dst6; struct route *cached_route; - struct in_addr in4; - const void *ptr; + struct sockaddr_in *sin; + struct sockaddr_in in4; struct sockaddr_in *dst4; u_int8_t tos; struct ip *ip; @@ -472,20 +611,33 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. + * ip6_dst: destination addr in the packet header. + * dst6: destination addr specified in function argument. */ - ptr = NULL; - if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst)) - ptr = GET_V4(&ip6->ip6_dst); - else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) - ptr = GET_V4(&dst6->sin6_addr); - else { - ifa_free(&ia6->ia_ifa); - m_freem(m); - ifp->if_oerrors++; - return ENETUNREACH; + DEBUG_PRINTF(1, "%s: dst addr selection\n", __func__); + sin = stf_getin4addr_in6(sc, &in4, &ia6->ia_ifa, &ip6->ip6_dst); + if (sin == NULL) { + if (sc->dstv4_addr != INADDR_ANY) + in4.sin_addr.s_addr = sc->dstv4_addr; + else { + sin = stf_getin4addr_in6(sc, &in4, &ia6->ia_ifa, &dst6->sin6_addr); + if (sin == NULL) { + ifa_free(&ia6->ia_ifa); + m_freem(m); + ifp->if_oerrors++; + return ENETUNREACH; + } + } } - bcopy(ptr, &in4, sizeof(in4)); +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + ip_sprintf(buf, &in4.sin_addr); + DEBUG_PRINTF(1, "%s: ip_dst = %s\n", __func__, buf); + } +#endif if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as @@ -507,11 +659,30 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); + bcopy(&in4.sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)); - bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr), - &ip->ip_src, sizeof(ip->ip_src)); + if (sc->srcv4_addr != INADDR_ANY) + in4.sin_addr.s_addr = sc->srcv4_addr; + else { + sin = stf_getin4addr_sin6(sc, &in4, &ia6->ia_ifa, &ia6->ia_addr); + if (sin == NULL) { + ifa_free(&ia6->ia_ifa); + m_freem(m); + ifp->if_oerrors++; + return ENETUNREACH; + } + } + bcopy(&in4.sin_addr, &ip->ip_src, sizeof(ip->ip_src)); +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip_sprintf(buf, &ip->ip_src); + DEBUG_PRINTF(1, "%s: ip_src = %s\n", __func__, buf); + } +#endif ifa_free(&ia6->ia_ifa); - bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; ip->ip_len = htons(m->m_pkthdr.len); @@ -520,7 +691,7 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); - if (!stf_route_cache) { + if (!V_stf_route_cache) { cached_route = NULL; goto sendit; } @@ -528,7 +699,7 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, /* * Do we have a cached route? */ - mtx_lock(&(sc)->sc_ro_mtx); + STF_LOCK(sc); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { @@ -546,8 +717,15 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, rtalloc_fib(&sc->sc_ro, sc->sc_fibnum); if (sc->sc_ro.ro_rt == NULL) { m_freem(m); - mtx_unlock(&(sc)->sc_ro_mtx); ifp->if_oerrors++; + STF_UNLOCK(sc); + return ENETUNREACH; + } + if (sc->sc_ro.ro_rt->rt_ifp == ifp) { + /* infinite loop detection */ + m_free(m); + ifp->if_oerrors++; + STF_UNLOCK(sc); return ENETUNREACH; } } @@ -556,35 +734,33 @@ stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, sendit: M_SETFIB(m, sc->sc_fibnum); ifp->if_opackets++; + DEBUG_PRINTF(1, "%s: ip_output dispatch.\n", __func__); error = ip_output(m, NULL, cached_route, 0, NULL, NULL); if (cached_route != NULL) - mtx_unlock(&(sc)->sc_ro_mtx); - return error; + STF_UNLOCK(sc); + + return (error); } static int -isrfc1918addr(in) - struct in_addr *in; +isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: * 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 */ if (stf_permit_rfc1918 == 0 && ( - (ntohl(in->s_addr) & 0xff000000) >> 24 == 10 || - (ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 || - (ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)) + (ntohl(in->s_addr) & 0xff000000) == 10 << 24 || + (ntohl(in->s_addr) & 0xfff00000) == (172 * 256 + 16) << 16 || + (ntohl(in->s_addr) & 0xffff0000) == (192 * 256 + 168) << 16 )) return 1; return 0; } static int -stf_checkaddr4(sc, in, inifp) - struct stf_softc *sc; - struct in_addr *in; - struct ifnet *inifp; /* incoming interface */ +stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { struct in_ifaddr *ia4; @@ -600,13 +776,6 @@ stf_checkaddr4(sc, in, inifp) } /* - * reject packets with private address range. - * (requirement from RFC3056 section 2 1st paragraph) - */ - if (isrfc1918addr(in)) - return -1; - - /* * reject packets with broadcast */ IN_IFADDR_RLOCK(); @@ -629,7 +798,7 @@ stf_checkaddr4(sc, in, inifp) bzero(&sin, sizeof(sin)); sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_len = sizeof(sin); sin.sin_addr = *in; rt = rtalloc1_fib((struct sockaddr *)&sin, 0, 0UL, sc->sc_fibnum); @@ -650,10 +819,7 @@ stf_checkaddr4(sc, in, inifp) } static int -stf_checkaddr6(sc, in6, inifp) - struct stf_softc *sc; - struct in6_addr *in6; - struct ifnet *inifp; /* incoming interface */ +stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses @@ -677,9 +843,7 @@ stf_checkaddr6(sc, in6, inifp) } void -in_stf_input(m, off) - struct mbuf *m; - int off; +in_stf_input(struct mbuf *m, int off) { int proto; struct stf_softc *sc; @@ -687,6 +851,7 @@ in_stf_input(m, off) struct ip6_hdr *ip6; u_int8_t otos, itos; struct ifnet *ifp; + struct route_in6 rin6; proto = mtod(m, struct ip *)->ip_p; @@ -710,6 +875,17 @@ in_stf_input(m, off) mac_ifnet_create_mbuf(ifp, m); #endif +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip_sprintf(buf, &ip->ip_dst); + DEBUG_PRINTF(1, "%s: ip->ip_dst = %s\n", __func__, buf); + ip_sprintf(buf, &ip->ip_src); + DEBUG_PRINTF(1, "%s: ip->ip_src = %s\n", __func__, buf); + } +#endif /* * perform sanity check against outer src/dst. * for source, perform ingress filter as well. @@ -730,6 +906,17 @@ in_stf_input(m, off) } ip6 = mtod(m, struct ip6_hdr *); +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip6_sprintf(buf, &ip6->ip6_dst); + DEBUG_PRINTF(1, "%s: ip6->ip6_dst = %s\n", __func__, buf); + ip6_sprintf(buf, &ip6->ip6_src); + DEBUG_PRINTF(1, "%s: ip6->ip6_src = %s\n", __func__, buf); + } +#endif /* * perform sanity check against inner src/dst. * for source, perform ingress filter as well. @@ -740,6 +927,41 @@ in_stf_input(m, off) return; } + /* + * reject packets with private address range. + * (requirement from RFC3056 section 2 1st paragraph) + */ + if ((IN6_IS_ADDR_6TO4(&ip6->ip6_src) && isrfc1918addr(&ip->ip_src)) || + (IN6_IS_ADDR_6TO4(&ip6->ip6_dst) && isrfc1918addr(&ip->ip_dst))) { + m_freem(m); + return; + } + + /* + * Ignore if the destination is the same stf interface because + * all of valid IPv6 outgoing traffic should go interfaces + * except for it. + */ + memset(&rin6, 0, sizeof(rin6)); + rin6.ro_dst.sin6_len = sizeof(rin6.ro_dst); + rin6.ro_dst.sin6_family = AF_INET6; + memcpy(&rin6.ro_dst.sin6_addr, &ip6->ip6_dst, + sizeof(rin6.ro_dst.sin6_addr)); + rtalloc((struct route *)&rin6); + if (rin6.ro_rt == NULL) { + DEBUG_PRINTF(1, "%s: no IPv6 dst. Ignored.\n", __func__); + m_free(m); + return; + } + if ((rin6.ro_rt->rt_ifp == ifp) && + (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &rin6.ro_dst.sin6_addr))) { + DEBUG_PRINTF(1, "%s: IPv6 dst is the same stf. Ignored.\n", __func__); + RTFREE(rin6.ro_rt); + m_free(m); + return; + } + RTFREE(rin6.ro_rt); + itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; if ((ifp->if_flags & IFF_LINK1) != 0) ip_ecn_egress(ECN_ALLOWED, &otos, &itos); @@ -749,7 +971,7 @@ in_stf_input(m, off) ip6->ip6_flow |= htonl((u_int32_t)itos << 20); m->m_pkthdr.rcvif = ifp; - + if (bpf_peers_present(ifp->if_bpf)) { /* * We need to prepend the address family as @@ -762,6 +984,7 @@ in_stf_input(m, off) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } + DEBUG_PRINTF(1, "%s: netisr_dispatch(NETISR_IPV6)\n", __func__); /* * Put the packet to the network layer input queue according to the * specified address family. @@ -776,48 +999,363 @@ in_stf_input(m, off) /* ARGSUSED */ static void -stf_rtrequest(cmd, rt, info) - int cmd; - struct rtentry *rt; - struct rt_addrinfo *info; +stf_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { + RT_LOCK_ASSERT(rt); rt->rt_mtu = rt->rt_ifp->if_mtu; } +static struct sockaddr_in * +stf_getin4addr_in6(struct stf_softc *sc, struct sockaddr_in *sin, + struct ifaddr *ifa, + const struct in6_addr *in6) +{ + struct sockaddr_in6 sin6; + + DEBUG_PRINTF(1, "%s: enter.\n", __func__); + if (ifa == NULL || in6 == NULL) + return NULL; + + memset(&sin6, 0, sizeof(sin6)); + memcpy(&sin6.sin6_addr, in6, sizeof(sin6.sin6_addr)); + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + + return(stf_getin4addr_sin6(sc, sin, ifa, &sin6)); +} + +static struct sockaddr_in * +stf_getin4addr_sin6(struct stf_softc *sc, struct sockaddr_in *sin, + struct ifaddr *ifa, + struct sockaddr_in6 *sin6) +{ + struct in6_ifaddr ia6; + int i; + + DEBUG_PRINTF(1, "%s: enter.\n", __func__); + if (ifa == NULL || sin6 == NULL) + return NULL; + + memset(&ia6, 0, sizeof(ia6)); + memcpy(&ia6, ifatoia6(ifa), sizeof(ia6)); + + /* + * Use prefixmask information from ifa, and + * address information from sin6. + */ + ia6.ia_addr.sin6_family = AF_INET6; + ia6.ia_ifa.ifa_addr = (struct sockaddr *)&ia6.ia_addr; + ia6.ia_ifa.ifa_dstaddr = NULL; + ia6.ia_ifa.ifa_netmask = (struct sockaddr *)&ia6.ia_prefixmask; + +#if STF_DEBUG > 3 + { + char buf[INET6_ADDRSTRLEN + 1]; + memset(&buf, 0, sizeof(buf)); + + ip6_sprintf(buf, &sin6->sin6_addr); + DEBUG_PRINTF(1, "%s: sin6->sin6_addr = %s\n", __func__, buf); + ip6_sprintf(buf, &ia6.ia_addr.sin6_addr); + DEBUG_PRINTF(1, "%s: ia6.ia_addr.sin6_addr = %s\n", __func__, buf); + ip6_sprintf(buf, &ia6.ia_prefixmask.sin6_addr); + DEBUG_PRINTF(1, "%s: ia6.ia_prefixmask.sin6_addr = %s\n", __func__, buf); + } +#endif + + /* + * When (src addr & src mask) != (dst (sin6) addr & src mask), + * the dst is not in the 6rd domain. The IPv4 address must + * not be used. + */ + for (i = 0; i < sizeof(ia6.ia_addr.sin6_addr); i++) { + if ((((u_char *)&ia6.ia_addr.sin6_addr)[i] & + ((u_char *)&ia6.ia_prefixmask.sin6_addr)[i]) + != + (((u_char *)&sin6->sin6_addr)[i] & + ((u_char *)&ia6.ia_prefixmask.sin6_addr)[i])) + return NULL; + } + + /* After the mask check, overwrite ia6.ia_addr with sin6. */ + memcpy(&ia6.ia_addr, sin6, sizeof(ia6.ia_addr)); + return(stf_getin4addr(sc, sin, (struct ifaddr *)&ia6, 0)); +} + +static struct sockaddr_in * +stf_getin4addr(struct stf_softc *sc, struct sockaddr_in *sin, + struct ifaddr *ifa, + int flags) +{ + struct in_addr *in; + struct sockaddr_in6 *sin6; + struct in6_ifaddr *ia6; + + DEBUG_PRINTF(1, "%s: enter.\n", __func__); + if (ifa == NULL || + ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET6) + return NULL; + + sin6 = satosin6(ifa->ifa_addr); + ia6 = ifatoia6(ifa); + + if ((flags & STF_GETIN4_USE_CACHE) && + (ifa->ifa_dstaddr != NULL) && + (ifa->ifa_dstaddr->sa_family == AF_INET)) { + /* + * XXX: ifa_dstaddr is used as a cache of the + * extracted IPv4 address. + */ + memcpy(sin, satosin(ifa->ifa_dstaddr), sizeof(*sin)); + +#if STF_DEBUG > 3 + { + char tmpbuf[INET6_ADDRSTRLEN + 1]; + memset(&tmpbuf, 0, INET6_ADDRSTRLEN); + + ip_sprintf(tmpbuf, &sin->sin_addr); + DEBUG_PRINTF(1, "%s: cached address was used = %s\n", __func__, tmpbuf); + } +#endif + return (sin); + } + + memset(sin, 0, sizeof(*sin)); + in = &sin->sin_addr; + +#if STF_DEBUG > 3 + { + char tmpbuf[INET6_ADDRSTRLEN + 1]; + memset(&tmpbuf, 0, INET6_ADDRSTRLEN); + + ip6_sprintf(tmpbuf, &sin6->sin6_addr); + DEBUG_PRINTF(1, "%s: sin6->sin6_addr = %s\n", __func__, tmpbuf); + } +#endif + + if (IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { + /* 6to4 (RFC 3056) */ + bcopy(GET_V4(&sin6->sin6_addr), in, sizeof(*in)); + if (isrfc1918addr(in)) + return NULL; + } else { + /* 6rd (RFC 5569) */ + struct in6_addr buf; + u_char *p = (u_char *)&buf; + u_char *q = (u_char *)&in->s_addr; + u_int residue = 0, v4residue = 0; + u_char mask, v4mask = 0; + int i, j; + u_int plen, loop; + + /* + * 6rd-relays IPv6 prefix is located at a 32-bit just + * after the prefix edge. + */ + plen = in6_mask2len(&satosin6(ifa->ifa_netmask)->sin6_addr, NULL); + if (64 < plen) { + DEBUG_PRINTF(1, "prefixlen is %d\n", plen); + return NULL; + } + + loop = 4; /* Normal 6rd operation */ + memcpy(&buf, &sin6->sin6_addr, sizeof(buf)); + if (sc->v4prefixlen != 0 && sc->v4prefixlen != 32) { + v4residue = sc->v4prefixlen % 8; + } + p += plen / 8; + //plen -= 32; + + residue = plen % 8; + mask = ((u_char)(-1) >> (8 - residue)); + if (v4residue) { + loop++; + v4mask = ((u_char)(-1) << v4residue); + } + /* + * The p points head of the IPv4 address part in + * bytes. The residue is a bit-shift factor when + * prefixlen is not a multiple of 8. + */ + DEBUG_PRINTF(2, "residue = %d 0x%x\n", residue, mask); + for (j = 0, i = (loop - (sc->v4prefixlen / 8)); i < loop; j++, i++) { + if (residue) { + q[i] = ((p[j] & mask) << (8 - residue)); + q[i] |= ((p[j + 1] >> residue) & mask); + DEBUG_PRINTF(2, "FINAL i = %d q[%d] - p[%d/%d] %x\n", + i, q[i], p[j], p[j + 1] >> residue, q[i]); + } else { + q[i] = p[j]; + DEBUG_PRINTF(2, "FINAL q[%d] - p[%d] %x\n", + q[i], p[j], q[i]); + } + } + if (v4residue) { + q[loop - (sc->v4prefixlen / 8)] &= v4mask; + + if (sc->v4prefixlen > 0 && sc->v4prefixlen < 32) + in->s_addr |= sc->inaddr; + } + + //if (in->s_addr != sc->srcv4_addr) + // printf("Wrong decoded address %x/%x!!!!\n", in->s_addr, sc->srcv4_addr); + } + +#if STF_DEBUG > 3 + { + char tmpbuf[INET6_ADDRSTRLEN + 1]; + memset(&tmpbuf, 0, INET_ADDRSTRLEN); + + ip_sprintf(tmpbuf, in); + DEBUG_PRINTF(1, "%s: in->in_addr = %s\n", __func__, tmpbuf); + DEBUG_PRINTF(1, "%s: leave\n", __func__); + } +#endif + + if (flags & STF_GETIN4_USE_CACHE) { + DEBUG_PRINTF(1, "%s: try to access ifa->ifa_dstaddr.\n", __func__); + ifa->ifa_dstaddr = (struct sockaddr *)&ia6->ia_dstaddr; + DEBUG_PRINTF(1, "%s: try to memset 0 to ia_dstaddr.\n", __func__); + memset(&ia6->ia_dstaddr, 0, sizeof(ia6->ia_dstaddr)); + DEBUG_PRINTF(1, "%s: try to memcpy ifa->ifa_dstaddr.\n", __func__); + memcpy((struct sockaddr_in *)ifa->ifa_dstaddr, + sin, sizeof(struct sockaddr_in)); + DEBUG_PRINTF(1, "%s: try to set sa_family.\n", __func__); + ifa->ifa_dstaddr->sa_family = AF_INET; + DEBUG_PRINTF(1, "%s: in->in_addr is stored in ifa_dstaddr.\n", + __func__); + } + + return (sin); +} + + static int -stf_ioctl(ifp, cmd, data) - struct ifnet *ifp; - u_long cmd; - caddr_t data; +stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { + struct stf_softc *sc, *sc_cur; struct ifaddr *ifa; struct ifreq *ifr; - struct sockaddr_in6 *sin6; - struct in_addr addr; + struct sockaddr_in in4; + struct stfv4args args; + struct in6_ifaddr *ia6; + struct ifdrv *ifd; int error, mtu; error = 0; + sc_cur = ifp->if_softc; + switch (cmd) { + case SIOCSDRVSPEC: + ifd = (struct ifdrv *) data; + error = priv_check(curthread, PRIV_NET_ADDIFADDR); + if (error) + break; + if (ifd->ifd_cmd == STF_SV4NET) { + if (ifd->ifd_len != sizeof(args)) { + error = EINVAL; + break; + } + mtx_lock(&stf_mtx); + LIST_FOREACH(sc, &V_stf_softc_list, stf_list) { + if (sc == sc_cur) + continue; + if (sc->inaddr == 0 || sc->v4prefixlen == 0) + continue; + + if ((ntohl(sc->inaddr) & ((uint32_t)(-1) << sc_cur->v4prefixlen)) == ntohl(sc_cur->inaddr)) { + error = EEXIST; + mtx_unlock(&stf_mtx); + return (error); + } + if ((ntohl(sc_cur->inaddr) & ((uint32_t)(-1) << sc->v4prefixlen)) == ntohl(sc->inaddr)) { + error = EEXIST; + mtx_unlock(&stf_mtx); + return (error); + } + } + mtx_unlock(&stf_mtx); + bzero(&args, sizeof args); + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); + if (error) + break; + + sc_cur->srcv4_addr = args.inaddr.s_addr; + sc_cur->inaddr = ntohl(args.inaddr.s_addr); + sc_cur->inaddr &= ((uint32_t)(-1) << args.prefix); + sc_cur->inaddr = htonl(sc_cur->inaddr); + sc_cur->v4prefixlen = args.prefix; + if (sc_cur->v4prefixlen == 0) + sc_cur->v4prefixlen = 32; + } else if (ifd->ifd_cmd == STF_SDSTV4) { + if (ifd->ifd_len != sizeof(args)) { + error = EINVAL; + break; + } + bzero(&args, sizeof args); + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); + if (error) + break; + sc_cur->dstv4_addr = args.dstv4_addr.s_addr; + } else + error = EINVAL; + break; + case SIOCGDRVSPEC: + ifd = (struct ifdrv *) data; + if (ifd->ifd_len != sizeof(args)) { + error = EINVAL; + break; + } + if (ifd->ifd_cmd != STF_GV4NET) { + error = EINVAL; + break; + } + bzero(&args, sizeof args); + args.inaddr.s_addr = sc_cur->srcv4_addr; + args.dstv4_addr.s_addr = sc_cur->dstv4_addr; + args.prefix = sc_cur->v4prefixlen; + error = copyout(&args, ifd->ifd_data, ifd->ifd_len); + + break; case SIOCSIFADDR: ifa = (struct ifaddr *)data; if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; break; } - sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; - if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { + if (stf_getin4addr(sc_cur, &in4, ifa, 0) == NULL) { error = EINVAL; break; } - bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr)); - if (isrfc1918addr(&addr)) { - error = EINVAL; - break; + /* + * Sanity check: if more than two interfaces have IFF_UP, do + * if_down() for all of them except for the specified one. + */ + mtx_lock(&stf_mtx); + LIST_FOREACH(sc, &V_stf_softc_list, stf_list) { + if (sc == sc_cur) + continue; + if ((ia6 = stf_getsrcifa6(sc->sc_ifp)) == NULL) + continue; + if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &ifatoia6(ifa)->ia_addr.sin6_addr)) { + error = EEXIST; + ifa_free(&ia6->ia_ifa); + break; + } + ifa_free(&ia6->ia_ifa); } + mtx_unlock(&stf_mtx); + /* + * XXX: ifa_dstaddr is used as a cache of the + * extracted IPv4 address. + */ + if (ifa->ifa_dstaddr != NULL) + ifa->ifa_dstaddr->sa_family = AF_UNSPEC; ifa->ifa_rtrequest = stf_rtrequest; ifp->if_flags |= IFF_UP; + ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: @@ -847,4 +1385,5 @@ stf_ioctl(ifp, cmd, data) } return error; + } diff --git a/sys/net/if_stf.h b/sys/net/if_stf.h index cbaf670..e6ff29e 100644 --- a/sys/net/if_stf.h +++ b/sys/net/if_stf.h @@ -33,6 +33,15 @@ #ifndef _NET_IF_STF_H_ #define _NET_IF_STF_H_ +struct stfv4args { + struct in_addr inaddr; + struct in_addr dstv4_addr; + int prefix; +}; + +#define STF_SV4NET 1 +#define STF_GV4NET 2 +#define STF_SDSTV4 3 void in_stf_input(struct mbuf *, int); #endif /* _NET_IF_STF_H_ */ diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index 262d6d2..9864278 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -52,6 +52,7 @@ #include <net/vnet.h> #ifdef INET #include <netinet/in.h> +#include <netinet/ip.h> #endif #include <net/bpf.h> #include <net/if_tun.h> @@ -110,6 +111,7 @@ static const char tunname[] = "tun"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; +static int tundispatch = 1; static struct clonedevs *tunclones; static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); @@ -119,6 +121,8 @@ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface."); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tundclone, 0, "Enable legacy devfs interface creation."); +SYSCTL_INT(_net_link_tun, OID_AUTO, tun_dispatching, CTLFLAG_RW, &tundispatch, 0, + "Queue rather than direct dispatch on write."); TUNABLE_INT("net.link.tun.devfs_cloning", &tundclone); @@ -328,31 +332,28 @@ static void tunstart(struct ifnet *ifp) { struct tun_softc *tp = ifp->if_softc; - struct mbuf *m; TUNDEBUG(ifp,"%s starting\n", ifp->if_xname); - if (ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m == NULL) { - IFQ_UNLOCK(&ifp->if_snd); - return; - } - IFQ_UNLOCK(&ifp->if_snd); - } + if (IFQ_IS_EMPTY(&ifp->if_snd)) + return; + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; - mtx_lock(&tp->tun_mtx); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { + if (!TAILQ_EMPTY(&tp->tun_rsel.si_tdlist)) + selwakeuppri(&tp->tun_rsel, PZERO + 1); + if (!KNLIST_EMPTY(&tp->tun_rsel.si_note)) { + mtx_lock(&tp->tun_mtx); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); mtx_unlock(&tp->tun_mtx); + } + if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) pgsigio(&tp->tun_sigio, SIGIO, 0); - } else - mtx_unlock(&tp->tun_mtx); + + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } /* XXX: should return an error code so it can fail. */ @@ -590,9 +591,7 @@ tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, #endif /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); cached_tun_flags = tp->tun_flags; - mtx_unlock(&tp->tun_mtx); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); @@ -799,9 +798,7 @@ tunread(struct cdev *dev, struct uio *uio, int flag) int error=0, len; TUNDEBUG (ifp, "read\n"); - mtx_lock(&tp->tun_mtx); if ((tp->tun_flags & TUN_READY) != TUN_READY) { - mtx_unlock(&tp->tun_mtx); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } @@ -812,19 +809,19 @@ tunread(struct cdev *dev, struct uio *uio, int flag) IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tun_mtx); return (EWOULDBLOCK); } + mtx_lock(&tp->tun_mtx); tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); + tp->tun_flags &= ~TUN_RWAIT; + mtx_unlock(&tp->tun_mtx); if (error != 0) { - mtx_unlock(&tp->tun_mtx); return (error); } } } while (m == NULL); - mtx_unlock(&tp->tun_mtx); while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); @@ -849,6 +846,7 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; + struct ip *ip; uint32_t family; int isr; @@ -876,18 +874,26 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) mac_ifnet_create_mbuf(ifp, m); #endif - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); + /* XXX: unlocked read? */ if (tp->tun_flags & TUN_IFHEAD) { - mtx_unlock(&tp->tun_mtx); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { - mtx_unlock(&tp->tun_mtx); - family = AF_INET; + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == NULL) + return (ENOBUFS); + ip = mtod(m, struct ip *); + if (ip->ip_v == IPVERSION) + family = AF_INET; + else if (ip->ip_v == AF_INET6) + family = AF_INET6; + else { + m_freem(m); + return (EINVAL); + } } BPF_MTAP2(ifp, &family, sizeof(family), m); @@ -923,7 +929,10 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) ifp->if_ipackets++; CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); - netisr_dispatch(isr, m); + if (tundispatch) + netisr_queue(isr, m); + else + netisr_dispatch(isr, m); CURVNET_RESTORE(); return (0); } @@ -939,21 +948,17 @@ tunpoll(struct cdev *dev, int events, struct thread *td) struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; - struct mbuf *m; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m != NULL) { - TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); - revents |= events & (POLLIN | POLLRDNORM); - } else { + if (IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); + } else { + TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); + revents |= events & (POLLIN | POLLRDNORM); } - IFQ_UNLOCK(&ifp->if_snd); } if (events & (POLLOUT | POLLWRNORM)) revents |= events & (POLLOUT | POLLWRNORM); diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 6a43e37..2de6829 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1,5 +1,9 @@ /*- * Copyright 1998 Massachusetts Institute of Technology + * Copyright 2012 ADARA Networks, Inc. + * + * Portions of this software were developed by Robert N. M. Watson under + * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby @@ -51,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/module.h> #include <sys/rwlock.h> +#include <sys/priv.h> #include <sys/queue.h> #include <sys/socket.h> #include <sys/sockio.h> @@ -112,6 +117,8 @@ struct ifvlan { int ifvm_mintu; /* min transmission unit */ uint16_t ifvm_proto; /* encapsulation ethertype */ uint16_t ifvm_tag; /* tag to apply on packets leaving if */ + uint16_t ifvm_vid; /* VLAN ID */ + uint8_t ifvm_pcp; /* Priority Code Point (PCP). */ } ifv_mib; SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY @@ -119,7 +126,9 @@ struct ifvlan { #endif }; #define ifv_proto ifv_mib.ifvm_proto -#define ifv_vid ifv_mib.ifvm_tag +#define ifv_tag ifv_mib.ifvm_tag +#define ifv_vid ifv_mib.ifvm_vid +#define ifv_pcp ifv_mib.ifvm_pcp #define ifv_encaplen ifv_mib.ifvm_encaplen #define ifv_mtufudge ifv_mib.ifvm_mtufudge #define ifv_mintu ifv_mib.ifvm_mintu @@ -144,6 +153,15 @@ static int soft_pad = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW, &soft_pad, 0, "pad short frames before tagging"); +/* + * For now, make preserving PCP via an mbuf tag optional, as it increases + * per-packet memory allocations and frees. In the future, it would be + * preferable to reuse ether_vtag for this, or similar. + */ +static int vlan_mtag_pcp = 0; +SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, + "Retain VLAN PCP information as packets are passed up the stack"); + static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); @@ -188,15 +206,14 @@ static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, #endif static void trunk_destroy(struct ifvlantrunk *trunk); +static void vlan_start(struct ifnet *ifp); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); -static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); -static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); @@ -693,6 +710,16 @@ vlan_devat(struct ifnet *ifp, uint16_t vid) } /* + * Recalculate the cached VLAN tag exposed via the MIB. + */ +static void +vlan_tag_recalculate(struct ifvlan *ifv) +{ + + ifv->ifv_tag = EVL_MAKETAG(ifv->ifv_vid, ifv->ifv_pcp, 0); +} + +/* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and @@ -958,9 +985,12 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) /* NB: mtu is not set here */ ifp->if_init = vlan_init; - ifp->if_transmit = vlan_transmit; - ifp->if_qflush = vlan_qflush; + ifp->if_start = vlan_start; ifp->if_ioctl = vlan_ioctl; + + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + ifp->if_snd.ifq_drv_maxlen = 0; + IFQ_SET_READY(&ifp->if_snd); ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ @@ -1001,6 +1031,8 @@ vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; + IFQ_PURGE(&ifp->if_snd); + ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ if_free(ifp); @@ -1021,97 +1053,114 @@ vlan_init(void *foo __unused) /* * The if_transmit method for vlan(4) interface. */ -static int -vlan_transmit(struct ifnet *ifp, struct mbuf *m) +static void +vlan_start(struct ifnet *ifp) { struct ifvlan *ifv; struct ifnet *p; + struct m_tag *mtag; + struct mbuf *m; + uint16_t tag; int error, len, mcast; + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m == NULL) { + IFQ_UNLOCK(&ifp->if_snd); + return; + } + IFQ_UNLOCK(&ifp->if_snd); + } + ifv = ifp->if_softc; p = PARENT(ifv); - len = m->m_pkthdr.len; - mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; - - BPF_MTAP(ifp, m); - /* - * Do not run parent's if_transmit() if the parent is not up, - * or parent's driver will cause a system crash. - */ - if (!UP_AND_RUNNING(p)) { - m_freem(m); - ifp->if_oerrors++; - return (ENETDOWN); - } + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; - /* - * Pad the frame to the minimum size allowed if told to. - * This option is in accord with IEEE Std 802.1Q, 2003 Ed., - * paragraph C.4.4.3.b. It can help to work around buggy - * bridges that violate paragraph C.4.4.3.a from the same - * document, i.e., fail to pad short frames after untagging. - * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but - * untagging it will produce a 62-byte frame, which is a runt - * and requires padding. There are VLAN-enabled network - * devices that just discard such runts instead or mishandle - * them somehow. - */ - if (soft_pad && p->if_type == IFT_ETHER) { - static char pad[8]; /* just zeros */ - int n; + len = m->m_pkthdr.len; + mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; - for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len; - n > 0; n -= sizeof(pad)) - if (!m_append(m, min(n, sizeof(pad)), pad)) - break; + BPF_MTAP(ifp, m); - if (n > 0) { - if_printf(ifp, "cannot pad short frame\n"); + /* + * Do not run parent's if_transmit() if the parent is not up, + * or parent's driver will cause a system crash. + */ + if (!UP_AND_RUNNING(p)) { ifp->if_oerrors++; - m_freem(m); - return (0); + return; } - } - /* - * If underlying interface can do VLAN tag insertion itself, - * just pass the packet along. However, we need some way to - * tell the interface where the packet came from so that it - * knows how to find the VLAN tag to use, so we attach a - * packet tag that holds it. - */ - if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { - m->m_pkthdr.ether_vtag = ifv->ifv_vid; - m->m_flags |= M_VLANTAG; - } else { - m = ether_vlanencap(m, ifv->ifv_vid); - if (m == NULL) { - if_printf(ifp, "unable to prepend VLAN header\n"); - ifp->if_oerrors++; - return (0); + /* + * Pad the frame to the minimum size allowed if told to. + * This option is in accord with IEEE Std 802.1Q, 2003 Ed., + * paragraph C.4.4.3.b. It can help to work around buggy + * bridges that violate paragraph C.4.4.3.a from the same + * document, i.e., fail to pad short frames after untagging. + * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but + * untagging it will produce a 62-byte frame, which is a runt + * and requires padding. There are VLAN-enabled network + * devices that just discard such runts instead or mishandle + * them somehow. + */ + if (soft_pad && p->if_type == IFT_ETHER) { + static char pad[8]; /* just zeros */ + int n; + + for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len; + n > 0; n -= sizeof(pad)) + if (!m_append(m, min(n, sizeof(pad)), pad)) + break; + + if (n > 0) { + if_printf(ifp, "cannot pad short frame\n"); + ifp->if_oerrors++; + m_freem(m); + return; + } } - } - /* - * Send it, precisely as ether_output() would have. - */ - error = (p->if_transmit)(p, m); - if (!error) { - ifp->if_opackets++; - ifp->if_omcasts += mcast; - ifp->if_obytes += len; - } else - ifp->if_oerrors++; - return (error); -} + /* + * If underlying interface can do VLAN tag insertion itself, + * just pass the packet along. However, we need some way to + * tell the interface where the packet came from so that it + * knows how to find the VLAN tag to use, so we attach a + * packet tag that holds it. + */ + if (vlan_mtag_pcp && (mtag = m_tag_locate(m, MTAG_8021Q, + MTAG_8021Q_PCP_OUT, NULL)) != NULL) + tag = EVL_MAKETAG(ifv->ifv_vid, *(uint8_t *)(mtag + 1), 0); + else + tag = ifv->ifv_tag; + if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { + m->m_pkthdr.ether_vtag = tag; + m->m_flags |= M_VLANTAG; + } else { + m = ether_vlanencap(m, tag); + if (m == NULL) { + if_printf(ifp, "unable to prepend VLAN header\n"); + ifp->if_oerrors++; + return; + } + } -/* - * The ifp->if_qflush entry point for vlan(4) is a no-op. - */ -static void -vlan_qflush(struct ifnet *ifp __unused) -{ + /* + * Send it, precisely as ether_output() would have. + */ + error = (p->if_transmit)(p, m); + if (!error) { + ifp->if_opackets++; + //ifp->if_omcasts += mcast; + //ifp->if_obytes += len; + } else + ifp->if_oerrors++; + } + + return; } static void @@ -1119,7 +1168,8 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk = ifp->if_vlantrunk; struct ifvlan *ifv; - uint16_t vid; + struct m_tag *mtag; + uint16_t vid, tag; KASSERT(trunk != NULL, ("%s: no trunk", __func__)); @@ -1128,7 +1178,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ - vid = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); + tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; @@ -1144,7 +1194,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) return; } evl = mtod(m, struct ether_vlan_header *); - vid = EVL_VLANOFTAG(ntohs(evl->evl_tag)); + tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet @@ -1168,6 +1218,8 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) } } + vid = EVL_VLANOFTAG(tag); + TRUNK_RLOCK(trunk); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { @@ -1178,6 +1230,28 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) } TRUNK_RUNLOCK(trunk); + if (vlan_mtag_pcp) { + /* + * While uncommon, it is possible that we will find a 802.1q + * packet encapsulated inside another packet that also had an + * 802.1q header. For example, ethernet tunneled over IPSEC + * arriving over ethernet. In that case, we replace the + * existing 802.1q PCP m_tag value. + */ + mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); + if (mtag == NULL) { + mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, + sizeof(uint8_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + ifp->if_ierrors++; + return; + } + m_tag_prepend(m, mtag); + } + *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); + } + m->m_pkthdr.rcvif = ifv->ifv_ifp; ifv->ifv_ifp->if_ipackets++; @@ -1195,7 +1269,7 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) /* VID numbers 0x0 and 0xFFF are reserved */ if (vid == 0 || vid == 0xFFF) return (EINVAL); - if (p->if_type != IFT_ETHER && + if (p->if_type != IFT_ETHER && p->if_type != IFT_BRIDGE && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) @@ -1226,6 +1300,8 @@ exists: } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ + ifv->ifv_pcp = 0; /* Default: best effort delivery. */ + vlan_tag_recalculate(ifv); error = vlan_inshash(trunk, ifv); if (error) goto done; @@ -1721,6 +1797,34 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } break; + case SIOCGVLANPCP: +#ifdef VIMAGE + if (ifp->if_vnet != ifp->if_home_vnet) { + error = EPERM; + break; + } +#endif + ifr->ifr_vlan_pcp = ifv->ifv_pcp; + break; + + case SIOCSVLANPCP: +#ifdef VIMAGE + if (ifp->if_vnet != ifp->if_home_vnet) { + error = EPERM; + break; + } +#endif + error = priv_check(curthread, PRIV_NET_SETVLANPCP); + if (error) + break; + if (ifr->ifr_vlan_pcp > 7) { + error = EINVAL; + break; + } + ifv->ifv_pcp = ifr->ifr_vlan_pcp; + vlan_tag_recalculate(ifv); + break; + default: error = EINVAL; break; diff --git a/sys/net/if_vlan_var.h b/sys/net/if_vlan_var.h index 4eb3b09..b1950e1 100644 --- a/sys/net/if_vlan_var.h +++ b/sys/net/if_vlan_var.h @@ -89,6 +89,23 @@ struct vlanreq { #define SIOCSETVLAN SIOCSIFGENERIC #define SIOCGETVLAN SIOCGIFGENERIC +#define SIOCGVLANPCP _IOWR('i', 152, struct ifreq) /* Get VLAN PCP */ +#define SIOCSVLANPCP _IOW('i', 153, struct ifreq) /* Set VLAN PCP */ + +/* + * Names for 802.1q priorities ("802.1p"). Notice that in this scheme, + * (0 < 1), allowing default 0-tagged traffic to take priority over background + * tagged traffic. + */ +#define IEEE8021Q_PCP_BK 1 /* Background (lowest) */ +#define IEEE8021Q_PCP_BE 0 /* Best effort (default) */ +#define IEEE8021Q_PCP_EE 2 /* Excellent effort */ +#define IEEE8021Q_PCP_CA 3 /* Critical applications */ +#define IEEE8021Q_PCP_VI 4 /* Video, < 100ms latency */ +#define IEEE8021Q_PCP_VO 5 /* Video, < 10ms latency */ +#define IEEE8021Q_PCP_IC 6 /* Internetwork control */ +#define IEEE8021Q_PCP_NC 7 /* Network control (highest) */ + #ifdef _KERNEL /* * Drivers that are capable of adding and removing the VLAN header @@ -126,6 +143,16 @@ struct vlanreq { * if_capabilities. */ +/* + * The 802.1q code may also tag mbufs with the PCP (priority) field for use in + * other layers of the stack, in which case an m_tag will be used. This is + * semantically quite different from use of the ether_vtag field, which is + * defined only between the device driver and VLAN layer. + */ +#define MTAG_8021Q 1326104895 +#define MTAG_8021Q_PCP_IN 0 /* Input priority. */ +#define MTAG_8021Q_PCP_OUT 1 /* Output priority. */ + #define VLAN_CAPABILITIES(_ifp) do { \ if ((_ifp)->if_vlantrunk != NULL) \ (*vlan_trunk_cap_p)(_ifp); \ diff --git a/sys/net/netisr.c b/sys/net/netisr.c index 0cc6bb8..ef34600 100644 --- a/sys/net/netisr.c +++ b/sys/net/netisr.c @@ -126,6 +126,13 @@ static struct rmlock netisr_rmlock; static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); +#ifdef DEVICE_POLLING +static int netisr_polling = 0; /* Enable Polling. */ +TUNABLE_INT("net.isr.polling_enable", &netisr_polling); +SYSCTL_INT(_net_isr, OID_AUTO, polling_enable, CTLFLAG_RW, + &netisr_polling, 0, "Enable polling"); +#endif + /*- * Three global direct dispatch policies are supported: * @@ -168,7 +175,7 @@ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); -static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ +static int netisr_bindthreads = 1; /* Bind threads to CPUs. */ TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); @@ -796,9 +803,11 @@ swi_net(void *arg) nwsp = arg; #ifdef DEVICE_POLLING - KASSERT(nws_count == 1, - ("%s: device_polling but nws_count != 1", __func__)); - netisr_poll(); + if (netisr_polling) { + KASSERT(nws_count == 1, + ("%s: device_polling but nws_count != 1", __func__)); + netisr_poll(); + } #endif #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); @@ -823,7 +832,8 @@ out: NETISR_RUNLOCK(&tracker); #endif #ifdef DEVICE_POLLING - netisr_pollmore(); + if (netisr_polling) + netisr_pollmore(); #endif } @@ -1078,6 +1088,9 @@ netisr_sched_poll(void) { struct netisr_workstream *nwsp; + if (!netisr_polling) + return; + nwsp = DPCPU_ID_PTR(nws_array[0], nws); NWS_SIGNAL(nwsp); } @@ -1151,7 +1164,7 @@ netisr_init(void *arg) * multiple netisr threads, so for the time being compiling in device * polling disables parallel netisr workers. */ - if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { + if (netisr_polling && (netisr_maxthreads != 1 || netisr_bindthreads != 0)) { printf("netisr_init: forcing maxthreads to 1 and " "bindthreads to 0 for device polling\n"); netisr_maxthreads = 1; diff --git a/sys/net/pfil.c b/sys/net/pfil.c index 44373ee..5d5d346 100644 --- a/sys/net/pfil.c +++ b/sys/net/pfil.c @@ -34,6 +34,7 @@ #include <sys/errno.h> #include <sys/lock.h> #include <sys/malloc.h> +#include <sys/sbuf.h> #include <sys/rmlock.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -78,7 +79,7 @@ pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0")); for (pfh = pfil_chain_get(dir, ph); pfh != NULL; pfh = TAILQ_NEXT(pfh, pfil_chain)) { - if (pfh->pfil_func != NULL) { + if (!(pfh->pfil_flags & PFIL_DISABLED) && pfh->pfil_func != NULL) { rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir, inp); if (rv != 0 || m == NULL) @@ -211,6 +212,140 @@ pfil_head_unregister(struct pfil_head *ph) return (0); } +static int +pfil_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + struct rm_priotracker rmpt; + struct pfil_head *ph; + struct packet_filter_hook *pfh, *pfhtmp; + struct sbuf *sb; + pfil_chain_t npfl, *pfl; + char *new_order, *elm, *parse; + int i = 0, err = 0, hintlen, reqlen; + + hintlen = 0; + + ph = (struct pfil_head *)arg1; + if (ph == NULL || !PFIL_HOOKED(ph)) { + err = SYSCTL_OUT(req, "", 2); + return (err); + } + + if (arg2 == PFIL_IN) + pfl = &ph->ph_in; + else + pfl = &ph->ph_out; + + if (TAILQ_EMPTY(pfl)) { + err = SYSCTL_OUT(req, "", 2); + return (err); + } + + /* + * NOTE: This is needed to avoid witness(4) warnings. + */ + PFIL_RLOCK(ph, &rmpt); + TAILQ_FOREACH(pfh, pfl, pfil_chain) { + if (pfh->pfil_name != NULL) + hintlen = strlen(pfh->pfil_name); + else + hintlen += 2; + } + PFIL_RUNLOCK(ph, &rmpt); + + sb = sbuf_new(NULL, NULL, hintlen + 1, SBUF_AUTOEXTEND); + if (sb == NULL) + return (EINVAL); + + PFIL_RLOCK(ph, &rmpt); + TAILQ_FOREACH(pfh, pfl, pfil_chain) { + if (i > 0) + sbuf_printf(sb, ", "); + if (pfh->pfil_name != NULL) + sbuf_printf(sb, "%s%s", pfh->pfil_name, + pfh->pfil_flags & PFIL_DISABLED ? "*" : ""); + else + sbuf_printf(sb, "%s%s", "NA", + pfh->pfil_flags & PFIL_DISABLED ? "*" : ""); + i++; + } + PFIL_RUNLOCK(ph, &rmpt); + + sbuf_finish(sb); + + /* hint for sensible write buffer sizes */ + hintlen = sbuf_len(sb) + i * 2; + err = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); + sbuf_delete(sb); + + if (err || !req->newptr) + return (err); + + if ((reqlen = req->newlen - req->newidx) > hintlen) + return (E2BIG); + new_order = malloc(reqlen + 1, M_TEMP, M_WAITOK|M_ZERO); + + err = SYSCTL_IN(req, new_order, reqlen); + if (err) + goto error; + new_order[reqlen] = '\0'; /* Just in case */ + parse = new_order; + + TAILQ_INIT(&npfl); + PFIL_WLOCK(ph); + while ((elm = strsep(&parse, " \t,")) != NULL) { + if (*elm == '\0') + continue; + TAILQ_FOREACH_SAFE(pfh, pfl, pfil_chain, pfhtmp) { + if (pfh->pfil_name != NULL) { + if (!strcmp(pfh->pfil_name, elm)) { + TAILQ_REMOVE(pfl, pfh, pfil_chain); + TAILQ_INSERT_TAIL(&npfl, pfh, pfil_chain); + pfh->pfil_flags &= ~PFIL_DISABLED; + break; + } + } else { + if (!strcmp(elm, "NA")) { + TAILQ_REMOVE(pfl, pfh, pfil_chain); + TAILQ_INSERT_TAIL(&npfl, pfh, pfil_chain); + pfh->pfil_flags &= ~PFIL_DISABLED; + break; + } + } + } + } + + TAILQ_FOREACH_SAFE(pfh, pfl, pfil_chain, pfhtmp) { + pfh->pfil_flags |= PFIL_DISABLED; + TAILQ_REMOVE(pfl, pfh, pfil_chain); + TAILQ_INSERT_TAIL(&npfl, pfh, pfil_chain); + } + + TAILQ_CONCAT(pfl, &npfl, pfil_chain); + +error: + PFIL_WUNLOCK(ph); + free(new_order, M_TEMP); + return (err); +} + +void +pfil_head_export_sysctl(struct pfil_head *ph, struct sysctl_oid_list *parent) +{ + struct sysctl_oid *root; + + root = SYSCTL_ADD_NODE(&ph->ph_clist, parent, OID_AUTO, "pfil", + CTLFLAG_RW, 0, "pfil(9) management"); + SYSCTL_ADD_PROC((void *)&ph->ph_clist, SYSCTL_CHILDREN(root), OID_AUTO, + "inbound", CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_SECURE3, + (void *)ph, PFIL_IN, pfil_sysctl_handler, "A", + "Inbound filter hooks"); + SYSCTL_ADD_PROC((void *)&ph->ph_clist, SYSCTL_CHILDREN(root), OID_AUTO, + "outbound", CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_SECURE3, + (void *)ph, PFIL_OUT, pfil_sysctl_handler, "A", + "Outbound filter hooks"); +} + /* * pfil_head_get() returns the pfil_head for a given key/dlt. */ @@ -238,6 +373,12 @@ pfil_head_get(int type, u_long val) int pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) { + return (pfil_add_named_hook(func, arg, NULL, flags, ph)); +} + +int +pfil_add_named_hook(pfil_func_t func, void *arg, char *name, int flags, struct pfil_head *ph) +{ struct packet_filter_hook *pfh1 = NULL; struct packet_filter_hook *pfh2 = NULL; int err; @@ -262,6 +403,8 @@ pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) if (flags & PFIL_IN) { pfh1->pfil_func = func; pfh1->pfil_arg = arg; + pfh1->pfil_name = name; + pfh1->pfil_flags &= ~PFIL_DISABLED; err = pfil_chain_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); if (err) goto locked_error; @@ -270,6 +413,8 @@ pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) if (flags & PFIL_OUT) { pfh2->pfil_func = func; pfh2->pfil_arg = arg; + pfh2->pfil_name = name; + pfh2->pfil_flags &= ~PFIL_DISABLED; err = pfil_chain_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); if (err) { if (flags & PFIL_IN) diff --git a/sys/net/pfil.h b/sys/net/pfil.h index c9a1b65..ff260ce 100644 --- a/sys/net/pfil.h +++ b/sys/net/pfil.h @@ -38,6 +38,7 @@ #include <sys/_mutex.h> #include <sys/lock.h> #include <sys/rmlock.h> +#include <sys/sysctl.h> struct mbuf; struct ifnet; @@ -55,11 +56,14 @@ struct packet_filter_hook { TAILQ_ENTRY(packet_filter_hook) pfil_chain; pfil_func_t pfil_func; void *pfil_arg; + int pfil_flags; + char *pfil_name; }; #define PFIL_IN 0x00000001 #define PFIL_OUT 0x00000002 #define PFIL_WAITOK 0x00000004 +#define PFIL_DISABLED 0x00000008 #define PFIL_ALL (PFIL_IN|PFIL_OUT) typedef TAILQ_HEAD(pfil_chain, packet_filter_hook) pfil_chain_t; @@ -85,6 +89,7 @@ struct pfil_head { struct rmlock ph_lock; /* Private lock storage */ int flags; #endif + struct sysctl_ctx_list ph_clist; union { u_long phu_val; void *phu_ptr; @@ -96,7 +101,9 @@ struct pfil_head { /* Public functions for pfil hook management by packet filters. */ struct pfil_head *pfil_head_get(int, u_long); +void pfil_head_export_sysctl(struct pfil_head *, struct sysctl_oid_list *); int pfil_add_hook(pfil_func_t, void *, int, struct pfil_head *); +int pfil_add_named_hook(pfil_func_t, void *, char *, int, struct pfil_head *); int pfil_remove_hook(pfil_func_t, void *, int, struct pfil_head *); #define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) diff --git a/sys/net/pfkeyv2.h b/sys/net/pfkeyv2.h index c45f8b0..bab26fe 100644 --- a/sys/net/pfkeyv2.h +++ b/sys/net/pfkeyv2.h @@ -218,7 +218,6 @@ struct sadb_x_sa2 { }; /* XXX Policy Extension */ -/* sizeof(struct sadb_x_policy) == 16 */ struct sadb_x_policy { u_int16_t sadb_x_policy_len; u_int16_t sadb_x_policy_exttype; @@ -228,6 +227,8 @@ struct sadb_x_policy { u_int32_t sadb_x_policy_id; u_int32_t sadb_x_policy_reserved2; }; +_Static_assert(sizeof(struct sadb_x_policy) == 16, "struct size mismatch"); + /* * When policy_type == IPSEC, it is followed by some of * the ipsec policy request. @@ -256,31 +257,31 @@ struct sadb_x_ipsecrequest { }; /* NAT-Traversal type, see RFC 3948 (and drafts). */ -/* sizeof(struct sadb_x_nat_t_type) == 8 */ struct sadb_x_nat_t_type { u_int16_t sadb_x_nat_t_type_len; u_int16_t sadb_x_nat_t_type_exttype; u_int8_t sadb_x_nat_t_type_type; u_int8_t sadb_x_nat_t_type_reserved[3]; }; +_Static_assert(sizeof(struct sadb_x_nat_t_type) == 8, "struct size mismatch"); /* NAT-Traversal source or destination port. */ -/* sizeof(struct sadb_x_nat_t_port) == 8 */ struct sadb_x_nat_t_port { u_int16_t sadb_x_nat_t_port_len; u_int16_t sadb_x_nat_t_port_exttype; u_int16_t sadb_x_nat_t_port_port; u_int16_t sadb_x_nat_t_port_reserved; }; +_Static_assert(sizeof(struct sadb_x_nat_t_port) == 8, "struct size mismatch"); /* ESP fragmentation size. */ -/* sizeof(struct sadb_x_nat_t_frag) == 8 */ struct sadb_x_nat_t_frag { u_int16_t sadb_x_nat_t_frag_len; u_int16_t sadb_x_nat_t_frag_exttype; u_int16_t sadb_x_nat_t_frag_fraglen; u_int16_t sadb_x_nat_t_frag_reserved; }; +_Static_assert(sizeof(struct sadb_x_nat_t_frag) == 8, "struct size mismatch"); #define SADB_EXT_RESERVED 0 @@ -332,39 +333,47 @@ struct sadb_x_nat_t_frag { #define SADB_SAFLAGS_PFS 1 -/* RFC2367 numbers - meets RFC2407 */ +/* + * Though some of these numbers (both _AALG and _EALG) appear to be + * IKEv2 numbers and others original IKE numbers, they have no meaning. + * These are constants that the various IKE daemons use to tell the kernel + * what cipher to use. + * + * Do not use these constants directly to decide which Transformation ID + * to send. You are responsible for mapping them yourself. + */ #define SADB_AALG_NONE 0 #define SADB_AALG_MD5HMAC 2 #define SADB_AALG_SHA1HMAC 3 #define SADB_AALG_MAX 252 -/* private allocations - based on RFC2407/IANA assignment */ #define SADB_X_AALG_SHA2_256 5 #define SADB_X_AALG_SHA2_384 6 #define SADB_X_AALG_SHA2_512 7 #define SADB_X_AALG_RIPEMD160HMAC 8 -#define SADB_X_AALG_AES_XCBC_MAC 9 /* draft-ietf-ipsec-ciph-aes-xcbc-mac-04 */ -/* private allocations should use 249-255 (RFC2407) */ +#define SADB_X_AALG_AES_XCBC_MAC 9 /* RFC3566 */ +#define SADB_X_AALG_AES128GMAC 11 /* RFC4543 + Errata1821 */ +#define SADB_X_AALG_AES192GMAC 12 +#define SADB_X_AALG_AES256GMAC 13 #define SADB_X_AALG_MD5 249 /* Keyed MD5 */ #define SADB_X_AALG_SHA 250 /* Keyed SHA */ #define SADB_X_AALG_NULL 251 /* null authentication */ #define SADB_X_AALG_TCP_MD5 252 /* Keyed TCP-MD5 (RFC2385) */ -/* RFC2367 numbers - meets RFC2407 */ #define SADB_EALG_NONE 0 #define SADB_EALG_DESCBC 2 #define SADB_EALG_3DESCBC 3 -#define SADB_EALG_NULL 11 -#define SADB_EALG_MAX 250 -/* private allocations - based on RFC2407/IANA assignment */ #define SADB_X_EALG_CAST128CBC 6 #define SADB_X_EALG_BLOWFISHCBC 7 +#define SADB_EALG_NULL 11 #define SADB_X_EALG_RIJNDAELCBC 12 #define SADB_X_EALG_AES 12 -/* private allocations - based on RFC4312/IANA assignment */ -#define SADB_X_EALG_CAMELLIACBC 22 -/* private allocations should use 249-255 (RFC2407) */ -#define SADB_X_EALG_SKIPJACK 249 /*250*/ /* for IPSEC */ -#define SADB_X_EALG_AESCTR 250 /*249*/ /* draft-ietf-ipsec-ciph-aes-ctr-03 */ +#define SADB_X_EALG_AESCTR 13 +#define SADB_X_EALG_AESGCM8 18 /* RFC4106 */ +#define SADB_X_EALG_AESGCM12 19 +#define SADB_X_EALG_AESGCM16 20 +#define SADB_X_EALG_CAMELLIACBC 22 +#define SADB_X_EALG_AESGMAC 23 /* RFC4543 + Errata1821 */ +#define SADB_EALG_MAX 23 /* !!! keep updated !!! */ /* private allocations - based on RFC2407/IANA assignment */ #define SADB_X_CALG_NONE 0 diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index ae5ecb9..e46bb69 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -198,10 +198,11 @@ extern struct rwlock pf_rules_lock; (a)->addr32[0] == (b)->addr32[0])) \ #define PF_ANEQ(a, b, c) \ - ((a)->addr32[0] != (b)->addr32[0] || \ + ((c == AF_INET && (a)->addr32[0] != (b)->addr32[0]) || \ + (c == AF_INET6 && ((a)->addr32[0] != (b)->addr32[0] || \ (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[2] != (b)->addr32[2] || \ - (a)->addr32[3] != (b)->addr32[3]) \ + (a)->addr32[3] != (b)->addr32[3]))) \ #define PF_AZERO(a, c) \ ((c == AF_INET && !(a)->addr32[0]) || \ @@ -326,6 +327,14 @@ struct pf_rule_gid { u_int8_t op; }; +struct pf_rule_ieee8021q_pcp { + u_int8_t pcp[2]; + u_int8_t op; +#define SETPCP_VALID 0x80 /* Set if PCP value in field is valid. */ +#define SETPCP_PCP_MASK 0x07 /* Mask to retrieve pcp if SETPCP_VALID. */ + u_int8_t setpcp; +}; + struct pf_rule_addr { struct pf_addr_wrap addr; u_int16_t port[2]; @@ -465,6 +474,13 @@ struct pf_osfp_ioctl { int fp_getnum; /* DIOCOSFPGET number */ }; +struct pf_rule_actions { + u_int16_t qid; + u_int16_t pqid; + u_int32_t pdnpipe; + u_int32_t dnpipe; + u_int8_t flags; +}; union pf_rule_ptr { struct pf_rule *ptr; @@ -488,6 +504,7 @@ struct pf_rule { union pf_rule_ptr skip[PF_SKIP_COUNT]; #define PF_RULE_LABEL_SIZE 64 char label[PF_RULE_LABEL_SIZE]; + char schedule[PF_RULE_LABEL_SIZE]; char ifname[IFNAMSIZ]; char qname[PF_QNAME_SIZE]; char pqname[PF_QNAME_SIZE]; @@ -520,12 +537,21 @@ struct pf_rule { u_int32_t limit; u_int32_t seconds; } max_src_conn_rate; - u_int32_t qid; - u_int32_t pqid; + u_int16_t qid; + u_int16_t pqid; + u_int32_t dnpipe; + u_int32_t pdnpipe; +#define PFRULE_DN_IS_PIPE 0x00000010 +#define PFRULE_DN_IS_QUEUE 0x00000020 + u_int32_t free_flags; u_int32_t rt_listid; u_int32_t nr; u_int32_t prob; +#ifdef PF_USER_INFO uid_t cuid; +#else + u_int32_t cuid; +#endif pid_t cpid; counter_u64_t states_cur; @@ -566,6 +592,29 @@ struct pf_rule { u_int8_t allow_opts; u_int8_t rt; u_int8_t return_ttl; + +#ifndef DSCP_EF +/* Copied from altq_cdnr.h */ +/* diffserve code points */ +#define DSCP_MASK 0xfc +#define DSCP_CUMASK 0x03 +#define DSCP_VA 0xb0 +#define DSCP_EF 0xb8 +#define DSCP_AF11 0x28 +#define DSCP_AF12 0x30 +#define DSCP_AF13 0x38 +#define DSCP_AF21 0x48 +#define DSCP_AF22 0x50 +#define DSCP_AF23 0x58 +#define DSCP_AF31 0x68 +#define DSCP_AF32 0x70 +#define DSCP_AF33 0x78 +#define DSCP_AF41 0x88 +#define DSCP_AF42 0x90 +#define DSCP_AF43 0x98 +#define AF_CLASSMASK 0xe0 +#define AF_DROPPRECMASK 0x18 +#endif u_int8_t tos; u_int8_t set_tos; u_int8_t anchor_relative; @@ -580,6 +629,8 @@ struct pf_rule { u_int16_t port; } divert; + struct pf_rule_ieee8021q_pcp ieee8021q_pcp; + uint64_t u_states_cur; uint64_t u_states_tot; uint64_t u_src_nodes; @@ -604,6 +655,13 @@ struct pf_rule { #define PFRULE_REASSEMBLE_TCP 0x1000 #define PFRULE_SET_TOS 0x2000 +/* rule flags for TOS or DSCP differentiation */ +#define PFRULE_TOS 0x2000 +#define PFRULE_DSCP 0x4000 + +/* rule flags for handling ALTQ hashing required by certain disciplines */ +#define PFRULE_ALTQ_HASH 0x8000 + /* rule flags again */ #define PFRULE_IFBOUND 0x00010000 /* if-bound */ #define PFRULE_STATESLOPPY 0x00020000 /* sloppy state tracking */ @@ -708,7 +766,13 @@ struct pf_state { u_int64_t id; u_int32_t creatorid; u_int8_t direction; - u_int8_t pad[3]; + u_int8_t pad[2]; + u_int8_t local_flags; +#define PFSTATE_DIVERT_ALTQ 0x10 +#define PFSTATE_DIVERT_DNCOOKIE 0x20 +#define PFSTATE_DIVERT_ACTION 0x40 +#define PFSTATE_DIVERT_TAG 0x80 +#define PFSTATE_DIVERT_MASK 0xFF00 u_int refs; TAILQ_ENTRY(pf_state) sync_list; @@ -730,7 +794,12 @@ struct pf_state { u_int32_t creation; u_int32_t expire; u_int32_t pfsync_time; + u_int16_t qid; + u_int16_t pqid; + u_int32_t pdnpipe; + u_int32_t dnpipe; u_int16_t tag; + u_int16_t divert_cookie; u_int8_t log; u_int8_t state_flags; #define PFSTATE_ALLOWOPTS 0x01 @@ -743,7 +812,7 @@ struct pf_state { /* XXX */ u_int8_t sync_updates; - u_int8_t _tail[3]; + u_int8_t _tail; }; /* @@ -1079,11 +1148,13 @@ struct pfi_kif { #define PFI_IFLAG_SKIP 0x0100 /* skip filtering on interface */ struct pf_pdesc { +#ifdef PF_USER_INFO struct { int done; uid_t uid; gid_t gid; } lookup; +#endif u_int64_t tot_len; /* Make Mickey money */ union { struct tcphdr *tcp; @@ -1101,6 +1172,7 @@ struct pf_pdesc { u_int16_t *sport; u_int16_t *dport; struct pf_mtag *pf_mtag; + struct pf_rule_actions act; u_int32_t p_len; /* total length of payload */ @@ -1252,6 +1324,11 @@ struct pfioc_state_kill { u_int psk_killed; }; +struct pfioc_schedule_kill { + int numberkilled; + char schedule[PF_RULE_LABEL_SIZE]; +}; + struct pfioc_states { int ps_len; union { @@ -1330,6 +1407,9 @@ struct pfioc_trans { #ifdef _KERNEL #define PFR_FLAG_USERIOCTL 0x10000000 #endif +#define DIOCGETNAMEDALTQ _IOWR('D', 94, struct pfioc_ruleset) +#define DIOCGETNAMEDTAG _IOR('D', 95, u_int32_t) +#define DIOCKILLSCHEDULE _IOWR('D', 96, struct pfioc_schedule_kill) struct pfioc_table { struct pfr_table pfrio_table; @@ -1594,6 +1674,8 @@ int pf_match_addr(u_int8_t, struct pf_addr *, struct pf_addr *, int pf_match_addr_range(struct pf_addr *, struct pf_addr *, struct pf_addr *, sa_family_t); int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); +int pf_match_ieee8021q_pcp(u_int8_t, u_int8_t, u_int8_t, struct mbuf *); +int pf_ieee8021q_setpcp(struct mbuf *m, struct pf_rule *r); void pf_normalize_init(void); void pf_normalize_cleanup(void); |