From 93152c67c93acd0eca913cc1939a3393129c2c4d Mon Sep 17 00:00:00 2001 From: melifaro Date: Thu, 31 Dec 2015 05:03:27 +0000 Subject: Implement interface link header precomputation API. Add if_requestencap() interface method which is capable of calculating various link headers for given interface. Right now there is support for INET/INET6/ARP llheader calculation (IFENCAP_LL type request). Other types are planned to support more complex calculation (L2 multipath lagg nexthops, tunnel encap nexthops, etc..). Reshape 'struct route' to be able to pass additional data (with is length) to prepend to mbuf. These two changes permits routing code to pass pre-calculated nexthop data (like L2 header for route w/gateway) down to the stack eliminating the need for other lookups. It also brings us closer to more complex scenarios like transparently handling MPLS nexthops and tunnel interfaces. Last, but not least, it removes layering violation introduced by flowtable code (ro_lle) and simplifies handling of existing if_output consumers. ARP/ND changes: Make arp/ndp stack pre-calculate link header upon installing/updating lle record. Interface link address change are handled by re-calculating headers for all lles based on if_lladdr event. After these changes, arpresolve()/nd6_resolve() returns full pre-calculated header for supported interfaces thus simplifying if_output(). Move these lookups to separate ether_resolve_addr() function which ether returs error or fully-prepared link header. Add resolve_addr() compat versions to return link addresses instead of pre-calculated data. BPF changes: Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT. Despite the naming, both of there have ther header "complete". The only difference is that interface source mac has to be filled by OS for AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside BPF and not pollute if_output() routines. Convert BPF to pass prepend data via new 'struct route' mechanism. Note that it does not change non-optimized if_output(): ro_prepend handling is purely optional. Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI. It is not needed for ethernet anymore. The only remaining FDDI user is dev/pdq mostly untouched since 2007. FDDI support was eliminated from OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65). Flowtable changes: Flowtable violates layering by saving (and not correctly managing) rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated header data from that lle. Differential Revision: https://reviews.freebsd.org/D4102 --- sys/netinet/if_ether.c | 178 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 142 insertions(+), 36 deletions(-) (limited to 'sys/netinet/if_ether.c') diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 4c0c586b..0bf5cec 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -282,6 +282,37 @@ arptimer(void *arg) } /* + * Stores link-layer header for @ifp in format suitable for if_output() + * into buffer @buf. Resulting header length is stored in @bufsize. + * + * Returns 0 on success. + */ +static int +arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, + size_t *bufsize) +{ + struct if_encap_req ereq; + int error; + + bzero(buf, *bufsize); + bzero(&ereq, sizeof(ereq)); + ereq.buf = buf; + ereq.bufsize = *bufsize; + ereq.rtype = IFENCAP_LL; + ereq.family = AF_ARP; + ereq.lladdr = ar_tha(ah); + ereq.hdata = (u_char *)ah; + if (bcast) + ereq.flags = IFENCAP_FLAG_BROADCAST; + error = ifp->if_requestencap(ifp, &ereq); + if (error == 0) + *bufsize = ereq.bufsize; + + return (error); +} + + +/* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address @@ -295,6 +326,10 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + struct route ro; + int error; if (sip == NULL) { /* @@ -350,12 +385,28 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; + + /* Calculate link header for sending frame */ + bzero(&ro, sizeof(ro)); + linkhdrsize = sizeof(linkhdr); + error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); + if (error != 0 && error != EAFNOSUPPORT) { + ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", + if_name(ifp), error); + return; + } + + ro.ro_prepend = linkhdr; + ro.ro_plen = linkhdrsize; + ro.ro_flags = 0; + m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ - (*ifp->if_output)(ifp, m, &sa, NULL); + (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); } + /* * Resolve an IP address into an ethernet address - heavy version. * Used internally by arpresolve(). @@ -368,18 +419,20 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, * Note that m_freem() handles NULL. */ static int -arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m, +arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; + char *lladdr; + int ll_len; if (pflags != NULL) *pflags = 0; - if (create == 0) { + if ((flags & LLE_CREATE) == 0) { IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); @@ -413,7 +466,14 @@ arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m, if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { - bcopy(&la->ll_addr, desten, ifp->if_addrlen); + if (flags & LLE_ADDRONLY) { + lladdr = la->ll_addr; + ll_len = ifp->if_addrlen; + } else { + lladdr = la->r_linkdata; + ll_len = la->r_hdrlen; + } + bcopy(lladdr, desten, ll_len); /* Check if we have feedback request from arptimer() */ if (la->r_skip_req != 0) { @@ -485,15 +545,31 @@ arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m, /* * Resolve an IP address into an ethernet address. + */ +int +arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, + char *desten, uint32_t *pflags) +{ + int error; + + flags |= LLE_ADDRONLY; + error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags); + return (error); +} + + +/* + * Lookups link header based on an IP address. * On input: * ifp is the interface we use * is_gw != 0 if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, - * desten is the storage to put LL address. + * desten is the storage to put LL header. * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR * - * On success, desten and flags are filled in and the function returns 0; + * On success, full/partial link header and flags are filled in and + * the function returns 0. * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. @@ -525,7 +601,7 @@ arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, la = lla_lookup(LLTABLE(ifp), LLE_UNLOCKED, dst); if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ - bcopy(&la->ll_addr, desten, ifp->if_addrlen); + bcopy(la->r_linkdata, desten, la->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR); /* Check if we have feedback request from arptimer() */ @@ -539,7 +615,8 @@ arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, } IF_AFDATA_RUNLOCK(ifp); - return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags)); + return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, + desten, pflags)); } /* @@ -683,6 +760,11 @@ in_arpinput(struct mbuf *m) struct sockaddr_in sin; struct sockaddr *dst; struct nhop4_basic nh4; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + struct route ro; + size_t linkhdrsize; + int lladdr_off; + int error; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; @@ -850,8 +932,14 @@ match: else if (itaddr.s_addr == myaddr.s_addr) { /* * Request/reply to our address, but no lle exists yet. - * Try to create new llentry. + * Calculate full link prepend to use in lle. */ + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, + &linkhdrsize, &lladdr_off) != 0) + goto reply; + + /* Allocate new entry */ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) { @@ -863,7 +951,8 @@ match: */ goto reply; } - lltable_set_entry_addr(ifp, la, ar_sha(ah)); + lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, + lladdr_off); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); @@ -921,7 +1010,7 @@ reply: if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); - (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); + (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { @@ -991,8 +1080,29 @@ reply: m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; + + /* Calculate link header for sending frame */ + bzero(&ro, sizeof(ro)); + linkhdrsize = sizeof(linkhdr); + error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); + + /* + * arp_fillheader() may fail due to lack of support inside encap request + * routing. This is not necessary an error, AF_ARP can/should be handled + * by if_output(). + */ + if (error != 0 && error != EAFNOSUPPORT) { + ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", + if_name(ifp), error); + return; + } + + ro.ro_prepend = linkhdr; + ro.ro_plen = linkhdrsize; + ro.ro_flags = 0; + m_clrprotoflags(m); /* Avoid confusing lower layers. */ - (*ifp->if_output)(ifp, m, &sa, NULL); + (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; @@ -1011,6 +1121,9 @@ arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp { struct sockaddr sa; struct mbuf *m_hold, *m_hold_next; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; LLE_WLOCK_ASSERT(la); @@ -1027,7 +1140,7 @@ arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp return; } if ((la->la_flags & LLE_VALID) && - bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { + bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) @@ -1050,31 +1163,19 @@ arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp } } + /* Calculate full link prepend to use in lle */ + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, + &linkhdrsize, &lladdr_off) != 0) + return; + /* Check if something has changed */ - if (memcmp(&la->ll_addr, ar_sha(ah), ifp->if_addrlen) != 0 || + if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || (la->la_flags & LLE_VALID) == 0) { - /* Perform real LLE update */ - /* use afdata WLOCK to update fields */ - LLE_ADDREF(la); - LLE_WUNLOCK(la); - IF_AFDATA_WLOCK(ifp); - LLE_WLOCK(la); - - /* - * Since we droppped LLE lock, other thread might have deleted - * this lle. Check and return - */ - if ((la->la_flags & LLE_DELETED) != 0) { - IF_AFDATA_WUNLOCK(ifp); - LLE_FREE_LOCKED(la); + /* Try to perform LLE update */ + if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize, + lladdr_off) == 0) return; - } - - /* Update data */ - lltable_set_entry_addr(ifp, la, ar_sha(ah)); - - IF_AFDATA_WUNLOCK(ifp); - LLE_REMREF(la); /* Clear fast path feedback request if set */ la->r_skip_req = 0; @@ -1215,10 +1316,12 @@ arp_handle_ifllchange(struct ifnet *ifp) /* * A handler for interface link layer address change event. */ -static __noinline void +static void arp_iflladdr(void *arg __unused, struct ifnet *ifp) { + lltable_update_ifaddr(LLTABLE(ifp)); + if ((ifp->if_flags & IFF_UP) != 0) arp_handle_ifllchange(ifp); } @@ -1231,5 +1334,8 @@ arp_init(void) if (IS_DEFAULT_VNET(curvnet)) iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); + if (IS_DEFAULT_VNET(curvnet)) + iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, + arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); -- cgit v1.1