diff options
author | np <np@FreeBSD.org> | 2012-06-19 07:34:13 +0000 |
---|---|---|
committer | np <np@FreeBSD.org> | 2012-06-19 07:34:13 +0000 |
commit | 67d5f1a727273d8e141e96c429114dff9fb06ec3 (patch) | |
tree | 9255a545bbd49a0458ed8850371b4fe6ed2cd01f /sys/netinet | |
parent | 27063437e23a5e5e7debf9144ee974d21b6a6774 (diff) | |
download | FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.zip FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.tar.gz |
- Updated TOE support in the kernel.
- Stateful TCP offload drivers for Terminator 3 and 4 (T3 and T4) ASICs.
These are available as t3_tom and t4_tom modules that augment cxgb(4)
and cxgbe(4) respectively. The cxgb/cxgbe drivers continue to work as
usual with or without these extra features.
- iWARP driver for Terminator 3 ASIC (kernel verbs). T4 iWARP in the
works and will follow soon.
Build-tested with make universe.
30s overview
============
What interfaces support TCP offload? Look for TOE4 and/or TOE6 in the
capabilities of an interface:
# ifconfig -m | grep TOE
Enable/disable TCP offload on an interface (just like any other ifnet
capability):
# ifconfig cxgbe0 toe
# ifconfig cxgbe0 -toe
Which connections are offloaded? Look for toe4 and/or toe6 in the
output of netstat and sockstat:
# netstat -np tcp | grep toe
# sockstat -46c | grep toe
Reviewed by: bz, gnn
Sponsored by: Chelsio communications.
MFC after: ~3 months (after 9.1, and after ensuring MFC is feasible)
Diffstat (limited to 'sys/netinet')
-rw-r--r-- | sys/netinet/if_ether.c | 13 | ||||
-rw-r--r-- | sys/netinet/if_ether.h | 10 | ||||
-rw-r--r-- | sys/netinet/in.c | 2 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 13 | ||||
-rw-r--r-- | sys/netinet/tcp_offload.c | 209 | ||||
-rw-r--r-- | sys/netinet/tcp_offload.h | 364 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 8 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 19 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.c | 135 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.h | 19 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 5 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 75 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 2 | ||||
-rw-r--r-- | sys/netinet/toecore.c | 575 | ||||
-rw-r--r-- | sys/netinet/toecore.h | 130 | ||||
-rw-r--r-- | sys/netinet/toedev.h | 162 |
16 files changed, 1035 insertions, 706 deletions
diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index bdb4efc..d6a7fd1 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -180,6 +180,17 @@ arptimer(void *arg) callout_active(&lle->la_timer)) { callout_stop(&lle->la_timer); LLE_REMREF(lle); + + if (lle->la_flags != LLE_DELETED) { + int evt; + + if (lle->la_flags & LLE_VALID) + evt = LLENTRY_EXPIRED; + else + evt = LLENTRY_TIMEDOUT; + EVENTHANDLER_INVOKE(lle_event, lle, evt); + } + pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); @@ -726,7 +737,7 @@ match: (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); la->la_flags |= LLE_VALID; - EVENTHANDLER_INVOKE(arp_update_event, la); + EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { int canceled; diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h index 8d44d35..e37a964 100644 --- a/sys/netinet/if_ether.h +++ b/sys/netinet/if_ether.h @@ -122,8 +122,14 @@ void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *); void arp_ifscrub(struct ifnet *, uint32_t); #include <sys/eventhandler.h> -typedef void (*llevent_arp_update_fn)(void *, struct llentry *); -EVENTHANDLER_DECLARE(arp_update_event, llevent_arp_update_fn); +enum { + LLENTRY_RESOLVED, + LLENTRY_TIMEDOUT, + LLENTRY_DELETED, + LLENTRY_EXPIRED, +}; +typedef void (*lle_event_fn)(void *, struct llentry *, int); +EVENTHANDLER_DECLARE(lle_event, lle_event_fn); #endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c index c1cbcb1..1beddd6 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -1469,7 +1469,7 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); lle->la_flags = LLE_DELETED; - EVENTHANDLER_INVOKE(arp_update_event, lle); + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); LLE_WUNLOCK(lle); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 4d3234f..6d8ebee 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -105,6 +105,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -958,6 +961,14 @@ relocked: goto dropwithreset; } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_input(tp, m); + m = NULL; /* consumed by the TOE driver */ + goto dropunlock; + } +#endif + /* * We've identified a valid inpcb, but it could be that we need an * inpcbinfo write lock but don't hold it. In this case, attempt to @@ -1320,7 +1331,7 @@ relocked: (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); - syncache_add(&inc, &to, th, inp, &so, m); + syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); /* * Entry added to syncache and mbuf consumed. * Everything already unlocked by syncache_add(). diff --git a/sys/netinet/tcp_offload.c b/sys/netinet/tcp_offload.c index 899a37c..b772418 100644 --- a/sys/netinet/tcp_offload.c +++ b/sys/netinet/tcp_offload.c @@ -1,145 +1,176 @@ /*- - * Copyright (c) 2007, Chelsio Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/types.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> - +#include <sys/sockopt.h> #include <net/if.h> -#include <net/if_types.h> -#include <net/if_var.h> #include <net/route.h> -#include <net/vnet.h> - #include <netinet/in.h> -#include <netinet/in_systm.h> #include <netinet/in_pcb.h> #include <netinet/tcp.h> #include <netinet/tcp_var.h> #include <netinet/tcp_offload.h> -#include <netinet/toedev.h> +#define TCPOUTFLAGS +#include <netinet/tcp_fsm.h> +#include <netinet/toecore.h> -uint32_t toedev_registration_count; +int registered_toedevs; +/* + * Provide an opportunity for a TOE driver to offload. + */ int tcp_offload_connect(struct socket *so, struct sockaddr *nam) { struct ifnet *ifp; - struct toedev *tdev; + struct toedev *tod; struct rtentry *rt; - int error; - - if (toedev_registration_count == 0) - return (EINVAL); - - /* - * Look up the route used for the connection to - * determine if it uses an interface capable of - * offloading the connection. - */ - rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/); - if (rt) + int error = EOPNOTSUPP; + + INP_WLOCK_ASSERT(sotoinpcb(so)); + KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, + ("%s: called with sa_family %d", __func__, nam->sa_family)); + + if (registered_toedevs == 0) + return (error); + + rt = rtalloc1(nam, 0, 0); + if (rt) RT_UNLOCK(rt); - else + else return (EHOSTUNREACH); ifp = rt->rt_ifp; - if ((ifp->if_capenable & IFCAP_TOE) == 0) { - error = EINVAL; - goto fail; - } - - tdev = TOEDEV(ifp); - if (tdev == NULL) { - error = EPERM; - goto fail; - } - - if (tdev->tod_can_offload(tdev, so) == 0) { - error = EPERM; - goto fail; - } - - return (tdev->tod_connect(tdev, so, rt, nam)); -fail: + + if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) + goto done; + if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)) + goto done; + + tod = TOEDEV(ifp); + if (tod != NULL) + error = tod->tod_connect(tod, so, rt, nam); +done: RTFREE(rt); return (error); } +void +tcp_offload_listen_start(struct tcpcb *tp) +{ -/* - * This file contains code as a short-term staging area before it is moved in - * to sys/netinet/tcp_offload.c - */ + INP_WLOCK_ASSERT(tp->t_inpcb); + + EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); +} void -tcp_offload_twstart(struct tcpcb *tp) +tcp_offload_listen_stop(struct tcpcb *tp) { - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); } -struct tcpcb * -tcp_offload_close(struct tcpcb *tp) +void +tcp_offload_input(struct tcpcb *tp, struct mbuf *m) { + struct toedev *tod = tp->tod; - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tp = tcp_close(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); - if (tp) - INP_WUNLOCK(tp->t_inpcb); + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); - return (tp); + tod->tod_input(tod, tp, m); } -struct tcpcb * -tcp_offload_drop(struct tcpcb *tp, int error) +int +tcp_offload_output(struct tcpcb *tp) { + struct toedev *tod = tp->tod; + int error, flags; - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tp = tcp_drop(tp, error); - INP_INFO_WUNLOCK(&V_tcbinfo); - if (tp) - INP_WUNLOCK(tp->t_inpcb); + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); - return (tp); + flags = tcp_outflags[tp->t_state]; + + if (flags & TH_RST) { + /* XXX: avoid repeated calls like we do for FIN */ + error = tod->tod_send_rst(tod, tp); + } else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) && + (tp->t_flags & TF_SENTFIN) == 0) { + error = tod->tod_send_fin(tod, tp); + if (error == 0) + tp->t_flags |= TF_SENTFIN; + } else + error = tod->tod_output(tod, tp); + + return (error); +} + +void +tcp_offload_rcvd(struct tcpcb *tp) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_rcvd(tod, tp); +} + +void +tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name); } +void +tcp_offload_detach(struct tcpcb *tp) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_pcb_detach(tod, tp); +} diff --git a/sys/netinet/tcp_offload.h b/sys/netinet/tcp_offload.h index 313185f..a052366 100644 --- a/sys/netinet/tcp_offload.h +++ b/sys/netinet/tcp_offload.h @@ -1,30 +1,30 @@ /*- - * Copyright (c) 2007, Chelsio Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * $FreeBSD$ + * */ #ifndef _NETINET_TCP_OFFLOAD_H_ @@ -34,321 +34,15 @@ #error "no user-serviceable parts inside" #endif -/* - * A driver publishes that it provides offload services - * by setting IFCAP_TOE in the ifnet. The offload connect - * will bypass any further work if the interface that a - * connection would use does not support TCP offload. - * - * The TOE API assumes that the tcp offload engine can offload the - * the entire connection from set up to teardown, with some provision - * being made to allowing the software stack to handle time wait. If - * the device does not meet these criteria, it is the driver's responsibility - * to overload the functions that it needs to in tcp_usrreqs and make - * its own calls to tcp_output if it needs to do so. - * - * There is currently no provision for the device advertising the congestion - * control algorithms it supports as there is currently no API for querying - * an operating system for the protocols that it has loaded. This is a desirable - * future extension. - * - * - * - * It is assumed that individuals deploying TOE will want connections - * to be offloaded without software changes so all connections on an - * interface providing TOE are offloaded unless the SO_NO_OFFLOAD - * flag is set on the socket. - * - * - * The toe_usrreqs structure constitutes the TOE driver's - * interface to the TCP stack for functionality that doesn't - * interact directly with userspace. If one wants to provide - * (optional) functionality to do zero-copy to/from - * userspace one still needs to override soreceive/sosend - * with functions that fault in and pin the user buffers. - * - * + tu_send - * - tells the driver that new data may have been added to the - * socket's send buffer - the driver should not fail if the - * buffer is in fact unchanged - * - the driver is responsible for providing credits (bytes in the send window) - * back to the socket by calling sbdrop() as segments are acknowledged. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_rcvd - * - returns credits to the driver and triggers window updates - * to the peer (a credit as used here is a byte in the peer's receive window) - * - the driver is expected to determine how many bytes have been - * consumed and credit that back to the card so that it can grow - * the window again by maintaining its own state between invocations. - * - In principle this could be used to shrink the window as well as - * grow the window, although it is not used for that now. - * - this function needs to correctly handle being called any number of - * times without any bytes being consumed from the receive buffer. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_disconnect - * - tells the driver to send FIN to peer - * - driver is expected to send the remaining data and then do a clean half close - * - disconnect implies at least half-close so only send, reset, and detach - * are legal - * - the driver is expected to handle transition through the shutdown - * state machine and allow the stack to support SO_LINGER. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_reset - * - closes the connection and sends a RST to peer - * - driver is expectd to trigger an RST and detach the toepcb - * - no further calls are legal after reset - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * The following fields in the tcpcb are expected to be referenced by the driver: - * + iss - * + rcv_nxt - * + rcv_wnd - * + snd_isn - * + snd_max - * + snd_nxt - * + snd_una - * + t_flags - * + t_inpcb - * + t_maxseg - * + t_toe - * - * The following fields in the inpcb are expected to be referenced by the driver: - * + inp_lport - * + inp_fport - * + inp_laddr - * + inp_fport - * + inp_socket - * + inp_ip_tos - * - * The following fields in the socket are expected to be referenced by the - * driver: - * + so_comp - * + so_error - * + so_linger - * + so_options - * + so_rcv - * + so_snd - * + so_state - * + so_timeo - * - * These functions all return 0 on success and can return the following errors - * as appropriate: - * + EPERM: - * + ENOBUFS: memory allocation failed - * + EMSGSIZE: MTU changed during the call - * + EHOSTDOWN: - * + EHOSTUNREACH: - * + ENETDOWN: - * * ENETUNREACH: the peer is no longer reachable - * - * + tu_detach - * - tells driver that the socket is going away so disconnect - * the toepcb and free appropriate resources - * - allows the driver to cleanly handle the case of connection state - * outliving the socket - * - no further calls are legal after detach - * - the driver is expected to provide its own synchronization between - * detach and receiving new data. - * - * + tu_syncache_event - * - even if it is not actually needed, the driver is expected to - * call syncache_add for the initial SYN and then syncache_expand - * for the SYN,ACK - * - tells driver that a connection either has not been added or has - * been dropped from the syncache - * - the driver is expected to maintain state that lives outside the - * software stack so the syncache needs to be able to notify the - * toe driver that the software stack is not going to create a connection - * for a received SYN - * - The driver is responsible for any synchronization required between - * the syncache dropping an entry and the driver processing the SYN,ACK. - * - */ -struct toe_usrreqs { - int (*tu_send)(struct tcpcb *tp); - int (*tu_rcvd)(struct tcpcb *tp); - int (*tu_disconnect)(struct tcpcb *tp); - int (*tu_reset)(struct tcpcb *tp); - void (*tu_detach)(struct tcpcb *tp); - void (*tu_syncache_event)(int event, void *toep); -}; - -/* - * Proxy for struct tcpopt between TOE drivers and TCP functions. - */ -struct toeopt { - u_int64_t to_flags; /* see tcpopt in tcp_var.h */ - u_int16_t to_mss; /* maximum segment size */ - u_int8_t to_wscale; /* window scaling */ +extern int registered_toedevs; - u_int8_t _pad1; /* explicit pad for 64bit alignment */ - u_int32_t _pad2; /* explicit pad for 64bit alignment */ - u_int64_t _pad3[4]; /* TBD */ -}; +int tcp_offload_connect(struct socket *, struct sockaddr *); +void tcp_offload_listen_start(struct tcpcb *); +void tcp_offload_listen_stop(struct tcpcb *); +void tcp_offload_input(struct tcpcb *, struct mbuf *); +int tcp_offload_output(struct tcpcb *); +void tcp_offload_rcvd(struct tcpcb *); +void tcp_offload_ctloutput(struct tcpcb *, int, int); +void tcp_offload_detach(struct tcpcb *); -#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */ -#define TOE_SC_DROP 2 /* connection was timed out */ - -/* - * Because listen is a one-to-many relationship (a socket can be listening - * on all interfaces on a machine some of which may be using different TCP - * offload devices), listen uses a publish/subscribe mechanism. The TCP - * offload driver registers a listen notification function with the stack. - * When a listen socket is created all TCP offload devices are notified - * so that they can do the appropriate set up to offload connections on the - * port to which the socket is bound. When the listen socket is closed, - * the offload devices are notified so that they will stop listening on that - * port and free any associated resources as well as sending RSTs on any - * connections in the SYN_RCVD state. - * - */ - -typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); -typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); - -EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); -EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); - -/* - * Check if the socket can be offloaded by the following steps: - * - determine the egress interface - * - check the interface for TOE capability and TOE is enabled - * - check if the device has resources to offload the connection - */ -int tcp_offload_connect(struct socket *so, struct sockaddr *nam); - -/* - * The tcp_output_* routines are wrappers around the toe_usrreqs calls - * which trigger packet transmission. In the non-offloaded case they - * translate to tcp_output. The tcp_offload_* routines notify TOE - * of specific events. I the non-offloaded case they are no-ops. - * - * Listen is a special case because it is a 1 to many relationship - * and there can be more than one offload driver in the system. - */ - -/* - * Connection is offloaded - */ -#define tp_offload(tp) ((tp)->t_flags & TF_TOE) - -/* - * hackish way of allowing this file to also be included by TOE - * which needs to be kept ignorant of socket implementation details - */ -#ifdef _SYS_SOCKETVAR_H_ -/* - * The socket has not been marked as "do not offload" - */ -#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0) - -static __inline int -tcp_output_connect(struct socket *so, struct sockaddr *nam) -{ - struct tcpcb *tp = sototcpcb(so); - int error; - - /* - * If offload has been disabled for this socket or the - * connection cannot be offloaded just call tcp_output - * to start the TCP state machine. - */ -#ifndef TCP_OFFLOAD_DISABLE - if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0) -#endif - error = tcp_output(tp); - return (error); -} - -static __inline int -tcp_output_send(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_send(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_rcvd(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_rcvd(tp)); #endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_disconnect(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_disconnect(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_reset(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_reset(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline void -tcp_offload_detach(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - tp->t_tu->tu_detach(tp); -#endif -} - -static __inline void -tcp_offload_listen_open(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket)) - EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); -#endif -} - -static __inline void -tcp_offload_listen_close(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); -#endif -} -#undef SO_OFFLOADABLE -#endif /* _SYS_SOCKETVAR_H_ */ -#undef tp_offload - -void tcp_offload_twstart(struct tcpcb *tp); -struct tcpcb *tcp_offload_close(struct tcpcb *tp); -struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error); - -#endif /* _NETINET_TCP_OFFLOAD_H_ */ diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 8e0f369..1881c54 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -75,6 +75,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -191,6 +194,11 @@ tcp_output(struct tcpcb *tp) INP_WLOCK_ASSERT(tp->t_inpcb); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return (tcp_offload_output(tp)); +#endif + /* * Determine length of data that should be transmitted, * and flags that will be used. diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 05da82e..9d35e0a 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -85,7 +85,6 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_syncache.h> -#include <netinet/tcp_offload.h> #ifdef INET6 #include <netinet6/tcp6_var.h> #endif @@ -96,6 +95,9 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include <netinet6/ip6protosw.h> #endif +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -824,7 +826,7 @@ tcp_drop(struct tcpcb *tp, int errno) if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; - (void) tcp_output_reset(tp); + (void) tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -924,8 +926,12 @@ tcp_discardcb(struct tcpcb *tp) /* free the reassembly queue, if any */ tcp_reass_flush(tp); + +#ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ - tcp_offload_detach(tp); + if (tp->t_flags & TF_TOE) + tcp_offload_detach(tp); +#endif tcp_free_sackholes(tp); @@ -954,9 +960,10 @@ tcp_close(struct tcpcb *tp) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); - /* Notify any offload devices of listener close */ +#ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) - tcp_offload_listen_close(tp); + tcp_offload_listen_stop(tp); +#endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); @@ -1695,7 +1702,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output_send(tp); + tcp_output(tp); return (inp); } diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 21a72f4..0f7637d 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -81,10 +81,12 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_syncache.h> -#include <netinet/tcp_offload.h> #ifdef INET6 #include <netinet6/tcp6_var.h> #endif +#ifdef TCP_OFFLOAD +#include <netinet/toecore.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -110,10 +112,8 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); -#ifdef TCP_OFFLOAD_DISABLE -#define TOEPCB_ISSET(sc) (0) -#else -#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) +#ifdef TCP_OFFLOAD +#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); @@ -332,6 +332,14 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_added(tod, sc->sc_todctx); + } +#endif + /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; @@ -356,10 +364,14 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch) TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; -#ifndef TCP_OFFLOAD_DISABLE - if (sc->sc_tu) - sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); -#endif +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_removed(tod, sc->sc_todctx); + } +#endif + syncache_free(sc); V_tcp_syncache.cache_count--; } @@ -846,6 +858,18 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) if (sc->sc_rxmits > 1) tp->snd_cwnd = tp->t_maxseg; +#ifdef TCP_OFFLOAD + /* + * Allow a TOE driver to install its hooks. Note that we hold the + * pcbinfo lock too and that prevents tcp_usr_accept from accepting a + * new connection before the TOE driver has done its thing. + */ + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_offload_socket(tod, sc->sc_todctx, so); + } +#endif /* * Copy and activate timers. */ @@ -926,6 +950,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_removed(tod, sc->sc_todctx); + } +#endif V_tcp_syncache.cache_count--; SCH_UNLOCK(sch); } @@ -934,7 +965,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ - if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) { + if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); @@ -945,9 +976,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ - if ((SEQ_LEQ(th->th_seq, sc->sc_irs) || - SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) && - !TOEPCB_ISSET(sc)) { + if (SEQ_LEQ(th->th_seq, sc->sc_irs) || + SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); @@ -964,8 +994,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ - if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts && - !TOEPCB_ISSET(sc)) { + if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", @@ -993,25 +1022,6 @@ failed: return (0); } -int -tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct socket **lsop, struct mbuf *m) -{ - struct tcpopt to; - int rc; - - bzero(&to, sizeof(struct tcpopt)); - to.to_mss = toeo->to_mss; - to.to_wscale = toeo->to_wscale; - to.to_flags = toeo->to_flags; - - INP_INFO_WLOCK(&V_tcbinfo); - rc = syncache_expand(inc, &to, th, lsop, m); - INP_INFO_WUNLOCK(&V_tcbinfo); - - return (rc); -} - /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: @@ -1025,10 +1035,10 @@ tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ -static void -_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m, - struct toe_usrreqs *tu, void *toepcb) +void +syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, + void *todctx) { struct tcpcb *tp; struct socket *so; @@ -1114,11 +1124,6 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { -#ifndef TCP_OFFLOAD_DISABLE - if (sc->sc_tu) - sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, - sc->sc_toepcb); -#endif TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* @@ -1151,7 +1156,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { + if (syncache_respond(sc) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1202,9 +1207,9 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } -#ifndef TCP_OFFLOAD_DISABLE - sc->sc_tu = tu; - sc->sc_toepcb = toepcb; +#ifdef TCP_OFFLOAD + sc->sc_tod = tod; + sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); @@ -1299,7 +1304,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* * Do a standard 3-way handshake. */ - if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) { + if (syncache_respond(sc) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1491,37 +1496,21 @@ syncache_respond(struct syncache *sc) m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); + + return (error); + } +#endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } -void -syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m) -{ - _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL); -} - -void -tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct inpcb *inp, struct socket **lsop, - struct toe_usrreqs *tu, void *toepcb) -{ - struct tcpopt to; - - bzero(&to, sizeof(struct tcpopt)); - to.to_mss = toeo->to_mss; - to.to_wscale = toeo->to_wscale; - to.to_flags = toeo->to_flags; - - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - - _syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb); -} - /* * The purpose of SYN cookies is to avoid keeping track of all SYN's we * receive and to be able to handle SYN floods from bogus source addresses diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h index 5783b6d..d18ee07 100644 --- a/sys/netinet/tcp_syncache.h +++ b/sys/netinet/tcp_syncache.h @@ -34,8 +34,6 @@ #define _NETINET_TCP_SYNCACHE_H_ #ifdef _KERNEL -struct toeopt; - void syncache_init(void); #ifdef VIMAGE void syncache_destroy(void); @@ -43,14 +41,9 @@ void syncache_destroy(void); void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); -int tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct socket **lsop, struct mbuf *m); void syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); -void tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *, - struct tcphdr *, struct inpcb *, struct socket **, - struct toe_usrreqs *tu, void *toepcb); - + struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, + void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); int syncache_pcbcount(void); @@ -75,10 +68,10 @@ struct syncache { u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; u_int16_t sc_flags; -#ifndef TCP_OFFLOAD_DISABLE - struct toe_usrreqs *sc_tu; /* TOE operations */ - void *sc_toepcb; /* TOE protocol block */ -#endif +#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE) + struct toedev *sc_tod; /* entry added by this TOE */ + void *sc_todctx; /* TOE driver context */ +#endif struct label *sc_label; /* MAC label reference */ struct ucred *sc_cred; /* cred cache for jail checks */ diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 9c3c749..b3ddacc 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -602,6 +602,11 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) struct inpcb *inp = tp->t_inpcb; int cpu = INP_CPU(inp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return; +#endif + switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index a9045f3..b69961e 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -87,7 +87,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> +#endif /* * TCP protocol interface to socket abstraction. @@ -367,7 +369,9 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); - tcp_offload_listen_open(tp); +#ifdef TCP_OFFLOAD + tcp_offload_listen_start(tp); +#endif } SOCK_UNLOCK(so); @@ -409,6 +413,9 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); +#ifdef TCP_OFFLOAD + tcp_offload_listen_start(tp); +#endif } SOCK_UNLOCK(so); @@ -459,7 +466,13 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) TCPDEBUG1(); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + error = tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); @@ -519,7 +532,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + error = tcp_output(tp); goto out; } #endif @@ -530,7 +548,13 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + error = tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); @@ -709,7 +733,7 @@ tcp_usr_shutdown(struct socket *so) socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output_disconnect(tp); + error = tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); @@ -739,7 +763,11 @@ tcp_usr_rcvd(struct socket *so, int flags) } tp = intotcpcb(inp); TCPDEBUG1(); - tcp_output_rcvd(tp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + tcp_offload_rcvd(tp); +#endif + tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); @@ -835,7 +863,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (!(inp->inp_flags & INP_DROPPED)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output_send(tp); + error = tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -884,7 +912,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; - error = tcp_output_send(tp); + error = tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } out: @@ -1119,7 +1147,6 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1192,7 +1219,6 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1323,9 +1349,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; - INP_WUNLOCK(inp); - break; + goto unlock_and_done; #endif /* TCP_SIGNATURE */ + case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); @@ -1351,6 +1377,13 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_flags |= opt; else tp->t_flags &= ~opt; +unlock_and_done: +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif INP_WUNLOCK(inp); break; @@ -1369,8 +1402,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) if (TCPS_HAVEESTABLISHED(tp->t_state)) error = tcp_output(tp); } - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); @@ -1385,8 +1417,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_maxseg = optval; else error = EINVAL; - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); @@ -1438,8 +1469,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } } CC_LIST_RUNLOCK(); - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_KEEPIDLE: case TCP_KEEPINTVL: @@ -1491,8 +1521,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) TP_KEEPINIT(tp)); break; } - INP_WUNLOCK(inp); - break; + goto unlock_and_done; default: INP_WUNLOCK(inp); @@ -1635,7 +1664,7 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output_disconnect(tp); + tcp_output(tp); } } @@ -1658,7 +1687,9 @@ tcp_usrclosed(struct tcpcb *tp) switch (tp->t_state) { case TCPS_LISTEN: - tcp_offload_listen_close(tp); +#ifdef TCP_OFFLOAD + tcp_offload_listen_stop(tp); +#endif /* FALLTHROUGH */ case TCPS_CLOSED: tp->t_state = TCPS_CLOSED; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 3b8bdf7..90ecca1 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -194,7 +194,7 @@ struct tcpcb { int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ - struct toe_usrreqs *t_tu; /* offload operations vector */ + struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c new file mode 100644 index 0000000..4b4efb7 --- /dev/null +++ b/sys/netinet/toecore.c @@ -0,0 +1,575 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/types.h> +#include <sys/sockopt.h> +#include <sys/sysctl.h> +#include <sys/socket.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_vlan_var.h> +#include <net/if_llatbl.h> +#include <net/route.h> + +#include <netinet/if_ether.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet6/nd6.h> +#define TCPSTATES +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_syncache.h> +#include <netinet/tcp_offload.h> +#include <netinet/toecore.h> + +static struct mtx toedev_lock; +static TAILQ_HEAD(, toedev) toedev_list; +static eventhandler_tag listen_start_eh; +static eventhandler_tag listen_stop_eh; +static eventhandler_tag lle_event_eh; +static eventhandler_tag route_redirect_eh; + +static int +toedev_connect(struct toedev *tod __unused, struct socket *so __unused, + struct rtentry *rt __unused, struct sockaddr *nam __unused) +{ + + return (ENOTSUP); +} + +static int +toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return (ENOTSUP); +} + +static int +toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return (ENOTSUP); +} + +static void +toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused, + struct mbuf *m) +{ + + m_freem(m); + return; +} + +static void +toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return; +} + +static int +toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return (ENOTSUP); +} + +static void +toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return; +} + +static void +toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused, + struct sockaddr *sa __unused, uint8_t *lladdr __unused, + uint16_t vtag __unused) +{ + + return; +} + +static void +toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused, + struct rtentry *rt0 __unused, struct rtentry *rt1 __unused) +{ + + return; +} + +static void +toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused) +{ + + return; +} + +static void +toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused) +{ + + return; +} + +static int +toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused, + struct mbuf *m) +{ + + m_freem(m); + return (0); +} + +static void +toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused, + struct socket *so __unused) +{ + + return; +} + +static void +toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused, + int sopt_dir __unused, int sopt_name __unused) +{ + + return; +} + +/* + * Inform one or more TOE devices about a listening socket. + */ +static void +toe_listen_start(struct inpcb *inp, void *arg) +{ + struct toedev *t, *tod; + struct tcpcb *tp; + + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_pcbinfo == &V_tcbinfo, + ("%s: inp is not a TCP inp", __func__)); + + if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) + return; + + tp = intotcpcb(inp); + if (tp->t_state != TCPS_LISTEN) + return; + + t = arg; + mtx_lock(&toedev_lock); + TAILQ_FOREACH(tod, &toedev_list, link) { + if (t == NULL || t == tod) + tod->tod_listen_start(tod, tp); + } + mtx_unlock(&toedev_lock); +} + +static void +toe_listen_start_event(void *arg __unused, struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_state == TCPS_LISTEN, + ("%s: t_state %s", __func__, tcpstates[tp->t_state])); + + toe_listen_start(inp, NULL); +} + +static void +toe_listen_stop_event(void *arg __unused, struct tcpcb *tp) +{ + struct toedev *tod; +#ifdef INVARIANTS + struct inpcb *inp = tp->t_inpcb; +#endif + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_state == TCPS_LISTEN, + ("%s: t_state %s", __func__, tcpstates[tp->t_state])); + + mtx_lock(&toedev_lock); + TAILQ_FOREACH(tod, &toedev_list, link) + tod->tod_listen_stop(tod, tp); + mtx_unlock(&toedev_lock); +} + +/* + * Fill up a freshly allocated toedev struct with reasonable defaults. + */ +void +init_toedev(struct toedev *tod) +{ + + tod->tod_softc = NULL; + + /* + * Provide no-op defaults so that the kernel can call any toedev + * function without having to check whether the TOE driver supplied one + * or not. + */ + tod->tod_connect = toedev_connect; + tod->tod_listen_start = toedev_listen_start; + tod->tod_listen_stop = toedev_listen_stop; + tod->tod_input = toedev_input; + tod->tod_rcvd = toedev_rcvd; + tod->tod_output = toedev_output; + tod->tod_send_rst = toedev_output; + tod->tod_send_fin = toedev_output; + tod->tod_pcb_detach = toedev_pcb_detach; + tod->tod_l2_update = toedev_l2_update; + tod->tod_route_redirect = toedev_route_redirect; + tod->tod_syncache_added = toedev_syncache_added; + tod->tod_syncache_removed = toedev_syncache_removed; + tod->tod_syncache_respond = toedev_syncache_respond; + tod->tod_offload_socket = toedev_offload_socket; + tod->tod_ctloutput = toedev_ctloutput; +} + +/* + * Register an active TOE device with the system. This allows it to receive + * notifications from the kernel. + */ +int +register_toedev(struct toedev *tod) +{ + struct toedev *t; + + mtx_lock(&toedev_lock); + TAILQ_FOREACH(t, &toedev_list, link) { + if (t == tod) { + mtx_unlock(&toedev_lock); + return (EEXIST); + } + } + + TAILQ_INSERT_TAIL(&toedev_list, tod, link); + registered_toedevs++; + mtx_unlock(&toedev_lock); + + inp_apply_all(toe_listen_start, tod); + + return (0); +} + +/* + * Remove the TOE device from the global list of active TOE devices. It is the + * caller's responsibility to ensure that the TOE device is quiesced prior to + * this call. + */ +int +unregister_toedev(struct toedev *tod) +{ + struct toedev *t, *t2; + int rc = ENODEV; + + mtx_lock(&toedev_lock); + TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) { + if (t == tod) { + TAILQ_REMOVE(&toedev_list, tod, link); + registered_toedevs--; + rc = 0; + break; + } + } + KASSERT(registered_toedevs >= 0, + ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs)); + mtx_unlock(&toedev_lock); + return (rc); +} + +void +toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, void *tod, void *todctx) +{ + struct socket *lso = inp->inp_socket; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx); +} + +int +toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, + struct tcphdr *th, struct socket **lsop) +{ + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + return (syncache_expand(inc, to, th, lsop, NULL)); +} + +/* + * General purpose check to see if a 4-tuple is in use by the kernel. If a TCP + * header (presumably for an incoming SYN) is also provided, an existing 4-tuple + * in TIME_WAIT may be assassinated freeing it up for re-use. + * + * Note that the TCP header must have been run through tcp_fields_to_host() or + * equivalent. + */ +int +toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp) +{ + struct inpcb *inp; + + if (inc->inc_flags & INC_ISIPV6) + return (ENOSYS); /* XXX: implement */ + + inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport, + inc->inc_laddr, inc->inc_lport, INPLOOKUP_WLOCKPCB, ifp); + if (inp != NULL) { + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) { + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */ + if (!tcp_twcheck(inp, NULL, th, NULL, 0)) + return (EADDRINUSE); + } else { + INP_WUNLOCK(inp); + return (EADDRINUSE); + } + } + + return (0); +} + +static void +toe_lle_event(void *arg __unused, struct llentry *lle, int evt) +{ + struct toedev *tod; + struct ifnet *ifp; + struct sockaddr *sa; + uint8_t *lladdr; + uint16_t vtag; + + LLE_WLOCK_ASSERT(lle); + + ifp = lle->lle_tbl->llt_ifp; + sa = L3_ADDR(lle); + + KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6, + ("%s: lle_event %d for lle %p but sa %p !INET && !INET6", + __func__, evt, lle, sa)); + + /* + * Not interested if the interface's TOE capability is not enabled. + */ + if ((sa->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) || + (sa->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))) + return; + + tod = TOEDEV(ifp); + if (tod == NULL) + return; + + vtag = 0xfff; + if (evt != LLENTRY_RESOLVED) { + + /* + * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean + * this entry is going to be deleted. + */ + + lladdr = NULL; + } else { + + KASSERT(lle->la_flags & LLE_VALID, + ("%s: %p resolved but not valid?", __func__, lle)); + + lladdr = (uint8_t *)&lle->ll_addr; +#ifdef VLAN_TAG + VLAN_TAG(ifp, &vtag); +#endif + } + + tod->tod_l2_update(tod, ifp, sa, lladdr, vtag); +} + +/* + * XXX: implement. + */ +static void +toe_route_redirect_event(void *arg __unused, struct rtentry *rt0, + struct rtentry *rt1, struct sockaddr *sa) +{ + + return; +} + +/* + * Returns 0 or EWOULDBLOCK on success (any other value is an error). 0 means + * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's + * tod_l2_update will be called later, when the entry is resolved or times out. + */ +int +toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, + uint8_t *lladdr, uint16_t *vtag) +{ + struct llentry *lle; + int rc; + + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + rc = arpresolve(ifp, NULL, NULL, sa, lladdr, &lle); + break; +#endif +#ifdef INET6 + case AF_INET6: + rc = nd6_storelladdr(ifp, NULL, sa, lladdr, &lle); + break; +#endif + default: + return (EPROTONOSUPPORT); + } + + if (rc == 0) { +#ifdef VLAN_TAG + if (VLAN_TAG(ifp, vtag) != 0) +#endif + *vtag = 0xfff; + } + + return (rc); +} + +void +toe_connect_failed(struct toedev *tod, struct tcpcb *tp, int err) +{ + struct inpcb *inp = tp->t_inpcb; + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_flags & TF_TOE, + ("%s: tp %p not offloaded.", __func__, tp)); + + if (!(inp->inp_flags & INP_DROPPED)) { + if (err == EAGAIN) { + + /* + * Temporary failure during offload, take this PCB back. + * Detach from the TOE driver and do the rest of what + * TCP's pru_connect would have done if the connection + * wasn't offloaded. + */ + + tod->tod_pcb_detach(tod, tp); + KASSERT(!(tp->t_flags & TF_TOE), + ("%s: tp %p still offloaded.", __func__, tp)); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + (void) tcp_output(tp); + } else { + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tp = tcp_drop(tp, err); + if (tp == NULL) + INP_WLOCK(inp); /* re-acquire */ + } + } + INP_WLOCK_ASSERT(inp); +} + +static int +toecore_load(void) +{ + + mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF); + TAILQ_INIT(&toedev_list); + + listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start, + toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY); + listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop, + toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY); + lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL, + EVENTHANDLER_PRI_ANY); + route_redirect_eh = EVENTHANDLER_REGISTER(route_redirect_event, + toe_route_redirect_event, NULL, EVENTHANDLER_PRI_ANY); + + return (0); +} + +static int +toecore_unload(void) +{ + + mtx_lock(&toedev_lock); + if (!TAILQ_EMPTY(&toedev_list)) { + mtx_unlock(&toedev_lock); + return (EBUSY); + } + + EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh); + EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh); + EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); + EVENTHANDLER_DEREGISTER(route_redirect_event, route_redirect_eh); + + mtx_unlock(&toedev_lock); + mtx_destroy(&toedev_lock); + + return (0); +} + +static int +toecore_mod_handler(module_t mod, int cmd, void *arg) +{ + + if (cmd == MOD_LOAD) + return (toecore_load()); + + if (cmd == MOD_UNLOAD) + return (toecore_unload()); + + return (EOPNOTSUPP); +} + +static moduledata_t mod_data= { + "toecore", + toecore_mod_handler, + 0 +}; + +MODULE_VERSION(toecore, 1); +DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/netinet/toecore.h b/sys/netinet/toecore.h new file mode 100644 index 0000000..a381825 --- /dev/null +++ b/sys/netinet/toecore.h @@ -0,0 +1,130 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TOE_H_ +#define _NETINET_TOE_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct tcpopt; +struct tcphdr; +struct in_conninfo; + +struct toedev { + TAILQ_ENTRY(toedev) link; /* glue for toedev_list */ + void *tod_softc; /* TOE driver private data */ + + /* + * Active open. If a failure occurs, it is reported back by the driver + * via toe_connect_failed. + */ + int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *, + struct sockaddr *); + + /* Passive open. */ + int (*tod_listen_start)(struct toedev *, struct tcpcb *); + int (*tod_listen_stop)(struct toedev *, struct tcpcb *); + + /* + * The kernel uses this routine to pass on any frame it receives for an + * offloaded connection to the TOE driver. This is an unusual event. + */ + void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *); + + /* + * This is called by the kernel during pru_rcvd for an offloaded TCP + * connection and provides an opportunity for the TOE driver to manage + * its rx window and credits. + */ + void (*tod_rcvd)(struct toedev *, struct tcpcb *); + + /* + * Transmit routine. The kernel calls this to have the TOE driver + * evaluate whether there is data to be transmitted, and transmit it. + */ + int (*tod_output)(struct toedev *, struct tcpcb *); + + /* Immediate teardown: send RST to peer. */ + int (*tod_send_rst)(struct toedev *, struct tcpcb *); + + /* Initiate orderly disconnect by sending FIN to the peer. */ + int (*tod_send_fin)(struct toedev *, struct tcpcb *); + + /* Called to indicate that the kernel is done with this TCP PCB. */ + void (*tod_pcb_detach)(struct toedev *, struct tcpcb *); + + /* + * The kernel calls this once it has information about an L2 entry that + * the TOE driver enquired about previously (via toe_l2_resolve). + */ + void (*tod_l2_update)(struct toedev *, struct ifnet *, + struct sockaddr *, uint8_t *, uint16_t); + + /* XXX. Route has been redirected. */ + void (*tod_route_redirect)(struct toedev *, struct ifnet *, + struct rtentry *, struct rtentry *); + + /* Syncache interaction. */ + void (*tod_syncache_added)(struct toedev *, void *); + void (*tod_syncache_removed)(struct toedev *, void *); + int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *); + void (*tod_offload_socket)(struct toedev *, void *, struct socket *); + + /* TCP socket option */ + void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int); +}; + +#include <sys/eventhandler.h> +typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); +typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); +EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); +EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); + +void init_toedev(struct toedev *); +int register_toedev(struct toedev *); +int unregister_toedev(struct toedev *); + +/* + * General interface for looking up L2 information for an IP address. If an + * answer is not available right away then the TOE driver's tod_l2_update will + * be called later. + */ +int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *, + uint8_t *, uint16_t *); + +void toe_connect_failed(struct toedev *, struct tcpcb *, int); + +void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, + struct inpcb *, void *, void *); +int toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, + struct socket **); + +int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *); +#endif diff --git a/sys/netinet/toedev.h b/sys/netinet/toedev.h deleted file mode 100644 index 7edaca1..0000000 --- a/sys/netinet/toedev.h +++ /dev/null @@ -1,162 +0,0 @@ -/*- - * Copyright (c) 2007, Chelsio Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _NETINET_TOEDEV_H_ -#define _NETINET_TOEDEV_H_ - -#ifndef _KERNEL -#error "no user-serviceable parts inside" -#endif - -extern uint32_t toedev_registration_count; - -/* Parameter values for offload_get_phys_egress(). */ -enum { - TOE_OPEN, - TOE_FAILOVER, -}; - -/* Parameter values for toe_failover(). */ -enum { - TOE_ACTIVE_SLAVE, - TOE_LINK_DOWN, - TOE_LINK_UP, - TOE_RELEASE, - TOE_RELEASE_ALL, -}; - -#define TOENAMSIZ 16 - -/* Get the toedev associated with a ifnet. */ -#define TOEDEV(ifp) ((ifp)->if_llsoftc) - -struct offload_id { - unsigned int id; - unsigned long data; -}; - -struct ifnet; -struct rt_entry; -struct tom_info; -struct sysctl_oid; -struct socket; -struct mbuf; - -struct toedev { - TAILQ_ENTRY(toedev) entry; - char tod_name[TOENAMSIZ]; /* TOE device name */ - unsigned int tod_ttid; /* TOE type id */ - unsigned long tod_flags; /* device flags */ - unsigned int tod_mtu; /* max TX offloaded data */ - unsigned int tod_nconn; /* max # of offloaded - * connections - */ - struct ifnet *tod_lldev; /* first interface */ - const struct tom_info *tod_offload_mod; /* TCP offload module */ - - /* - * This TOE device is capable of offloading the connection for socket so - */ - int (*tod_can_offload)(struct toedev *dev, struct socket *so); - - /* - * Establish a connection to nam using the TOE device dev - */ - int (*tod_connect)(struct toedev *dev, struct socket *so, - struct rtentry *rt, struct sockaddr *nam); - /* - * Send an mbuf down to the toe device - */ - int (*tod_send)(struct toedev *dev, struct mbuf *m); - /* - * Receive an array of mbufs from the TOE device dev - */ - int (*tod_recv)(struct toedev *dev, struct mbuf **m, int n); - /* - * Device specific ioctl interface - */ - int (*tod_ctl)(struct toedev *dev, unsigned int req, void *data); - /* - * Update L2 entry in toedev - */ - void (*tod_arp_update)(struct toedev *dev, struct rtentry *neigh); - /* - * Failover from one toe device to another - */ - void (*tod_failover)(struct toedev *dev, struct ifnet *bond_ifp, - struct ifnet *ndev, int event); - void *tod_priv; /* driver private data */ - void *tod_l2opt; /* optional layer 2 data */ - void *tod_l3opt; /* optional layer 3 data */ - void *tod_l4opt; /* optional layer 4 data */ - void *tod_ulp; /* upper lever protocol */ -}; - -struct tom_info { - TAILQ_ENTRY(tom_info) entry; - int (*ti_attach)(struct toedev *dev, - const struct offload_id *entry); - int (*ti_detach)(struct toedev *dev); - const char *ti_name; - const struct offload_id *ti_id_table; -}; - -static __inline void -init_offload_dev(struct toedev *dev) -{ -} - -int register_tom(struct tom_info *t); -int unregister_tom(struct tom_info *t); -int register_toedev(struct toedev *dev, const char *name); -int unregister_toedev(struct toedev *dev); -int activate_offload(struct toedev *dev); -int toe_send(struct toedev *dev, struct mbuf *m); -void toe_arp_update(struct rtentry *rt); -struct ifnet *offload_get_phys_egress(struct ifnet *ifp, - struct socket *so, int context); -int toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n); - -static __inline void -toe_neigh_update(struct ifnet *ifp) -{ -} - -static __inline void -toe_failover(struct ifnet *bond_ifp, struct ifnet *fail_ifp, int event) -{ -} - -static __inline int -toe_enslave(struct ifnet *bond_ifp, struct ifnet *slave_ifp) -{ - return (0); -} - -#endif /* _NETINET_TOEDEV_H_ */ |