summaryrefslogtreecommitdiffstats
path: root/sys/dev/cxgb/ulp
diff options
context:
space:
mode:
authorkmacy <kmacy@FreeBSD.org>2007-12-16 05:27:26 +0000
committerkmacy <kmacy@FreeBSD.org>2007-12-16 05:27:26 +0000
commitf96fe5e169e8cfe06b070663cdaf7a637dfde154 (patch)
tree4227e68976ae5d008757c5ec68446ae18267d17a /sys/dev/cxgb/ulp
parentf04336e4cbede2676f151b37d96aacb1b14cb9b2 (diff)
downloadFreeBSD-src-f96fe5e169e8cfe06b070663cdaf7a637dfde154.zip
FreeBSD-src-f96fe5e169e8cfe06b070663cdaf7a637dfde154.tar.gz
Add driver for TCP offload
Sponsored by: Chelsio Inc.
Diffstat (limited to 'sys/dev/cxgb/ulp')
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c3378
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c560
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_defs.h79
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_listen.c345
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h185
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_toepcb.h112
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom.c500
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom.h157
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c106
9 files changed, 5422 insertions, 0 deletions
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
new file mode 100644
index 0000000..0c796b5
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -0,0 +1,3378 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/priv.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_syncache.h>
+#include <net/route.h>
+
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/bus.h>
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+
+
+/*
+ * For ULP connections HW may add headers, e.g., for digests, that aren't part
+ * of the messages sent by the host but that are part of the TCP payload and
+ * therefore consume TCP sequence space. Tx connection parameters that
+ * operate in TCP sequence space are affected by the HW additions and need to
+ * compensate for them to accurately track TCP sequence numbers. This array
+ * contains the compensating extra lengths for ULP packets. It is indexed by
+ * a packet's ULP submode.
+ */
+const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
+
+#ifdef notyet
+/*
+ * This sk_buff holds a fake header-only TCP segment that we use whenever we
+ * need to exploit SW TCP functionality that expects TCP headers, such as
+ * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
+ * CPUs without locking.
+ */
+static struct mbuf *tcphdr_mbuf __read_mostly;
+#endif
+
+/*
+ * Size of WRs in bytes. Note that we assume all devices we are handling have
+ * the same WR size.
+ */
+static unsigned int wrlen __read_mostly;
+
+/*
+ * The number of WRs needed for an skb depends on the number of page fragments
+ * in the skb and whether it has any payload in its main body. This maps the
+ * length of the gather list represented by an skb into the # of necessary WRs.
+ */
+static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
+
+/*
+ * Max receive window supported by HW in bytes. Only a small part of it can
+ * be set through option0, the rest needs to be set through RX_DATA_ACK.
+ */
+#define MAX_RCV_WND ((1U << 27) - 1)
+
+/*
+ * Min receive window. We want it to be large enough to accommodate receive
+ * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
+ */
+#define MIN_RCV_WND (24 * 1024U)
+#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
+
+#define VALIDATE_SEQ 0
+#define VALIDATE_SOCK(so)
+#define DEBUG_WR 0
+
+extern int tcp_do_autorcvbuf;
+extern int tcp_do_autosndbuf;
+extern int tcp_autorcvbuf_max;
+extern int tcp_autosndbuf_max;
+
+static void t3_send_reset(struct toepcb *toep);
+static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
+static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
+static void handle_syncache_event(int event, void *arg);
+
+
+static inline int
+is_t3a(const struct toedev *dev)
+{
+ return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
+}
+
+static void
+dump_toepcb(struct toepcb *toep)
+{
+ DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
+ toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
+ toep->tp_mtu_idx, toep->tp_tid);
+
+ DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
+ toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
+ toep->tp_mss_clamp, toep->tp_flags);
+}
+
+static struct rtentry *
+rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
+{
+ struct rtentry *rt = NULL;
+
+ if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
+ RT_UNLOCK(rt);
+
+ return (rt);
+}
+
+/*
+ * Determine whether to send a CPL message now or defer it. A message is
+ * deferred if the connection is in SYN_SENT since we don't know the TID yet.
+ * For connections in other states the message is sent immediately.
+ * If through_l2t is set the message is subject to ARP processing, otherwise
+ * it is sent directly.
+ */
+static inline void
+send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
+{
+ struct toepcb *toep = tp->t_toe;
+
+
+ if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
+ INP_LOCK(tp->t_inpcb);
+ mbufq_tail(&toep->out_of_order_queue, m); // defer
+ INP_UNLOCK(tp->t_inpcb);
+ } else if (through_l2t)
+ l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T
+ else
+ cxgb_ofld_send(T3C_DEV(so), m); // send directly
+}
+
+static inline unsigned int
+mkprio(unsigned int cntrl, const struct socket *so)
+{
+ return cntrl;
+}
+
+/*
+ * Populate a TID_RELEASE WR. The skb must be already propely sized.
+ */
+static inline void
+mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
+{
+ struct cpl_tid_release *req;
+
+ m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+ req = mtod(m, struct cpl_tid_release *);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+static inline void
+make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct tx_data_wr *req;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ req = mtod(m, struct tx_data_wr *);
+ m->m_len = sizeof(*req);
+ req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+ req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
+ /* len includes the length of any HW ULP additions */
+ req->len = htonl(len);
+ req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
+ /* V_TX_ULP_SUBMODE sets both the mode and submode */
+ req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
+ V_TX_URG(/* skb_urgent(skb) */ 0 ) |
+ V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
+ (tail ? 0 : 1))));
+ req->sndseq = htonl(tp->snd_nxt);
+ if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
+ req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
+ V_TX_CPU_IDX(toep->tp_qset));
+
+ /* Sendbuffer is in units of 32KB.
+ */
+ if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE)
+ req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
+ else
+ req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
+ toep->tp_flags |= TP_DATASENT;
+ }
+}
+
+int
+t3_push_frames(struct socket *so, int req_completion)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ struct mbuf *tail, *m0, *last;
+ struct t3cdev *cdev;
+ struct tom_data *d;
+ int bytes, count, total_bytes;
+ bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
+ segp = segs;
+
+ if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
+ DPRINTF("tcp state=%d\n", tp->t_state);
+ return (0);
+ }
+
+ if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
+ DPRINTF("disconnecting\n");
+
+ return (0);
+ }
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ SOCKBUF_LOCK(&so->so_snd);
+
+ d = TOM_DATA(TOE_DEV(so));
+ cdev = d->cdev;
+ last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
+ total_bytes = 0;
+ DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
+ toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);
+
+ if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) {
+ KASSERT(tail, ("sbdrop error"));
+ last = tail = tail->m_next;
+ }
+
+ if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
+ DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (0);
+ }
+
+ toep->tp_m_last = NULL;
+ while (toep->tp_wr_avail && (tail != NULL)) {
+ count = bytes = 0;
+ if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (0);
+ }
+ while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
+ && (tail != NULL) && (count < TX_MAX_SEGS)) {
+ bytes += tail->m_len;
+ count++;
+ last = tail;
+ /*
+ * technically an abuse to be using this for a VA
+ * but less gross than defining my own structure
+ * or calling pmap_kextract from here :-|
+ */
+ segp->ds_addr = (bus_addr_t)tail->m_data;
+ segp->ds_len = tail->m_len;
+ DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
+ count, mbuf_wrs[count], tail->m_data, tail->m_len);
+
+ segp++;
+ tail = tail->m_next;
+ }
+ DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
+ toep->tp_wr_avail, count, mbuf_wrs[count], tail);
+ if (tail) {
+ so->so_snd.sb_sndptr = tail;
+ toep->tp_m_last = NULL;
+ } else
+ toep->tp_m_last = so->so_snd.sb_sndptr = last;
+
+ DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
+
+ so->so_snd.sb_sndptroff += bytes;
+ total_bytes += bytes;
+ toep->tp_write_seq += bytes;
+
+
+ SOCKBUF_UNLOCK(&so->so_snd);
+
+ /*
+ * XXX can drop socket buffer lock here
+ */
+
+ toep->tp_wr_avail -= mbuf_wrs[count];
+ toep->tp_wr_unacked += mbuf_wrs[count];
+
+ make_tx_data_wr(so, m0, bytes, tail);
+ m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
+ m_set_sgl(m0, segs);
+ m_set_sgllen(m0, count);
+ /*
+ * remember credits used
+ */
+ m0->m_pkthdr.csum_data = mbuf_wrs[count];
+ m0->m_pkthdr.len = bytes;
+ if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
+ toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
+ struct work_request_hdr *wr = cplhdr(m0);
+
+ wr->wr_hi |= htonl(F_WR_COMPL);
+ toep->tp_wr_unacked = 0;
+ }
+
+ m0->m_type = MT_DONTFREE;
+ enqueue_wr(toep, m0);
+ DPRINTF("sending offload tx with %d bytes in %d segments\n",
+ bytes, count);
+
+ l2t_send(cdev, m0, toep->tp_l2t);
+ if (toep->tp_wr_avail && (tail != NULL))
+ SOCKBUF_LOCK(&so->so_snd);
+ }
+
+ SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+ return (total_bytes);
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
+ * under any circumstances. We take the easy way out and always queue the
+ * message to the write_queue. We can optimize the case where the queue is
+ * already empty though the optimization is probably not worth it.
+ */
+static void
+close_conn(struct socket *so)
+{
+ struct mbuf *m;
+ struct cpl_close_con_req *req;
+ struct tom_data *d;
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp;
+ struct toepcb *toep;
+ unsigned int tid;
+
+
+ INP_LOCK(inp);
+ tp = sototcpcb(so);
+ toep = tp->t_toe;
+
+ if (tp->t_state != TCPS_SYN_SENT)
+ t3_push_frames(so, 1);
+
+ if (toep->tp_flags & TP_FIN_SENT) {
+ INP_UNLOCK(inp);
+ return;
+ }
+
+ tid = toep->tp_tid;
+
+ d = TOM_DATA(toep->tp_toedev);
+
+ m = m_gethdr_nofail(sizeof(*req));
+
+ toep->tp_flags |= TP_FIN_SENT;
+ req = mtod(m, struct cpl_close_con_req *);
+
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+ req->wr.wr_lo = htonl(V_WR_TID(tid));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+ req->rsvd = htonl(toep->tp_write_seq);
+ INP_UNLOCK(inp);
+ /*
+ * XXX - need to defer shutdown while there is still data in the queue
+ *
+ */
+ cxgb_ofld_send(d->cdev, m);
+
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
+ * and send it along.
+ */
+static void
+abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
+{
+ struct cpl_abort_req *req = cplhdr(m);
+
+ req->cmd = CPL_ABORT_NO_RST;
+ cxgb_ofld_send(cdev, m);
+}
+
+/*
+ * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
+ * permitted to return without sending the message in case we cannot allocate
+ * an sk_buff. Returns the number of credits sent.
+ */
+uint32_t
+t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
+{
+ struct mbuf *m;
+ struct cpl_rx_data_ack *req;
+ struct toepcb *toep = tp->t_toe;
+ struct toedev *tdev = toep->tp_toedev;
+
+ m = m_gethdr_nofail(sizeof(*req));
+
+ DPRINTF("returning %u credits to HW\n", credits);
+
+ req = mtod(m, struct cpl_rx_data_ack *);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
+ req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
+ m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep)));
+ cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+ return (credits);
+}
+
+
+/*
+ * Set of states for which we should return RX credits.
+ */
+#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
+
+/*
+ * Called after some received data has been read. It returns RX credits
+ * to the HW for the amount of data processed.
+ */
+void
+t3_cleanup_rbuf(struct tcpcb *tp)
+{
+ struct toepcb *toep = tp->t_toe;
+ struct socket *so;
+ struct toedev *dev;
+ int dack_mode, must_send, read;
+ u32 thres, credits, dack = 0;
+
+ if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
+ (tp->t_state == TCPS_FIN_WAIT_2)))
+ return;
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ so = tp->t_inpcb->inp_socket;
+ SOCKBUF_LOCK(&so->so_rcv);
+ read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
+ toep->tp_copied_seq += read;
+ toep->tp_enqueued_bytes -= read;
+ credits = toep->tp_copied_seq - toep->tp_rcv_wup;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ if (credits > so->so_rcv.sb_mbmax)
+ printf("copied_seq=%u rcv_wup=%u credits=%u\n",
+ toep->tp_copied_seq, toep->tp_rcv_wup, credits);
+ /*
+ * XXX this won't accurately reflect credit return - we need
+ * to look at the difference between the amount that has been
+ * put in the recv sockbuf and what is there now
+ */
+
+ if (__predict_false(!credits))
+ return;
+
+ dev = toep->tp_toedev;
+ thres = TOM_TUNABLE(dev, rx_credit_thres);
+
+ if (__predict_false(thres == 0))
+ return;
+
+ if (toep->tp_ulp_mode)
+ dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+ else {
+ dack_mode = TOM_TUNABLE(dev, delack);
+ if (__predict_false(dack_mode != toep->tp_delack_mode)) {
+ u32 r = tp->rcv_nxt - toep->tp_delack_seq;
+
+ if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
+ dack = F_RX_DACK_CHANGE |
+ V_RX_DACK_MODE(dack_mode);
+ }
+ }
+
+ /*
+ * For coalescing to work effectively ensure the receive window has
+ * at least 16KB left.
+ */
+ must_send = credits + 16384 >= tp->rcv_wnd;
+
+ if (must_send || credits >= thres)
+ toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
+}
+
+static int
+cxgb_toe_disconnect(struct tcpcb *tp)
+{
+ struct socket *so;
+
+ DPRINTF("cxgb_toe_disconnect\n");
+
+ so = tp->t_inpcb->inp_socket;
+ close_conn(so);
+ return (0);
+}
+
+static int
+cxgb_toe_abort(struct tcpcb *tp)
+{
+ struct toepcb *toep = tp->t_toe;
+
+
+ t3_send_reset(toep);
+
+ /*
+ * unhook from socket
+ */
+ tp->t_flags &= ~TF_TOE;
+ toep->tp_tp = NULL;
+ tp->t_toe = NULL;
+ return (0);
+}
+
+static int
+cxgb_toe_send(struct tcpcb *tp)
+{
+ struct socket *so;
+
+ DPRINTF("cxgb_toe_send\n");
+ dump_toepcb(tp->t_toe);
+
+ so = tp->t_inpcb->inp_socket;
+ t3_push_frames(so, 1);
+ return (0);
+}
+
+static int
+cxgb_toe_rcvd(struct tcpcb *tp)
+{
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ t3_cleanup_rbuf(tp);
+
+ return (0);
+}
+
+static void
+cxgb_toe_detach(struct tcpcb *tp)
+{
+ struct toepcb *toep;
+ /*
+ * XXX how do we handle teardown in the SYN_SENT state?
+ *
+ */
+ INP_INFO_WLOCK(&tcbinfo);
+ toep = tp->t_toe;
+ toep->tp_tp = NULL;
+
+ /*
+ * unhook from socket
+ */
+ tp->t_flags &= ~TF_TOE;
+ tp->t_toe = NULL;
+ INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+
+static struct toe_usrreqs cxgb_toe_usrreqs = {
+ .tu_disconnect = cxgb_toe_disconnect,
+ .tu_abort = cxgb_toe_abort,
+ .tu_send = cxgb_toe_send,
+ .tu_rcvd = cxgb_toe_rcvd,
+ .tu_detach = cxgb_toe_detach,
+ .tu_detach = cxgb_toe_detach,
+ .tu_syncache_event = handle_syncache_event,
+};
+
+
+static void
+__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
+ uint64_t mask, uint64_t val, int no_reply)
+{
+ struct cpl_set_tcb_field *req;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ req = mtod(m, struct cpl_set_tcb_field *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
+ req->reply = V_NO_REPLY(no_reply);
+ req->cpu_idx = 0;
+ req->word = htons(word);
+ req->mask = htobe64(mask);
+ req->val = htobe64(val);
+
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+ send_or_defer(so, tp, m, 0);
+}
+
+static void
+t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
+{
+ struct mbuf *m;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ if (toep == NULL)
+ return;
+
+ if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
+ return;
+
+ m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
+
+ __set_tcb_field(so, m, word, mask, val, 1);
+}
+
+/*
+ * Set one of the t_flags bits in the TCB.
+ */
+static void
+set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
+{
+ t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
+ */
+static void
+t3_set_nagle(struct socket *so)
+{
+ struct tcpcb *tp = sototcpcb(so);
+
+ set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
+ */
+void
+t3_set_keepalive(struct socket *so, int on_off)
+{
+ set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
+}
+
+void
+t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
+{
+ set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
+ */
+static void
+t3_set_tos(struct socket *so)
+{
+ t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
+ V_TCB_TOS(SO_TOS(so)));
+}
+
+
+/*
+ * In DDP mode, TP fails to schedule a timer to push RX data to the host when
+ * DDP is disabled (data is delivered to freelist). [Note that, the peer should
+ * set the PSH bit in the last segment, which would trigger delivery.]
+ * We work around the issue by setting a DDP buffer in a partial placed state,
+ * which guarantees that TP will schedule a timer.
+ */
+#define TP_DDP_TIMER_WORKAROUND_MASK\
+ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
+ ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
+ V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
+#define TP_DDP_TIMER_WORKAROUND_VAL\
+ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
+ ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
+ 32))
+
+static void
+t3_enable_ddp(struct socket *so, int on)
+{
+ if (on)
+ t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
+ V_TF_DDP_OFF(0));
+ else
+ t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_OFF(1) |
+ TP_DDP_TIMER_WORKAROUND_MASK,
+ V_TF_DDP_OFF(1) |
+ TP_DDP_TIMER_WORKAROUND_VAL);
+
+}
+
+
+void
+t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
+{
+ t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
+ V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
+ tag_color);
+}
+
+void
+t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+ unsigned int len)
+{
+ if (buf_idx == 0)
+ t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
+ V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+ V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+ V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
+ V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+ else
+ t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
+ V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+ V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
+ V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
+ V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
+}
+
+static int
+t3_set_cong_control(struct socket *so, const char *name)
+{
+#ifdef notyet
+ int cong_algo;
+
+ for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
+ if (!strcmp(name, t3_cong_ops[cong_algo].name))
+ break;
+
+ if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
+int
+t3_get_tcb(struct socket *so)
+{
+ struct cpl_get_tcb *req;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
+
+ if (!m)
+ return (ENOMEM);
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+ req = mtod(m, struct cpl_get_tcb *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
+ req->cpuno = htons(toep->tp_qset);
+ if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
+ mbufq_tail(&toep->out_of_order_queue, m); // defer
+ else
+ cxgb_ofld_send(T3C_DEV(so), m);
+ return 0;
+}
+
+static inline void
+so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
+{
+ struct toepcb *toep = sototoep(so);
+ toepcb_hold(toep);
+
+ cxgb_insert_tid(d->cdev, d->client, toep, tid);
+}
+
+/**
+ * find_best_mtu - find the entry in the MTU table closest to an MTU
+ * @d: TOM state
+ * @mtu: the target MTU
+ *
+ * Returns the index of the value in the MTU table that is closest to but
+ * does not exceed the target MTU.
+ */
+static unsigned int
+find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+ int i = 0;
+
+ while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+ ++i;
+ return (i);
+}
+
+static unsigned int
+select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
+{
+ unsigned int idx;
+
+#ifdef notyet
+ struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
+#endif
+ if (tp) {
+ tp->t_maxseg = pmtu - 40;
+ if (tp->t_maxseg < td->mtus[0] - 40)
+ tp->t_maxseg = td->mtus[0] - 40;
+ idx = find_best_mtu(td, tp->t_maxseg + 40);
+
+ tp->t_maxseg = td->mtus[idx] - 40;
+ } else
+ idx = find_best_mtu(td, pmtu);
+
+ return (idx);
+}
+
+void
+t3_release_ddp_resources(struct toepcb *toep)
+{
+ /*
+ * This is a no-op until we have DDP support
+ */
+}
+
+static inline void
+free_atid(struct t3cdev *cdev, unsigned int tid)
+{
+ struct toepcb *toep = cxgb_free_atid(cdev, tid);
+
+ if (toep)
+ toepcb_release(toep);
+}
+
+/*
+ * Release resources held by an offload connection (TID, L2T entry, etc.)
+ */
+static void
+t3_release_offload_resources(struct toepcb *toep)
+{
+ struct tcpcb *tp = toep->tp_tp;
+ struct toedev *tdev = toep->tp_toedev;
+ struct t3cdev *cdev;
+ unsigned int tid = toep->tp_tid;
+
+ if (!tdev)
+ return;
+
+ cdev = TOEP_T3C_DEV(toep);
+ if (!cdev)
+ return;
+
+ toep->tp_qset = 0;
+ t3_release_ddp_resources(toep);
+
+#ifdef CTRL_SKB_CACHE
+ kfree_skb(CTRL_SKB_CACHE(tp));
+ CTRL_SKB_CACHE(tp) = NULL;
+#endif
+
+ if (toep->tp_wr_avail != toep->tp_wr_max) {
+ purge_wr_queue(toep);
+ reset_wr_list(toep);
+ }
+
+ if (toep->tp_l2t) {
+ l2t_release(L2DATA(cdev), toep->tp_l2t);
+ toep->tp_l2t = NULL;
+ }
+ printf("setting toep->tp_tp to NULL\n");
+
+ toep->tp_tp = NULL;
+ if (tp) {
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ tp->t_toe = NULL;
+ tp->t_flags &= ~TF_TOE;
+ }
+
+ if (toep->tp_state == TCPS_SYN_SENT) {
+ free_atid(cdev, tid);
+#ifdef notyet
+ __skb_queue_purge(&tp->out_of_order_queue);
+#endif
+ } else { // we have TID
+ cxgb_remove_tid(cdev, toep, tid);
+ toepcb_release(toep);
+ }
+#if 0
+ log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
+#endif
+}
+
+static void
+install_offload_ops(struct socket *so)
+{
+ struct tcpcb *tp = sototcpcb(so);
+
+ KASSERT(tp->t_toe != NULL, ("toepcb not set"));
+
+ t3_install_socket_ops(so);
+ tp->t_flags |= TF_TOE;
+ tp->t_tu = &cxgb_toe_usrreqs;
+}
+
+/*
+ * Determine the receive window scaling factor given a target max
+ * receive window.
+ */
+static __inline int
+select_rcv_wscale(int space)
+{
+ int wscale = 0;
+
+ if (space > MAX_RCV_WND)
+ space = MAX_RCV_WND;
+
+ if (tcp_do_rfc1323)
+ for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
+ return wscale;
+}
+
+/*
+ * Determine the receive window size for a socket.
+ */
+static unsigned int
+select_rcv_wnd(struct socket *so)
+{
+ struct toedev *dev = TOE_DEV(so);
+ struct tom_data *d = TOM_DATA(dev);
+ unsigned int wnd;
+ unsigned int max_rcv_wnd;
+
+ if (tcp_do_autorcvbuf)
+ wnd = tcp_autorcvbuf_max;
+ else
+ wnd = sbspace(&so->so_rcv);
+
+ /* XXX
+ * For receive coalescing to work effectively we need a receive window
+ * that can accomodate a coalesced segment.
+ */
+ if (wnd < MIN_RCV_WND)
+ wnd = MIN_RCV_WND;
+
+ /* PR 5138 */
+ max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ?
+ (uint32_t)d->rx_page_size * 23 :
+ MAX_RCV_WND);
+
+ return min(wnd, max_rcv_wnd);
+}
+
+/*
+ * Assign offload parameters to some socket fields. This code is used by
+ * both active and passive opens.
+ */
+static inline void
+init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
+ struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
+
+ SOCK_LOCK_ASSERT(so);
+
+ printf("initializing offload socket\n");
+#ifdef notyet
+ /*
+ * We either need to fix push frames to work with sbcompress
+ * or we need to add this
+ */
+ so->so_rcv.sb_flags |= SB_TOE;
+ so->so_snd.sb_flags |= SB_TOE;
+#endif
+ tp->t_toe = toep;
+ toep->tp_tp = tp;
+ toep->tp_toedev = dev;
+
+ toep->tp_tid = tid;
+ toep->tp_l2t = e;
+ toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
+ toep->tp_wr_unacked = 0;
+ toep->tp_delack_mode = 0;
+
+ toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
+ /*
+ * XXX broken
+ *
+ */
+ tp->rcv_wnd = select_rcv_wnd(so);
+ toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
+ tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+ toep->tp_qset_idx = 0;
+
+ reset_wr_list(toep);
+ DPRINTF("initialization done\n");
+}
+
+/*
+ * The next two functions calculate the option 0 value for a socket.
+ */
+static inline unsigned int
+calc_opt0h(struct socket *so, int mtu_idx)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ int wscale = select_rcv_wscale(tp->rcv_wnd);
+
+ return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
+ V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
+ V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
+}
+
+static inline unsigned int
+calc_opt0l(struct socket *so, int ulp_mode)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ unsigned int val;
+
+ val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
+ V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
+
+ DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
+ return (val);
+}
+
+static inline unsigned int
+calc_opt2(const struct socket *so, struct toedev *dev)
+{
+ int flv_valid;
+
+ flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
+
+ return V_FLAVORS_VALID(flv_valid) |
+ V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
+}
+#if 0
+(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
+#endif
+
+static void
+mk_act_open_req(struct socket *so, struct mbuf *m,
+ unsigned int atid, const struct l2t_entry *e)
+{
+ struct cpl_act_open_req *req;
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp = intotcpcb(inp);
+ struct toepcb *toep = tp->t_toe;
+ struct toedev *tdev = TOE_DEV(so);
+
+ m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
+
+ req = mtod(m, struct cpl_act_open_req *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
+ req->local_port = inp->inp_lport;
+ req->peer_port = inp->inp_fport;
+ memcpy(&req->local_ip, &inp->inp_laddr, 4);
+ memcpy(&req->peer_ip, &inp->inp_faddr, 4);
+ DPRINTF("connect smt_idx=%d\n", e->smt_idx);
+ req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
+ V_TX_CHANNEL(e->smt_idx));
+ req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
+ req->params = 0;
+ req->opt2 = htonl(calc_opt2(so, tdev));
+}
+
+
+/*
+ * Convert an ACT_OPEN_RPL status to an errno.
+ */
+static int
+act_open_rpl_status_to_errno(int status)
+{
+ switch (status) {
+ case CPL_ERR_CONN_RESET:
+ return (ECONNREFUSED);
+ case CPL_ERR_ARP_MISS:
+ return (EHOSTUNREACH);
+ case CPL_ERR_CONN_TIMEDOUT:
+ return (ETIMEDOUT);
+ case CPL_ERR_TCAM_FULL:
+ return (ENOMEM);
+ case CPL_ERR_CONN_EXIST:
+ log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+ return (EADDRINUSE);
+ default:
+ return (EIO);
+ }
+}
+
+static void
+fail_act_open(struct toepcb *toep, int errno)
+{
+ struct tcpcb *tp = toep->tp_tp;
+
+ t3_release_offload_resources(toep);
+ if (tp) {
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ tcp_drop(tp, errno);
+ }
+
+#ifdef notyet
+ TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+#endif
+}
+
+/*
+ * Handle active open failures.
+ */
+static void
+active_open_failed(struct toepcb *toep, struct mbuf *m)
+{
+ struct cpl_act_open_rpl *rpl = cplhdr(m);
+ struct inpcb *inp;
+
+ INP_INFO_WLOCK(&tcbinfo);
+ if (toep->tp_tp == NULL)
+ goto done;
+
+ inp = toep->tp_tp->t_inpcb;
+ INP_LOCK(inp);
+
+/*
+ * Don't handle connection retry for now
+ */
+#ifdef notyet
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (rpl->status == CPL_ERR_CONN_EXIST &&
+ icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
+ icsk->icsk_retransmit_timer.function = act_open_retry_timer;
+ sk_reset_timer(so, &icsk->icsk_retransmit_timer,
+ jiffies + HZ / 2);
+ } else
+#endif
+ fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
+ INP_UNLOCK(inp);
+done:
+ INP_INFO_WUNLOCK(&tcbinfo);
+
+ m_free(m);
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int
+act_open_has_tid(int status)
+{
+ return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+ status != CPL_ERR_ARP_MISS;
+}
+
+/*
+ * Process an ACT_OPEN_RPL CPL message.
+ */
+static int
+do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+ struct cpl_act_open_rpl *rpl = cplhdr(m);
+
+ if (cdev->type != T3A && act_open_has_tid(rpl->status))
+ cxgb_queue_tid_release(cdev, GET_TID(rpl));
+
+ active_open_failed(toep, m);
+ return (0);
+}
+
+/*
+ * Handle an ARP failure for an active open. XXX purge ofo queue
+ *
+ * XXX badly broken for crossed SYNs as the ATID is no longer valid.
+ * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
+ * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
+ * free the atid. Hmm.
+ */
+#ifdef notyet
+static void
+act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
+{
+ struct toepcb *toep = m_get_toep(m);
+ struct tcpcb *tp = toep->tp_tp;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = toeptoso(toep);
+
+ INP_LOCK(inp);
+ if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
+ fail_act_open(so, EHOSTUNREACH);
+ printf("freeing %p\n", m);
+
+ m_free(m);
+ }
+ INP_UNLOCK(inp);
+}
+#endif
+/*
+ * Send an active open request.
+ */
+int
+t3_connect(struct toedev *tdev, struct socket *so,
+ struct rtentry *rt, struct sockaddr *nam)
+{
+ struct mbuf *m;
+ struct l2t_entry *e;
+ struct tom_data *d = TOM_DATA(tdev);
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp = intotcpcb(inp);
+ struct toepcb *toep; /* allocated by init_offload_socket */
+
+ int atid;
+
+ toep = toepcb_alloc();
+ if (toep == NULL)
+ goto out_err;
+
+ if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
+ goto out_err;
+
+ e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
+ if (!e)
+ goto free_tid;
+
+ INP_LOCK_ASSERT(inp);
+ m = m_gethdr(MT_DATA, M_WAITOK);
+
+#if 0
+ m->m_toe.mt_toepcb = tp->t_toe;
+ set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
+#endif
+ SOCK_LOCK(so);
+
+ init_offload_socket(so, tdev, atid, e, rt, toep);
+
+ install_offload_ops(so);
+
+ mk_act_open_req(so, m, atid, e);
+ SOCK_UNLOCK(so);
+
+ soisconnecting(so);
+ toep = tp->t_toe;
+ m_set_toep(m, tp->t_toe);
+
+ printf("sending off request\n");
+
+ toep->tp_state = TCPS_SYN_SENT;
+ l2t_send(d->cdev, (struct mbuf *)m, e);
+
+ if (toep->tp_ulp_mode)
+ t3_enable_ddp(so, 0);
+ return (0);
+
+free_tid:
+ printf("failing connect - free atid\n");
+
+ free_atid(d->cdev, atid);
+out_err:
+ printf("return ENOMEM\n");
+ return (ENOMEM);
+}
+
+/*
+ * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
+ * not send multiple ABORT_REQs for the same connection and also that we do
+ * not try to send a message after the connection has closed. Returns 1 if
+ * an ABORT_REQ wasn't generated after all, 0 otherwise.
+ */
+static void
+t3_send_reset(struct toepcb *toep)
+{
+
+ struct cpl_abort_req *req;
+ unsigned int tid = toep->tp_tid;
+ int mode = CPL_ABORT_SEND_RST;
+ struct tcpcb *tp = toep->tp_tp;
+ struct toedev *tdev = toep->tp_toedev;
+ struct socket *so = NULL;
+ struct mbuf *m;
+
+ if (tp) {
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ so = toeptoso(toep);
+ }
+
+ if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
+ tdev == NULL))
+ return;
+ toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
+
+ /* Purge the send queue so we don't send anything after an abort. */
+ if (so)
+ sbflush(&so->so_snd);
+ if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
+ mode |= CPL_ABORT_POST_CLOSE_REQ;
+
+ m = m_gethdr_nofail(sizeof(*req));
+ m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
+ set_arp_failure_handler(m, abort_arp_failure);
+
+ req = mtod(m, struct cpl_abort_req *);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+ req->wr.wr_lo = htonl(V_WR_TID(tid));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
+ req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
+ req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
+ req->cmd = mode;
+ if (tp && (tp->t_state == TCPS_SYN_SENT))
+ mbufq_tail(&toep->out_of_order_queue, m); // defer
+ else
+ l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
+}
+
+static int
+t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct inpcb *inp;
+ int error, optval;
+
+ if (sopt->sopt_name == IP_OPTIONS)
+ return (ENOPROTOOPT);
+
+ if (sopt->sopt_name != IP_TOS)
+ return (EOPNOTSUPP);
+
+ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+
+ if (error)
+ return (error);
+
+ if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
+ return (EPERM);
+
+ inp = sotoinpcb(so);
+ inp->inp_ip_tos = optval;
+
+ t3_set_tos(so);
+
+ return (0);
+}
+
+static int
+t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ int err = 0;
+ size_t copied;
+
+ if (sopt->sopt_name != TCP_CONGESTION &&
+ sopt->sopt_name != TCP_NODELAY)
+ return (EOPNOTSUPP);
+
+ if (sopt->sopt_name == TCP_CONGESTION) {
+ char name[TCP_CA_NAME_MAX];
+ int optlen = sopt->sopt_valsize;
+ struct tcpcb *tp;
+
+ if (optlen < 1)
+ return (EINVAL);
+
+ err = copyinstr(sopt->sopt_val, name,
+ min(TCP_CA_NAME_MAX - 1, optlen), &copied);
+ if (err)
+ return (err);
+ if (copied < 1)
+ return (EINVAL);
+
+ tp = sototcpcb(so);
+ /*
+ * XXX I need to revisit this
+ */
+ if ((err = t3_set_cong_control(so, name)) == 0) {
+#ifdef notyet
+ tp->t_cong_control = strdup(name, M_CXGB);
+#endif
+ } else
+ return (err);
+ } else {
+ int optval, oldval;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ err = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+
+ if (err)
+ return (err);
+
+ inp = sotoinpcb(so);
+ tp = intotcpcb(inp);
+
+ INP_LOCK(inp);
+
+ oldval = tp->t_flags;
+ if (optval)
+ tp->t_flags |= TF_NODELAY;
+ else
+ tp->t_flags &= ~TF_NODELAY;
+ INP_UNLOCK(inp);
+
+ if (oldval != tp->t_flags)
+ t3_set_nagle(so);
+
+ }
+
+ return (0);
+}
+
+static int
+t3_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ int err;
+
+ if (sopt->sopt_level != IPPROTO_TCP)
+ err = t3_ip_ctloutput(so, sopt);
+ else
+ err = t3_tcp_ctloutput(so, sopt);
+
+ if (err != EOPNOTSUPP)
+ return (err);
+
+ return tcp_ctloutput(so, sopt);
+}
+
+/*
+ * Process new data received for a connection.
+ */
+static void
+new_rx_data(struct toepcb *toep, struct mbuf *m)
+{
+ struct cpl_rx_data *hdr = cplhdr(m);
+ struct tcpcb *tp = toep->tp_tp;
+ struct socket *so = toeptoso(toep);
+ int len = be16toh(hdr->len);
+
+ INP_LOCK(tp->t_inpcb);
+
+#ifdef notyet
+ if (__predict_false(sk_no_receive(sk))) {
+ handle_excess_rx(so, skb);
+ return;
+ }
+
+ if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
+ handle_ddp_data(so, skb);
+
+ TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
+ TCP_SKB_CB(skb)->flags = 0;
+ skb_ulp_mode(skb) = 0; /* for iSCSI */
+#endif
+#if VALIDATE_SEQ
+ if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
+ printk(KERN_ERR
+ "%s: TID %u: Bad sequence number %u, expected %u\n",
+ TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
+ tp->rcv_nxt);
+ __kfree_skb(skb);
+ return;
+ }
+#endif
+ m_adj(m, sizeof(*hdr));
+
+#ifdef notyet
+ /*
+ * We don't handle urgent data yet
+ */
+ if (__predict_false(hdr->urg))
+ handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
+ if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
+ tp->urg_seq - tp->rcv_nxt < skb->len))
+ tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
+ tp->rcv_nxt];
+#endif
+ if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
+ toep->tp_delack_mode = hdr->dack_mode;
+ toep->tp_delack_seq = tp->rcv_nxt;
+ }
+
+ DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
+
+ if (len < m->m_pkthdr.len)
+ m->m_pkthdr.len = m->m_len = len;
+
+ tp->rcv_nxt += m->m_pkthdr.len;
+ tp->t_rcvtime = ticks;
+ toep->tp_enqueued_bytes += m->m_pkthdr.len;
+#ifdef T3_TRACE
+ T3_TRACE2(TIDTB(sk),
+ "new_rx_data: seq 0x%x len %u",
+ TCP_SKB_CB(skb)->seq, skb->len);
+#endif
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sb_notify(&so->so_rcv))
+ DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
+
+ sbappend_locked(&so->so_rcv, m);
+ KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
+
+ ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
+ so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
+
+ INP_UNLOCK(tp->t_inpcb);
+ DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
+ so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
+
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+/*
+ * Handler for RX_DATA CPL messages.
+ */
+static int
+do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+
+ DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
+
+ new_rx_data(toep, m);
+
+ return (0);
+}
+
+static void
+new_rx_data_ddp(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_rx_data_ddp *hdr;
+ unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+
+#ifdef notyet
+ if (unlikely(sk_no_receive(sk))) {
+ handle_excess_rx(so, m);
+ return;
+ }
+#endif
+ tp = sototcpcb(so);
+ q = &toep->tp_ddp_state;
+ hdr = cplhdr(m);
+ ddp_report = ntohl(hdr->u.ddp_report);
+ buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+ bsp = &q->buf_state[buf_idx];
+
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(sk),
+ "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
+ "hdr seq 0x%x len %u offset %u",
+ tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
+ ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
+ T3_TRACE1(TIDTB(sk),
+ "new_rx_data_ddp: ddp_report 0x%x",
+ ddp_report);
+#endif
+
+ ddp_len = ntohs(hdr->len);
+ rcv_nxt = ntohl(hdr->seq) + ddp_len;
+
+ /*
+ * Overload to store old rcv_next
+ */
+ m->m_pkthdr.csum_data = tp->rcv_nxt;
+ tp->rcv_nxt = rcv_nxt;
+
+ /*
+ * Store the length in m->m_len. We are changing the meaning of
+ * m->m_len here, we need to be very careful that nothing from now on
+ * interprets ->len of this packet the usual way.
+ */
+ m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
+
+ /*
+ * Figure out where the new data was placed in the buffer and store it
+ * in when. Assumes the buffer offset starts at 0, consumer needs to
+ * account for page pod's pg_offset.
+ */
+ end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
+#ifdef notyet
+ TCP_SKB_CB(skb)->when = end_offset - skb->len;
+
+ /*
+ * We store in mac.raw the address of the gather list where the
+ * placement happened.
+ */
+ skb->mac.raw = (unsigned char *)bsp->gl;
+#endif
+ bsp->cur_offset = end_offset;
+
+ /*
+ * Bit 0 of flags stores whether the DDP buffer is completed.
+ * Note that other parts of the code depend on this being in bit 0.
+ */
+ if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
+#if 0
+ TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */
+#endif
+ panic("spurious ddp completion");
+ } else {
+ m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
+ if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1; /* flip buffers */
+ }
+
+ if (bsp->flags & DDP_BF_NOCOPY) {
+ m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
+ bsp->flags &= ~DDP_BF_NOCOPY;
+ }
+
+ if (ddp_report & F_DDP_PSH)
+ m->m_pkthdr.csum_flags |= DDP_BF_PSH;
+
+ tp->t_rcvtime = ticks;
+ sbappendstream_locked(&so->so_rcv, m);
+#ifdef notyet
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+#endif
+}
+
+#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
+ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
+ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
+ F_DDP_INVALID_PPOD)
+
+/*
+ * Handler for RX_DATA_DDP CPL messages.
+ */
+static int
+do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = ctx;
+ struct socket *so = toeptoso(toep);
+ const struct cpl_rx_data_ddp *hdr = cplhdr(m);
+
+ VALIDATE_SOCK(so);
+
+ if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
+ log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
+ GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
+ return CPL_RET_BUF_DONE;
+ }
+#if 0
+ skb->h.th = tcphdr_skb->h.th;
+#endif
+ new_rx_data_ddp(so, m);
+ return (0);
+}
+
+static void
+process_ddp_complete(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_rx_ddp_complete *hdr;
+ unsigned int ddp_report, buf_idx, when;
+
+#ifdef notyet
+ if (unlikely(sk_no_receive(sk))) {
+ handle_excess_rx(sk, skb);
+ return;
+ }
+#endif
+ q = &toep->tp_ddp_state;
+ hdr = cplhdr(m);
+ ddp_report = ntohl(hdr->ddp_report);
+ buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+ bsp = &q->buf_state[buf_idx];
+
+ when = bsp->cur_offset;
+ m->m_len = G_DDP_OFFSET(ddp_report) - when;
+
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(sk),
+ "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+ "ddp_report 0x%x offset %u, len %u",
+ tp->rcv_nxt, bsp->cur_offset, ddp_report,
+ G_DDP_OFFSET(ddp_report), skb->len);
+#endif
+
+ bsp->cur_offset += m->m_len;
+
+ if (!(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1; /* flip buffers */
+
+#ifdef T3_TRACE
+ T3_TRACE4(TIDTB(sk),
+ "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+ "ddp_report %u offset %u",
+ tp->rcv_nxt, bsp->cur_offset, ddp_report,
+ G_DDP_OFFSET(ddp_report));
+#endif
+#if 0
+ skb->mac.raw = (unsigned char *)bsp->gl;
+#endif
+ m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
+ if (bsp->flags & DDP_BF_NOCOPY)
+ bsp->flags &= ~DDP_BF_NOCOPY;
+ m->m_pkthdr.csum_data = tp->rcv_nxt;
+ tp->rcv_nxt += m->m_len;
+
+ tp->t_rcvtime = ticks;
+ sbappendstream_locked(&so->so_rcv, m);
+#ifdef notyet
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk, 0);
+#endif
+}
+
+/*
+ * Handler for RX_DDP_COMPLETE CPL messages.
+ */
+static int
+do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = ctx;
+ struct socket *so = toeptoso(toep);
+
+ VALIDATE_SOCK(so);
+#if 0
+ skb->h.th = tcphdr_skb->h.th;
+#endif
+ process_ddp_complete(so, m);
+ return (0);
+}
+
+/*
+ * Move a socket to TIME_WAIT state. We need to make some adjustments to the
+ * socket state before calling tcp_time_wait to comply with its expectations.
+ */
+static void
+enter_timewait(struct socket *so)
+{
+ struct tcpcb *tp = sototcpcb(so);
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ /*
+ * Bump rcv_nxt for the peer FIN. We don't do this at the time we
+ * process peer_close because we don't want to carry the peer FIN in
+ * the socket's receive queue and if we increment rcv_nxt without
+ * having the FIN in the receive queue we'll confuse facilities such
+ * as SIOCINQ.
+ */
+ tp->rcv_nxt++;
+
+ tp->ts_recent_age = 0; /* defeat recycling */
+ tp->t_srtt = 0; /* defeat tcp_update_metrics */
+ tcp_twstart(tp);
+}
+
+/*
+ * Handle a peer FIN.
+ */
+static void
+do_peer_fin(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ int keep = 0, dead = (so->so_state & SS_NOFDREF);
+
+ DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
+
+#ifdef T3_TRACE
+ T3_TRACE0(TIDTB(sk),"do_peer_fin:");
+#endif
+
+ if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
+ printf("abort_pending set\n");
+
+ goto out;
+ }
+
+#ifdef notyet
+ if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
+ keep = handle_peer_close_data(so, skb);
+ if (keep < 0)
+ return;
+ }
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(so, SOCK_DONE);
+#endif
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(tp->t_inpcb);
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
+ socantrcvmore(so);
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /* FALLTHROUGH */
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_CLOSE_WAIT;
+ break;
+ case TCPS_FIN_WAIT_1:
+ tp->t_state = TCPS_CLOSING;
+ break;
+ case TCPS_FIN_WAIT_2:
+ /*
+ * If we've sent an abort_req we must have sent it too late,
+ * HW will send us a reply telling us so, and this peer_close
+ * is really the last message for this connection and needs to
+ * be treated as an abort_rpl, i.e., transition the connection
+ * to TCP_CLOSE (note that the host stack does this at the
+ * time of generating the RST but we must wait for HW).
+ * Otherwise we enter TIME_WAIT.
+ */
+ t3_release_offload_resources(toep);
+ if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+ tp = tcp_close(tp);
+ } else
+ enter_timewait(so);
+ break;
+ default:
+ log(LOG_ERR,
+ "%s: TID %u received PEER_CLOSE in bad state %d\n",
+ TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
+ }
+ INP_INFO_WUNLOCK(&tcbinfo);
+ if (tp)
+ INP_UNLOCK(tp->t_inpcb);
+
+ if (!dead) {
+ DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
+
+ sorwakeup(so);
+ sowwakeup(so);
+ wakeup(&so->so_timeo);
+#ifdef notyet
+ sk->sk_state_change(sk);
+
+ /* Do not send POLL_HUP for half duplex close. */
+ if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(so, 1, POLL_HUP);
+ else
+ sk_wake_async(so, 1, POLL_IN);
+#endif
+ }
+out:
+ if (!keep)
+ m_free(m);
+}
+
+/*
+ * Handler for PEER_CLOSE CPL messages.
+ */
+static int
+do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+ struct socket *so = toeptoso(toep);
+
+ VALIDATE_SOCK(so);
+
+ do_peer_fin(so, m);
+ return (0);
+}
+
+static void
+process_close_con_rpl(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct cpl_close_con_rpl *rpl = cplhdr(m);
+ struct toepcb *toep = tp->t_toe;
+
+ tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
+
+ DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
+ !!(so->so_state & SS_NOFDREF));
+ if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
+ goto out;
+
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(tp->t_inpcb);
+ switch (tp->t_state) {
+ case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
+ t3_release_offload_resources(toep);
+ if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+ tp = tcp_close(tp);
+
+ } else
+ enter_timewait(so);
+ break;
+ case TCPS_LAST_ACK:
+ /*
+ * In this state we don't care about pending abort_rpl.
+ * If we've sent abort_req it was post-close and was sent too
+ * late, this close_con_rpl is the actual last message.
+ */
+ t3_release_offload_resources(toep);
+ tp = tcp_close(tp);
+ break;
+ case TCPS_FIN_WAIT_1:
+#ifdef notyet
+ dst_confirm(sk->sk_dst_cache);
+#endif
+ soisdisconnecting(so);
+
+ if ((so->so_state & SS_NOFDREF) == 0) {
+ /*
+ * Wake up lingering close
+ */
+ sowwakeup(so);
+ sorwakeup(so);
+ wakeup(&so->so_timeo);
+ } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
+ (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
+ tp = tcp_drop(tp, 0);
+ }
+
+ break;
+ default:
+ log(LOG_ERR,
+ "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
+ TOE_DEV(so)->tod_name, toep->tp_tid,
+ tp->t_state);
+ }
+ INP_INFO_WUNLOCK(&tcbinfo);
+ if (tp)
+ INP_UNLOCK(tp->t_inpcb);
+out:
+ m_free(m);
+}
+
+/*
+ * Handler for CLOSE_CON_RPL CPL messages.
+ */
+static int
+do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
+ void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+ struct socket *so = toeptoso(toep);
+
+ VALIDATE_SOCK(so);
+
+ process_close_con_rpl(so, m);
+ return (0);
+}
+
+/*
+ * Process abort replies. We only process these messages if we anticipate
+ * them as the coordination between SW and HW in this area is somewhat lacking
+ * and sometimes we get ABORT_RPLs after we are done with the connection that
+ * originated the ABORT_REQ.
+ */
+static void
+process_abort_rpl(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+#ifdef T3_TRACE
+ T3_TRACE1(TIDTB(sk),
+ "process_abort_rpl: GTS rpl pending %d",
+ sock_flag(sk, ABORT_RPL_PENDING));
+#endif
+ INP_LOCK(tp->t_inpcb);
+
+ if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+ /*
+ * XXX panic on tcpdrop
+ */
+ if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
+ toep->tp_flags |= TP_ABORT_RPL_RCVD;
+ else {
+ toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
+ if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
+ !is_t3a(TOE_DEV(so))) {
+ if (toep->tp_flags & TP_ABORT_REQ_RCVD)
+ panic("TP_ABORT_REQ_RCVD set");
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(tp->t_inpcb);
+ t3_release_offload_resources(toep);
+ tp = tcp_close(tp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ }
+ }
+ }
+ if (tp)
+ INP_UNLOCK(tp->t_inpcb);
+
+ m_free(m);
+}
+
+/*
+ * Handle an ABORT_RPL_RSS CPL message.
+ */
+static int
+do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct socket *so;
+ struct cpl_abort_rpl_rss *rpl = cplhdr(m);
+ struct toepcb *toep;
+
+ /*
+ * Ignore replies to post-close aborts indicating that the abort was
+ * requested too late. These connections are terminated when we get
+ * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
+ * arrives the TID is either no longer used or it has been recycled.
+ */
+ if (rpl->status == CPL_ERR_ABORT_FAILED) {
+discard:
+ m_free(m);
+ return (0);
+ }
+
+ toep = (struct toepcb *)ctx;
+
+ /*
+ * Sometimes we've already closed the socket, e.g., a post-close
+ * abort races with ABORT_REQ_RSS, the latter frees the socket
+ * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
+ * but FW turns the ABORT_REQ into a regular one and so we get
+ * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
+ */
+ if (!toep)
+ goto discard;
+
+ if (toep->tp_tp == NULL) {
+ printf("removing tid for abort\n");
+ cxgb_remove_tid(cdev, toep, toep->tp_tid);
+ if (toep->tp_l2t)
+ l2t_release(L2DATA(cdev), toep->tp_l2t);
+
+ toepcb_release(toep);
+ goto discard;
+ }
+
+ printf("toep=%p\n", toep);
+ printf("tp=%p\n", toep->tp_tp);
+
+ so = toeptoso(toep); /* <- XXX panic */
+ toepcb_hold(toep);
+ process_abort_rpl(so, m);
+ toepcb_release(toep);
+ return (0);
+}
+
+/*
+ * Convert the status code of an ABORT_REQ into a Linux error code. Also
+ * indicate whether RST should be sent in response.
+ */
+static int
+abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
+{
+ struct tcpcb *tp = sototcpcb(so);
+
+ switch (abort_reason) {
+ case CPL_ERR_BAD_SYN:
+#if 0
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
+#endif
+ case CPL_ERR_CONN_RESET:
+ // XXX need to handle SYN_RECV due to crossed SYNs
+ return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
+ case CPL_ERR_XMIT_TIMEDOUT:
+ case CPL_ERR_PERSIST_TIMEDOUT:
+ case CPL_ERR_FINWAIT2_TIMEDOUT:
+ case CPL_ERR_KEEPALIVE_TIMEDOUT:
+#if 0
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+#endif
+ return (ETIMEDOUT);
+ default:
+ return (EIO);
+ }
+}
+
+static inline void
+set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
+{
+ struct cpl_abort_rpl *rpl = cplhdr(m);
+
+ rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+ rpl->wr.wr_lo = htonl(V_WR_TID(tid));
+ m->m_len = m->m_pkthdr.len = sizeof(*rpl);
+
+ OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+ rpl->cmd = cmd;
+}
+
+static void
+send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
+{
+ struct mbuf *reply_mbuf;
+ struct cpl_abort_req_rss *req = cplhdr(m);
+
+ reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
+ m_set_priority(m, CPL_PRIORITY_DATA);
+ m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
+ set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
+ cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+ m_free(m);
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static inline int
+is_neg_adv_abort(unsigned int status)
+{
+ return status == CPL_ERR_RTX_NEG_ADVICE ||
+ status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static void
+send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
+{
+ struct mbuf *reply_mbuf;
+ struct cpl_abort_req_rss *req = cplhdr(m);
+
+ reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+
+ if (!reply_mbuf) {
+ /* Defer the reply. Stick rst_status into req->cmd. */
+ req->status = rst_status;
+ t3_defer_reply(m, tdev, send_deferred_abort_rpl);
+ return;
+ }
+
+ m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
+ set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
+ m_free(m);
+
+ /*
+ * XXX need to sync with ARP as for SYN_RECV connections we can send
+ * these messages while ARP is pending. For other connection states
+ * it's not a problem.
+ */
+ cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+}
+
+#ifdef notyet
+static void
+cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
+{
+ UNIMPLEMENTED();
+#ifdef notyet
+ struct request_sock *req = child->sk_user_data;
+
+ inet_csk_reqsk_queue_removed(parent, req);
+ synq_remove(tcp_sk(child));
+ __reqsk_free(req);
+ child->sk_user_data = NULL;
+#endif
+}
+
+
+/*
+ * Performs the actual work to abort a SYN_RECV connection.
+ */
+static void
+do_abort_syn_rcv(struct socket *child, struct socket *parent)
+{
+ struct tcpcb *parenttp = sototcpcb(parent);
+ struct tcpcb *childtp = sototcpcb(child);
+
+ /*
+ * If the server is still open we clean up the child connection,
+ * otherwise the server already did the clean up as it was purging
+ * its SYN queue and the skb was just sitting in its backlog.
+ */
+ if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
+ cleanup_syn_rcv_conn(child, parent);
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(childtp->t_inpcb);
+ t3_release_offload_resources(childtp->t_toe);
+ childtp = tcp_close(childtp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ if (childtp)
+ INP_UNLOCK(childtp->t_inpcb);
+ }
+}
+#endif
+
+/*
+ * Handle abort requests for a SYN_RECV connection. These need extra work
+ * because the socket is on its parent's SYN queue.
+ */
+static int
+abort_syn_rcv(struct socket *so, struct mbuf *m)
+{
+ UNIMPLEMENTED();
+#ifdef notyet
+ struct socket *parent;
+ struct toedev *tdev = TOE_DEV(so);
+ struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
+ struct socket *oreq = so->so_incomp;
+ struct t3c_tid_entry *t3c_stid;
+ struct tid_info *t;
+
+ if (!oreq)
+ return -1; /* somehow we are not on the SYN queue */
+
+ t = &(T3C_DATA(cdev))->tid_maps;
+ t3c_stid = lookup_stid(t, oreq->ts_recent);
+ parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
+
+ SOCK_LOCK(parent);
+ do_abort_syn_rcv(so, parent);
+ send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
+ SOCK_UNLOCK(parent);
+#endif
+ return (0);
+}
+
+/*
+ * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
+ * request except that we need to reply to it.
+ */
+static void
+process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
+{
+ int rst_status = CPL_ABORT_NO_RST;
+ const struct cpl_abort_req_rss *req = cplhdr(m);
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ INP_LOCK(tp->t_inpcb);
+ if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
+ toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
+ m_free(m);
+ goto skip;
+ }
+
+ toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
+ /*
+ * Three cases to consider:
+ * a) We haven't sent an abort_req; close the connection.
+ * b) We have sent a post-close abort_req that will get to TP too late
+ * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
+ * be ignored and the connection should be closed now.
+ * c) We have sent a regular abort_req that will get to TP too late.
+ * That will generate an abort_rpl with status 0, wait for it.
+ */
+ if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
+ (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
+ so->so_error = abort_status_to_errno(so, req->status,
+ &rst_status);
+#if 0
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+#endif
+ /*
+ * SYN_RECV needs special processing. If abort_syn_rcv()
+ * returns 0 is has taken care of the abort.
+ */
+ if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
+ goto skip;
+
+ t3_release_offload_resources(toep);
+ tp = tcp_close(tp);
+ }
+ if (tp)
+ INP_UNLOCK(tp->t_inpcb);
+ send_abort_rpl(m, tdev, rst_status);
+ return;
+
+skip:
+ INP_UNLOCK(tp->t_inpcb);
+}
+
+/*
+ * Handle an ABORT_REQ_RSS CPL message.
+ */
+static int
+do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ const struct cpl_abort_req_rss *req = cplhdr(m);
+ struct toepcb *toep = (struct toepcb *)ctx;
+ struct socket *so;
+ struct inpcb *inp;
+
+ if (is_neg_adv_abort(req->status)) {
+ m_free(m);
+ return (0);
+ }
+
+ printf("aborting tid=%d\n", toep->tp_tid);
+
+ if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
+ cxgb_remove_tid(cdev, toep, toep->tp_tid);
+ toep->tp_flags |= TP_ABORT_REQ_RCVD;
+ printf("sending abort rpl\n");
+
+ send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
+ printf("sent\n");
+ if (toep->tp_l2t)
+ l2t_release(L2DATA(cdev), toep->tp_l2t);
+
+ /*
+ * Unhook
+ */
+ toep->tp_tp->t_toe = NULL;
+ toep->tp_tp->t_flags &= ~TF_TOE;
+ toep->tp_tp = NULL;
+ /*
+ * XXX need to call syncache_chkrst - but we don't
+ * have a way of doing that yet
+ */
+ toepcb_release(toep);
+ printf("abort for unestablished connection :-(\n");
+ return (0);
+ }
+ if (toep->tp_tp == NULL) {
+ printf("disconnected toepcb\n");
+ /* should be freed momentarily */
+ return (0);
+ }
+
+ so = toeptoso(toep);
+ inp = sotoinpcb(so);
+
+ VALIDATE_SOCK(so);
+ toepcb_hold(toep);
+ INP_INFO_WLOCK(&tcbinfo);
+ process_abort_req(so, m, TOE_DEV(so));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ toepcb_release(toep);
+ return (0);
+}
+#ifdef notyet
+static void
+pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
+{
+ struct toedev *tdev = TOE_DEV(parent);
+
+ do_abort_syn_rcv(child, parent);
+ if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
+ struct cpl_pass_accept_rpl *rpl = cplhdr(m);
+
+ rpl->opt0h = htonl(F_TCAM_BYPASS);
+ rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+ cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+ } else
+ m_free(m);
+}
+#endif
+static void
+handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
+{
+ UNIMPLEMENTED();
+
+#ifdef notyet
+ struct t3cdev *cdev;
+ struct socket *parent;
+ struct socket *oreq;
+ struct t3c_tid_entry *t3c_stid;
+ struct tid_info *t;
+ struct tcpcb *otp, *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ /*
+ * If the connection is being aborted due to the parent listening
+ * socket going away there's nothing to do, the ABORT_REQ will close
+ * the connection.
+ */
+ if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+ m_free(m);
+ return;
+ }
+
+ oreq = so->so_incomp;
+ otp = sototcpcb(oreq);
+
+ cdev = T3C_DEV(so);
+ t = &(T3C_DATA(cdev))->tid_maps;
+ t3c_stid = lookup_stid(t, otp->ts_recent);
+ parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
+
+ SOCK_LOCK(parent);
+ pass_open_abort(so, parent, m);
+ SOCK_UNLOCK(parent);
+#endif
+}
+
+/*
+ * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
+ * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
+ * connection.
+ */
+static void
+pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
+{
+
+#ifdef notyet
+ TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+ BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
+#endif
+ handle_pass_open_arp_failure(m_get_socket(m), m);
+}
+
+/*
+ * Populate a reject CPL_PASS_ACCEPT_RPL WR.
+ */
+static void
+mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
+{
+ struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
+ struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
+ unsigned int tid = GET_TID(req);
+
+ m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
+ rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
+ rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
+ rpl->opt0h = htonl(F_TCAM_BYPASS);
+ rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+ rpl->opt2 = 0;
+ rpl->rsvd = rpl->opt2; /* workaround for HW bug */
+}
+
+/*
+ * Send a deferred reject to an accept request.
+ */
+static void
+reject_pass_request(struct toedev *tdev, struct mbuf *m)
+{
+ struct mbuf *reply_mbuf;
+
+ reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
+ mk_pass_accept_rpl(reply_mbuf, m);
+ cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+ m_free(m);
+}
+
+static void
+handle_syncache_event(int event, void *arg)
+{
+ struct toepcb *toep = arg;
+
+ switch (event) {
+ case SC_ENTRY_PRESENT:
+ /*
+ * entry already exists - free toepcb
+ * and l2t
+ */
+ printf("syncache entry present\n");
+ toepcb_release(toep);
+ break;
+ case SC_DROP:
+ /*
+ * The syncache has given up on this entry
+ * either it timed out, or it was evicted
+ * we need to explicitly release the tid
+ */
+ printf("syncache entry dropped\n");
+ toepcb_release(toep);
+ break;
+ default:
+ log(LOG_ERR, "unknown syncache event %d\n", event);
+ break;
+ }
+}
+
+static void
+syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
+{
+ struct in_conninfo inc;
+ struct tcpopt to;
+ struct tcphdr th;
+ struct inpcb *inp;
+ int mss, wsf, sack, ts;
+
+ bzero(&to, sizeof(struct tcpopt));
+ inp = sotoinpcb(lso);
+
+ /*
+ * Fill out information for entering us into the syncache
+ */
+ inc.inc_fport = th.th_sport = req->peer_port;
+ inc.inc_lport = th.th_dport = req->local_port;
+ toep->tp_iss = th.th_seq = req->rcv_isn;
+ th.th_flags = TH_SYN;
+
+ toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
+
+ inc.inc_isipv6 = 0;
+ inc.inc_len = 0;
+ inc.inc_faddr.s_addr = req->peer_ip;
+ inc.inc_laddr.s_addr = req->local_ip;
+
+ DPRINTF("syncache add of %d:%d %d:%d\n",
+ ntohl(req->local_ip), ntohs(req->local_port),
+ ntohl(req->peer_ip), ntohs(req->peer_port));
+
+ mss = req->tcp_options.mss;
+ wsf = req->tcp_options.wsf;
+ ts = req->tcp_options.tstamp;
+ sack = req->tcp_options.sack;
+ to.to_mss = mss;
+ to.to_wscale = wsf;
+ to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
+
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(inp);
+ syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
+}
+
+
+/*
+ * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
+ * lock held. Note that the sock here is a listening socket that is not owned
+ * by the TOE.
+ */
+static void
+process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
+ struct listen_ctx *lctx)
+{
+ int rt_flags;
+ struct l2t_entry *e;
+ struct iff_mac tim;
+ struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
+ struct cpl_pass_accept_rpl *rpl;
+ struct cpl_pass_accept_req *req = cplhdr(m);
+ unsigned int tid = GET_TID(req);
+ struct tom_data *d = TOM_DATA(tdev);
+ struct t3cdev *cdev = d->cdev;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *newtoep;
+ struct rtentry *dst;
+ struct sockaddr_in nam;
+ struct t3c_data *td = T3C_DATA(cdev);
+
+ reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+ if (__predict_false(reply_mbuf == NULL)) {
+ if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
+ t3_defer_reply(m, tdev, reject_pass_request);
+ else {
+ cxgb_queue_tid_release(cdev, tid);
+ m_free(m);
+ }
+ DPRINTF("failed to get reply_mbuf\n");
+
+ goto out;
+ }
+
+ if (tp->t_state != TCPS_LISTEN) {
+ DPRINTF("socket not in listen state\n");
+
+ goto reject;
+ }
+
+ tim.mac_addr = req->dst_mac;
+ tim.vlan_tag = ntohs(req->vlan_tag);
+ if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+ DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
+ goto reject;
+ }
+
+#ifdef notyet
+ /*
+ * XXX do route lookup to confirm that we're still listening on this
+ * address
+ */
+ if (ip_route_input(skb, req->local_ip, req->peer_ip,
+ G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
+ goto reject;
+ rt_flags = ((struct rtable *)skb->dst)->rt_flags &
+ (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
+ dst_release(skb->dst); // done with the input route, release it
+ skb->dst = NULL;
+
+ if ((rt_flags & RTF_LOCAL) == 0)
+ goto reject;
+#endif
+ /*
+ * XXX
+ */
+ rt_flags = RTF_LOCAL;
+ if ((rt_flags & RTF_LOCAL) == 0)
+ goto reject;
+
+ /*
+ * Calculate values and add to syncache
+ */
+
+ newtoep = toepcb_alloc();
+ if (newtoep == NULL)
+ goto reject;
+
+ bzero(&nam, sizeof(struct sockaddr_in));
+
+ nam.sin_len = sizeof(struct sockaddr_in);
+ nam.sin_family = AF_INET;
+ nam.sin_addr.s_addr =req->peer_ip;
+ dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
+
+ if (dst == NULL) {
+ printf("failed to find route\n");
+ goto reject;
+ }
+ e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
+ (struct sockaddr *)&nam);
+ if (e == NULL) {
+ DPRINTF("failed to get l2t\n");
+ }
+ /*
+ * Point to our listen socket until accept
+ */
+ newtoep->tp_tp = tp;
+ newtoep->tp_flags = TP_SYN_RCVD;
+ newtoep->tp_tid = tid;
+ newtoep->tp_toedev = tdev;
+
+ printf("inserting tid=%d\n", tid);
+ cxgb_insert_tid(cdev, d->client, newtoep, tid);
+ SOCK_LOCK(so);
+ LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
+ SOCK_UNLOCK(so);
+
+
+ if (lctx->ulp_mode) {
+ ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+
+ if (!ddp_mbuf)
+ newtoep->tp_ulp_mode = 0;
+ else
+ newtoep->tp_ulp_mode = lctx->ulp_mode;
+ }
+
+ set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
+
+ DPRINTF("adding request to syn cache\n");
+
+ /*
+ * XXX workaround for lack of syncache drop
+ */
+ toepcb_hold(newtoep);
+ syncache_add_accept_req(req, so, newtoep);
+
+
+
+ rpl = cplhdr(reply_mbuf);
+ reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
+ rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ rpl->wr.wr_lo = 0;
+ OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
+ rpl->opt2 = htonl(calc_opt2(so, tdev));
+ rpl->rsvd = rpl->opt2; /* workaround for HW bug */
+ rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
+
+ DPRINTF("accept smt_idx=%d\n", e->smt_idx);
+
+ rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
+ V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
+ rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
+ CPL_PASS_OPEN_ACCEPT);
+
+ DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
+
+ m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
+
+#ifdef DEBUG_PRINT
+ {
+ int i;
+
+ DPRINTF("rpl:\n");
+ uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
+
+ for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
+ DPRINTF("[%d] %08x\n", i, rplbuf[i]);
+ }
+#endif
+
+
+ l2t_send(cdev, reply_mbuf, e);
+ m_free(m);
+#ifdef notyet
+ /*
+ * XXX this call path has to be converted to not depend on sockets
+ */
+ if (newtoep->tp_ulp_mode)
+ __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_OFF(1) |
+ TP_DDP_TIMER_WORKAROUND_MASK,
+ V_TF_DDP_OFF(1) |
+ TP_DDP_TIMER_WORKAROUND_VAL, 1);
+
+#endif
+ return;
+reject:
+ if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
+ mk_pass_accept_rpl(reply_mbuf, m);
+ else
+ mk_tid_release(reply_mbuf, NULL, tid);
+ cxgb_ofld_send(cdev, reply_mbuf);
+ m_free(m);
+out:
+#if 0
+ TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+#else
+ return;
+#endif
+}
+
+/*
+ * Handle a CPL_PASS_ACCEPT_REQ message.
+ */
+static int
+do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+ struct socket *lso = listen_ctx->lso;
+ struct tom_data *d = listen_ctx->tom_data;
+
+#if VALIDATE_TID
+ struct cpl_pass_accept_req *req = cplhdr(m);
+ unsigned int tid = GET_TID(req);
+ struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
+
+ if (unlikely(!lsk)) {
+ printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
+ cdev->name,
+ (unsigned long)((union listen_entry *)ctx -
+ t->stid_tab));
+ return CPL_RET_BUF_DONE;
+ }
+ if (unlikely(tid >= t->ntids)) {
+ printk(KERN_ERR "%s: passive open TID %u too large\n",
+ cdev->name, tid);
+ return CPL_RET_BUF_DONE;
+ }
+ /*
+ * For T3A the current user of the TID may have closed but its last
+ * message(s) may have been backlogged so the TID appears to be still
+ * in use. Just take the TID away, the connection can close at its
+ * own leisure. For T3B this situation is a bug.
+ */
+ if (!valid_new_tid(t, tid) &&
+ cdev->type != T3A) {
+ printk(KERN_ERR "%s: passive open uses existing TID %u\n",
+ cdev->name, tid);
+ return CPL_RET_BUF_DONE;
+ }
+#endif
+
+ process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
+ return (0);
+}
+
+/*
+ * Called when a connection is established to translate the TCP options
+ * reported by HW to Linux's native format.
+ */
+static void
+assign_rxopt(struct socket *so, unsigned int opt)
+{
+ const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
+ tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
+ tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
+ tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
+ if (tp->t_flags & TF_RCVD_SCALE)
+ tp->rcv_scale = 0;
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCP_ESTABLISHED.
+ *
+ * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
+ */
+static void
+make_established(struct socket *so, u32 snd_isn, unsigned int opt)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
+ assign_rxopt(so, opt);
+ so->so_proto->pr_ctloutput = t3_ctloutput;
+
+#if 0
+ inet_sk(sk)->id = tp->write_seq ^ jiffies;
+#endif
+
+
+ /*
+ * XXX not clear what rcv_wup maps to
+ */
+ /*
+ * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+ * pass through opt0.
+ */
+ if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
+ toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
+
+ dump_toepcb(toep);
+
+#ifdef notyet
+/*
+ * no clean interface for marking ARP up to date
+ */
+ dst_confirm(sk->sk_dst_cache);
+#endif
+ tp->t_state = TCPS_ESTABLISHED;
+}
+
+static int
+syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
+{
+
+ struct in_conninfo inc;
+ struct tcpopt to;
+ struct tcphdr th;
+ int mss, wsf, sack, ts;
+ struct mbuf *m = NULL;
+ const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
+ unsigned int opt;
+
+#ifdef MAC
+#error "no MAC support"
+#endif
+
+ opt = ntohs(req->tcp_opt);
+
+ bzero(&to, sizeof(struct tcpopt));
+
+ /*
+ * Fill out information for entering us into the syncache
+ */
+ inc.inc_fport = th.th_sport = req->peer_port;
+ inc.inc_lport = th.th_dport = req->local_port;
+ th.th_seq = req->rcv_isn;
+ th.th_flags = TH_ACK;
+
+ inc.inc_isipv6 = 0;
+ inc.inc_len = 0;
+ inc.inc_faddr.s_addr = req->peer_ip;
+ inc.inc_laddr.s_addr = req->local_ip;
+
+ mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
+ wsf = G_TCPOPT_WSCALE_OK(opt);
+ ts = G_TCPOPT_TSTAMP(opt);
+ sack = G_TCPOPT_SACK(opt);
+
+ to.to_mss = mss;
+ to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
+ to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
+
+ DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
+ ntohl(req->local_ip), ntohs(req->local_port),
+ ntohl(req->peer_ip), ntohs(req->peer_port),
+ mss, wsf, ts, sack);
+ return syncache_expand(&inc, &to, &th, so, m);
+}
+
+
+/*
+ * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
+ * if we are in TCP_SYN_RECV due to crossed SYNs
+ */
+static int
+do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct cpl_pass_establish *req = cplhdr(m);
+ struct toepcb *toep = (struct toepcb *)ctx;
+ struct tcpcb *tp;
+ struct socket *so, *lso;
+ struct t3c_data *td = T3C_DATA(cdev);
+ // Complete socket initialization now that we have the SND_ISN
+
+ struct toedev *tdev;
+
+ so = lso = toeptoso(toep);
+ tdev = toep->tp_toedev;
+
+ SOCK_LOCK(so);
+ LIST_REMOVE(toep, synq_entry);
+ SOCK_UNLOCK(so);
+
+ INP_INFO_WLOCK(&tcbinfo);
+ if (!syncache_expand_establish_req(req, &so, toep)) {
+ /*
+ * No entry
+ */
+ UNIMPLEMENTED();
+ }
+ if (so == NULL) {
+ /*
+ * Couldn't create the socket
+ */
+ UNIMPLEMENTED();
+ }
+
+ /*
+ * XXX workaround for lack of syncache drop
+ */
+ toepcb_release(toep);
+
+ tp = sototcpcb(so);
+ INP_LOCK(tp->t_inpcb);
+#ifdef notyet
+ so->so_snd.sb_flags |= SB_TOE;
+ so->so_rcv.sb_flags |= SB_TOE;
+#endif
+ toep->tp_tp = tp;
+ toep->tp_flags = 0;
+ tp->t_toe = toep;
+ reset_wr_list(toep);
+ tp->rcv_wnd = select_rcv_wnd(so);
+ DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
+ install_offload_ops(so);
+
+ toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
+ toep->tp_wr_unacked = 0;
+ toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
+ toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
+ tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+ toep->tp_qset_idx = 0;
+ toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
+
+ /*
+ * XXX Cancel any keep alive timer
+ */
+
+ make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+ INP_INFO_WUNLOCK(&tcbinfo);
+ INP_UNLOCK(tp->t_inpcb);
+ soisconnected(so);
+
+#ifdef notyet
+ /*
+ * XXX not sure how these checks map to us
+ */
+ if (unlikely(sk->sk_socket)) { // simultaneous opens only
+ sk->sk_state_change(sk);
+ sk_wake_async(so, 0, POLL_OUT);
+ }
+ /*
+ * The state for the new connection is now up to date.
+ * Next check if we should add the connection to the parent's
+ * accept queue. When the parent closes it resets connections
+ * on its SYN queue, so check if we are being reset. If so we
+ * don't need to do anything more, the coming ABORT_RPL will
+ * destroy this socket. Otherwise move the connection to the
+ * accept queue.
+ *
+ * Note that we reset the synq before closing the server so if
+ * we are not being reset the stid is still open.
+ */
+ if (unlikely(!tp->forward_skb_hint)) { // removed from synq
+ __kfree_skb(skb);
+ goto unlock;
+ }
+#endif
+ m_free(m);
+
+ return (0);
+}
+
+/*
+ * Fill in the right TID for CPL messages waiting in the out-of-order queue
+ * and send them to the TOE.
+ */
+static void
+fixup_and_send_ofo(struct socket *so)
+{
+ struct mbuf *m;
+ struct toedev *tdev = TOE_DEV(so);
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ unsigned int tid = toep->tp_tid;
+
+ printf("fixup_and_send_ofo\n");
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
+ /*
+ * A variety of messages can be waiting but the fields we'll
+ * be touching are common to all so any message type will do.
+ */
+ struct cpl_close_con_req *p = cplhdr(m);
+
+ p->wr.wr_lo = htonl(V_WR_TID(tid));
+ OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
+ cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+ }
+}
+
+/*
+ * Updates socket state from an active establish CPL message. Runs with the
+ * socket lock held.
+ */
+static void
+socket_act_establish(struct socket *so, struct mbuf *m)
+{
+ struct cpl_act_establish *req = cplhdr(m);
+ u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ if (__predict_false(tp->t_state != TCPS_SYN_SENT))
+ log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
+ toep->tp_tid, tp->t_state);
+
+ tp->ts_recent_age = ticks;
+ tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
+ toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
+
+ make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+ /*
+ * Now that we finally have a TID send any CPL messages that we had to
+ * defer for lack of a TID.
+ */
+ if (mbufq_len(&toep->out_of_order_queue))
+ fixup_and_send_ofo(so);
+
+ if (__predict_false(so->so_state & SS_NOFDREF)) {
+#ifdef notyet
+ /*
+ * XXX not clear what should be done here
+ * appears to correspond to sorwakeup_locked
+ */
+ sk->sk_state_change(sk);
+ sk_wake_async(so, 0, POLL_OUT);
+#endif
+ }
+ m_free(m);
+#ifdef notyet
+/*
+ * XXX assume no write requests permitted while socket connection is
+ * incomplete
+ */
+ /*
+ * Currently the send queue must be empty at this point because the
+ * socket layer does not send anything before a connection is
+ * established. To be future proof though we handle the possibility
+ * that there are pending buffers to send (either TX_DATA or
+ * CLOSE_CON_REQ). First we need to adjust the sequence number of the
+ * buffers according to the just learned write_seq, and then we send
+ * them on their way.
+ */
+ fixup_pending_writeq_buffers(sk);
+ if (t3_push_frames(so, 1))
+ sk->sk_write_space(sk);
+#endif
+
+ soisconnected(so);
+ toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
+ tcpstat.tcps_connects++;
+
+}
+
+/*
+ * Process a CPL_ACT_ESTABLISH message.
+ */
+static int
+do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct cpl_act_establish *req = cplhdr(m);
+ unsigned int tid = GET_TID(req);
+ unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+ struct toepcb *toep = (struct toepcb *)ctx;
+ struct tcpcb *tp = toep->tp_tp;
+ struct socket *so;
+ struct toedev *tdev;
+ struct tom_data *d;
+
+ if (tp == NULL) {
+ free_atid(cdev, atid);
+ return (0);
+ }
+
+ so = toeptoso(toep);
+ tdev = TOE_DEV(so); /* blow up here if link was down */
+ d = TOM_DATA(tdev);
+
+ INP_LOCK(tp->t_inpcb);
+
+ /*
+ * It's OK if the TID is currently in use, the owning socket may have
+ * backlogged its last CPL message(s). Just take it away.
+ */
+ toep->tp_tid = tid;
+ toep->tp_tp = tp;
+ so_insert_tid(d, so, tid);
+ free_atid(cdev, atid);
+ toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
+
+ socket_act_establish(so, m);
+ INP_UNLOCK(tp->t_inpcb);
+ return (0);
+}
+
+/*
+ * Process an acknowledgment of WR completion. Advance snd_una and send the
+ * next batch of work requests from the write queue.
+ */
+static void
+wr_ack(struct toepcb *toep, struct mbuf *m)
+{
+ struct tcpcb *tp = toep->tp_tp;
+ struct cpl_wr_ack *hdr = cplhdr(m);
+ struct socket *so = toeptoso(toep);
+ unsigned int credits = ntohs(hdr->credits);
+ u32 snd_una = ntohl(hdr->snd_una);
+ int bytes = 0;
+
+ DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
+
+ INP_LOCK(tp->t_inpcb);
+
+ toep->tp_wr_avail += credits;
+ if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
+ toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
+
+ while (credits) {
+ struct mbuf *p = peek_wr(toep);
+ DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
+
+ if (__predict_false(!p)) {
+ log(LOG_ERR, "%u WR_ACK credits for TID %u with "
+ "nothing pending, state %u\n",
+ credits, toep->tp_tid, tp->t_state);
+ break;
+ }
+ if (__predict_false(credits < p->m_pkthdr.csum_data)) {
+#if DEBUG_WR > 1
+ struct tx_data_wr *w = cplhdr(p);
+#ifdef notyet
+ log(LOG_ERR,
+ "TID %u got %u WR credits, need %u, len %u, "
+ "main body %u, frags %u, seq # %u, ACK una %u,"
+ " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
+ toep->tp_tid, credits, p->csum, p->len,
+ p->len - p->data_len, skb_shinfo(p)->nr_frags,
+ ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
+ WR_AVAIL(tp), count_pending_wrs(tp) - credits);
+#endif
+#endif
+ p->m_pkthdr.csum_data -= credits;
+ break;
+ } else {
+ dequeue_wr(toep);
+ credits -= p->m_pkthdr.csum_data;
+ bytes += p->m_pkthdr.len;
+ DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
+
+ m_free(p);
+ }
+ }
+
+#if DEBUG_WR
+ check_wr_invariants(tp);
+#endif
+
+ if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
+#if VALIDATE_SEQ
+ struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+ log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
+ "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
+ toep->tp_tid, tp->snd_una);
+#endif
+ goto out_free;
+ }
+
+ if (tp->snd_una != snd_una) {
+ tp->snd_una = snd_una;
+ tp->ts_recent_age = ticks;
+#ifdef notyet
+ /*
+ * Keep ARP entry "minty fresh"
+ */
+ dst_confirm(sk->sk_dst_cache);
+#endif
+ if (tp->snd_una == tp->snd_nxt)
+ toep->tp_flags &= ~TP_TX_WAIT_IDLE;
+ }
+ if (bytes) {
+ DPRINTF("sbdrop(%d)\n", bytes);
+ SOCKBUF_LOCK(&so->so_snd);
+ sbdrop_locked(&so->so_snd, bytes);
+ sowwakeup_locked(so);
+ }
+
+ if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
+ t3_push_frames(so, 0);
+
+out_free:
+ INP_UNLOCK(tp->t_inpcb);
+ m_free(m);
+}
+
+/*
+ * Handler for TX_DATA_ACK CPL messages.
+ */
+static int
+do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+
+ DPRINTF("do_wr_ack\n");
+ dump_toepcb(toep);
+
+ VALIDATE_SOCK(so);
+
+ wr_ack(toep, m);
+ return 0;
+}
+
+
+/*
+ * Reset a connection that is on a listener's SYN queue or accept queue,
+ * i.e., one that has not had a struct socket associated with it.
+ * Must be called from process context.
+ *
+ * Modeled after code in inet_csk_listen_stop().
+ */
+static void
+t3_reset_listen_child(struct socket *child)
+{
+ struct tcpcb *tp = sototcpcb(child);
+
+ t3_send_reset(tp->t_toe);
+}
+
+/*
+ * Disconnect offloaded established but not yet accepted connections sitting
+ * on a server's accept_queue. We just send an ABORT_REQ at this point and
+ * finish off the disconnect later as we may need to wait for the ABORT_RPL.
+ */
+void
+t3_disconnect_acceptq(struct socket *listen_so)
+{
+ struct socket *so;
+ struct tcpcb *tp;
+
+ TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
+ tp = sototcpcb(so);
+
+ if (tp->t_flags & TF_TOE) {
+ INP_LOCK(tp->t_inpcb);
+ t3_reset_listen_child(so);
+ INP_UNLOCK(tp->t_inpcb);
+ }
+
+ }
+}
+
+/*
+ * Reset offloaded connections sitting on a server's syn queue. As above
+ * we send ABORT_REQ and finish off when we get ABORT_RPL.
+ */
+
+void
+t3_reset_synq(struct listen_ctx *lctx)
+{
+ struct toepcb *toep;
+
+ SOCK_LOCK(lctx->lso);
+ while (!LIST_EMPTY(&lctx->synq_head)) {
+ toep = LIST_FIRST(&lctx->synq_head);
+ LIST_REMOVE(toep, synq_entry);
+ toep->tp_tp = NULL;
+ t3_send_reset(toep);
+ cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
+ toepcb_release(toep);
+ }
+ SOCK_UNLOCK(lctx->lso);
+}
+
+void
+t3_init_wr_tab(unsigned int wr_len)
+{
+ int i;
+
+ if (mbuf_wrs[1]) /* already initialized */
+ return;
+
+ for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
+ int sgl_len = (3 * i) / 2 + (i & 1);
+
+ sgl_len += 3;
+ mbuf_wrs[i] = sgl_len <= wr_len ?
+ 1 : 1 + (sgl_len - 2) / (wr_len - 1);
+ }
+
+ wrlen = wr_len * 8;
+}
+
+int
+t3_init_cpl_io(void)
+{
+#ifdef notyet
+ tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
+ if (!tcphdr_skb) {
+ log(LOG_ERR,
+ "Chelsio TCP offload: can't allocate sk_buff\n");
+ return -1;
+ }
+ skb_put(tcphdr_skb, sizeof(struct tcphdr));
+ tcphdr_skb->h.raw = tcphdr_skb->data;
+ memset(tcphdr_skb->data, 0, tcphdr_skb->len);
+#endif
+
+
+ t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
+ t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
+ t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
+ t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
+ t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
+ t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
+ t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
+ t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+ t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
+ t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
+ t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
+ t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
+#ifdef notyet
+ t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
+ t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
+ t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
+#endif
+ return (0);
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
new file mode 100644
index 0000000..8cb42e1
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -0,0 +1,560 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <machine/bus.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_ofld.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
+ struct uio *uio, struct mbuf *top, struct mbuf *control,
+ int flags, struct thread *td);
+
+static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
+ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
+ int *flagsp);
+
+#ifdef notyet
+#define VM_HOLD_WRITEABLE 0x1
+static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
+ int *count, int flags);
+#endif
+static void vm_fault_unhold_pages(vm_page_t *m, int count);
+
+
+
+#define TMP_IOV_MAX 16
+
+void
+t3_init_socket_ops(void)
+{
+ struct protosw *prp;
+
+ prp = pffindtype(AF_INET, SOCK_STREAM);
+ pru_sosend = prp->pr_usrreqs->pru_sosend;
+ pru_soreceive = prp->pr_usrreqs->pru_soreceive;
+}
+
+
+struct cxgb_dma_info {
+ size_t cdi_mapped;
+ int cdi_nsegs;
+ bus_dma_segment_t *cdi_segs;
+
+};
+
+static void
+cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
+ bus_size_t mapsize, int error)
+{
+ struct cxgb_dma_info *cdi = arg;
+
+ cdi->cdi_mapped = mapsize;
+ cdi->cdi_nsegs = nsegs;
+ cdi->cdi_segs = segs;
+}
+
+static void
+iov_adj(struct iovec **iov, int *iovcnt, size_t count)
+{
+ struct iovec *iovtmp;
+ int iovcnttmp;
+ caddr_t ptmp;
+
+ if (count > 0) {
+ iovtmp = *iov;
+ iovcnttmp = *iovcnt;
+ while (count > 0) {
+ if (count < iovtmp->iov_len) {
+ ptmp = iovtmp->iov_base;
+ ptmp += count;
+ iovtmp->iov_base = ptmp;
+ iovtmp->iov_len -= count;
+ break;
+ } else
+ count -= iovtmp->iov_len;
+ iovtmp++;
+ iovcnttmp--;
+ }
+ *iov = iovtmp;
+ *iovcnt = iovcnttmp;
+ } else if (count < 0) {
+ iovtmp = &(*iov)[*iovcnt - 1];
+ iovcnttmp = *iovcnt;
+ while (count < 0) {
+ if (-count < iovtmp->iov_len) {
+ iovtmp->iov_len += count;
+ break;
+ } else
+ count += iovtmp->iov_len;
+ iovtmp--;
+ iovcnttmp--;
+ }
+ *iovcnt = iovcnttmp;
+ }
+}
+
+
+static void
+cxgb_zero_copy_free(void *cl, void *arg) {}
+
+static int
+cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
+{
+
+ return (EINVAL);
+}
+
+static void
+cxgb_wait_dma_completion(struct toepcb *tp)
+{
+
+}
+
+static int
+cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
+{
+ int i, seg_count, err, type;
+ struct mbuf *m0;
+ struct cxgb_dma_info cdi;
+ struct mbuf_vec *mv;
+ struct mbuf_iovec *mi;
+ bus_dma_segment_t *segs;
+
+ err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
+ cxgb_dma_callback, &cdi, 0);
+
+ if (err)
+ return (err);
+ seg_count = cdi.cdi_nsegs;
+ if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
+ bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
+ return (ENOMEM);
+ }
+ segs = cdi.cdi_segs;
+ m0->m_type = type;
+ m0->m_flags = (M_EXT|M_NOFREE);
+ m0->m_ext.ext_type = EXT_EXTREF;
+ m0->m_ext.ext_free = cxgb_zero_copy_free;
+ m0->m_ext.ext_args = NULL;
+
+ mv = mtomv(m0);
+ mv->mv_count = seg_count;
+ mv->mv_first = 0;
+ for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
+ mi_collapse_sge(mi, segs);
+
+ *m = m0;
+
+ if (cdi.cdi_mapped < uio->uio_resid) {
+ uio->uio_resid -= cdi.cdi_mapped;
+ } else
+ uio->uio_resid = 0;
+
+ return (0);
+}
+
+static int
+t3_sosend(struct socket *so, struct uio *uio)
+{
+ int rv, count, hold_resid, sent, iovcnt;
+ struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct mbuf *m;
+ struct uio uiotmp;
+
+ /*
+ * Events requiring iteration:
+ * - number of pages exceeds max hold pages for process or system
+ * - number of pages exceeds maximum sg entries for a single WR
+ *
+ * We're limited to holding 128 pages at once - and we're limited to
+ * 34 SG entries per work request, but each SG entry can be any number
+ * of contiguous pages
+ *
+ */
+
+ uiotmp = *uio;
+ iovcnt = uio->uio_iovcnt;
+ iov = uio->uio_iov;
+ sent = 0;
+sendmore:
+ /*
+ * Make sure we don't exceed the socket buffer
+ */
+ count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
+ rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
+ hold_resid = uiotmp.uio_resid;
+ if (rv)
+ return (rv);
+
+ /*
+ * Bump past sent and shave off the unheld amount
+ */
+ if (hold_resid > 0) {
+ iovtmpp = iovtmp;
+ memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+ if (sent)
+ iov_adj(&iovtmpp, &iovcnt, sent);
+ iov_adj(&iovtmpp, &iovcnt, -hold_resid);
+ uiotmp.uio_iov = iovtmpp;
+ uiotmp.uio_iovcnt = iovcnt;
+
+ }
+ uiotmp.uio_resid = uio->uio_resid - hold_resid;
+
+ /*
+ * Push off all held pages
+ *
+ */
+ while (uiotmp.uio_resid > 0) {
+ rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
+ if (rv) {
+ vm_fault_unhold_pages(toep->tp_pages, count);
+ return (rv);
+ }
+ uio->uio_resid -= m->m_pkthdr.len;
+ sent += m->m_pkthdr.len;
+ sbappend_locked(&so->so_snd, m);
+ t3_push_frames(so, TRUE);
+ iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
+ }
+ /*
+ * Wait for pending I/O to be DMA'd to the card
+ *
+ */
+ cxgb_wait_dma_completion(toep);
+ vm_fault_unhold_pages(toep->tp_pages, count);
+ /*
+ * If there is more data to send adjust local copy of iov
+ * to point to teh start
+ */
+ if (hold_resid) {
+ iovtmpp = iovtmp;
+ memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+ iov_adj(&iovtmpp, &iovcnt, sent);
+ uiotmp = *uio;
+ uiotmp.uio_iov = iovtmpp;
+ uiotmp.uio_iovcnt = iovcnt;
+ goto sendmore;
+ }
+
+ return (0);
+}
+
+static int
+cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toedev *tdev;
+ int zcopy_thres, zcopy_enabled, rv;
+
+ /*
+ * In order to use DMA direct from userspace the following
+ * conditions must be met:
+ * - the connection is currently offloaded
+ * - ddp is enabled
+ * - the number of bytes to be transferred exceeds the threshold
+ * - the number of bytes currently in flight won't exceed the in-flight
+ * threshold XXX TODO
+ * - vm_fault_hold_user_pages succeeds
+ * - blocking socket XXX for now
+ *
+ */
+ if (tp->t_flags & TF_TOE) {
+ tdev = TOE_DEV(so);
+ zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
+ zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
+
+ if ((uio->uio_resid > zcopy_thres) &&
+ (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0)
+ && zcopy_enabled) {
+ rv = t3_sosend(so, uio);
+ if (rv != EAGAIN)
+ return (rv);
+ }
+ }
+ return pru_sosend(so, addr, uio, top, control, flags, td);
+}
+
+
+static int
+t3_soreceive(struct socket *so, struct uio *uio)
+{
+#ifdef notyet
+ int i, rv, count, hold_resid, sent, iovcnt;
+ struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct mbuf *m;
+ struct uio uiotmp;
+
+ /*
+ * Events requiring iteration:
+ * - number of pages exceeds max hold pages for process or system
+ * - number of pages exceeds maximum sg entries for a single WR
+ *
+ * We're limited to holding 128 pages at once - and we're limited to
+ * 34 SG entries per work request, but each SG entry can be any number
+ * of contiguous pages
+ *
+ */
+
+ uiotmp = *uio;
+ iovcnt = uio->uio_iovcnt;
+ iov = uio->uio_iov;
+ sent = 0;
+ re;
+#endif
+ return (0);
+}
+
+static int
+cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct toedev *tdev;
+ int rv, zcopy_thres, zcopy_enabled;
+ struct tcpcb *tp = sototcpcb(so);
+
+ /*
+ * In order to use DMA direct from userspace the following
+ * conditions must be met:
+ * - the connection is currently offloaded
+ * - ddp is enabled
+ * - the number of bytes to be transferred exceeds the threshold
+ * - the number of bytes currently in flight won't exceed the in-flight
+ * threshold XXX TODO
+ * - vm_fault_hold_user_pages succeeds
+ * - blocking socket XXX for now
+ * - iovcnt is 1
+ *
+ */
+ if (tp->t_flags & TF_TOE) {
+ tdev = TOE_DEV(so);
+ zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
+ zcopy_enabled = TOM_TUNABLE(tdev, ddp);
+ if ((uio->uio_resid > zcopy_thres) &&
+ (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0)
+ && zcopy_enabled) {
+ rv = t3_soreceive(so, uio);
+ if (rv != EAGAIN)
+ return (rv);
+ }
+ }
+
+ return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
+}
+
+
+void
+t3_install_socket_ops(struct socket *so)
+{
+ so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
+ so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
+}
+
+/*
+ * This routine takes a user address range and does the following:
+ * - validate that the user has access to those pages (flags indicates read or write) - if not fail
+ * - validate that count is enough to hold range number of pages - if not fail
+ * - fault in any non-resident pages
+ * - if the user is doing a read force a write fault for any COWed pages
+ * - if the user is doing a read mark all pages as dirty
+ * - hold all pages
+ * - return number of pages in count
+ */
+#ifdef notyet
+static int
+vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
+{
+
+ vm_offset_t start, va;
+ vm_paddr_t pa;
+ int pageslen, faults, rv;
+
+ struct thread *td;
+ vm_map_t map;
+ pmap_t pmap;
+ vm_page_t m, *pages;
+ vm_prot_t prot;
+
+ start = addr & ~PAGE_MASK;
+ pageslen = roundup2(addr + len, PAGE_SIZE);
+ if (*count < (pageslen >> PAGE_SHIFT))
+ return (EFBIG);
+
+ *count = pageslen >> PAGE_SHIFT;
+ /*
+ * Check that virtual address range is legal
+ * This check is somewhat bogus as on some architectures kernel
+ * and user do not share VA - however, it appears that all FreeBSD
+ * architectures define it
+ */
+ if (addr + len > VM_MAXUSER_ADDRESS)
+ return (EFAULT);
+
+ td = curthread;
+ map = &td->td_proc->p_vmspace->vm_map;
+ pmap = &td->td_proc->p_vmspace->vm_pmap;
+ pages = mp;
+
+ prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
+ bzero(pages, sizeof(vm_page_t *) * (*count));
+retry:
+
+ /*
+ * First optimistically assume that all pages are resident (and R/W if for write)
+ * if so just mark pages as held (and dirty if for write) and return
+ */
+ vm_page_lock_queues();
+ for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
+ /*
+ * Assure that we only hold the page once
+ */
+ if (*pages == NULL) {
+ /*
+ * page queue mutex is recursable so this is OK
+ * it would be really nice if we had an unlocked version of this so
+ * we were only acquiring the pmap lock 1 time as opposed to potentially
+ * many dozens of times
+ */
+ m = pmap_extract_and_hold(pmap, va, prot);
+ if (m == NULL) {
+ faults++;
+ continue;
+ }
+ *pages = m;
+ if (flags & VM_HOLD_WRITEABLE)
+ vm_page_dirty(m);
+ }
+ }
+ vm_page_unlock_queues();
+
+ if (faults == 0)
+ return (0);
+ /*
+ * Pages either have insufficient permissions or are not present
+ * trigger a fault where neccessary
+ *
+ */
+ for (va = start; va < pageslen; va += PAGE_SIZE) {
+ m = NULL;
+ pa = pmap_extract(pmap, va);
+ rv = 0;
+ if (pa)
+ m = PHYS_TO_VM_PAGE(pa);
+ if (flags & VM_HOLD_WRITEABLE) {
+ if (m == NULL || (m->flags & PG_WRITEABLE) == 0)
+ rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
+ } else if (m == NULL)
+ rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
+ if (rv)
+ goto error;
+ }
+ goto retry;
+
+error:
+ vm_page_lock_queues();
+ for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++)
+ if (*pages)
+ vm_page_unhold(*pages);
+ vm_page_unlock_queues();
+ return (EFAULT);
+}
+#endif
+
+static void
+vm_fault_unhold_pages(vm_page_t *mp, int count)
+{
+
+ KASSERT(count >= 0, ("negative count %d", count));
+ vm_page_lock_queues();
+ while (count--) {
+ vm_page_unhold(*mp);
+ mp++;
+ }
+ vm_page_unlock_queues();
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
new file mode 100644
index 0000000..9077295
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
@@ -0,0 +1,79 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_DEFS_H_
+#define CXGB_DEFS_H_
+
+#define VALIDATE_TID 0
+
+#define TOEPCB(so) ((struct toepcb *)(sototcpcb((so))->t_toe))
+#define TOE_DEV(so) (TOEPCB((so))->tp_toedev)
+#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket)
+#define sototoep(so) (sototcpcb((so))->t_toe)
+
+struct listen_ctx;
+
+typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m);
+
+void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h);
+void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
+void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
+int t3_push_frames(struct socket *so, int req_completion);
+int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt,
+ struct sockaddr *nam);
+void t3_init_listen_cpl_handlers(void);
+int t3_init_cpl_io(void);
+void t3_init_wr_tab(unsigned int wr_len);
+uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail);
+void t3_cleanup_rbuf(struct tcpcb *tp);
+
+void t3_init_socket_ops(void);
+void t3_install_socket_ops(struct socket *so);
+
+
+void t3_disconnect_acceptq(struct socket *listen_so);
+void t3_reset_synq(struct listen_ctx *ctx);
+void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler);
+
+struct toepcb *toepcb_alloc(void);
+void toepcb_hold(struct toepcb *);
+void toepcb_release(struct toepcb *);
+void toepcb_init(struct toepcb *);
+
+void t3_set_rcv_coalesce_enable(struct socket *so, int on_off);
+void t3_set_keepalive(struct socket *so, int on_off);
+void t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag);
+void t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+ unsigned int len);
+int t3_get_tcb(struct socket *so);
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
new file mode 100644
index 0000000..e785790
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -0,0 +1,345 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+
+#include <netinet/tcp_ofld.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+
+static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid);
+static int listen_hash_del(struct tom_data *d, struct socket *so);
+
+/*
+ * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release
+ * the STID.
+ */
+static int
+do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct cpl_close_listserv_rpl *rpl = cplhdr(m);
+ unsigned int stid = GET_TID(rpl);
+
+ if (rpl->status != CPL_ERR_NONE)
+ log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for "
+ "STID %u\n", rpl->status, stid);
+ else {
+ struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+
+ cxgb_free_stid(cdev, stid);
+ free(listen_ctx, M_CXGB);
+ }
+
+ return (CPL_RET_BUF_DONE);
+}
+
+/*
+ * Process a CPL_PASS_OPEN_RPL message. Remove the socket from the listen hash
+ * table and free the STID if there was any error, otherwise nothing to do.
+ */
+static int
+do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct cpl_pass_open_rpl *rpl = cplhdr(m);
+
+ if (rpl->status != CPL_ERR_NONE) {
+ int stid = GET_TID(rpl);
+ struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+ struct tom_data *d = listen_ctx->tom_data;
+ struct socket *lso = listen_ctx->lso;
+
+#if VALIDATE_TID
+ if (!lso)
+ return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE);
+#endif
+ /*
+ * Note: It is safe to unconditionally call listen_hash_del()
+ * at this point without risking unhashing a reincarnation of
+ * an already closed socket (i.e., there is no listen, close,
+ * listen, free the sock for the second listen while processing
+ * a message for the first race) because we are still holding
+ * a reference on the socket. It is possible that the unhash
+ * will fail because the socket is already closed, but we can't
+ * unhash the wrong socket because it is impossible for the
+ * socket to which this message refers to have reincarnated.
+ */
+ listen_hash_del(d, lso);
+ cxgb_free_stid(cdev, stid);
+#ifdef notyet
+ /*
+ * XXX need to unreference the inpcb
+ * but we have no way of knowing that other TOMs aren't referencing it
+ */
+ sock_put(lso);
+#endif
+ free(listen_ctx, M_CXGB);
+ }
+ return CPL_RET_BUF_DONE;
+}
+
+void
+t3_init_listen_cpl_handlers(void)
+{
+ t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+ t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+}
+
+static inline int
+listen_hashfn(const struct socket *so)
+{
+ return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1);
+}
+
+/*
+ * Create and add a listen_info entry to the listen hash table. This and the
+ * listen hash table functions below cannot be called from softirqs.
+ */
+static struct listen_info *
+listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid)
+{
+ struct listen_info *p;
+
+ p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO);
+ if (p) {
+ int bucket = listen_hashfn(so);
+
+ p->so = so; /* just a key, no need to take a reference */
+ p->stid = stid;
+ mtx_lock(&d->listen_lock);
+ p->next = d->listen_hash_tab[bucket];
+ d->listen_hash_tab[bucket] = p;
+ mtx_unlock(&d->listen_lock);
+ }
+ return p;
+}
+
+#if 0
+/*
+ * Given a pointer to a listening socket return its server TID by consulting
+ * the socket->stid map. Returns -1 if the socket is not in the map.
+ */
+static int
+listen_hash_find(struct tom_data *d, struct socket *so)
+{
+ int stid = -1, bucket = listen_hashfn(so);
+ struct listen_info *p;
+
+ spin_lock(&d->listen_lock);
+ for (p = d->listen_hash_tab[bucket]; p; p = p->next)
+ if (p->sk == sk) {
+ stid = p->stid;
+ break;
+ }
+ spin_unlock(&d->listen_lock);
+ return stid;
+}
+#endif
+
+/*
+ * Delete the listen_info structure for a listening socket. Returns the server
+ * TID for the socket if it is present in the socket->stid map, or -1.
+ */
+static int
+listen_hash_del(struct tom_data *d, struct socket *so)
+{
+ int bucket, stid = -1;
+ struct listen_info *p, **prev;
+
+ bucket = listen_hashfn(so);
+ prev = &d->listen_hash_tab[bucket];
+
+ mtx_lock(&d->listen_lock);
+ for (p = *prev; p; prev = &p->next, p = p->next)
+ if (p->so == so) {
+ stid = p->stid;
+ *prev = p->next;
+ free(p, M_CXGB);
+ break;
+ }
+ mtx_unlock(&d->listen_lock);
+
+ return (stid);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ */
+void
+t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+{
+ int stid;
+ struct mbuf *m;
+ struct cpl_pass_open_req *req;
+ struct tom_data *d = TOM_DATA(dev);
+ struct inpcb *inp = sotoinpcb(so);
+ struct listen_ctx *ctx;
+
+ if (!TOM_TUNABLE(dev, activated))
+ return;
+
+ printf("start listen\n");
+
+ ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT);
+
+ if (!ctx)
+ return;
+
+ ctx->tom_data = d;
+ ctx->lso = so;
+ ctx->ulp_mode = 0; /* DDP if the default */
+ LIST_INIT(&ctx->synq_head);
+
+ stid = cxgb_alloc_stid(d->cdev, d->client, ctx);
+ if (stid < 0)
+ goto free_ctx;
+
+#ifdef notyet
+ /*
+ * XXX need to mark inpcb as referenced
+ */
+ sock_hold(sk);
+#endif
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL)
+ goto free_stid;
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+ if (!listen_hash_add(d, so, stid))
+ goto free_all;
+
+ req = mtod(m, struct cpl_pass_open_req *);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
+ req->local_port = inp->inp_lport;
+ memcpy(&req->local_ip, &inp->inp_laddr, 4);
+ req->peer_port = 0;
+ req->peer_ip = 0;
+ req->peer_netmask = 0;
+ req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+ req->opt0l = htonl(V_RCV_BUFSIZ(16));
+ req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+ m_set_priority(m, CPL_PRIORITY_LISTEN);
+ cxgb_ofld_send(cdev, m);
+ return;
+
+free_all:
+ m_free(m);
+free_stid:
+ cxgb_free_stid(cdev, stid);
+#if 0
+ sock_put(sk);
+#endif
+free_ctx:
+ free(ctx, M_CXGB);
+}
+
+/*
+ * Stop a listening server by sending a close_listsvr request to HW.
+ * The server TID is freed when we get the reply.
+ */
+void
+t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+{
+ struct mbuf *m;
+ struct cpl_close_listserv_req *req;
+ struct listen_ctx *lctx;
+ int stid = listen_hash_del(TOM_DATA(dev), so);
+
+ if (stid < 0)
+ return;
+
+ lctx = cxgb_get_lctx(cdev, stid);
+ /*
+ * Do this early so embryonic connections are marked as being aborted
+ * while the stid is still open. This ensures pass_establish messages
+ * that arrive while we are closing the server will be able to locate
+ * the listening socket.
+ */
+ t3_reset_synq(lctx);
+
+ /* Send the close ASAP to stop further passive opens */
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ /*
+ * XXX allocate from lowmem cache
+ */
+ }
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+ req = mtod(m, struct cpl_close_listserv_req *);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
+ req->cpu_idx = 0;
+ m_set_priority(m, CPL_PRIORITY_LISTEN);
+ cxgb_ofld_send(cdev, m);
+
+ t3_disconnect_acceptq(so);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
new file mode 100644
index 0000000..9fa42b5
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
@@ -0,0 +1,185 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+
+#ifndef T3_DDP_H
+#define T3_DDP_H
+
+/* Should be 1 or 2 indicating single or double kernel buffers. */
+#define NUM_DDP_KBUF 2
+
+/* min receive window for a connection to be considered for DDP */
+#define MIN_DDP_RCV_WIN (48 << 10)
+
+/* amount of Rx window not available to DDP to avoid window exhaustion */
+#define DDP_RSVD_WIN (16 << 10)
+
+/* # of sentinel invalid page pods at the end of a group of valid page pods */
+#define NUM_SENTINEL_PPODS 0
+
+/* # of pages a pagepod can hold without needing another pagepod */
+#define PPOD_PAGES 4
+
+/* page pods are allocated in groups of this size (must be power of 2) */
+#define PPOD_CLUSTER_SIZE 16
+
+/* for each TID we reserve this many page pods up front */
+#define RSVD_PPODS_PER_TID 1
+
+struct pagepod {
+ uint32_t pp_vld_tid;
+ uint32_t pp_pgsz_tag_color;
+ uint32_t pp_max_offset;
+ uint32_t pp_page_offset;
+ uint64_t pp_rsvd;
+ uint64_t pp_addr[5];
+};
+
+#define PPOD_SIZE sizeof(struct pagepod)
+
+#define S_PPOD_TID 0
+#define M_PPOD_TID 0xFFFFFF
+#define V_PPOD_TID(x) ((x) << S_PPOD_TID)
+
+#define S_PPOD_VALID 24
+#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID)
+#define F_PPOD_VALID V_PPOD_VALID(1U)
+
+#define S_PPOD_COLOR 0
+#define M_PPOD_COLOR 0x3F
+#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR)
+
+#define S_PPOD_TAG 6
+#define M_PPOD_TAG 0xFFFFFF
+#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
+
+#define S_PPOD_PGSZ 30
+#define M_PPOD_PGSZ 0x3
+#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
+
+struct pci_dev;
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <machine/bus.h>
+
+/* DDP gather lists can specify an offset only for the first page. */
+struct ddp_gather_list {
+ unsigned int dgl_length;
+ unsigned int dgl_offset;
+ unsigned int dgl_nelem;
+ vm_page_t *dgl_pages;
+ bus_addr_t dgl_phys_addr[0];
+};
+
+struct ddp_buf_state {
+ unsigned int cur_offset; /* offset of latest DDP notification */
+ unsigned int flags;
+ struct ddp_gather_list *gl;
+};
+
+struct ddp_state {
+ struct pci_dev *pdev;
+ struct ddp_buf_state buf_state[2]; /* per buffer state */
+ int cur_buf;
+ unsigned short kbuf_noinval;
+ unsigned short kbuf_idx; /* which HW buffer is used for kbuf */
+ struct ddp_gather_list *ubuf;
+ unsigned int ubuf_nppods; /* # of page pods for buffer 1 */
+ unsigned int ubuf_tag;
+ unsigned int ubuf_ddp_ready;
+ int get_tcb_count;
+ unsigned int kbuf_posted;
+ int cancel_ubuf;
+ unsigned int kbuf_nppods[NUM_DDP_KBUF];
+ unsigned int kbuf_tag[NUM_DDP_KBUF];
+ struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */
+};
+
+/* buf_state flags */
+enum {
+ DDP_BF_NOINVAL = 1 << 0, /* buffer is set to NO_INVALIDATE */
+ DDP_BF_NOCOPY = 1 << 1, /* DDP to final dest, no copy needed */
+ DDP_BF_NOFLIP = 1 << 2, /* buffer flips after GET_TCB_RPL */
+ DDP_BF_PSH = 1 << 3, /* set in skb->flags if the a DDP was
+ completed with a segment having the
+ PSH flag set */
+};
+
+#ifdef notyet
+/*
+ * Returns 1 if a UBUF DMA buffer might be active.
+ */
+static inline int t3_ddp_ubuf_pending(struct sock *so)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct ddp_state *p = DDP_STATE(tp);
+
+ /* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
+ * but DDP_STATE() is only valid if the connection actually enabled
+ * DDP.
+ */
+ if (!p)
+ return 0;
+
+ return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) ||
+ (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY));
+}
+#endif
+
+int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
+ unsigned int nppods, unsigned int tag, unsigned int maxoff,
+ unsigned int pg_off, unsigned int color);
+int t3_alloc_ppods(struct tom_data *td, unsigned int n);
+void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
+void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl);
+int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len,
+ struct ddp_gather_list **newgl,
+ const struct ddp_gather_list *gl);
+int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
+ int len);
+//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
+void t3_post_kbuf(struct socket *so, int modulate);
+int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+ int rcv_flags, int modulate, int post_kbuf);
+void t3_cancel_ubuf(struct socket *so);
+int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+ int rcv_flags, int modulate, int post_kbuf);
+int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
+void t3_cleanup_ddp(struct socket *so);
+void t3_release_ddp_resources(struct toepcb *toep);
+void t3_cancel_ddpbuf(struct socket *so, unsigned int bufidx);
+void t3_overlay_ddpbuf(struct socket *so, unsigned int bufidx, unsigned int tag0,
+ unsigned int tag1, unsigned int len);
+void t3_setup_ddpbufs(struct socket *so, unsigned int len0, unsigned int offset0,
+ unsigned int len1, unsigned int offset1,
+ uint64_t ddp_flags, uint64_t flag_mask, int modulate);
+#endif /* T3_DDP_H */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
new file mode 100644
index 0000000..a078bee
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
@@ -0,0 +1,112 @@
+
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef CXGB_TOEPCB_H_
+#define CXGB_TOEPCB_H_
+#include <sys/bus.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+struct toepcb {
+ struct toedev *tp_toedev;
+ struct l2t_entry *tp_l2t;
+ pr_ctloutput_t *tp_ctloutput;
+ unsigned int tp_tid;
+ int tp_wr_max;
+ int tp_wr_avail;
+ int tp_wr_unacked;
+ int tp_delack_mode;
+ int tp_mtu_idx;
+ int tp_ulp_mode;
+ int tp_qset_idx;
+ int tp_mss_clamp;
+ int tp_qset;
+ int tp_flags;
+ int tp_enqueued_bytes;
+ int tp_page_count;
+ int tp_state;
+
+ tcp_seq tp_iss;
+ tcp_seq tp_delack_seq;
+ tcp_seq tp_rcv_wup;
+ tcp_seq tp_copied_seq;
+ uint64_t tp_write_seq;
+
+ volatile int tp_refcount;
+ vm_page_t *tp_pages;
+
+ struct tcpcb *tp_tp;
+ struct mbuf *tp_m_last;
+ bus_dma_tag_t tp_tx_dmat;
+ bus_dmamap_t tp_dmamap;
+
+ LIST_ENTRY(toepcb) synq_entry;
+ struct mbuf_head wr_list;
+ struct mbuf_head out_of_order_queue;
+ struct ddp_state tp_ddp_state;
+};
+
+static inline void
+reset_wr_list(struct toepcb *toep)
+{
+
+ mbufq_init(&toep->wr_list);
+}
+
+static inline void
+purge_wr_queue(struct toepcb *toep)
+{
+ struct mbuf *m;
+
+ while ((m = mbufq_dequeue(&toep->wr_list)) != NULL)
+ m_freem(m);
+}
+
+static inline void
+enqueue_wr(struct toepcb *toep, struct mbuf *m)
+{
+
+ mbufq_tail(&toep->wr_list, m);
+}
+
+static inline struct mbuf *
+peek_wr(struct toepcb *toep)
+{
+
+ return (mbufq_peek(&toep->wr_list));
+}
+
+static inline struct mbuf *
+dequeue_wr(struct toepcb *toep)
+{
+
+ return (mbufq_dequeue(&toep->wr_list));
+}
+
+#endif
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
new file mode 100644
index 0000000..2dc6150
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -0,0 +1,500 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/eventhandler.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/taskqueue.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_fsm.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+static int activated = 1;
+TUNABLE_INT("hw.t3toe.activated", &activated);
+SYSCTL_NODE(_hw, OID_AUTO, t3toe, CTLFLAG_RD, 0, "T3 toe driver parameters");
+SYSCTL_UINT(_hw_t3toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0,
+ "enable TOE at init time");
+
+static TAILQ_HEAD(, tom_data) cxgb_list;
+static struct mtx cxgb_list_lock;
+
+static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry);
+/*
+ * Handlers for each CPL opcode
+ */
+static cxgb_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS];
+
+static eventhandler_tag listen_tag;
+
+static struct offload_id t3_toe_id_tab[] = {
+ { TOE_ID_CHELSIO_T3, 0 },
+ { TOE_ID_CHELSIO_T3B, 0 },
+ { 0 }
+};
+
+static struct tom_info t3_tom_info = {
+ .ti_attach = t3_toe_attach,
+ .ti_id_table = t3_toe_id_tab,
+ .ti_name = "Chelsio-T3"
+};
+
+struct cxgb_client t3c_tom_client = {
+ .name = "tom_cxgb3",
+ .remove = NULL,
+ .handlers = tom_cpl_handlers,
+ .redirect = NULL
+};
+
+/*
+ * Add an skb to the deferred skb queue for processing from process context.
+ */
+void
+t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler)
+{
+ struct tom_data *td = TOM_DATA(dev);
+
+ m_set_handler(m, handler);
+ mtx_lock(&td->deferq.lock);
+
+ mbufq_tail(&td->deferq, m);
+ if (mbufq_len(&td->deferq) == 1)
+ taskqueue_enqueue(td->tq, &td->deferq_task);
+ mtx_lock(&td->deferq.lock);
+}
+
+struct toepcb *
+toepcb_alloc(void)
+{
+ struct toepcb *toep;
+
+ toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT);
+
+ if (toep == NULL)
+ return (NULL);
+
+ toepcb_init(toep);
+ return (toep);
+}
+
+void
+toepcb_init(struct toepcb *toep)
+{
+ bzero(toep, sizeof(*toep));
+ toep->tp_refcount = 1;
+}
+
+void
+toepcb_hold(struct toepcb *toep)
+{
+ atomic_add_acq_int(&toep->tp_refcount, 1);
+}
+
+void
+toepcb_release(struct toepcb *toep)
+{
+ if (toep->tp_refcount == 1) {
+ printf("doing final toepcb free\n");
+
+ free(toep, M_DEVBUF);
+ return;
+ }
+
+ atomic_add_acq_int(&toep->tp_refcount, -1);
+}
+
+/*
+ * Add a T3 offload device to the list of devices we are managing.
+ */
+static void
+t3cdev_add(struct tom_data *t)
+{
+ mtx_lock(&cxgb_list_lock);
+ TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
+ mtx_unlock(&cxgb_list_lock);
+}
+
+/*
+ * Allocate a TOM data structure,
+ * initialize its cpl_handlers
+ * and register it as a T3C client
+ */
+static void t3c_tom_add(struct t3cdev *cdev)
+{
+ int i;
+ unsigned int wr_len;
+ struct tom_data *t;
+ struct toedev *tdev;
+ struct adap_ports *port_info;
+
+ t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
+
+ if (!t)
+ return;
+
+ if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0)
+ goto out_free_tom;
+
+ port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO);
+ if (!port_info)
+ goto out_free_tom;
+
+ if (cdev->ctl(cdev, GET_PORTS, port_info) < 0)
+ goto out_free_all;
+
+ t3_init_wr_tab(wr_len);
+ t->cdev = cdev;
+ t->client = &t3c_tom_client;
+
+ /* Register TCP offload device */
+ tdev = &t->tdev;
+ tdev->tod_ttid = (cdev->type == T3A ?
+ TOE_ID_CHELSIO_T3 : TOE_ID_CHELSIO_T3B);
+ tdev->tod_lldev = cdev->lldev;
+
+ if (register_toedev(tdev, "toe%d")) {
+ printf("unable to register offload device");
+ goto out_free_all;
+ }
+ TOM_DATA(tdev) = t;
+
+ for (i = 0; i < port_info->nports; i++) {
+ struct ifnet *ifp = port_info->lldevs[i];
+ TOEDEV(ifp) = tdev;
+
+ ifp->if_capabilities |= IFCAP_TOE;
+ }
+ t->ports = port_info;
+
+ /* Add device to the list of offload devices */
+ t3cdev_add(t);
+
+ /* Activate TCP offload device */
+ activate_offload(tdev);
+ return;
+
+out_free_all:
+ free(port_info, M_CXGB);
+out_free_tom:
+ free(t, M_CXGB);
+ return;
+}
+
+/*
+ * Process a received packet with an unknown/unexpected CPL opcode.
+ */
+static int
+do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name,
+ *mtod(m, unsigned int *));
+
+ return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
+}
+
+
+/*
+ * Add a new handler to the CPL dispatch table. A NULL handler may be supplied
+ * to unregister an existing handler.
+ */
+void
+t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h)
+{
+ if (opcode < NUM_CPL_CMDS)
+ tom_cpl_handlers[opcode] = h ? h : do_bad_cpl;
+ else
+ log(LOG_ERR, "Chelsio T3 TOM: handler registration for "
+ "opcode %u failed\n", opcode);
+}
+
+/*
+ * Make a preliminary determination if a connection can be offloaded. It's OK
+ * to fail the offload later if we say we can offload here. For now this
+ * always accepts the offload request unless there are IP options.
+ */
+static int
+can_offload(struct toedev *dev, struct socket *so)
+{
+ struct tom_data *tomd = TOM_DATA(dev);
+ struct t3cdev *cdev = T3CDEV(dev->tod_lldev);
+ struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
+
+ return sotoinpcb(so)->inp_depend4.inp4_options == NULL &&
+ tomd->conf.activated &&
+ (tomd->conf.max_conn < 0 ||
+ atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn);
+}
+
+
+static int tom_ctl(struct toedev *dev, unsigned int req, void *data)
+{
+ struct tom_data *t = TOM_DATA(dev);
+ struct t3cdev *cdev = t->cdev;
+
+ if (cdev->ctl)
+ return cdev->ctl(cdev, req, data);
+
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Initialize the CPL dispatch table.
+ */
+static void
+init_cpl_handlers(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_CPL_CMDS; ++i)
+ tom_cpl_handlers[i] = do_bad_cpl;
+
+ t3_init_listen_cpl_handlers();
+}
+
+static int
+t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
+{
+ struct tom_data *t = TOM_DATA(dev);
+ struct t3cdev *cdev = t->cdev;
+ struct ddp_params ddp;
+ struct ofld_page_info rx_page_info;
+ int err;
+
+#if 0
+ skb_queue_head_init(&t->deferq);
+ T3_INIT_WORK(&t->deferq_task, process_deferq, t);
+ spin_lock_init(&t->listen_lock);
+#endif
+ t3_init_tunables(t);
+ mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF);
+
+ /* Adjust TOE activation for this module */
+ t->conf.activated = activated;
+
+ dev->tod_can_offload = can_offload;
+ dev->tod_connect = t3_connect;
+ dev->tod_ctl = tom_ctl;
+#if 0
+#ifndef NETEVENT
+ dev->tod_neigh_update = tom_neigh_update;
+#endif
+ dev->tod_failover = t3_failover;
+#endif
+ err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp);
+ if (err)
+ return err;
+
+ err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info);
+ if (err)
+ return err;
+
+ t->ddp_llimit = ddp.llimit;
+ t->ddp_ulimit = ddp.ulimit;
+ t->pdev = ddp.pdev;
+ t->rx_page_size = rx_page_info.page_size;
+#ifdef notyet
+ /* OK if this fails, we just can't do DDP */
+ t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE;
+ t->ppod_map = t3_alloc_mem(t->nppods);
+#endif
+
+#if 0
+ spin_lock_init(&t->ppod_map_lock);
+ tom_proc_init(dev);
+#ifdef CONFIG_SYSCTL
+ t->sysctl = t3_sysctl_register(dev, &t->conf);
+#endif
+#endif
+ return (0);
+}
+
+static void
+cxgb_toe_listen(void *unused, int event, struct tcpcb *tp)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+ struct tom_data *p;
+
+ switch (event) {
+ case OFLD_LISTEN_OPEN:
+ case OFLD_LISTEN_CLOSE:
+ mtx_lock(&cxgb_list_lock);
+ TAILQ_FOREACH(p, &cxgb_list, entry) {
+ if (event == OFLD_LISTEN_OPEN)
+ t3_listen_start(&p->tdev, so, p->cdev);
+ else if (tp->t_state == TCPS_LISTEN) {
+ printf("stopping listen on port=%d\n",
+ ntohs(tp->t_inpcb->inp_lport));
+
+ t3_listen_stop(&p->tdev, so, p->cdev);
+ }
+
+ }
+ mtx_unlock(&cxgb_list_lock);
+ break;
+ default:
+ log(LOG_ERR, "unrecognized listen event %d\n", event);
+ break;
+ }
+}
+
+static void
+cxgb_register_listeners(void)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ INP_INFO_RLOCK(&tcbinfo);
+ LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
+ tp = intotcpcb(inp);
+
+ if (tp->t_state == TCPS_LISTEN)
+ cxgb_toe_listen(NULL, OFLD_LISTEN_OPEN, tp);
+ }
+ INP_INFO_RUNLOCK(&tcbinfo);
+}
+
+static int
+t3_tom_init(void)
+{
+
+#if 0
+ struct socket *sock;
+ err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0) {
+ printk(KERN_ERR "Could not create TCP socket, error %d\n", err);
+ return err;
+ }
+
+ t3_def_state_change = sock->sk->sk_state_change;
+ t3_def_data_ready = sock->sk->sk_data_ready;
+ t3_def_error_report = sock->sk->sk_error_report;
+ sock_release(sock);
+#endif
+ init_cpl_handlers();
+ if (t3_init_cpl_io() < 0)
+ return -1;
+ t3_init_socket_ops();
+
+ /* Register with the TOE device layer. */
+
+ if (register_tom(&t3_tom_info) != 0) {
+ log(LOG_ERR,
+ "Unable to register Chelsio T3 TCP offload module.\n");
+ return -1;
+ }
+
+ mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
+ listen_tag = EVENTHANDLER_REGISTER(ofld_listen, cxgb_toe_listen, NULL, EVENTHANDLER_PRI_ANY);
+ TAILQ_INIT(&cxgb_list);
+
+ /* Register to offloading devices */
+ t3c_tom_client.add = t3c_tom_add;
+ cxgb_register_client(&t3c_tom_client);
+ cxgb_register_listeners();
+ return (0);
+}
+
+static int
+t3_tom_load(module_t mod, int cmd, void *arg)
+{
+ int err = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ printf("wheeeeee ...\n");
+
+ t3_tom_init();
+ break;
+ case MOD_QUIESCE:
+ break;
+ case MOD_UNLOAD:
+ printf("uhm, ... unloading isn't really supported for toe\n");
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+
+ return (err);
+}
+
+static moduledata_t mod_data= {
+ "t3_tom",
+ t3_tom_load,
+ 0
+};
+MODULE_VERSION(t3_tom, 1);
+MODULE_DEPEND(t3_tom, toecore, 1, 1, 1);
+MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1);
+DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
new file mode 100644
index 0000000..8d60bbd
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
@@ -0,0 +1,157 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_TOM_H_
+#define CXGB_TOM_H_
+#include <sys/protosw.h>
+
+#define LISTEN_INFO_HASH_SIZE 32
+
+struct listen_info {
+ struct listen_info *next; /* Link to next entry */
+ struct socket *so; /* The listening socket */
+ unsigned int stid; /* The server TID */
+};
+
+
+/*
+ * TOM tunable parameters. They can be manipulated through sysctl(2) or /proc.
+ */
+struct tom_tunables {
+ int max_host_sndbuf; // max host RAM consumed by a sndbuf
+ int tx_hold_thres; // push/pull threshold for non-full TX sk_buffs
+ int max_wrs; // max # of outstanding WRs per connection
+ int rx_credit_thres; // min # of RX credits needed for RX_DATA_ACK
+ int cong_alg; // Congestion control algorithm
+ int mss; // max TX_DATA WR payload size
+ int delack; // delayed ACK control
+ int max_conn; // maximum number of offloaded connections
+ int soft_backlog_limit; // whether the listen backlog limit is soft
+ int ddp; // whether to put new connections in DDP mode
+ int ddp_thres; // min recvmsg size before activating DDP
+ int ddp_copy_limit; // capacity of kernel DDP buffer
+ int ddp_push_wait; // whether blocking DDP waits for PSH flag
+ int ddp_rcvcoalesce; // whether receive coalescing is enabled
+ int zcopy_sosend_enabled; // < is never zcopied
+ int zcopy_sosend_partial_thres; // < is never zcopied
+ int zcopy_sosend_partial_copy; // bytes copied in partial zcopy
+ int zcopy_sosend_thres;// >= are mostly zcopied
+ int zcopy_sosend_copy; // bytes coped in zcopied
+ int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA
+ int activated; // TOE engine activation state
+};
+
+struct tom_data {
+ TAILQ_ENTRY(tom_data) entry;
+
+ struct t3cdev *cdev;
+ struct pci_dev *pdev;
+ struct toedev tdev;
+
+ struct cxgb_client *client;
+ struct tom_tunables conf;
+ struct tom_sysctl_table *sysctl;
+
+ /*
+ * The next three locks listen_lock, deferq.lock, and tid_release_lock
+ * are used rarely so we let them potentially share a cacheline.
+ */
+
+ struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE];
+ struct mtx listen_lock;
+
+ struct mbuf_head deferq;
+ struct task deferq_task;
+
+ struct socket **tid_release_list;
+ struct mtx tid_release_lock;
+ struct task tid_release_task;
+
+ volatile int tx_dma_pending;
+
+ unsigned int ddp_llimit;
+ unsigned int ddp_ulimit;
+
+ unsigned int rx_page_size;
+
+ u8 *ppod_map;
+ unsigned int nppods;
+ struct mtx ppod_map_lock;
+
+ struct adap_ports *ports;
+ struct taskqueue *tq;
+};
+
+
+struct listen_ctx {
+ struct socket *lso;
+ struct tom_data *tom_data;
+ int ulp_mode;
+ LIST_HEAD(, toepcb) synq_head;
+
+};
+
+#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt)
+#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev)
+#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev)
+#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param)
+
+#define TP_DATASENT (1 << 0)
+#define TP_TX_WAIT_IDLE (1 << 1)
+#define TP_FIN_SENT (1 << 2)
+#define TP_ABORT_RPL_PENDING (1 << 3)
+#define TP_ABORT_SHUTDOWN (1 << 4)
+#define TP_ABORT_RPL_RCVD (1 << 5)
+#define TP_ABORT_REQ_RCVD (1 << 6)
+#define TP_CLOSE_CON_REQUESTED (1 << 7)
+#define TP_SYN_RCVD (1 << 8)
+#define TP_ESTABLISHED (1 << 9)
+
+void t3_init_tunables(struct tom_data *t);
+
+static __inline struct mbuf *
+m_gethdr_nofail(int len)
+{
+ struct mbuf *m;
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ panic("implement lowmem cache\n");
+ }
+
+ KASSERT(len < MHLEN, ("requested header size too large for mbuf"));
+ m->m_pkthdr.len = m->m_len = len;
+ return (m);
+}
+
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
new file mode 100644
index 0000000..7219922
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+
+static struct tom_tunables default_tunable_vals = {
+ .max_host_sndbuf = 32 * 1024,
+ .tx_hold_thres = 0,
+ .max_wrs = 15,
+ .rx_credit_thres = 15 * 1024,
+ .cong_alg = -1,
+ .mss = 16384,
+ .delack = 1,
+ .max_conn = -1,
+ .soft_backlog_limit = 0,
+ .ddp = 0,
+ .ddp_thres = 14 * 4096,
+ .ddp_copy_limit = 13 * 4096,
+ .ddp_push_wait = 1,
+ .ddp_rcvcoalesce = 0,
+ .zcopy_sosend_enabled = 0,
+ .zcopy_sosend_partial_thres = 40960,
+ .zcopy_sosend_partial_copy = 4096 * 3,
+ .zcopy_sosend_thres = 128 * 1024,
+ .zcopy_sosend_copy = 4096 * 2,
+ .zcopy_sosend_ret_pending_dma = 1,
+ .activated = 1,
+};
+
+void t3_init_tunables(struct tom_data *t)
+{
+ t->conf = default_tunable_vals;
+
+ /* Now apply device specific fixups. */
+ t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk;
+ t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs;
+}
OpenPOWER on IntegriCloud