From f96fe5e169e8cfe06b070663cdaf7a637dfde154 Mon Sep 17 00:00:00 2001 From: kmacy Date: Sun, 16 Dec 2007 05:27:26 +0000 Subject: Add driver for TCP offload Sponsored by: Chelsio Inc. --- sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 3378 ++++++++++++++++++++++++++++++++ sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c | 560 ++++++ sys/dev/cxgb/ulp/tom/cxgb_defs.h | 79 + sys/dev/cxgb/ulp/tom/cxgb_listen.c | 345 ++++ sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h | 185 ++ sys/dev/cxgb/ulp/tom/cxgb_toepcb.h | 112 ++ sys/dev/cxgb/ulp/tom/cxgb_tom.c | 500 +++++ sys/dev/cxgb/ulp/tom/cxgb_tom.h | 157 ++ sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c | 106 + 9 files changed, 5422 insertions(+) create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_defs.h create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_listen.c create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_toepcb.h create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tom.c create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tom.h create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c (limited to 'sys/dev/cxgb/ulp') diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c new file mode 100644 index 0000000..0c796b5 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -0,0 +1,3378 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +/* + * For ULP connections HW may add headers, e.g., for digests, that aren't part + * of the messages sent by the host but that are part of the TCP payload and + * therefore consume TCP sequence space. Tx connection parameters that + * operate in TCP sequence space are affected by the HW additions and need to + * compensate for them to accurately track TCP sequence numbers. This array + * contains the compensating extra lengths for ULP packets. It is indexed by + * a packet's ULP submode. + */ +const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; + +#ifdef notyet +/* + * This sk_buff holds a fake header-only TCP segment that we use whenever we + * need to exploit SW TCP functionality that expects TCP headers, such as + * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple + * CPUs without locking. + */ +static struct mbuf *tcphdr_mbuf __read_mostly; +#endif + +/* + * Size of WRs in bytes. Note that we assume all devices we are handling have + * the same WR size. + */ +static unsigned int wrlen __read_mostly; + +/* + * The number of WRs needed for an skb depends on the number of page fragments + * in the skb and whether it has any payload in its main body. This maps the + * length of the gather list represented by an skb into the # of necessary WRs. + */ +static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly; + +/* + * Max receive window supported by HW in bytes. Only a small part of it can + * be set through option0, the rest needs to be set through RX_DATA_ACK. + */ +#define MAX_RCV_WND ((1U << 27) - 1) + +/* + * Min receive window. We want it to be large enough to accommodate receive + * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. + */ +#define MIN_RCV_WND (24 * 1024U) +#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS) + +#define VALIDATE_SEQ 0 +#define VALIDATE_SOCK(so) +#define DEBUG_WR 0 + +extern int tcp_do_autorcvbuf; +extern int tcp_do_autosndbuf; +extern int tcp_autorcvbuf_max; +extern int tcp_autosndbuf_max; + +static void t3_send_reset(struct toepcb *toep); +static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); +static inline void free_atid(struct t3cdev *cdev, unsigned int tid); +static void handle_syncache_event(int event, void *arg); + + +static inline int +is_t3a(const struct toedev *dev) +{ + return (dev->tod_ttid == TOE_ID_CHELSIO_T3); +} + +static void +dump_toepcb(struct toepcb *toep) +{ + DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", + toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, + toep->tp_mtu_idx, toep->tp_tid); + + DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", + toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, + toep->tp_mss_clamp, toep->tp_flags); +} + +static struct rtentry * +rtalloc2(struct sockaddr *dst, int report, u_long ignflags) +{ + struct rtentry *rt = NULL; + + if ((rt = rtalloc1(dst, report, ignflags)) != NULL) + RT_UNLOCK(rt); + + return (rt); +} + +/* + * Determine whether to send a CPL message now or defer it. A message is + * deferred if the connection is in SYN_SENT since we don't know the TID yet. + * For connections in other states the message is sent immediately. + * If through_l2t is set the message is subject to ARP processing, otherwise + * it is sent directly. + */ +static inline void +send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t) +{ + struct toepcb *toep = tp->t_toe; + + + if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { + INP_LOCK(tp->t_inpcb); + mbufq_tail(&toep->out_of_order_queue, m); // defer + INP_UNLOCK(tp->t_inpcb); + } else if (through_l2t) + l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T + else + cxgb_ofld_send(T3C_DEV(so), m); // send directly +} + +static inline unsigned int +mkprio(unsigned int cntrl, const struct socket *so) +{ + return cntrl; +} + +/* + * Populate a TID_RELEASE WR. The skb must be already propely sized. + */ +static inline void +mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid) +{ + struct cpl_tid_release *req; + + m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so)); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req = mtod(m, struct cpl_tid_release *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); +} + +static inline void +make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct tx_data_wr *req; + + INP_LOCK_ASSERT(tp->t_inpcb); + + req = mtod(m, struct tx_data_wr *); + m->m_len = sizeof(*req); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); + /* len includes the length of any HW ULP additions */ + req->len = htonl(len); + req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); + /* V_TX_ULP_SUBMODE sets both the mode and submode */ + req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | + V_TX_URG(/* skb_urgent(skb) */ 0 ) | + V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && + (tail ? 0 : 1)))); + req->sndseq = htonl(tp->snd_nxt); + if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { + req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | + V_TX_CPU_IDX(toep->tp_qset)); + + /* Sendbuffer is in units of 32KB. + */ + if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) + req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); + else + req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15)); + toep->tp_flags |= TP_DATASENT; + } +} + +int +t3_push_frames(struct socket *so, int req_completion) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + struct mbuf *tail, *m0, *last; + struct t3cdev *cdev; + struct tom_data *d; + int bytes, count, total_bytes; + bus_dma_segment_t segs[TX_MAX_SEGS], *segp; + segp = segs; + + if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { + DPRINTF("tcp state=%d\n", tp->t_state); + return (0); + } + + if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { + DPRINTF("disconnecting\n"); + + return (0); + } + + INP_LOCK_ASSERT(tp->t_inpcb); + + SOCKBUF_LOCK(&so->so_snd); + + d = TOM_DATA(TOE_DEV(so)); + cdev = d->cdev; + last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb; + total_bytes = 0; + DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", + toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last); + + if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) { + KASSERT(tail, ("sbdrop error")); + last = tail = tail->m_next; + } + + if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { + DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); + SOCKBUF_UNLOCK(&so->so_snd); + return (0); + } + + toep->tp_m_last = NULL; + while (toep->tp_wr_avail && (tail != NULL)) { + count = bytes = 0; + if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { + SOCKBUF_UNLOCK(&so->so_snd); + return (0); + } + while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) + && (tail != NULL) && (count < TX_MAX_SEGS)) { + bytes += tail->m_len; + count++; + last = tail; + /* + * technically an abuse to be using this for a VA + * but less gross than defining my own structure + * or calling pmap_kextract from here :-| + */ + segp->ds_addr = (bus_addr_t)tail->m_data; + segp->ds_len = tail->m_len; + DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", + count, mbuf_wrs[count], tail->m_data, tail->m_len); + + segp++; + tail = tail->m_next; + } + DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", + toep->tp_wr_avail, count, mbuf_wrs[count], tail); + if (tail) { + so->so_snd.sb_sndptr = tail; + toep->tp_m_last = NULL; + } else + toep->tp_m_last = so->so_snd.sb_sndptr = last; + + DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); + + so->so_snd.sb_sndptroff += bytes; + total_bytes += bytes; + toep->tp_write_seq += bytes; + + + SOCKBUF_UNLOCK(&so->so_snd); + + /* + * XXX can drop socket buffer lock here + */ + + toep->tp_wr_avail -= mbuf_wrs[count]; + toep->tp_wr_unacked += mbuf_wrs[count]; + + make_tx_data_wr(so, m0, bytes, tail); + m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so)); + m_set_sgl(m0, segs); + m_set_sgllen(m0, count); + /* + * remember credits used + */ + m0->m_pkthdr.csum_data = mbuf_wrs[count]; + m0->m_pkthdr.len = bytes; + if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || + toep->tp_wr_unacked >= toep->tp_wr_max / 2) { + struct work_request_hdr *wr = cplhdr(m0); + + wr->wr_hi |= htonl(F_WR_COMPL); + toep->tp_wr_unacked = 0; + } + + m0->m_type = MT_DONTFREE; + enqueue_wr(toep, m0); + DPRINTF("sending offload tx with %d bytes in %d segments\n", + bytes, count); + + l2t_send(cdev, m0, toep->tp_l2t); + if (toep->tp_wr_avail && (tail != NULL)) + SOCKBUF_LOCK(&so->so_snd); + } + + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); + return (total_bytes); +} + +/* + * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail + * under any circumstances. We take the easy way out and always queue the + * message to the write_queue. We can optimize the case where the queue is + * already empty though the optimization is probably not worth it. + */ +static void +close_conn(struct socket *so) +{ + struct mbuf *m; + struct cpl_close_con_req *req; + struct tom_data *d; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct toepcb *toep; + unsigned int tid; + + + INP_LOCK(inp); + tp = sototcpcb(so); + toep = tp->t_toe; + + if (tp->t_state != TCPS_SYN_SENT) + t3_push_frames(so, 1); + + if (toep->tp_flags & TP_FIN_SENT) { + INP_UNLOCK(inp); + return; + } + + tid = toep->tp_tid; + + d = TOM_DATA(toep->tp_toedev); + + m = m_gethdr_nofail(sizeof(*req)); + + toep->tp_flags |= TP_FIN_SENT; + req = mtod(m, struct cpl_close_con_req *); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = htonl(toep->tp_write_seq); + INP_UNLOCK(inp); + /* + * XXX - need to defer shutdown while there is still data in the queue + * + */ + cxgb_ofld_send(d->cdev, m); + +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void +abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) +{ + struct cpl_abort_req *req = cplhdr(m); + + req->cmd = CPL_ABORT_NO_RST; + cxgb_ofld_send(cdev, m); +} + +/* + * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are + * permitted to return without sending the message in case we cannot allocate + * an sk_buff. Returns the number of credits sent. + */ +uint32_t +t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) +{ + struct mbuf *m; + struct cpl_rx_data_ack *req; + struct toepcb *toep = tp->t_toe; + struct toedev *tdev = toep->tp_toedev; + + m = m_gethdr_nofail(sizeof(*req)); + + DPRINTF("returning %u credits to HW\n", credits); + + req = mtod(m, struct cpl_rx_data_ack *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); + req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); + m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + return (credits); +} + + +/* + * Set of states for which we should return RX credits. + */ +#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) + +/* + * Called after some received data has been read. It returns RX credits + * to the HW for the amount of data processed. + */ +void +t3_cleanup_rbuf(struct tcpcb *tp) +{ + struct toepcb *toep = tp->t_toe; + struct socket *so; + struct toedev *dev; + int dack_mode, must_send, read; + u32 thres, credits, dack = 0; + + if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || + (tp->t_state == TCPS_FIN_WAIT_2))) + return; + INP_LOCK_ASSERT(tp->t_inpcb); + + so = tp->t_inpcb->inp_socket; + SOCKBUF_LOCK(&so->so_rcv); + read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc; + toep->tp_copied_seq += read; + toep->tp_enqueued_bytes -= read; + credits = toep->tp_copied_seq - toep->tp_rcv_wup; + SOCKBUF_UNLOCK(&so->so_rcv); + + if (credits > so->so_rcv.sb_mbmax) + printf("copied_seq=%u rcv_wup=%u credits=%u\n", + toep->tp_copied_seq, toep->tp_rcv_wup, credits); + /* + * XXX this won't accurately reflect credit return - we need + * to look at the difference between the amount that has been + * put in the recv sockbuf and what is there now + */ + + if (__predict_false(!credits)) + return; + + dev = toep->tp_toedev; + thres = TOM_TUNABLE(dev, rx_credit_thres); + + if (__predict_false(thres == 0)) + return; + + if (toep->tp_ulp_mode) + dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + else { + dack_mode = TOM_TUNABLE(dev, delack); + if (__predict_false(dack_mode != toep->tp_delack_mode)) { + u32 r = tp->rcv_nxt - toep->tp_delack_seq; + + if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) + dack = F_RX_DACK_CHANGE | + V_RX_DACK_MODE(dack_mode); + } + } + + /* + * For coalescing to work effectively ensure the receive window has + * at least 16KB left. + */ + must_send = credits + 16384 >= tp->rcv_wnd; + + if (must_send || credits >= thres) + toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); +} + +static int +cxgb_toe_disconnect(struct tcpcb *tp) +{ + struct socket *so; + + DPRINTF("cxgb_toe_disconnect\n"); + + so = tp->t_inpcb->inp_socket; + close_conn(so); + return (0); +} + +static int +cxgb_toe_abort(struct tcpcb *tp) +{ + struct toepcb *toep = tp->t_toe; + + + t3_send_reset(toep); + + /* + * unhook from socket + */ + tp->t_flags &= ~TF_TOE; + toep->tp_tp = NULL; + tp->t_toe = NULL; + return (0); +} + +static int +cxgb_toe_send(struct tcpcb *tp) +{ + struct socket *so; + + DPRINTF("cxgb_toe_send\n"); + dump_toepcb(tp->t_toe); + + so = tp->t_inpcb->inp_socket; + t3_push_frames(so, 1); + return (0); +} + +static int +cxgb_toe_rcvd(struct tcpcb *tp) +{ + INP_LOCK_ASSERT(tp->t_inpcb); + t3_cleanup_rbuf(tp); + + return (0); +} + +static void +cxgb_toe_detach(struct tcpcb *tp) +{ + struct toepcb *toep; + /* + * XXX how do we handle teardown in the SYN_SENT state? + * + */ + INP_INFO_WLOCK(&tcbinfo); + toep = tp->t_toe; + toep->tp_tp = NULL; + + /* + * unhook from socket + */ + tp->t_flags &= ~TF_TOE; + tp->t_toe = NULL; + INP_INFO_WUNLOCK(&tcbinfo); +} + + +static struct toe_usrreqs cxgb_toe_usrreqs = { + .tu_disconnect = cxgb_toe_disconnect, + .tu_abort = cxgb_toe_abort, + .tu_send = cxgb_toe_send, + .tu_rcvd = cxgb_toe_rcvd, + .tu_detach = cxgb_toe_detach, + .tu_detach = cxgb_toe_detach, + .tu_syncache_event = handle_syncache_event, +}; + + +static void +__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word, + uint64_t mask, uint64_t val, int no_reply) +{ + struct cpl_set_tcb_field *req; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + req = mtod(m, struct cpl_set_tcb_field *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); + req->reply = V_NO_REPLY(no_reply); + req->cpu_idx = 0; + req->word = htons(word); + req->mask = htobe64(mask); + req->val = htobe64(val); + + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); + send_or_defer(so, tp, m, 0); +} + +static void +t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val) +{ + struct mbuf *m; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + if (toep == NULL) + return; + + if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) + return; + + m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); + + __set_tcb_field(so, m, word, mask, val, 1); +} + +/* + * Set one of the t_flags bits in the TCB. + */ +static void +set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val) +{ + t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); +} + +/* + * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. + */ +static void +t3_set_nagle(struct socket *so) +{ + struct tcpcb *tp = sototcpcb(so); + + set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); +} + +/* + * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. + */ +void +t3_set_keepalive(struct socket *so, int on_off) +{ + set_tcb_tflag(so, S_TF_KEEPALIVE, on_off); +} + +void +t3_set_rcv_coalesce_enable(struct socket *so, int on_off) +{ + set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off); +} + +/* + * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. + */ +static void +t3_set_tos(struct socket *so) +{ + t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), + V_TCB_TOS(SO_TOS(so))); +} + + +/* + * In DDP mode, TP fails to schedule a timer to push RX data to the host when + * DDP is disabled (data is delivered to freelist). [Note that, the peer should + * set the PSH bit in the last segment, which would trigger delivery.] + * We work around the issue by setting a DDP buffer in a partial placed state, + * which guarantees that TP will schedule a timer. + */ +#define TP_DDP_TIMER_WORKAROUND_MASK\ + (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ + ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ + V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) +#define TP_DDP_TIMER_WORKAROUND_VAL\ + (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ + ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ + 32)) + +static void +t3_enable_ddp(struct socket *so, int on) +{ + if (on) + t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), + V_TF_DDP_OFF(0)); + else + t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_MASK, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_VAL); + +} + + +void +t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color) +{ + t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx, + V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), + tag_color); +} + +void +t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset, + unsigned int len) +{ + if (buf_idx == 0) + t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET, + V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | + V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); + else + t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET, + V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | + V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), + V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | + V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); +} + +static int +t3_set_cong_control(struct socket *so, const char *name) +{ +#ifdef notyet + int cong_algo; + + for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) + if (!strcmp(name, t3_cong_ops[cong_algo].name)) + break; + + if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) + return -EINVAL; +#endif + return 0; +} + +int +t3_get_tcb(struct socket *so) +{ + struct cpl_get_tcb *req; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); + + if (!m) + return (ENOMEM); + + INP_LOCK_ASSERT(tp->t_inpcb); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); + req = mtod(m, struct cpl_get_tcb *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); + req->cpuno = htons(toep->tp_qset); + if (sototcpcb(so)->t_state == TCPS_SYN_SENT) + mbufq_tail(&toep->out_of_order_queue, m); // defer + else + cxgb_ofld_send(T3C_DEV(so), m); + return 0; +} + +static inline void +so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid) +{ + struct toepcb *toep = sototoep(so); + toepcb_hold(toep); + + cxgb_insert_tid(d->cdev, d->client, toep, tid); +} + +/** + * find_best_mtu - find the entry in the MTU table closest to an MTU + * @d: TOM state + * @mtu: the target MTU + * + * Returns the index of the value in the MTU table that is closest to but + * does not exceed the target MTU. + */ +static unsigned int +find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return (i); +} + +static unsigned int +select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) +{ + unsigned int idx; + +#ifdef notyet + struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt; +#endif + if (tp) { + tp->t_maxseg = pmtu - 40; + if (tp->t_maxseg < td->mtus[0] - 40) + tp->t_maxseg = td->mtus[0] - 40; + idx = find_best_mtu(td, tp->t_maxseg + 40); + + tp->t_maxseg = td->mtus[idx] - 40; + } else + idx = find_best_mtu(td, pmtu); + + return (idx); +} + +void +t3_release_ddp_resources(struct toepcb *toep) +{ + /* + * This is a no-op until we have DDP support + */ +} + +static inline void +free_atid(struct t3cdev *cdev, unsigned int tid) +{ + struct toepcb *toep = cxgb_free_atid(cdev, tid); + + if (toep) + toepcb_release(toep); +} + +/* + * Release resources held by an offload connection (TID, L2T entry, etc.) + */ +static void +t3_release_offload_resources(struct toepcb *toep) +{ + struct tcpcb *tp = toep->tp_tp; + struct toedev *tdev = toep->tp_toedev; + struct t3cdev *cdev; + unsigned int tid = toep->tp_tid; + + if (!tdev) + return; + + cdev = TOEP_T3C_DEV(toep); + if (!cdev) + return; + + toep->tp_qset = 0; + t3_release_ddp_resources(toep); + +#ifdef CTRL_SKB_CACHE + kfree_skb(CTRL_SKB_CACHE(tp)); + CTRL_SKB_CACHE(tp) = NULL; +#endif + + if (toep->tp_wr_avail != toep->tp_wr_max) { + purge_wr_queue(toep); + reset_wr_list(toep); + } + + if (toep->tp_l2t) { + l2t_release(L2DATA(cdev), toep->tp_l2t); + toep->tp_l2t = NULL; + } + printf("setting toep->tp_tp to NULL\n"); + + toep->tp_tp = NULL; + if (tp) { + INP_LOCK_ASSERT(tp->t_inpcb); + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + } + + if (toep->tp_state == TCPS_SYN_SENT) { + free_atid(cdev, tid); +#ifdef notyet + __skb_queue_purge(&tp->out_of_order_queue); +#endif + } else { // we have TID + cxgb_remove_tid(cdev, toep, tid); + toepcb_release(toep); + } +#if 0 + log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); +#endif +} + +static void +install_offload_ops(struct socket *so) +{ + struct tcpcb *tp = sototcpcb(so); + + KASSERT(tp->t_toe != NULL, ("toepcb not set")); + + t3_install_socket_ops(so); + tp->t_flags |= TF_TOE; + tp->t_tu = &cxgb_toe_usrreqs; +} + +/* + * Determine the receive window scaling factor given a target max + * receive window. + */ +static __inline int +select_rcv_wscale(int space) +{ + int wscale = 0; + + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; + + if (tcp_do_rfc1323) + for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; + return wscale; +} + +/* + * Determine the receive window size for a socket. + */ +static unsigned int +select_rcv_wnd(struct socket *so) +{ + struct toedev *dev = TOE_DEV(so); + struct tom_data *d = TOM_DATA(dev); + unsigned int wnd; + unsigned int max_rcv_wnd; + + if (tcp_do_autorcvbuf) + wnd = tcp_autorcvbuf_max; + else + wnd = sbspace(&so->so_rcv); + + /* XXX + * For receive coalescing to work effectively we need a receive window + * that can accomodate a coalesced segment. + */ + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; + + /* PR 5138 */ + max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? + (uint32_t)d->rx_page_size * 23 : + MAX_RCV_WND); + + return min(wnd, max_rcv_wnd); +} + +/* + * Assign offload parameters to some socket fields. This code is used by + * both active and passive opens. + */ +static inline void +init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, + struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) +{ + struct tcpcb *tp = sototcpcb(so); + struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); + + SOCK_LOCK_ASSERT(so); + + printf("initializing offload socket\n"); +#ifdef notyet + /* + * We either need to fix push frames to work with sbcompress + * or we need to add this + */ + so->so_rcv.sb_flags |= SB_TOE; + so->so_snd.sb_flags |= SB_TOE; +#endif + tp->t_toe = toep; + toep->tp_tp = tp; + toep->tp_toedev = dev; + + toep->tp_tid = tid; + toep->tp_l2t = e; + toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); + toep->tp_wr_unacked = 0; + toep->tp_delack_mode = 0; + + toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); + /* + * XXX broken + * + */ + tp->rcv_wnd = select_rcv_wnd(so); + toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) && + tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; + toep->tp_qset_idx = 0; + + reset_wr_list(toep); + DPRINTF("initialization done\n"); +} + +/* + * The next two functions calculate the option 0 value for a socket. + */ +static inline unsigned int +calc_opt0h(struct socket *so, int mtu_idx) +{ + struct tcpcb *tp = sototcpcb(so); + int wscale = select_rcv_wscale(tp->rcv_wnd); + + return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | + V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | + V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); +} + +static inline unsigned int +calc_opt0l(struct socket *so, int ulp_mode) +{ + struct tcpcb *tp = sototcpcb(so); + unsigned int val; + + val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) | + V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); + + DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val); + return (val); +} + +static inline unsigned int +calc_opt2(const struct socket *so, struct toedev *dev) +{ + int flv_valid; + + flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); + + return V_FLAVORS_VALID(flv_valid) | + V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0); +} +#if 0 +(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) +#endif + +static void +mk_act_open_req(struct socket *so, struct mbuf *m, + unsigned int atid, const struct l2t_entry *e) +{ + struct cpl_act_open_req *req; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep = tp->t_toe; + struct toedev *tdev = TOE_DEV(so); + + m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so)); + + req = mtod(m, struct cpl_act_open_req *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); + req->local_port = inp->inp_lport; + req->peer_port = inp->inp_fport; + memcpy(&req->local_ip, &inp->inp_laddr, 4); + memcpy(&req->peer_ip, &inp->inp_faddr, 4); + DPRINTF("connect smt_idx=%d\n", e->smt_idx); + req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | + V_TX_CHANNEL(e->smt_idx)); + req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); + req->params = 0; + req->opt2 = htonl(calc_opt2(so, tdev)); +} + + +/* + * Convert an ACT_OPEN_RPL status to an errno. + */ +static int +act_open_rpl_status_to_errno(int status) +{ + switch (status) { + case CPL_ERR_CONN_RESET: + return (ECONNREFUSED); + case CPL_ERR_ARP_MISS: + return (EHOSTUNREACH); + case CPL_ERR_CONN_TIMEDOUT: + return (ETIMEDOUT); + case CPL_ERR_TCAM_FULL: + return (ENOMEM); + case CPL_ERR_CONN_EXIST: + log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); + return (EADDRINUSE); + default: + return (EIO); + } +} + +static void +fail_act_open(struct toepcb *toep, int errno) +{ + struct tcpcb *tp = toep->tp_tp; + + t3_release_offload_resources(toep); + if (tp) { + INP_LOCK_ASSERT(tp->t_inpcb); + tcp_drop(tp, errno); + } + +#ifdef notyet + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); +#endif +} + +/* + * Handle active open failures. + */ +static void +active_open_failed(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_act_open_rpl *rpl = cplhdr(m); + struct inpcb *inp; + + INP_INFO_WLOCK(&tcbinfo); + if (toep->tp_tp == NULL) + goto done; + + inp = toep->tp_tp->t_inpcb; + INP_LOCK(inp); + +/* + * Don't handle connection retry for now + */ +#ifdef notyet + struct inet_connection_sock *icsk = inet_csk(sk); + + if (rpl->status == CPL_ERR_CONN_EXIST && + icsk->icsk_retransmit_timer.function != act_open_retry_timer) { + icsk->icsk_retransmit_timer.function = act_open_retry_timer; + sk_reset_timer(so, &icsk->icsk_retransmit_timer, + jiffies + HZ / 2); + } else +#endif + fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); + INP_UNLOCK(inp); +done: + INP_INFO_WUNLOCK(&tcbinfo); + + m_free(m); +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int +act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +/* + * Process an ACT_OPEN_RPL CPL message. + */ +static int +do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + struct cpl_act_open_rpl *rpl = cplhdr(m); + + if (cdev->type != T3A && act_open_has_tid(rpl->status)) + cxgb_queue_tid_release(cdev, GET_TID(rpl)); + + active_open_failed(toep, m); + return (0); +} + +/* + * Handle an ARP failure for an active open. XXX purge ofo queue + * + * XXX badly broken for crossed SYNs as the ATID is no longer valid. + * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should + * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't + * free the atid. Hmm. + */ +#ifdef notyet +static void +act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) +{ + struct toepcb *toep = m_get_toep(m); + struct tcpcb *tp = toep->tp_tp; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = toeptoso(toep); + + INP_LOCK(inp); + if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { + fail_act_open(so, EHOSTUNREACH); + printf("freeing %p\n", m); + + m_free(m); + } + INP_UNLOCK(inp); +} +#endif +/* + * Send an active open request. + */ +int +t3_connect(struct toedev *tdev, struct socket *so, + struct rtentry *rt, struct sockaddr *nam) +{ + struct mbuf *m; + struct l2t_entry *e; + struct tom_data *d = TOM_DATA(tdev); + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep; /* allocated by init_offload_socket */ + + int atid; + + toep = toepcb_alloc(); + if (toep == NULL) + goto out_err; + + if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) + goto out_err; + + e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); + if (!e) + goto free_tid; + + INP_LOCK_ASSERT(inp); + m = m_gethdr(MT_DATA, M_WAITOK); + +#if 0 + m->m_toe.mt_toepcb = tp->t_toe; + set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); +#endif + SOCK_LOCK(so); + + init_offload_socket(so, tdev, atid, e, rt, toep); + + install_offload_ops(so); + + mk_act_open_req(so, m, atid, e); + SOCK_UNLOCK(so); + + soisconnecting(so); + toep = tp->t_toe; + m_set_toep(m, tp->t_toe); + + printf("sending off request\n"); + + toep->tp_state = TCPS_SYN_SENT; + l2t_send(d->cdev, (struct mbuf *)m, e); + + if (toep->tp_ulp_mode) + t3_enable_ddp(so, 0); + return (0); + +free_tid: + printf("failing connect - free atid\n"); + + free_atid(d->cdev, atid); +out_err: + printf("return ENOMEM\n"); + return (ENOMEM); +} + +/* + * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do + * not send multiple ABORT_REQs for the same connection and also that we do + * not try to send a message after the connection has closed. Returns 1 if + * an ABORT_REQ wasn't generated after all, 0 otherwise. + */ +static void +t3_send_reset(struct toepcb *toep) +{ + + struct cpl_abort_req *req; + unsigned int tid = toep->tp_tid; + int mode = CPL_ABORT_SEND_RST; + struct tcpcb *tp = toep->tp_tp; + struct toedev *tdev = toep->tp_toedev; + struct socket *so = NULL; + struct mbuf *m; + + if (tp) { + INP_LOCK_ASSERT(tp->t_inpcb); + so = toeptoso(toep); + } + + if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || + tdev == NULL)) + return; + toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); + + /* Purge the send queue so we don't send anything after an abort. */ + if (so) + sbflush(&so->so_snd); + if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) + mode |= CPL_ABORT_POST_CLOSE_REQ; + + m = m_gethdr_nofail(sizeof(*req)); + m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so)); + set_arp_failure_handler(m, abort_arp_failure); + + req = mtod(m, struct cpl_abort_req *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); + req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; + req->rsvd1 = !(toep->tp_flags & TP_DATASENT); + req->cmd = mode; + if (tp && (tp->t_state == TCPS_SYN_SENT)) + mbufq_tail(&toep->out_of_order_queue, m); // defer + else + l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); +} + +static int +t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct inpcb *inp; + int error, optval; + + if (sopt->sopt_name == IP_OPTIONS) + return (ENOPROTOOPT); + + if (sopt->sopt_name != IP_TOS) + return (EOPNOTSUPP); + + error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); + + if (error) + return (error); + + if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) + return (EPERM); + + inp = sotoinpcb(so); + inp->inp_ip_tos = optval; + + t3_set_tos(so); + + return (0); +} + +static int +t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int err = 0; + size_t copied; + + if (sopt->sopt_name != TCP_CONGESTION && + sopt->sopt_name != TCP_NODELAY) + return (EOPNOTSUPP); + + if (sopt->sopt_name == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + int optlen = sopt->sopt_valsize; + struct tcpcb *tp; + + if (optlen < 1) + return (EINVAL); + + err = copyinstr(sopt->sopt_val, name, + min(TCP_CA_NAME_MAX - 1, optlen), &copied); + if (err) + return (err); + if (copied < 1) + return (EINVAL); + + tp = sototcpcb(so); + /* + * XXX I need to revisit this + */ + if ((err = t3_set_cong_control(so, name)) == 0) { +#ifdef notyet + tp->t_cong_control = strdup(name, M_CXGB); +#endif + } else + return (err); + } else { + int optval, oldval; + struct inpcb *inp; + struct tcpcb *tp; + + err = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + + if (err) + return (err); + + inp = sotoinpcb(so); + tp = intotcpcb(inp); + + INP_LOCK(inp); + + oldval = tp->t_flags; + if (optval) + tp->t_flags |= TF_NODELAY; + else + tp->t_flags &= ~TF_NODELAY; + INP_UNLOCK(inp); + + if (oldval != tp->t_flags) + t3_set_nagle(so); + + } + + return (0); +} + +static int +t3_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int err; + + if (sopt->sopt_level != IPPROTO_TCP) + err = t3_ip_ctloutput(so, sopt); + else + err = t3_tcp_ctloutput(so, sopt); + + if (err != EOPNOTSUPP) + return (err); + + return tcp_ctloutput(so, sopt); +} + +/* + * Process new data received for a connection. + */ +static void +new_rx_data(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_rx_data *hdr = cplhdr(m); + struct tcpcb *tp = toep->tp_tp; + struct socket *so = toeptoso(toep); + int len = be16toh(hdr->len); + + INP_LOCK(tp->t_inpcb); + +#ifdef notyet + if (__predict_false(sk_no_receive(sk))) { + handle_excess_rx(so, skb); + return; + } + + if (ULP_MODE(tp) == ULP_MODE_TCPDDP) + handle_ddp_data(so, skb); + + TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); + TCP_SKB_CB(skb)->flags = 0; + skb_ulp_mode(skb) = 0; /* for iSCSI */ +#endif +#if VALIDATE_SEQ + if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { + printk(KERN_ERR + "%s: TID %u: Bad sequence number %u, expected %u\n", + TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, + tp->rcv_nxt); + __kfree_skb(skb); + return; + } +#endif + m_adj(m, sizeof(*hdr)); + +#ifdef notyet + /* + * We don't handle urgent data yet + */ + if (__predict_false(hdr->urg)) + handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); + if (__predict_false(tp->urg_data == TCP_URG_NOTYET && + tp->urg_seq - tp->rcv_nxt < skb->len)) + tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - + tp->rcv_nxt]; +#endif + if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { + toep->tp_delack_mode = hdr->dack_mode; + toep->tp_delack_seq = tp->rcv_nxt; + } + + DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len); + + if (len < m->m_pkthdr.len) + m->m_pkthdr.len = m->m_len = len; + + tp->rcv_nxt += m->m_pkthdr.len; + tp->t_rcvtime = ticks; + toep->tp_enqueued_bytes += m->m_pkthdr.len; +#ifdef T3_TRACE + T3_TRACE2(TIDTB(sk), + "new_rx_data: seq 0x%x len %u", + TCP_SKB_CB(skb)->seq, skb->len); +#endif + SOCKBUF_LOCK(&so->so_rcv); + if (sb_notify(&so->so_rcv)) + DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len); + + sbappend_locked(&so->so_rcv, m); + KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax, + + ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", + so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax)); + + INP_UNLOCK(tp->t_inpcb); + DPRINTF("sb_cc=%d sb_mbcnt=%d\n", + so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt); + + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +/* + * Handler for RX_DATA CPL messages. + */ +static int +do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); + + new_rx_data(toep, m); + + return (0); +} + +static void +new_rx_data_ddp(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_data_ddp *hdr; + unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; + +#ifdef notyet + if (unlikely(sk_no_receive(sk))) { + handle_excess_rx(so, m); + return; + } +#endif + tp = sototcpcb(so); + q = &toep->tp_ddp_state; + hdr = cplhdr(m); + ddp_report = ntohl(hdr->u.ddp_report); + buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; + bsp = &q->buf_state[buf_idx]; + +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " + "hdr seq 0x%x len %u offset %u", + tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), + ntohs(hdr->len), G_DDP_OFFSET(ddp_report)); + T3_TRACE1(TIDTB(sk), + "new_rx_data_ddp: ddp_report 0x%x", + ddp_report); +#endif + + ddp_len = ntohs(hdr->len); + rcv_nxt = ntohl(hdr->seq) + ddp_len; + + /* + * Overload to store old rcv_next + */ + m->m_pkthdr.csum_data = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + + /* + * Store the length in m->m_len. We are changing the meaning of + * m->m_len here, we need to be very careful that nothing from now on + * interprets ->len of this packet the usual way. + */ + m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data; + + /* + * Figure out where the new data was placed in the buffer and store it + * in when. Assumes the buffer offset starts at 0, consumer needs to + * account for page pod's pg_offset. + */ + end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; +#ifdef notyet + TCP_SKB_CB(skb)->when = end_offset - skb->len; + + /* + * We store in mac.raw the address of the gather list where the + * placement happened. + */ + skb->mac.raw = (unsigned char *)bsp->gl; +#endif + bsp->cur_offset = end_offset; + + /* + * Bit 0 of flags stores whether the DDP buffer is completed. + * Note that other parts of the code depend on this being in bit 0. + */ + if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { +#if 0 + TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ +#endif + panic("spurious ddp completion"); + } else { + m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); + if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; /* flip buffers */ + } + + if (bsp->flags & DDP_BF_NOCOPY) { + m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY); + bsp->flags &= ~DDP_BF_NOCOPY; + } + + if (ddp_report & F_DDP_PSH) + m->m_pkthdr.csum_flags |= DDP_BF_PSH; + + tp->t_rcvtime = ticks; + sbappendstream_locked(&so->so_rcv, m); +#ifdef notyet + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); +#endif +} + +#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ + F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ + F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ + F_DDP_INVALID_PPOD) + +/* + * Handler for RX_DATA_DDP CPL messages. + */ +static int +do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = ctx; + struct socket *so = toeptoso(toep); + const struct cpl_rx_data_ddp *hdr = cplhdr(m); + + VALIDATE_SOCK(so); + + if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { + log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", + GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); + return CPL_RET_BUF_DONE; + } +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif + new_rx_data_ddp(so, m); + return (0); +} + +static void +process_ddp_complete(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_ddp_complete *hdr; + unsigned int ddp_report, buf_idx, when; + +#ifdef notyet + if (unlikely(sk_no_receive(sk))) { + handle_excess_rx(sk, skb); + return; + } +#endif + q = &toep->tp_ddp_state; + hdr = cplhdr(m); + ddp_report = ntohl(hdr->ddp_report); + buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; + bsp = &q->buf_state[buf_idx]; + + when = bsp->cur_offset; + m->m_len = G_DDP_OFFSET(ddp_report) - when; + +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report 0x%x offset %u, len %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report), skb->len); +#endif + + bsp->cur_offset += m->m_len; + + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; /* flip buffers */ + +#ifdef T3_TRACE + T3_TRACE4(TIDTB(sk), + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report %u offset %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report)); +#endif +#if 0 + skb->mac.raw = (unsigned char *)bsp->gl; +#endif + m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; + if (bsp->flags & DDP_BF_NOCOPY) + bsp->flags &= ~DDP_BF_NOCOPY; + m->m_pkthdr.csum_data = tp->rcv_nxt; + tp->rcv_nxt += m->m_len; + + tp->t_rcvtime = ticks; + sbappendstream_locked(&so->so_rcv, m); +#ifdef notyet + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); +#endif +} + +/* + * Handler for RX_DDP_COMPLETE CPL messages. + */ +static int +do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = ctx; + struct socket *so = toeptoso(toep); + + VALIDATE_SOCK(so); +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif + process_ddp_complete(so, m); + return (0); +} + +/* + * Move a socket to TIME_WAIT state. We need to make some adjustments to the + * socket state before calling tcp_time_wait to comply with its expectations. + */ +static void +enter_timewait(struct socket *so) +{ + struct tcpcb *tp = sototcpcb(so); + + INP_LOCK_ASSERT(tp->t_inpcb); + /* + * Bump rcv_nxt for the peer FIN. We don't do this at the time we + * process peer_close because we don't want to carry the peer FIN in + * the socket's receive queue and if we increment rcv_nxt without + * having the FIN in the receive queue we'll confuse facilities such + * as SIOCINQ. + */ + tp->rcv_nxt++; + + tp->ts_recent_age = 0; /* defeat recycling */ + tp->t_srtt = 0; /* defeat tcp_update_metrics */ + tcp_twstart(tp); +} + +/* + * Handle a peer FIN. + */ +static void +do_peer_fin(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + int keep = 0, dead = (so->so_state & SS_NOFDREF); + + DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead); + +#ifdef T3_TRACE + T3_TRACE0(TIDTB(sk),"do_peer_fin:"); +#endif + + if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { + printf("abort_pending set\n"); + + goto out; + } + +#ifdef notyet + if (ULP_MODE(tp) == ULP_MODE_TCPDDP) { + keep = handle_peer_close_data(so, skb); + if (keep < 0) + return; + } + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(so, SOCK_DONE); +#endif + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(tp->t_inpcb); + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) + socantrcvmore(so); + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + case TCPS_FIN_WAIT_2: + /* + * If we've sent an abort_req we must have sent it too late, + * HW will send us a reply telling us so, and this peer_close + * is really the last message for this connection and needs to + * be treated as an abort_rpl, i.e., transition the connection + * to TCP_CLOSE (note that the host stack does this at the + * time of generating the RST but we must wait for HW). + * Otherwise we enter TIME_WAIT. + */ + t3_release_offload_resources(toep); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + tp = tcp_close(tp); + } else + enter_timewait(so); + break; + default: + log(LOG_ERR, + "%s: TID %u received PEER_CLOSE in bad state %d\n", + TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state); + } + INP_INFO_WUNLOCK(&tcbinfo); + if (tp) + INP_UNLOCK(tp->t_inpcb); + + if (!dead) { + DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags); + + sorwakeup(so); + sowwakeup(so); + wakeup(&so->so_timeo); +#ifdef notyet + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if ((sk->sk_shutdown & SEND_SHUTDOWN) || + sk->sk_state == TCP_CLOSE) + sk_wake_async(so, 1, POLL_HUP); + else + sk_wake_async(so, 1, POLL_IN); +#endif + } +out: + if (!keep) + m_free(m); +} + +/* + * Handler for PEER_CLOSE CPL messages. + */ +static int +do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + struct socket *so = toeptoso(toep); + + VALIDATE_SOCK(so); + + do_peer_fin(so, m); + return (0); +} + +static void +process_close_con_rpl(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct cpl_close_con_rpl *rpl = cplhdr(m); + struct toepcb *toep = tp->t_toe; + + tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ + + DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state, + !!(so->so_state & SS_NOFDREF)); + if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) + goto out; + + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(tp->t_inpcb); + switch (tp->t_state) { + case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ + t3_release_offload_resources(toep); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + tp = tcp_close(tp); + + } else + enter_timewait(so); + break; + case TCPS_LAST_ACK: + /* + * In this state we don't care about pending abort_rpl. + * If we've sent abort_req it was post-close and was sent too + * late, this close_con_rpl is the actual last message. + */ + t3_release_offload_resources(toep); + tp = tcp_close(tp); + break; + case TCPS_FIN_WAIT_1: +#ifdef notyet + dst_confirm(sk->sk_dst_cache); +#endif + soisdisconnecting(so); + + if ((so->so_state & SS_NOFDREF) == 0) { + /* + * Wake up lingering close + */ + sowwakeup(so); + sorwakeup(so); + wakeup(&so->so_timeo); + } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 && + (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { + tp = tcp_drop(tp, 0); + } + + break; + default: + log(LOG_ERR, + "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", + TOE_DEV(so)->tod_name, toep->tp_tid, + tp->t_state); + } + INP_INFO_WUNLOCK(&tcbinfo); + if (tp) + INP_UNLOCK(tp->t_inpcb); +out: + m_free(m); +} + +/* + * Handler for CLOSE_CON_RPL CPL messages. + */ +static int +do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, + void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + struct socket *so = toeptoso(toep); + + VALIDATE_SOCK(so); + + process_close_con_rpl(so, m); + return (0); +} + +/* + * Process abort replies. We only process these messages if we anticipate + * them as the coordination between SW and HW in this area is somewhat lacking + * and sometimes we get ABORT_RPLs after we are done with the connection that + * originated the ABORT_REQ. + */ +static void +process_abort_rpl(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + +#ifdef T3_TRACE + T3_TRACE1(TIDTB(sk), + "process_abort_rpl: GTS rpl pending %d", + sock_flag(sk, ABORT_RPL_PENDING)); +#endif + INP_LOCK(tp->t_inpcb); + + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + /* + * XXX panic on tcpdrop + */ + if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so))) + toep->tp_flags |= TP_ABORT_RPL_RCVD; + else { + toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); + if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || + !is_t3a(TOE_DEV(so))) { + if (toep->tp_flags & TP_ABORT_REQ_RCVD) + panic("TP_ABORT_REQ_RCVD set"); + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(tp->t_inpcb); + t3_release_offload_resources(toep); + tp = tcp_close(tp); + INP_INFO_WUNLOCK(&tcbinfo); + } + } + } + if (tp) + INP_UNLOCK(tp->t_inpcb); + + m_free(m); +} + +/* + * Handle an ABORT_RPL_RSS CPL message. + */ +static int +do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct socket *so; + struct cpl_abort_rpl_rss *rpl = cplhdr(m); + struct toepcb *toep; + + /* + * Ignore replies to post-close aborts indicating that the abort was + * requested too late. These connections are terminated when we get + * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss + * arrives the TID is either no longer used or it has been recycled. + */ + if (rpl->status == CPL_ERR_ABORT_FAILED) { +discard: + m_free(m); + return (0); + } + + toep = (struct toepcb *)ctx; + + /* + * Sometimes we've already closed the socket, e.g., a post-close + * abort races with ABORT_REQ_RSS, the latter frees the socket + * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, + * but FW turns the ABORT_REQ into a regular one and so we get + * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. + */ + if (!toep) + goto discard; + + if (toep->tp_tp == NULL) { + printf("removing tid for abort\n"); + cxgb_remove_tid(cdev, toep, toep->tp_tid); + if (toep->tp_l2t) + l2t_release(L2DATA(cdev), toep->tp_l2t); + + toepcb_release(toep); + goto discard; + } + + printf("toep=%p\n", toep); + printf("tp=%p\n", toep->tp_tp); + + so = toeptoso(toep); /* <- XXX panic */ + toepcb_hold(toep); + process_abort_rpl(so, m); + toepcb_release(toep); + return (0); +} + +/* + * Convert the status code of an ABORT_REQ into a Linux error code. Also + * indicate whether RST should be sent in response. + */ +static int +abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) +{ + struct tcpcb *tp = sototcpcb(so); + + switch (abort_reason) { + case CPL_ERR_BAD_SYN: +#if 0 + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through +#endif + case CPL_ERR_CONN_RESET: + // XXX need to handle SYN_RECV due to crossed SYNs + return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: +#if 0 + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); +#endif + return (ETIMEDOUT); + default: + return (EIO); + } +} + +static inline void +set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) +{ + struct cpl_abort_rpl *rpl = cplhdr(m); + + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(tid)); + m->m_len = m->m_pkthdr.len = sizeof(*rpl); + + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); + rpl->cmd = cmd; +} + +static void +send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) +{ + struct mbuf *reply_mbuf; + struct cpl_abort_req_rss *req = cplhdr(m); + + reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); + m_set_priority(m, CPL_PRIORITY_DATA); + m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); + set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); + m_free(m); +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static inline int +is_neg_adv_abort(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; +} + +static void +send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) +{ + struct mbuf *reply_mbuf; + struct cpl_abort_req_rss *req = cplhdr(m); + + reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); + + if (!reply_mbuf) { + /* Defer the reply. Stick rst_status into req->cmd. */ + req->status = rst_status; + t3_defer_reply(m, tdev, send_deferred_abort_rpl); + return; + } + + m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); + set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); + m_free(m); + + /* + * XXX need to sync with ARP as for SYN_RECV connections we can send + * these messages while ARP is pending. For other connection states + * it's not a problem. + */ + cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); +} + +#ifdef notyet +static void +cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) +{ + UNIMPLEMENTED(); +#ifdef notyet + struct request_sock *req = child->sk_user_data; + + inet_csk_reqsk_queue_removed(parent, req); + synq_remove(tcp_sk(child)); + __reqsk_free(req); + child->sk_user_data = NULL; +#endif +} + + +/* + * Performs the actual work to abort a SYN_RECV connection. + */ +static void +do_abort_syn_rcv(struct socket *child, struct socket *parent) +{ + struct tcpcb *parenttp = sototcpcb(parent); + struct tcpcb *childtp = sototcpcb(child); + + /* + * If the server is still open we clean up the child connection, + * otherwise the server already did the clean up as it was purging + * its SYN queue and the skb was just sitting in its backlog. + */ + if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { + cleanup_syn_rcv_conn(child, parent); + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(childtp->t_inpcb); + t3_release_offload_resources(childtp->t_toe); + childtp = tcp_close(childtp); + INP_INFO_WUNLOCK(&tcbinfo); + if (childtp) + INP_UNLOCK(childtp->t_inpcb); + } +} +#endif + +/* + * Handle abort requests for a SYN_RECV connection. These need extra work + * because the socket is on its parent's SYN queue. + */ +static int +abort_syn_rcv(struct socket *so, struct mbuf *m) +{ + UNIMPLEMENTED(); +#ifdef notyet + struct socket *parent; + struct toedev *tdev = TOE_DEV(so); + struct t3cdev *cdev = TOM_DATA(tdev)->cdev; + struct socket *oreq = so->so_incomp; + struct t3c_tid_entry *t3c_stid; + struct tid_info *t; + + if (!oreq) + return -1; /* somehow we are not on the SYN queue */ + + t = &(T3C_DATA(cdev))->tid_maps; + t3c_stid = lookup_stid(t, oreq->ts_recent); + parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; + + SOCK_LOCK(parent); + do_abort_syn_rcv(so, parent); + send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); + SOCK_UNLOCK(parent); +#endif + return (0); +} + +/* + * Process abort requests. If we are waiting for an ABORT_RPL we ignore this + * request except that we need to reply to it. + */ +static void +process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev) +{ + int rst_status = CPL_ABORT_NO_RST; + const struct cpl_abort_req_rss *req = cplhdr(m); + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + INP_LOCK(tp->t_inpcb); + if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { + toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); + m_free(m); + goto skip; + } + + toep->tp_flags &= ~TP_ABORT_REQ_RCVD; + /* + * Three cases to consider: + * a) We haven't sent an abort_req; close the connection. + * b) We have sent a post-close abort_req that will get to TP too late + * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will + * be ignored and the connection should be closed now. + * c) We have sent a regular abort_req that will get to TP too late. + * That will generate an abort_rpl with status 0, wait for it. + */ + if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || + (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { + so->so_error = abort_status_to_errno(so, req->status, + &rst_status); +#if 0 + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); +#endif + /* + * SYN_RECV needs special processing. If abort_syn_rcv() + * returns 0 is has taken care of the abort. + */ + if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) + goto skip; + + t3_release_offload_resources(toep); + tp = tcp_close(tp); + } + if (tp) + INP_UNLOCK(tp->t_inpcb); + send_abort_rpl(m, tdev, rst_status); + return; + +skip: + INP_UNLOCK(tp->t_inpcb); +} + +/* + * Handle an ABORT_REQ_RSS CPL message. + */ +static int +do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + const struct cpl_abort_req_rss *req = cplhdr(m); + struct toepcb *toep = (struct toepcb *)ctx; + struct socket *so; + struct inpcb *inp; + + if (is_neg_adv_abort(req->status)) { + m_free(m); + return (0); + } + + printf("aborting tid=%d\n", toep->tp_tid); + + if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { + cxgb_remove_tid(cdev, toep, toep->tp_tid); + toep->tp_flags |= TP_ABORT_REQ_RCVD; + printf("sending abort rpl\n"); + + send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); + printf("sent\n"); + if (toep->tp_l2t) + l2t_release(L2DATA(cdev), toep->tp_l2t); + + /* + * Unhook + */ + toep->tp_tp->t_toe = NULL; + toep->tp_tp->t_flags &= ~TF_TOE; + toep->tp_tp = NULL; + /* + * XXX need to call syncache_chkrst - but we don't + * have a way of doing that yet + */ + toepcb_release(toep); + printf("abort for unestablished connection :-(\n"); + return (0); + } + if (toep->tp_tp == NULL) { + printf("disconnected toepcb\n"); + /* should be freed momentarily */ + return (0); + } + + so = toeptoso(toep); + inp = sotoinpcb(so); + + VALIDATE_SOCK(so); + toepcb_hold(toep); + INP_INFO_WLOCK(&tcbinfo); + process_abort_req(so, m, TOE_DEV(so)); + INP_INFO_WUNLOCK(&tcbinfo); + toepcb_release(toep); + return (0); +} +#ifdef notyet +static void +pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) +{ + struct toedev *tdev = TOE_DEV(parent); + + do_abort_syn_rcv(child, parent); + if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { + struct cpl_pass_accept_rpl *rpl = cplhdr(m); + + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + } else + m_free(m); +} +#endif +static void +handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) +{ + UNIMPLEMENTED(); + +#ifdef notyet + struct t3cdev *cdev; + struct socket *parent; + struct socket *oreq; + struct t3c_tid_entry *t3c_stid; + struct tid_info *t; + struct tcpcb *otp, *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + /* + * If the connection is being aborted due to the parent listening + * socket going away there's nothing to do, the ABORT_REQ will close + * the connection. + */ + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + m_free(m); + return; + } + + oreq = so->so_incomp; + otp = sototcpcb(oreq); + + cdev = T3C_DEV(so); + t = &(T3C_DATA(cdev))->tid_maps; + t3c_stid = lookup_stid(t, otp->ts_recent); + parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; + + SOCK_LOCK(parent); + pass_open_abort(so, parent, m); + SOCK_UNLOCK(parent); +#endif +} + +/* + * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly + * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV + * connection. + */ +static void +pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) +{ + +#ifdef notyet + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); +#endif + handle_pass_open_arp_failure(m_get_socket(m), m); +} + +/* + * Populate a reject CPL_PASS_ACCEPT_RPL WR. + */ +static void +mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) +{ + struct cpl_pass_accept_req *req = cplhdr(req_mbuf); + struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); + unsigned int tid = GET_TID(req); + + m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); + rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + rpl->opt2 = 0; + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ +} + +/* + * Send a deferred reject to an accept request. + */ +static void +reject_pass_request(struct toedev *tdev, struct mbuf *m) +{ + struct mbuf *reply_mbuf; + + reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); + mk_pass_accept_rpl(reply_mbuf, m); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); + m_free(m); +} + +static void +handle_syncache_event(int event, void *arg) +{ + struct toepcb *toep = arg; + + switch (event) { + case SC_ENTRY_PRESENT: + /* + * entry already exists - free toepcb + * and l2t + */ + printf("syncache entry present\n"); + toepcb_release(toep); + break; + case SC_DROP: + /* + * The syncache has given up on this entry + * either it timed out, or it was evicted + * we need to explicitly release the tid + */ + printf("syncache entry dropped\n"); + toepcb_release(toep); + break; + default: + log(LOG_ERR, "unknown syncache event %d\n", event); + break; + } +} + +static void +syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) +{ + struct in_conninfo inc; + struct tcpopt to; + struct tcphdr th; + struct inpcb *inp; + int mss, wsf, sack, ts; + + bzero(&to, sizeof(struct tcpopt)); + inp = sotoinpcb(lso); + + /* + * Fill out information for entering us into the syncache + */ + inc.inc_fport = th.th_sport = req->peer_port; + inc.inc_lport = th.th_dport = req->local_port; + toep->tp_iss = th.th_seq = req->rcv_isn; + th.th_flags = TH_SYN; + + toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn); + + inc.inc_isipv6 = 0; + inc.inc_len = 0; + inc.inc_faddr.s_addr = req->peer_ip; + inc.inc_laddr.s_addr = req->local_ip; + + DPRINTF("syncache add of %d:%d %d:%d\n", + ntohl(req->local_ip), ntohs(req->local_port), + ntohl(req->peer_ip), ntohs(req->peer_port)); + + mss = req->tcp_options.mss; + wsf = req->tcp_options.wsf; + ts = req->tcp_options.tstamp; + sack = req->tcp_options.sack; + to.to_mss = mss; + to.to_wscale = wsf; + to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); + + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(inp); + syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); +} + + +/* + * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket + * lock held. Note that the sock here is a listening socket that is not owned + * by the TOE. + */ +static void +process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, + struct listen_ctx *lctx) +{ + int rt_flags; + struct l2t_entry *e; + struct iff_mac tim; + struct mbuf *reply_mbuf, *ddp_mbuf = NULL; + struct cpl_pass_accept_rpl *rpl; + struct cpl_pass_accept_req *req = cplhdr(m); + unsigned int tid = GET_TID(req); + struct tom_data *d = TOM_DATA(tdev); + struct t3cdev *cdev = d->cdev; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *newtoep; + struct rtentry *dst; + struct sockaddr_in nam; + struct t3c_data *td = T3C_DATA(cdev); + + reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); + if (__predict_false(reply_mbuf == NULL)) { + if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) + t3_defer_reply(m, tdev, reject_pass_request); + else { + cxgb_queue_tid_release(cdev, tid); + m_free(m); + } + DPRINTF("failed to get reply_mbuf\n"); + + goto out; + } + + if (tp->t_state != TCPS_LISTEN) { + DPRINTF("socket not in listen state\n"); + + goto reject; + } + + tim.mac_addr = req->dst_mac; + tim.vlan_tag = ntohs(req->vlan_tag); + if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { + DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); + goto reject; + } + +#ifdef notyet + /* + * XXX do route lookup to confirm that we're still listening on this + * address + */ + if (ip_route_input(skb, req->local_ip, req->peer_ip, + G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) + goto reject; + rt_flags = ((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); + dst_release(skb->dst); // done with the input route, release it + skb->dst = NULL; + + if ((rt_flags & RTF_LOCAL) == 0) + goto reject; +#endif + /* + * XXX + */ + rt_flags = RTF_LOCAL; + if ((rt_flags & RTF_LOCAL) == 0) + goto reject; + + /* + * Calculate values and add to syncache + */ + + newtoep = toepcb_alloc(); + if (newtoep == NULL) + goto reject; + + bzero(&nam, sizeof(struct sockaddr_in)); + + nam.sin_len = sizeof(struct sockaddr_in); + nam.sin_family = AF_INET; + nam.sin_addr.s_addr =req->peer_ip; + dst = rtalloc2((struct sockaddr *)&nam, 1, 0); + + if (dst == NULL) { + printf("failed to find route\n"); + goto reject; + } + e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, + (struct sockaddr *)&nam); + if (e == NULL) { + DPRINTF("failed to get l2t\n"); + } + /* + * Point to our listen socket until accept + */ + newtoep->tp_tp = tp; + newtoep->tp_flags = TP_SYN_RCVD; + newtoep->tp_tid = tid; + newtoep->tp_toedev = tdev; + + printf("inserting tid=%d\n", tid); + cxgb_insert_tid(cdev, d->client, newtoep, tid); + SOCK_LOCK(so); + LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); + SOCK_UNLOCK(so); + + + if (lctx->ulp_mode) { + ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); + + if (!ddp_mbuf) + newtoep->tp_ulp_mode = 0; + else + newtoep->tp_ulp_mode = lctx->ulp_mode; + } + + set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); + + DPRINTF("adding request to syn cache\n"); + + /* + * XXX workaround for lack of syncache drop + */ + toepcb_hold(newtoep); + syncache_add_accept_req(req, so, newtoep); + + + + rpl = cplhdr(reply_mbuf); + reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + rpl->wr.wr_lo = 0; + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); + rpl->opt2 = htonl(calc_opt2(so, tdev)); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten + + DPRINTF("accept smt_idx=%d\n", e->smt_idx); + + rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | + V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); + rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) | + CPL_PASS_OPEN_ACCEPT); + + DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); + + m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so)); + +#ifdef DEBUG_PRINT + { + int i; + + DPRINTF("rpl:\n"); + uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *); + + for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++) + DPRINTF("[%d] %08x\n", i, rplbuf[i]); + } +#endif + + + l2t_send(cdev, reply_mbuf, e); + m_free(m); +#ifdef notyet + /* + * XXX this call path has to be converted to not depend on sockets + */ + if (newtoep->tp_ulp_mode) + __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_MASK, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_VAL, 1); + +#endif + return; +reject: + if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) + mk_pass_accept_rpl(reply_mbuf, m); + else + mk_tid_release(reply_mbuf, NULL, tid); + cxgb_ofld_send(cdev, reply_mbuf); + m_free(m); +out: +#if 0 + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); +#else + return; +#endif +} + +/* + * Handle a CPL_PASS_ACCEPT_REQ message. + */ +static int +do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + struct socket *lso = listen_ctx->lso; + struct tom_data *d = listen_ctx->tom_data; + +#if VALIDATE_TID + struct cpl_pass_accept_req *req = cplhdr(m); + unsigned int tid = GET_TID(req); + struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; + + if (unlikely(!lsk)) { + printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", + cdev->name, + (unsigned long)((union listen_entry *)ctx - + t->stid_tab)); + return CPL_RET_BUF_DONE; + } + if (unlikely(tid >= t->ntids)) { + printk(KERN_ERR "%s: passive open TID %u too large\n", + cdev->name, tid); + return CPL_RET_BUF_DONE; + } + /* + * For T3A the current user of the TID may have closed but its last + * message(s) may have been backlogged so the TID appears to be still + * in use. Just take the TID away, the connection can close at its + * own leisure. For T3B this situation is a bug. + */ + if (!valid_new_tid(t, tid) && + cdev->type != T3A) { + printk(KERN_ERR "%s: passive open uses existing TID %u\n", + cdev->name, tid); + return CPL_RET_BUF_DONE; + } +#endif + + process_pass_accept_req(lso, m, &d->tdev, listen_ctx); + return (0); +} + +/* + * Called when a connection is established to translate the TCP options + * reported by HW to Linux's native format. + */ +static void +assign_rxopt(struct socket *so, unsigned int opt) +{ + const struct t3c_data *td = T3C_DATA(T3C_DEV(so)); + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + INP_LOCK_ASSERT(tp->t_inpcb); + + toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; + tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; + tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; + tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; + if (tp->t_flags & TF_RCVD_SCALE) + tp->rcv_scale = 0; +} + +/* + * Completes some final bits of initialization for just established connections + * and changes their state to TCP_ESTABLISHED. + * + * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. + */ +static void +make_established(struct socket *so, u32 snd_isn, unsigned int opt) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; + assign_rxopt(so, opt); + so->so_proto->pr_ctloutput = t3_ctloutput; + +#if 0 + inet_sk(sk)->id = tp->write_seq ^ jiffies; +#endif + + + /* + * XXX not clear what rcv_wup maps to + */ + /* + * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't + * pass through opt0. + */ + if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) + toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); + + dump_toepcb(toep); + +#ifdef notyet +/* + * no clean interface for marking ARP up to date + */ + dst_confirm(sk->sk_dst_cache); +#endif + tp->t_state = TCPS_ESTABLISHED; +} + +static int +syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) +{ + + struct in_conninfo inc; + struct tcpopt to; + struct tcphdr th; + int mss, wsf, sack, ts; + struct mbuf *m = NULL; + const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); + unsigned int opt; + +#ifdef MAC +#error "no MAC support" +#endif + + opt = ntohs(req->tcp_opt); + + bzero(&to, sizeof(struct tcpopt)); + + /* + * Fill out information for entering us into the syncache + */ + inc.inc_fport = th.th_sport = req->peer_port; + inc.inc_lport = th.th_dport = req->local_port; + th.th_seq = req->rcv_isn; + th.th_flags = TH_ACK; + + inc.inc_isipv6 = 0; + inc.inc_len = 0; + inc.inc_faddr.s_addr = req->peer_ip; + inc.inc_laddr.s_addr = req->local_ip; + + mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; + wsf = G_TCPOPT_WSCALE_OK(opt); + ts = G_TCPOPT_TSTAMP(opt); + sack = G_TCPOPT_SACK(opt); + + to.to_mss = mss; + to.to_wscale = G_TCPOPT_SND_WSCALE(opt); + to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); + + DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", + ntohl(req->local_ip), ntohs(req->local_port), + ntohl(req->peer_ip), ntohs(req->peer_port), + mss, wsf, ts, sack); + return syncache_expand(&inc, &to, &th, so, m); +} + + +/* + * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work + * if we are in TCP_SYN_RECV due to crossed SYNs + */ +static int +do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_pass_establish *req = cplhdr(m); + struct toepcb *toep = (struct toepcb *)ctx; + struct tcpcb *tp; + struct socket *so, *lso; + struct t3c_data *td = T3C_DATA(cdev); + // Complete socket initialization now that we have the SND_ISN + + struct toedev *tdev; + + so = lso = toeptoso(toep); + tdev = toep->tp_toedev; + + SOCK_LOCK(so); + LIST_REMOVE(toep, synq_entry); + SOCK_UNLOCK(so); + + INP_INFO_WLOCK(&tcbinfo); + if (!syncache_expand_establish_req(req, &so, toep)) { + /* + * No entry + */ + UNIMPLEMENTED(); + } + if (so == NULL) { + /* + * Couldn't create the socket + */ + UNIMPLEMENTED(); + } + + /* + * XXX workaround for lack of syncache drop + */ + toepcb_release(toep); + + tp = sototcpcb(so); + INP_LOCK(tp->t_inpcb); +#ifdef notyet + so->so_snd.sb_flags |= SB_TOE; + so->so_rcv.sb_flags |= SB_TOE; +#endif + toep->tp_tp = tp; + toep->tp_flags = 0; + tp->t_toe = toep; + reset_wr_list(toep); + tp->rcv_wnd = select_rcv_wnd(so); + DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd); + install_offload_ops(so); + + toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); + toep->tp_wr_unacked = 0; + toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); + toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) && + tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; + toep->tp_qset_idx = 0; + toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); + + /* + * XXX Cancel any keep alive timer + */ + + make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + INP_INFO_WUNLOCK(&tcbinfo); + INP_UNLOCK(tp->t_inpcb); + soisconnected(so); + +#ifdef notyet + /* + * XXX not sure how these checks map to us + */ + if (unlikely(sk->sk_socket)) { // simultaneous opens only + sk->sk_state_change(sk); + sk_wake_async(so, 0, POLL_OUT); + } + /* + * The state for the new connection is now up to date. + * Next check if we should add the connection to the parent's + * accept queue. When the parent closes it resets connections + * on its SYN queue, so check if we are being reset. If so we + * don't need to do anything more, the coming ABORT_RPL will + * destroy this socket. Otherwise move the connection to the + * accept queue. + * + * Note that we reset the synq before closing the server so if + * we are not being reset the stid is still open. + */ + if (unlikely(!tp->forward_skb_hint)) { // removed from synq + __kfree_skb(skb); + goto unlock; + } +#endif + m_free(m); + + return (0); +} + +/* + * Fill in the right TID for CPL messages waiting in the out-of-order queue + * and send them to the TOE. + */ +static void +fixup_and_send_ofo(struct socket *so) +{ + struct mbuf *m; + struct toedev *tdev = TOE_DEV(so); + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + unsigned int tid = toep->tp_tid; + + printf("fixup_and_send_ofo\n"); + + INP_LOCK_ASSERT(tp->t_inpcb); + while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { + /* + * A variety of messages can be waiting but the fields we'll + * be touching are common to all so any message type will do. + */ + struct cpl_close_con_req *p = cplhdr(m); + + p->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + } +} + +/* + * Updates socket state from an active establish CPL message. Runs with the + * socket lock held. + */ +static void +socket_act_establish(struct socket *so, struct mbuf *m) +{ + struct cpl_act_establish *req = cplhdr(m); + u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + if (__predict_false(tp->t_state != TCPS_SYN_SENT)) + log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", + toep->tp_tid, tp->t_state); + + tp->ts_recent_age = ticks; + tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; + toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; + + make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + + /* + * Now that we finally have a TID send any CPL messages that we had to + * defer for lack of a TID. + */ + if (mbufq_len(&toep->out_of_order_queue)) + fixup_and_send_ofo(so); + + if (__predict_false(so->so_state & SS_NOFDREF)) { +#ifdef notyet + /* + * XXX not clear what should be done here + * appears to correspond to sorwakeup_locked + */ + sk->sk_state_change(sk); + sk_wake_async(so, 0, POLL_OUT); +#endif + } + m_free(m); +#ifdef notyet +/* + * XXX assume no write requests permitted while socket connection is + * incomplete + */ + /* + * Currently the send queue must be empty at this point because the + * socket layer does not send anything before a connection is + * established. To be future proof though we handle the possibility + * that there are pending buffers to send (either TX_DATA or + * CLOSE_CON_REQ). First we need to adjust the sequence number of the + * buffers according to the just learned write_seq, and then we send + * them on their way. + */ + fixup_pending_writeq_buffers(sk); + if (t3_push_frames(so, 1)) + sk->sk_write_space(sk); +#endif + + soisconnected(so); + toep->tp_state = tp->t_state = TCPS_ESTABLISHED; + tcpstat.tcps_connects++; + +} + +/* + * Process a CPL_ACT_ESTABLISH message. + */ +static int +do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_act_establish *req = cplhdr(m); + unsigned int tid = GET_TID(req); + unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + struct toepcb *toep = (struct toepcb *)ctx; + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + struct toedev *tdev; + struct tom_data *d; + + if (tp == NULL) { + free_atid(cdev, atid); + return (0); + } + + so = toeptoso(toep); + tdev = TOE_DEV(so); /* blow up here if link was down */ + d = TOM_DATA(tdev); + + INP_LOCK(tp->t_inpcb); + + /* + * It's OK if the TID is currently in use, the owning socket may have + * backlogged its last CPL message(s). Just take it away. + */ + toep->tp_tid = tid; + toep->tp_tp = tp; + so_insert_tid(d, so, tid); + free_atid(cdev, atid); + toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); + + socket_act_establish(so, m); + INP_UNLOCK(tp->t_inpcb); + return (0); +} + +/* + * Process an acknowledgment of WR completion. Advance snd_una and send the + * next batch of work requests from the write queue. + */ +static void +wr_ack(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct cpl_wr_ack *hdr = cplhdr(m); + struct socket *so = toeptoso(toep); + unsigned int credits = ntohs(hdr->credits); + u32 snd_una = ntohl(hdr->snd_una); + int bytes = 0; + + DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits); + + INP_LOCK(tp->t_inpcb); + + toep->tp_wr_avail += credits; + if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) + toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; + + while (credits) { + struct mbuf *p = peek_wr(toep); + DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ; + + if (__predict_false(!p)) { + log(LOG_ERR, "%u WR_ACK credits for TID %u with " + "nothing pending, state %u\n", + credits, toep->tp_tid, tp->t_state); + break; + } + if (__predict_false(credits < p->m_pkthdr.csum_data)) { +#if DEBUG_WR > 1 + struct tx_data_wr *w = cplhdr(p); +#ifdef notyet + log(LOG_ERR, + "TID %u got %u WR credits, need %u, len %u, " + "main body %u, frags %u, seq # %u, ACK una %u," + " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", + toep->tp_tid, credits, p->csum, p->len, + p->len - p->data_len, skb_shinfo(p)->nr_frags, + ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), + WR_AVAIL(tp), count_pending_wrs(tp) - credits); +#endif +#endif + p->m_pkthdr.csum_data -= credits; + break; + } else { + dequeue_wr(toep); + credits -= p->m_pkthdr.csum_data; + bytes += p->m_pkthdr.len; + DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len); + + m_free(p); + } + } + +#if DEBUG_WR + check_wr_invariants(tp); +#endif + + if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { +#if VALIDATE_SEQ + struct tom_data *d = TOM_DATA(TOE_DEV(so)); + + log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " + "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, + toep->tp_tid, tp->snd_una); +#endif + goto out_free; + } + + if (tp->snd_una != snd_una) { + tp->snd_una = snd_una; + tp->ts_recent_age = ticks; +#ifdef notyet + /* + * Keep ARP entry "minty fresh" + */ + dst_confirm(sk->sk_dst_cache); +#endif + if (tp->snd_una == tp->snd_nxt) + toep->tp_flags &= ~TP_TX_WAIT_IDLE; + } + if (bytes) { + DPRINTF("sbdrop(%d)\n", bytes); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, bytes); + sowwakeup_locked(so); + } + + if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc) + t3_push_frames(so, 0); + +out_free: + INP_UNLOCK(tp->t_inpcb); + m_free(m); +} + +/* + * Handler for TX_DATA_ACK CPL messages. + */ +static int +do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + DPRINTF("do_wr_ack\n"); + dump_toepcb(toep); + + VALIDATE_SOCK(so); + + wr_ack(toep, m); + return 0; +} + + +/* + * Reset a connection that is on a listener's SYN queue or accept queue, + * i.e., one that has not had a struct socket associated with it. + * Must be called from process context. + * + * Modeled after code in inet_csk_listen_stop(). + */ +static void +t3_reset_listen_child(struct socket *child) +{ + struct tcpcb *tp = sototcpcb(child); + + t3_send_reset(tp->t_toe); +} + +/* + * Disconnect offloaded established but not yet accepted connections sitting + * on a server's accept_queue. We just send an ABORT_REQ at this point and + * finish off the disconnect later as we may need to wait for the ABORT_RPL. + */ +void +t3_disconnect_acceptq(struct socket *listen_so) +{ + struct socket *so; + struct tcpcb *tp; + + TAILQ_FOREACH(so, &listen_so->so_comp, so_list) { + tp = sototcpcb(so); + + if (tp->t_flags & TF_TOE) { + INP_LOCK(tp->t_inpcb); + t3_reset_listen_child(so); + INP_UNLOCK(tp->t_inpcb); + } + + } +} + +/* + * Reset offloaded connections sitting on a server's syn queue. As above + * we send ABORT_REQ and finish off when we get ABORT_RPL. + */ + +void +t3_reset_synq(struct listen_ctx *lctx) +{ + struct toepcb *toep; + + SOCK_LOCK(lctx->lso); + while (!LIST_EMPTY(&lctx->synq_head)) { + toep = LIST_FIRST(&lctx->synq_head); + LIST_REMOVE(toep, synq_entry); + toep->tp_tp = NULL; + t3_send_reset(toep); + cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); + toepcb_release(toep); + } + SOCK_UNLOCK(lctx->lso); +} + +void +t3_init_wr_tab(unsigned int wr_len) +{ + int i; + + if (mbuf_wrs[1]) /* already initialized */ + return; + + for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { + int sgl_len = (3 * i) / 2 + (i & 1); + + sgl_len += 3; + mbuf_wrs[i] = sgl_len <= wr_len ? + 1 : 1 + (sgl_len - 2) / (wr_len - 1); + } + + wrlen = wr_len * 8; +} + +int +t3_init_cpl_io(void) +{ +#ifdef notyet + tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); + if (!tcphdr_skb) { + log(LOG_ERR, + "Chelsio TCP offload: can't allocate sk_buff\n"); + return -1; + } + skb_put(tcphdr_skb, sizeof(struct tcphdr)); + tcphdr_skb->h.raw = tcphdr_skb->data; + memset(tcphdr_skb->data, 0, tcphdr_skb->len); +#endif + + + t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); + t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); + t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); + t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); + t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); + t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); + t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); + t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); + t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); + t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); + t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); + t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); +#ifdef notyet + t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); + t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); + t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); +#endif + return (0); +} + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c new file mode 100644 index 0000000..8cb42e1 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -0,0 +1,560 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, + struct uio *uio, struct mbuf *top, struct mbuf *control, + int flags, struct thread *td); + +static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, + int *flagsp); + +#ifdef notyet +#define VM_HOLD_WRITEABLE 0x1 +static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, + int *count, int flags); +#endif +static void vm_fault_unhold_pages(vm_page_t *m, int count); + + + +#define TMP_IOV_MAX 16 + +void +t3_init_socket_ops(void) +{ + struct protosw *prp; + + prp = pffindtype(AF_INET, SOCK_STREAM); + pru_sosend = prp->pr_usrreqs->pru_sosend; + pru_soreceive = prp->pr_usrreqs->pru_soreceive; +} + + +struct cxgb_dma_info { + size_t cdi_mapped; + int cdi_nsegs; + bus_dma_segment_t *cdi_segs; + +}; + +static void +cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs, + bus_size_t mapsize, int error) +{ + struct cxgb_dma_info *cdi = arg; + + cdi->cdi_mapped = mapsize; + cdi->cdi_nsegs = nsegs; + cdi->cdi_segs = segs; +} + +static void +iov_adj(struct iovec **iov, int *iovcnt, size_t count) +{ + struct iovec *iovtmp; + int iovcnttmp; + caddr_t ptmp; + + if (count > 0) { + iovtmp = *iov; + iovcnttmp = *iovcnt; + while (count > 0) { + if (count < iovtmp->iov_len) { + ptmp = iovtmp->iov_base; + ptmp += count; + iovtmp->iov_base = ptmp; + iovtmp->iov_len -= count; + break; + } else + count -= iovtmp->iov_len; + iovtmp++; + iovcnttmp--; + } + *iov = iovtmp; + *iovcnt = iovcnttmp; + } else if (count < 0) { + iovtmp = &(*iov)[*iovcnt - 1]; + iovcnttmp = *iovcnt; + while (count < 0) { + if (-count < iovtmp->iov_len) { + iovtmp->iov_len += count; + break; + } else + count += iovtmp->iov_len; + iovtmp--; + iovcnttmp--; + } + *iovcnt = iovcnttmp; + } +} + + +static void +cxgb_zero_copy_free(void *cl, void *arg) {} + +static int +cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) +{ + + return (EINVAL); +} + +static void +cxgb_wait_dma_completion(struct toepcb *tp) +{ + +} + +static int +cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) +{ + int i, seg_count, err, type; + struct mbuf *m0; + struct cxgb_dma_info cdi; + struct mbuf_vec *mv; + struct mbuf_iovec *mi; + bus_dma_segment_t *segs; + + err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio, + cxgb_dma_callback, &cdi, 0); + + if (err) + return (err); + seg_count = cdi.cdi_nsegs; + if ((m0 = mcl_alloc(seg_count, &type)) == NULL) { + bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap); + return (ENOMEM); + } + segs = cdi.cdi_segs; + m0->m_type = type; + m0->m_flags = (M_EXT|M_NOFREE); + m0->m_ext.ext_type = EXT_EXTREF; + m0->m_ext.ext_free = cxgb_zero_copy_free; + m0->m_ext.ext_args = NULL; + + mv = mtomv(m0); + mv->mv_count = seg_count; + mv->mv_first = 0; + for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++) + mi_collapse_sge(mi, segs); + + *m = m0; + + if (cdi.cdi_mapped < uio->uio_resid) { + uio->uio_resid -= cdi.cdi_mapped; + } else + uio->uio_resid = 0; + + return (0); +} + +static int +t3_sosend(struct socket *so, struct uio *uio) +{ + int rv, count, hold_resid, sent, iovcnt; + struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m; + struct uio uiotmp; + + /* + * Events requiring iteration: + * - number of pages exceeds max hold pages for process or system + * - number of pages exceeds maximum sg entries for a single WR + * + * We're limited to holding 128 pages at once - and we're limited to + * 34 SG entries per work request, but each SG entry can be any number + * of contiguous pages + * + */ + + uiotmp = *uio; + iovcnt = uio->uio_iovcnt; + iov = uio->uio_iov; + sent = 0; +sendmore: + /* + * Make sure we don't exceed the socket buffer + */ + count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE); + rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0); + hold_resid = uiotmp.uio_resid; + if (rv) + return (rv); + + /* + * Bump past sent and shave off the unheld amount + */ + if (hold_resid > 0) { + iovtmpp = iovtmp; + memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); + if (sent) + iov_adj(&iovtmpp, &iovcnt, sent); + iov_adj(&iovtmpp, &iovcnt, -hold_resid); + uiotmp.uio_iov = iovtmpp; + uiotmp.uio_iovcnt = iovcnt; + + } + uiotmp.uio_resid = uio->uio_resid - hold_resid; + + /* + * Push off all held pages + * + */ + while (uiotmp.uio_resid > 0) { + rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m); + if (rv) { + vm_fault_unhold_pages(toep->tp_pages, count); + return (rv); + } + uio->uio_resid -= m->m_pkthdr.len; + sent += m->m_pkthdr.len; + sbappend_locked(&so->so_snd, m); + t3_push_frames(so, TRUE); + iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); + } + /* + * Wait for pending I/O to be DMA'd to the card + * + */ + cxgb_wait_dma_completion(toep); + vm_fault_unhold_pages(toep->tp_pages, count); + /* + * If there is more data to send adjust local copy of iov + * to point to teh start + */ + if (hold_resid) { + iovtmpp = iovtmp; + memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); + iov_adj(&iovtmpp, &iovcnt, sent); + uiotmp = *uio; + uiotmp.uio_iov = iovtmpp; + uiotmp.uio_iovcnt = iovcnt; + goto sendmore; + } + + return (0); +} + +static int +cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *control, int flags, struct thread *td) +{ + struct tcpcb *tp = sototcpcb(so); + struct toedev *tdev; + int zcopy_thres, zcopy_enabled, rv; + + /* + * In order to use DMA direct from userspace the following + * conditions must be met: + * - the connection is currently offloaded + * - ddp is enabled + * - the number of bytes to be transferred exceeds the threshold + * - the number of bytes currently in flight won't exceed the in-flight + * threshold XXX TODO + * - vm_fault_hold_user_pages succeeds + * - blocking socket XXX for now + * + */ + if (tp->t_flags & TF_TOE) { + tdev = TOE_DEV(so); + zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); + zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); + + if ((uio->uio_resid > zcopy_thres) && + (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0) + && zcopy_enabled) { + rv = t3_sosend(so, uio); + if (rv != EAGAIN) + return (rv); + } + } + return pru_sosend(so, addr, uio, top, control, flags, td); +} + + +static int +t3_soreceive(struct socket *so, struct uio *uio) +{ +#ifdef notyet + int i, rv, count, hold_resid, sent, iovcnt; + struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m; + struct uio uiotmp; + + /* + * Events requiring iteration: + * - number of pages exceeds max hold pages for process or system + * - number of pages exceeds maximum sg entries for a single WR + * + * We're limited to holding 128 pages at once - and we're limited to + * 34 SG entries per work request, but each SG entry can be any number + * of contiguous pages + * + */ + + uiotmp = *uio; + iovcnt = uio->uio_iovcnt; + iov = uio->uio_iov; + sent = 0; + re; +#endif + return (0); +} + +static int +cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct toedev *tdev; + int rv, zcopy_thres, zcopy_enabled; + struct tcpcb *tp = sototcpcb(so); + + /* + * In order to use DMA direct from userspace the following + * conditions must be met: + * - the connection is currently offloaded + * - ddp is enabled + * - the number of bytes to be transferred exceeds the threshold + * - the number of bytes currently in flight won't exceed the in-flight + * threshold XXX TODO + * - vm_fault_hold_user_pages succeeds + * - blocking socket XXX for now + * - iovcnt is 1 + * + */ + if (tp->t_flags & TF_TOE) { + tdev = TOE_DEV(so); + zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); + zcopy_enabled = TOM_TUNABLE(tdev, ddp); + if ((uio->uio_resid > zcopy_thres) && + (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0) + && zcopy_enabled) { + rv = t3_soreceive(so, uio); + if (rv != EAGAIN) + return (rv); + } + } + + return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); +} + + +void +t3_install_socket_ops(struct socket *so) +{ + so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; + so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; +} + +/* + * This routine takes a user address range and does the following: + * - validate that the user has access to those pages (flags indicates read or write) - if not fail + * - validate that count is enough to hold range number of pages - if not fail + * - fault in any non-resident pages + * - if the user is doing a read force a write fault for any COWed pages + * - if the user is doing a read mark all pages as dirty + * - hold all pages + * - return number of pages in count + */ +#ifdef notyet +static int +vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags) +{ + + vm_offset_t start, va; + vm_paddr_t pa; + int pageslen, faults, rv; + + struct thread *td; + vm_map_t map; + pmap_t pmap; + vm_page_t m, *pages; + vm_prot_t prot; + + start = addr & ~PAGE_MASK; + pageslen = roundup2(addr + len, PAGE_SIZE); + if (*count < (pageslen >> PAGE_SHIFT)) + return (EFBIG); + + *count = pageslen >> PAGE_SHIFT; + /* + * Check that virtual address range is legal + * This check is somewhat bogus as on some architectures kernel + * and user do not share VA - however, it appears that all FreeBSD + * architectures define it + */ + if (addr + len > VM_MAXUSER_ADDRESS) + return (EFAULT); + + td = curthread; + map = &td->td_proc->p_vmspace->vm_map; + pmap = &td->td_proc->p_vmspace->vm_pmap; + pages = mp; + + prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ; + bzero(pages, sizeof(vm_page_t *) * (*count)); +retry: + + /* + * First optimistically assume that all pages are resident (and R/W if for write) + * if so just mark pages as held (and dirty if for write) and return + */ + vm_page_lock_queues(); + for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) { + /* + * Assure that we only hold the page once + */ + if (*pages == NULL) { + /* + * page queue mutex is recursable so this is OK + * it would be really nice if we had an unlocked version of this so + * we were only acquiring the pmap lock 1 time as opposed to potentially + * many dozens of times + */ + m = pmap_extract_and_hold(pmap, va, prot); + if (m == NULL) { + faults++; + continue; + } + *pages = m; + if (flags & VM_HOLD_WRITEABLE) + vm_page_dirty(m); + } + } + vm_page_unlock_queues(); + + if (faults == 0) + return (0); + /* + * Pages either have insufficient permissions or are not present + * trigger a fault where neccessary + * + */ + for (va = start; va < pageslen; va += PAGE_SIZE) { + m = NULL; + pa = pmap_extract(pmap, va); + rv = 0; + if (pa) + m = PHYS_TO_VM_PAGE(pa); + if (flags & VM_HOLD_WRITEABLE) { + if (m == NULL || (m->flags & PG_WRITEABLE) == 0) + rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); + } else if (m == NULL) + rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL); + if (rv) + goto error; + } + goto retry; + +error: + vm_page_lock_queues(); + for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) + if (*pages) + vm_page_unhold(*pages); + vm_page_unlock_queues(); + return (EFAULT); +} +#endif + +static void +vm_fault_unhold_pages(vm_page_t *mp, int count) +{ + + KASSERT(count >= 0, ("negative count %d", count)); + vm_page_lock_queues(); + while (count--) { + vm_page_unhold(*mp); + mp++; + } + vm_page_unlock_queues(); +} + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h new file mode 100644 index 0000000..9077295 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h @@ -0,0 +1,79 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef CXGB_DEFS_H_ +#define CXGB_DEFS_H_ + +#define VALIDATE_TID 0 + +#define TOEPCB(so) ((struct toepcb *)(sototcpcb((so))->t_toe)) +#define TOE_DEV(so) (TOEPCB((so))->tp_toedev) +#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket) +#define sototoep(so) (sototcpcb((so))->t_toe) + +struct listen_ctx; + +typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m); + +void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h); +void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev); +void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev); +int t3_push_frames(struct socket *so, int req_completion); +int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt, + struct sockaddr *nam); +void t3_init_listen_cpl_handlers(void); +int t3_init_cpl_io(void); +void t3_init_wr_tab(unsigned int wr_len); +uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail); +void t3_cleanup_rbuf(struct tcpcb *tp); + +void t3_init_socket_ops(void); +void t3_install_socket_ops(struct socket *so); + + +void t3_disconnect_acceptq(struct socket *listen_so); +void t3_reset_synq(struct listen_ctx *ctx); +void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler); + +struct toepcb *toepcb_alloc(void); +void toepcb_hold(struct toepcb *); +void toepcb_release(struct toepcb *); +void toepcb_init(struct toepcb *); + +void t3_set_rcv_coalesce_enable(struct socket *so, int on_off); +void t3_set_keepalive(struct socket *so, int on_off); +void t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag); +void t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset, + unsigned int len); +int t3_get_tcb(struct socket *so); + +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c new file mode 100644 index 0000000..e785790 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -0,0 +1,345 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid); +static int listen_hash_del(struct tom_data *d, struct socket *so); + +/* + * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release + * the STID. + */ +static int +do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_close_listserv_rpl *rpl = cplhdr(m); + unsigned int stid = GET_TID(rpl); + + if (rpl->status != CPL_ERR_NONE) + log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for " + "STID %u\n", rpl->status, stid); + else { + struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + + cxgb_free_stid(cdev, stid); + free(listen_ctx, M_CXGB); + } + + return (CPL_RET_BUF_DONE); +} + +/* + * Process a CPL_PASS_OPEN_RPL message. Remove the socket from the listen hash + * table and free the STID if there was any error, otherwise nothing to do. + */ +static int +do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_pass_open_rpl *rpl = cplhdr(m); + + if (rpl->status != CPL_ERR_NONE) { + int stid = GET_TID(rpl); + struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + struct tom_data *d = listen_ctx->tom_data; + struct socket *lso = listen_ctx->lso; + +#if VALIDATE_TID + if (!lso) + return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE); +#endif + /* + * Note: It is safe to unconditionally call listen_hash_del() + * at this point without risking unhashing a reincarnation of + * an already closed socket (i.e., there is no listen, close, + * listen, free the sock for the second listen while processing + * a message for the first race) because we are still holding + * a reference on the socket. It is possible that the unhash + * will fail because the socket is already closed, but we can't + * unhash the wrong socket because it is impossible for the + * socket to which this message refers to have reincarnated. + */ + listen_hash_del(d, lso); + cxgb_free_stid(cdev, stid); +#ifdef notyet + /* + * XXX need to unreference the inpcb + * but we have no way of knowing that other TOMs aren't referencing it + */ + sock_put(lso); +#endif + free(listen_ctx, M_CXGB); + } + return CPL_RET_BUF_DONE; +} + +void +t3_init_listen_cpl_handlers(void) +{ + t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); + t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); +} + +static inline int +listen_hashfn(const struct socket *so) +{ + return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1); +} + +/* + * Create and add a listen_info entry to the listen hash table. This and the + * listen hash table functions below cannot be called from softirqs. + */ +static struct listen_info * +listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid) +{ + struct listen_info *p; + + p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO); + if (p) { + int bucket = listen_hashfn(so); + + p->so = so; /* just a key, no need to take a reference */ + p->stid = stid; + mtx_lock(&d->listen_lock); + p->next = d->listen_hash_tab[bucket]; + d->listen_hash_tab[bucket] = p; + mtx_unlock(&d->listen_lock); + } + return p; +} + +#if 0 +/* + * Given a pointer to a listening socket return its server TID by consulting + * the socket->stid map. Returns -1 if the socket is not in the map. + */ +static int +listen_hash_find(struct tom_data *d, struct socket *so) +{ + int stid = -1, bucket = listen_hashfn(so); + struct listen_info *p; + + spin_lock(&d->listen_lock); + for (p = d->listen_hash_tab[bucket]; p; p = p->next) + if (p->sk == sk) { + stid = p->stid; + break; + } + spin_unlock(&d->listen_lock); + return stid; +} +#endif + +/* + * Delete the listen_info structure for a listening socket. Returns the server + * TID for the socket if it is present in the socket->stid map, or -1. + */ +static int +listen_hash_del(struct tom_data *d, struct socket *so) +{ + int bucket, stid = -1; + struct listen_info *p, **prev; + + bucket = listen_hashfn(so); + prev = &d->listen_hash_tab[bucket]; + + mtx_lock(&d->listen_lock); + for (p = *prev; p; prev = &p->next, p = p->next) + if (p->so == so) { + stid = p->stid; + *prev = p->next; + free(p, M_CXGB); + break; + } + mtx_unlock(&d->listen_lock); + + return (stid); +} + +/* + * Start a listening server by sending a passive open request to HW. + */ +void +t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev) +{ + int stid; + struct mbuf *m; + struct cpl_pass_open_req *req; + struct tom_data *d = TOM_DATA(dev); + struct inpcb *inp = sotoinpcb(so); + struct listen_ctx *ctx; + + if (!TOM_TUNABLE(dev, activated)) + return; + + printf("start listen\n"); + + ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT); + + if (!ctx) + return; + + ctx->tom_data = d; + ctx->lso = so; + ctx->ulp_mode = 0; /* DDP if the default */ + LIST_INIT(&ctx->synq_head); + + stid = cxgb_alloc_stid(d->cdev, d->client, ctx); + if (stid < 0) + goto free_ctx; + +#ifdef notyet + /* + * XXX need to mark inpcb as referenced + */ + sock_hold(sk); +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) + goto free_stid; + m->m_pkthdr.len = m->m_len = sizeof(*req); + + if (!listen_hash_add(d, so, stid)) + goto free_all; + + req = mtod(m, struct cpl_pass_open_req *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid)); + req->local_port = inp->inp_lport; + memcpy(&req->local_ip, &inp->inp_laddr, 4); + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(16)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + m_set_priority(m, CPL_PRIORITY_LISTEN); + cxgb_ofld_send(cdev, m); + return; + +free_all: + m_free(m); +free_stid: + cxgb_free_stid(cdev, stid); +#if 0 + sock_put(sk); +#endif +free_ctx: + free(ctx, M_CXGB); +} + +/* + * Stop a listening server by sending a close_listsvr request to HW. + * The server TID is freed when we get the reply. + */ +void +t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev) +{ + struct mbuf *m; + struct cpl_close_listserv_req *req; + struct listen_ctx *lctx; + int stid = listen_hash_del(TOM_DATA(dev), so); + + if (stid < 0) + return; + + lctx = cxgb_get_lctx(cdev, stid); + /* + * Do this early so embryonic connections are marked as being aborted + * while the stid is still open. This ensures pass_establish messages + * that arrive while we are closing the server will be able to locate + * the listening socket. + */ + t3_reset_synq(lctx); + + /* Send the close ASAP to stop further passive opens */ + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + /* + * XXX allocate from lowmem cache + */ + } + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req = mtod(m, struct cpl_close_listserv_req *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid)); + req->cpu_idx = 0; + m_set_priority(m, CPL_PRIORITY_LISTEN); + cxgb_ofld_send(cdev, m); + + t3_disconnect_acceptq(so); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h new file mode 100644 index 0000000..9fa42b5 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h @@ -0,0 +1,185 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ + +#ifndef T3_DDP_H +#define T3_DDP_H + +/* Should be 1 or 2 indicating single or double kernel buffers. */ +#define NUM_DDP_KBUF 2 + +/* min receive window for a connection to be considered for DDP */ +#define MIN_DDP_RCV_WIN (48 << 10) + +/* amount of Rx window not available to DDP to avoid window exhaustion */ +#define DDP_RSVD_WIN (16 << 10) + +/* # of sentinel invalid page pods at the end of a group of valid page pods */ +#define NUM_SENTINEL_PPODS 0 + +/* # of pages a pagepod can hold without needing another pagepod */ +#define PPOD_PAGES 4 + +/* page pods are allocated in groups of this size (must be power of 2) */ +#define PPOD_CLUSTER_SIZE 16 + +/* for each TID we reserve this many page pods up front */ +#define RSVD_PPODS_PER_TID 1 + +struct pagepod { + uint32_t pp_vld_tid; + uint32_t pp_pgsz_tag_color; + uint32_t pp_max_offset; + uint32_t pp_page_offset; + uint64_t pp_rsvd; + uint64_t pp_addr[5]; +}; + +#define PPOD_SIZE sizeof(struct pagepod) + +#define S_PPOD_TID 0 +#define M_PPOD_TID 0xFFFFFF +#define V_PPOD_TID(x) ((x) << S_PPOD_TID) + +#define S_PPOD_VALID 24 +#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID) +#define F_PPOD_VALID V_PPOD_VALID(1U) + +#define S_PPOD_COLOR 0 +#define M_PPOD_COLOR 0x3F +#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR) + +#define S_PPOD_TAG 6 +#define M_PPOD_TAG 0xFFFFFF +#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG) + +#define S_PPOD_PGSZ 30 +#define M_PPOD_PGSZ 0x3 +#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ) + +struct pci_dev; +#include +#include +#include + +/* DDP gather lists can specify an offset only for the first page. */ +struct ddp_gather_list { + unsigned int dgl_length; + unsigned int dgl_offset; + unsigned int dgl_nelem; + vm_page_t *dgl_pages; + bus_addr_t dgl_phys_addr[0]; +}; + +struct ddp_buf_state { + unsigned int cur_offset; /* offset of latest DDP notification */ + unsigned int flags; + struct ddp_gather_list *gl; +}; + +struct ddp_state { + struct pci_dev *pdev; + struct ddp_buf_state buf_state[2]; /* per buffer state */ + int cur_buf; + unsigned short kbuf_noinval; + unsigned short kbuf_idx; /* which HW buffer is used for kbuf */ + struct ddp_gather_list *ubuf; + unsigned int ubuf_nppods; /* # of page pods for buffer 1 */ + unsigned int ubuf_tag; + unsigned int ubuf_ddp_ready; + int get_tcb_count; + unsigned int kbuf_posted; + int cancel_ubuf; + unsigned int kbuf_nppods[NUM_DDP_KBUF]; + unsigned int kbuf_tag[NUM_DDP_KBUF]; + struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */ +}; + +/* buf_state flags */ +enum { + DDP_BF_NOINVAL = 1 << 0, /* buffer is set to NO_INVALIDATE */ + DDP_BF_NOCOPY = 1 << 1, /* DDP to final dest, no copy needed */ + DDP_BF_NOFLIP = 1 << 2, /* buffer flips after GET_TCB_RPL */ + DDP_BF_PSH = 1 << 3, /* set in skb->flags if the a DDP was + completed with a segment having the + PSH flag set */ +}; + +#ifdef notyet +/* + * Returns 1 if a UBUF DMA buffer might be active. + */ +static inline int t3_ddp_ubuf_pending(struct sock *so) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct ddp_state *p = DDP_STATE(tp); + + /* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP, + * but DDP_STATE() is only valid if the connection actually enabled + * DDP. + */ + if (!p) + return 0; + + return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || + (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)); +} +#endif + +int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl, + unsigned int nppods, unsigned int tag, unsigned int maxoff, + unsigned int pg_off, unsigned int color); +int t3_alloc_ppods(struct tom_data *td, unsigned int n); +void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); +void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl); +int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len, + struct ddp_gather_list **newgl, + const struct ddp_gather_list *gl); +int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to, + int len); +//void t3_repost_kbuf(struct socket *so, int modulate, int activate); +void t3_post_kbuf(struct socket *so, int modulate); +int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock, + int rcv_flags, int modulate, int post_kbuf); +void t3_cancel_ubuf(struct socket *so); +int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock, + int rcv_flags, int modulate, int post_kbuf); +int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall); +void t3_cleanup_ddp(struct socket *so); +void t3_release_ddp_resources(struct toepcb *toep); +void t3_cancel_ddpbuf(struct socket *so, unsigned int bufidx); +void t3_overlay_ddpbuf(struct socket *so, unsigned int bufidx, unsigned int tag0, + unsigned int tag1, unsigned int len); +void t3_setup_ddpbufs(struct socket *so, unsigned int len0, unsigned int offset0, + unsigned int len1, unsigned int offset1, + uint64_t ddp_flags, uint64_t flag_mask, int modulate); +#endif /* T3_DDP_H */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h new file mode 100644 index 0000000..a078bee --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h @@ -0,0 +1,112 @@ + +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef CXGB_TOEPCB_H_ +#define CXGB_TOEPCB_H_ +#include +#include + +struct toepcb { + struct toedev *tp_toedev; + struct l2t_entry *tp_l2t; + pr_ctloutput_t *tp_ctloutput; + unsigned int tp_tid; + int tp_wr_max; + int tp_wr_avail; + int tp_wr_unacked; + int tp_delack_mode; + int tp_mtu_idx; + int tp_ulp_mode; + int tp_qset_idx; + int tp_mss_clamp; + int tp_qset; + int tp_flags; + int tp_enqueued_bytes; + int tp_page_count; + int tp_state; + + tcp_seq tp_iss; + tcp_seq tp_delack_seq; + tcp_seq tp_rcv_wup; + tcp_seq tp_copied_seq; + uint64_t tp_write_seq; + + volatile int tp_refcount; + vm_page_t *tp_pages; + + struct tcpcb *tp_tp; + struct mbuf *tp_m_last; + bus_dma_tag_t tp_tx_dmat; + bus_dmamap_t tp_dmamap; + + LIST_ENTRY(toepcb) synq_entry; + struct mbuf_head wr_list; + struct mbuf_head out_of_order_queue; + struct ddp_state tp_ddp_state; +}; + +static inline void +reset_wr_list(struct toepcb *toep) +{ + + mbufq_init(&toep->wr_list); +} + +static inline void +purge_wr_queue(struct toepcb *toep) +{ + struct mbuf *m; + + while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) + m_freem(m); +} + +static inline void +enqueue_wr(struct toepcb *toep, struct mbuf *m) +{ + + mbufq_tail(&toep->wr_list, m); +} + +static inline struct mbuf * +peek_wr(struct toepcb *toep) +{ + + return (mbufq_peek(&toep->wr_list)); +} + +static inline struct mbuf * +dequeue_wr(struct toepcb *toep) +{ + + return (mbufq_dequeue(&toep->wr_list)); +} + +#endif + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c new file mode 100644 index 0000000..2dc6150 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c @@ -0,0 +1,500 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int activated = 1; +TUNABLE_INT("hw.t3toe.activated", &activated); +SYSCTL_NODE(_hw, OID_AUTO, t3toe, CTLFLAG_RD, 0, "T3 toe driver parameters"); +SYSCTL_UINT(_hw_t3toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0, + "enable TOE at init time"); + +static TAILQ_HEAD(, tom_data) cxgb_list; +static struct mtx cxgb_list_lock; + +static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry); +/* + * Handlers for each CPL opcode + */ +static cxgb_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS]; + +static eventhandler_tag listen_tag; + +static struct offload_id t3_toe_id_tab[] = { + { TOE_ID_CHELSIO_T3, 0 }, + { TOE_ID_CHELSIO_T3B, 0 }, + { 0 } +}; + +static struct tom_info t3_tom_info = { + .ti_attach = t3_toe_attach, + .ti_id_table = t3_toe_id_tab, + .ti_name = "Chelsio-T3" +}; + +struct cxgb_client t3c_tom_client = { + .name = "tom_cxgb3", + .remove = NULL, + .handlers = tom_cpl_handlers, + .redirect = NULL +}; + +/* + * Add an skb to the deferred skb queue for processing from process context. + */ +void +t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler) +{ + struct tom_data *td = TOM_DATA(dev); + + m_set_handler(m, handler); + mtx_lock(&td->deferq.lock); + + mbufq_tail(&td->deferq, m); + if (mbufq_len(&td->deferq) == 1) + taskqueue_enqueue(td->tq, &td->deferq_task); + mtx_lock(&td->deferq.lock); +} + +struct toepcb * +toepcb_alloc(void) +{ + struct toepcb *toep; + + toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT); + + if (toep == NULL) + return (NULL); + + toepcb_init(toep); + return (toep); +} + +void +toepcb_init(struct toepcb *toep) +{ + bzero(toep, sizeof(*toep)); + toep->tp_refcount = 1; +} + +void +toepcb_hold(struct toepcb *toep) +{ + atomic_add_acq_int(&toep->tp_refcount, 1); +} + +void +toepcb_release(struct toepcb *toep) +{ + if (toep->tp_refcount == 1) { + printf("doing final toepcb free\n"); + + free(toep, M_DEVBUF); + return; + } + + atomic_add_acq_int(&toep->tp_refcount, -1); +} + +/* + * Add a T3 offload device to the list of devices we are managing. + */ +static void +t3cdev_add(struct tom_data *t) +{ + mtx_lock(&cxgb_list_lock); + TAILQ_INSERT_TAIL(&cxgb_list, t, entry); + mtx_unlock(&cxgb_list_lock); +} + +/* + * Allocate a TOM data structure, + * initialize its cpl_handlers + * and register it as a T3C client + */ +static void t3c_tom_add(struct t3cdev *cdev) +{ + int i; + unsigned int wr_len; + struct tom_data *t; + struct toedev *tdev; + struct adap_ports *port_info; + + t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); + + if (!t) + return; + + if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0) + goto out_free_tom; + + port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO); + if (!port_info) + goto out_free_tom; + + if (cdev->ctl(cdev, GET_PORTS, port_info) < 0) + goto out_free_all; + + t3_init_wr_tab(wr_len); + t->cdev = cdev; + t->client = &t3c_tom_client; + + /* Register TCP offload device */ + tdev = &t->tdev; + tdev->tod_ttid = (cdev->type == T3A ? + TOE_ID_CHELSIO_T3 : TOE_ID_CHELSIO_T3B); + tdev->tod_lldev = cdev->lldev; + + if (register_toedev(tdev, "toe%d")) { + printf("unable to register offload device"); + goto out_free_all; + } + TOM_DATA(tdev) = t; + + for (i = 0; i < port_info->nports; i++) { + struct ifnet *ifp = port_info->lldevs[i]; + TOEDEV(ifp) = tdev; + + ifp->if_capabilities |= IFCAP_TOE; + } + t->ports = port_info; + + /* Add device to the list of offload devices */ + t3cdev_add(t); + + /* Activate TCP offload device */ + activate_offload(tdev); + return; + +out_free_all: + free(port_info, M_CXGB); +out_free_tom: + free(t, M_CXGB); + return; +} + +/* + * Process a received packet with an unknown/unexpected CPL opcode. + */ +static int +do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name, + *mtod(m, unsigned int *)); + + return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG); +} + + +/* + * Add a new handler to the CPL dispatch table. A NULL handler may be supplied + * to unregister an existing handler. + */ +void +t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h) +{ + if (opcode < NUM_CPL_CMDS) + tom_cpl_handlers[opcode] = h ? h : do_bad_cpl; + else + log(LOG_ERR, "Chelsio T3 TOM: handler registration for " + "opcode %u failed\n", opcode); +} + +/* + * Make a preliminary determination if a connection can be offloaded. It's OK + * to fail the offload later if we say we can offload here. For now this + * always accepts the offload request unless there are IP options. + */ +static int +can_offload(struct toedev *dev, struct socket *so) +{ + struct tom_data *tomd = TOM_DATA(dev); + struct t3cdev *cdev = T3CDEV(dev->tod_lldev); + struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; + + return sotoinpcb(so)->inp_depend4.inp4_options == NULL && + tomd->conf.activated && + (tomd->conf.max_conn < 0 || + atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn); +} + + +static int tom_ctl(struct toedev *dev, unsigned int req, void *data) +{ + struct tom_data *t = TOM_DATA(dev); + struct t3cdev *cdev = t->cdev; + + if (cdev->ctl) + return cdev->ctl(cdev, req, data); + + return (EOPNOTSUPP); +} + +/* + * Initialize the CPL dispatch table. + */ +static void +init_cpl_handlers(void) +{ + int i; + + for (i = 0; i < NUM_CPL_CMDS; ++i) + tom_cpl_handlers[i] = do_bad_cpl; + + t3_init_listen_cpl_handlers(); +} + +static int +t3_toe_attach(struct toedev *dev, const struct offload_id *entry) +{ + struct tom_data *t = TOM_DATA(dev); + struct t3cdev *cdev = t->cdev; + struct ddp_params ddp; + struct ofld_page_info rx_page_info; + int err; + +#if 0 + skb_queue_head_init(&t->deferq); + T3_INIT_WORK(&t->deferq_task, process_deferq, t); + spin_lock_init(&t->listen_lock); +#endif + t3_init_tunables(t); + mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF); + + /* Adjust TOE activation for this module */ + t->conf.activated = activated; + + dev->tod_can_offload = can_offload; + dev->tod_connect = t3_connect; + dev->tod_ctl = tom_ctl; +#if 0 +#ifndef NETEVENT + dev->tod_neigh_update = tom_neigh_update; +#endif + dev->tod_failover = t3_failover; +#endif + err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp); + if (err) + return err; + + err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info); + if (err) + return err; + + t->ddp_llimit = ddp.llimit; + t->ddp_ulimit = ddp.ulimit; + t->pdev = ddp.pdev; + t->rx_page_size = rx_page_info.page_size; +#ifdef notyet + /* OK if this fails, we just can't do DDP */ + t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE; + t->ppod_map = t3_alloc_mem(t->nppods); +#endif + +#if 0 + spin_lock_init(&t->ppod_map_lock); + tom_proc_init(dev); +#ifdef CONFIG_SYSCTL + t->sysctl = t3_sysctl_register(dev, &t->conf); +#endif +#endif + return (0); +} + +static void +cxgb_toe_listen(void *unused, int event, struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + struct tom_data *p; + + switch (event) { + case OFLD_LISTEN_OPEN: + case OFLD_LISTEN_CLOSE: + mtx_lock(&cxgb_list_lock); + TAILQ_FOREACH(p, &cxgb_list, entry) { + if (event == OFLD_LISTEN_OPEN) + t3_listen_start(&p->tdev, so, p->cdev); + else if (tp->t_state == TCPS_LISTEN) { + printf("stopping listen on port=%d\n", + ntohs(tp->t_inpcb->inp_lport)); + + t3_listen_stop(&p->tdev, so, p->cdev); + } + + } + mtx_unlock(&cxgb_list_lock); + break; + default: + log(LOG_ERR, "unrecognized listen event %d\n", event); + break; + } +} + +static void +cxgb_register_listeners(void) +{ + struct inpcb *inp; + struct tcpcb *tp; + + INP_INFO_RLOCK(&tcbinfo); + LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { + tp = intotcpcb(inp); + + if (tp->t_state == TCPS_LISTEN) + cxgb_toe_listen(NULL, OFLD_LISTEN_OPEN, tp); + } + INP_INFO_RUNLOCK(&tcbinfo); +} + +static int +t3_tom_init(void) +{ + +#if 0 + struct socket *sock; + err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + printk(KERN_ERR "Could not create TCP socket, error %d\n", err); + return err; + } + + t3_def_state_change = sock->sk->sk_state_change; + t3_def_data_ready = sock->sk->sk_data_ready; + t3_def_error_report = sock->sk->sk_error_report; + sock_release(sock); +#endif + init_cpl_handlers(); + if (t3_init_cpl_io() < 0) + return -1; + t3_init_socket_ops(); + + /* Register with the TOE device layer. */ + + if (register_tom(&t3_tom_info) != 0) { + log(LOG_ERR, + "Unable to register Chelsio T3 TCP offload module.\n"); + return -1; + } + + mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF); + listen_tag = EVENTHANDLER_REGISTER(ofld_listen, cxgb_toe_listen, NULL, EVENTHANDLER_PRI_ANY); + TAILQ_INIT(&cxgb_list); + + /* Register to offloading devices */ + t3c_tom_client.add = t3c_tom_add; + cxgb_register_client(&t3c_tom_client); + cxgb_register_listeners(); + return (0); +} + +static int +t3_tom_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + printf("wheeeeee ...\n"); + + t3_tom_init(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("uhm, ... unloading isn't really supported for toe\n"); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data= { + "t3_tom", + t3_tom_load, + 0 +}; +MODULE_VERSION(t3_tom, 1); +MODULE_DEPEND(t3_tom, toecore, 1, 1, 1); +MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1); +DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h new file mode 100644 index 0000000..8d60bbd --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h @@ -0,0 +1,157 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef CXGB_TOM_H_ +#define CXGB_TOM_H_ +#include + +#define LISTEN_INFO_HASH_SIZE 32 + +struct listen_info { + struct listen_info *next; /* Link to next entry */ + struct socket *so; /* The listening socket */ + unsigned int stid; /* The server TID */ +}; + + +/* + * TOM tunable parameters. They can be manipulated through sysctl(2) or /proc. + */ +struct tom_tunables { + int max_host_sndbuf; // max host RAM consumed by a sndbuf + int tx_hold_thres; // push/pull threshold for non-full TX sk_buffs + int max_wrs; // max # of outstanding WRs per connection + int rx_credit_thres; // min # of RX credits needed for RX_DATA_ACK + int cong_alg; // Congestion control algorithm + int mss; // max TX_DATA WR payload size + int delack; // delayed ACK control + int max_conn; // maximum number of offloaded connections + int soft_backlog_limit; // whether the listen backlog limit is soft + int ddp; // whether to put new connections in DDP mode + int ddp_thres; // min recvmsg size before activating DDP + int ddp_copy_limit; // capacity of kernel DDP buffer + int ddp_push_wait; // whether blocking DDP waits for PSH flag + int ddp_rcvcoalesce; // whether receive coalescing is enabled + int zcopy_sosend_enabled; // < is never zcopied + int zcopy_sosend_partial_thres; // < is never zcopied + int zcopy_sosend_partial_copy; // bytes copied in partial zcopy + int zcopy_sosend_thres;// >= are mostly zcopied + int zcopy_sosend_copy; // bytes coped in zcopied + int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA + int activated; // TOE engine activation state +}; + +struct tom_data { + TAILQ_ENTRY(tom_data) entry; + + struct t3cdev *cdev; + struct pci_dev *pdev; + struct toedev tdev; + + struct cxgb_client *client; + struct tom_tunables conf; + struct tom_sysctl_table *sysctl; + + /* + * The next three locks listen_lock, deferq.lock, and tid_release_lock + * are used rarely so we let them potentially share a cacheline. + */ + + struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE]; + struct mtx listen_lock; + + struct mbuf_head deferq; + struct task deferq_task; + + struct socket **tid_release_list; + struct mtx tid_release_lock; + struct task tid_release_task; + + volatile int tx_dma_pending; + + unsigned int ddp_llimit; + unsigned int ddp_ulimit; + + unsigned int rx_page_size; + + u8 *ppod_map; + unsigned int nppods; + struct mtx ppod_map_lock; + + struct adap_ports *ports; + struct taskqueue *tq; +}; + + +struct listen_ctx { + struct socket *lso; + struct tom_data *tom_data; + int ulp_mode; + LIST_HEAD(, toepcb) synq_head; + +}; + +#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt) +#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev) +#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev) +#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param) + +#define TP_DATASENT (1 << 0) +#define TP_TX_WAIT_IDLE (1 << 1) +#define TP_FIN_SENT (1 << 2) +#define TP_ABORT_RPL_PENDING (1 << 3) +#define TP_ABORT_SHUTDOWN (1 << 4) +#define TP_ABORT_RPL_RCVD (1 << 5) +#define TP_ABORT_REQ_RCVD (1 << 6) +#define TP_CLOSE_CON_REQUESTED (1 << 7) +#define TP_SYN_RCVD (1 << 8) +#define TP_ESTABLISHED (1 << 9) + +void t3_init_tunables(struct tom_data *t); + +static __inline struct mbuf * +m_gethdr_nofail(int len) +{ + struct mbuf *m; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + panic("implement lowmem cache\n"); + } + + KASSERT(len < MHLEN, ("requested header size too large for mbuf")); + m->m_pkthdr.len = m->m_len = len; + return (m); +} + + +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c new file mode 100644 index 0000000..7219922 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c @@ -0,0 +1,106 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct tom_tunables default_tunable_vals = { + .max_host_sndbuf = 32 * 1024, + .tx_hold_thres = 0, + .max_wrs = 15, + .rx_credit_thres = 15 * 1024, + .cong_alg = -1, + .mss = 16384, + .delack = 1, + .max_conn = -1, + .soft_backlog_limit = 0, + .ddp = 0, + .ddp_thres = 14 * 4096, + .ddp_copy_limit = 13 * 4096, + .ddp_push_wait = 1, + .ddp_rcvcoalesce = 0, + .zcopy_sosend_enabled = 0, + .zcopy_sosend_partial_thres = 40960, + .zcopy_sosend_partial_copy = 4096 * 3, + .zcopy_sosend_thres = 128 * 1024, + .zcopy_sosend_copy = 4096 * 2, + .zcopy_sosend_ret_pending_dma = 1, + .activated = 1, +}; + +void t3_init_tunables(struct tom_data *t) +{ + t->conf = default_tunable_vals; + + /* Now apply device specific fixups. */ + t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk; + t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs; +} -- cgit v1.1