diff options
author | kmacy <kmacy@FreeBSD.org> | 2008-02-23 01:06:17 +0000 |
---|---|---|
committer | kmacy <kmacy@FreeBSD.org> | 2008-02-23 01:06:17 +0000 |
commit | 48fe676ff5ddc104ebc346eebf48c7c0e285f833 (patch) | |
tree | 02a3e854ca5eb4caea80ce68a9a12f620befb52d /sys/dev/cxgb/ulp | |
parent | df26e399aa077b14fb965be866012bccf2847bae (diff) | |
download | FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.zip FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.tar.gz |
- update firmware to 5.0
- add support for T3C
- add DDP support (zero-copy receive)
- fix TOE transmit of large requests
- fix shutdown so that sockets don't remain in CLOSING state indefinitely
- register listeners when an interface is brought up after tom is loaded
- fix setting of multicast filter
- enable link at device attach
- exit tick handler if shutdown is in progress
- add helper for logging TCB
- add sysctls for dumping transmit queues
- note that TOE wxill not be MFC'd until after 7.0 has been finalized
MFC after: 3 days
Diffstat (limited to 'sys/dev/cxgb/ulp')
-rw-r--r-- | sys/dev/cxgb/ulp/toecore/cxgb_toedev.h | 4 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 1569 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c | 729 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_ddp.c | 735 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_defs.h | 10 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_listen.c | 22 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h | 52 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c | 694 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c | 1362 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_toepcb.h | 81 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_tom.c | 102 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_tom.h | 2 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c | 18 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_vm.c | 180 | ||||
-rw-r--r-- | sys/dev/cxgb/ulp/tom/cxgb_vm.h | 40 |
15 files changed, 2898 insertions, 2702 deletions
diff --git a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h index 8e88d6b..c70c37d 100644 --- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h +++ b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h @@ -41,6 +41,8 @@ enum { TOE_ID_CHELSIO_T2, TOE_ID_CHELSIO_T3, TOE_ID_CHELSIO_T3B, -}; + TOE_ID_CHELSIO_T3C, +} + ; #endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c index 0f2f2ee..96e5b65 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/limits.h> +#include <sys/ktr.h> #include <sys/lock.h> #include <sys/mbuf.h> #include <sys/mutex.h> @@ -63,9 +64,9 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_offload.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_syncache.h> +#include <netinet/tcp_timer.h> #include <net/route.h> - #include <dev/cxgb/t3cdev.h> #include <dev/cxgb/common/cxgb_firmware_exports.h> #include <dev/cxgb/common/cxgb_t3_cpl.h> @@ -84,8 +85,6 @@ __FBSDID("$FreeBSD$"); #include <dev/cxgb/ulp/tom/cxgb_toepcb.h> #include <dev/cxgb/ulp/tom/cxgb_tcp.h> - - /* * For ULP connections HW may add headers, e.g., for digests, that aren't part * of the messages sent by the host but that are part of the TCP payload and @@ -118,7 +117,7 @@ static unsigned int wrlen __read_mostly; * in the skb and whether it has any payload in its main body. This maps the * length of the gather list represented by an skb into the # of necessary WRs. */ -static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly; +static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; /* * Max receive window supported by HW in bytes. Only a small part of it can @@ -147,6 +146,37 @@ static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); static inline void free_atid(struct t3cdev *cdev, unsigned int tid); static void handle_syncache_event(int event, void *arg); +static inline void +SBAPPEND(struct sockbuf *sb, struct mbuf *n) +{ + struct mbuf * m; + + m = sb->sb_mb; + while (m) { + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || + !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", + !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", + m->m_next, m->m_nextpkt, m->m_flags)); + m = m->m_next; + } + m = n; + while (m) { + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || + !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", + !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", + m->m_next, m->m_nextpkt, m->m_flags)); + m = m->m_next; + } + sbappend_locked(sb, n); + m = sb->sb_mb; + while (m) { + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", + m->m_next, m->m_nextpkt, m->m_flags)); + m = m->m_next; + } +} static inline int is_t3a(const struct toedev *dev) @@ -166,6 +196,7 @@ dump_toepcb(struct toepcb *toep) toep->tp_mss_clamp, toep->tp_flags); } +#ifndef RTALLOC2_DEFINED static struct rtentry * rtalloc2(struct sockaddr *dst, int report, u_long ignflags) { @@ -176,7 +207,7 @@ rtalloc2(struct sockaddr *dst, int report, u_long ignflags) return (rt); } - +#endif /* * Determine whether to send a CPL message now or defer it. A message is * deferred if the connection is in SYN_SENT since we don't know the TID yet. @@ -185,39 +216,39 @@ rtalloc2(struct sockaddr *dst, int report, u_long ignflags) * it is sent directly. */ static inline void -send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t) +send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) { - struct toepcb *toep = tp->t_toe; + struct tcpcb *tp = toep->tp_tp; - if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { INP_LOCK(tp->t_inpcb); mbufq_tail(&toep->out_of_order_queue, m); // defer INP_UNLOCK(tp->t_inpcb); } else if (through_l2t) - l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T + l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T else - cxgb_ofld_send(T3C_DEV(so), m); // send directly + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly } static inline unsigned int -mkprio(unsigned int cntrl, const struct socket *so) +mkprio(unsigned int cntrl, const struct toepcb *toep) { - return cntrl; + return (cntrl); } /* * Populate a TID_RELEASE WR. The skb must be already propely sized. */ static inline void -mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid) +mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) { struct cpl_tid_release *req; - m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so)); + m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); m->m_pkthdr.len = m->m_len = sizeof(*req); req = mtod(m, struct cpl_tid_release *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); } @@ -257,6 +288,8 @@ make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) } } +#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ + int t3_push_frames(struct socket *so, int req_completion) { @@ -266,9 +299,8 @@ t3_push_frames(struct socket *so, int req_completion) struct mbuf *tail, *m0, *last; struct t3cdev *cdev; struct tom_data *d; - int bytes, count, total_bytes; + int i, bytes, count, total_bytes; bus_dma_segment_t segs[TX_MAX_SEGS], *segp; - segp = segs; if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { DPRINTF("tcp state=%d\n", tp->t_state); @@ -281,10 +313,9 @@ t3_push_frames(struct socket *so, int req_completion) return (0); } - INP_LOCK_ASSERT(tp->t_inpcb); + INP_LOCK_ASSERT(tp->t_inpcb); SOCKBUF_LOCK(&so->so_snd); - d = TOM_DATA(TOE_DEV(so)); cdev = d->cdev; last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb; @@ -306,61 +337,103 @@ t3_push_frames(struct socket *so, int req_completion) toep->tp_m_last = NULL; while (toep->tp_wr_avail && (tail != NULL)) { count = bytes = 0; + segp = segs; if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } - while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) - && (tail != NULL) && (count < TX_MAX_SEGS)) { - bytes += tail->m_len; - count++; + /* + * If the data in tail fits as in-line, then + * make an immediate data wr. + */ + if (tail->m_len <= IMM_LEN) { + count = 1; + bytes = tail->m_len; last = tail; - /* - * technically an abuse to be using this for a VA - * but less gross than defining my own structure - * or calling pmap_kextract from here :-| - */ - segp->ds_addr = (bus_addr_t)tail->m_data; - segp->ds_len = tail->m_len; - DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", - count, mbuf_wrs[count], tail->m_data, tail->m_len); - - segp++; tail = tail->m_next; + m_set_sgl(m0, NULL); + m_set_sgllen(m0, 0); + make_tx_data_wr(so, m0, bytes, tail); + m_append(m0, bytes, mtod(last, caddr_t)); + KASSERT(!m0->m_next, ("bad append")); + } else { + while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) + && (tail != NULL) && (count < TX_MAX_SEGS-1)) { + bytes += tail->m_len; + last = tail; + count++; + /* + * technically an abuse to be using this for a VA + * but less gross than defining my own structure + * or calling pmap_kextract from here :-| + */ + segp->ds_addr = (bus_addr_t)tail->m_data; + segp->ds_len = tail->m_len; + DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", + count, mbuf_wrs[count], tail->m_data, tail->m_len); + segp++; + tail = tail->m_next; + } + DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", + toep->tp_wr_avail, count, mbuf_wrs[count], tail); + + m_set_sgl(m0, segs); + m_set_sgllen(m0, count); + make_tx_data_wr(so, m0, bytes, tail); } - DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", - toep->tp_wr_avail, count, mbuf_wrs[count], tail); + m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); + if (tail) { so->so_snd.sb_sndptr = tail; toep->tp_m_last = NULL; } else toep->tp_m_last = so->so_snd.sb_sndptr = last; + DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); so->so_snd.sb_sndptroff += bytes; total_bytes += bytes; toep->tp_write_seq += bytes; - - - SOCKBUF_UNLOCK(&so->so_snd); - - /* - * XXX can drop socket buffer lock here - */ + CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d", + toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff); + if (tail) + CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x", + total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una); + else + CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x", + total_bytes, toep->tp_m_last, tp->snd_una); + + + i = 0; + while (i < count && m_get_sgllen(m0)) { + if ((count - i) >= 3) { + CTR6(KTR_TOM, + "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d", + segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len, + segs[i + 2].ds_addr, segs[i + 2].ds_len); + i += 3; + } else if ((count - i) == 2) { + CTR4(KTR_TOM, + "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d", + segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len); + i += 2; + } else { + CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", + segs[i].ds_addr, segs[i].ds_len); + i++; + } - toep->tp_wr_avail -= mbuf_wrs[count]; - toep->tp_wr_unacked += mbuf_wrs[count]; + } - make_tx_data_wr(so, m0, bytes, tail); - m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so)); - m_set_sgl(m0, segs); - m_set_sgllen(m0, count); - /* + /* * remember credits used */ m0->m_pkthdr.csum_data = mbuf_wrs[count]; m0->m_pkthdr.len = bytes; + toep->tp_wr_avail -= mbuf_wrs[count]; + toep->tp_wr_unacked += mbuf_wrs[count]; + if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || toep->tp_wr_unacked >= toep->tp_wr_max / 2) { struct work_request_hdr *wr = cplhdr(m0); @@ -368,18 +441,16 @@ t3_push_frames(struct socket *so, int req_completion) wr->wr_hi |= htonl(F_WR_COMPL); toep->tp_wr_unacked = 0; } - + KASSERT((m0->m_pkthdr.csum_data > 0) && + (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", + m0->m_pkthdr.csum_data)); m0->m_type = MT_DONTFREE; enqueue_wr(toep, m0); DPRINTF("sending offload tx with %d bytes in %d segments\n", bytes, count); - l2t_send(cdev, m0, toep->tp_l2t); - if (toep->tp_wr_avail && (tail != NULL)) - SOCKBUF_LOCK(&so->so_snd); } - - SOCKBUF_UNLOCK_ASSERT(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); return (total_bytes); } @@ -467,13 +538,105 @@ t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail req = mtod(m, struct cpl_rx_data_ack *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); - m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); + m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); return (credits); } +/* + * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. + * This is only used in DDP mode, so we take the opportunity to also set the + * DACK mode and flush any Rx credits. + */ +void +t3_send_rx_modulate(struct toepcb *toep) +{ + struct mbuf *m; + struct cpl_rx_data_ack *req; + + m = m_gethdr_nofail(sizeof(*req)); + + req = mtod(m, struct cpl_rx_data_ack *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + m->m_pkthdr.len = m->m_len = sizeof(*req); + + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); + req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | + V_RX_DACK_MODE(1) | + V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); + toep->tp_rcv_wup = toep->tp_copied_seq; +} + +/* + * Handle receipt of an urgent pointer. + */ +static void +handle_urg_ptr(struct socket *so, uint32_t urg_seq) +{ +#ifdef URGENT_DATA_SUPPORTED + struct tcpcb *tp = sototcpcb(so); + + urg_seq--; /* initially points past the urgent data, per BSD */ + + if (tp->urg_data && !after(urg_seq, tp->urg_seq)) + return; /* duplicate pointer */ + sk_send_sigurg(sk); + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + tp->copied_seq++; + if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) + tom_eat_skb(sk, skb, 0); + } + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = urg_seq; +#endif +} + +/* + * Returns true if a socket cannot accept new Rx data. + */ +static inline int +so_no_receive(const struct socket *so) +{ + return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); +} + +/* + * Process an urgent data notification. + */ +static void +rx_urg_notify(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_rx_urg_notify *hdr = cplhdr(m); + struct socket *so = toeptoso(toep); + + VALIDATE_SOCK(so); + + if (!so_no_receive(so)) + handle_urg_ptr(so, ntohl(hdr->seq)); + + m_freem(m); +} + +/* + * Handler for RX_URG_NOTIFY CPL messages. + */ +static int +do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + rx_urg_notify(toep, m); + return (0); +} /* * Set of states for which we should return RX credits. @@ -485,7 +648,7 @@ t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail * to the HW for the amount of data processed. */ void -t3_cleanup_rbuf(struct tcpcb *tp) +t3_cleanup_rbuf(struct tcpcb *tp, int copied) { struct toepcb *toep = tp->t_toe; struct socket *so; @@ -493,23 +656,38 @@ t3_cleanup_rbuf(struct tcpcb *tp) int dack_mode, must_send, read; u32 thres, credits, dack = 0; + so = tp->t_inpcb->inp_socket; if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || - (tp->t_state == TCPS_FIN_WAIT_2))) + (tp->t_state == TCPS_FIN_WAIT_2))) { + if (copied) { + SOCKBUF_LOCK(&so->so_rcv); + toep->tp_copied_seq += copied; + SOCKBUF_UNLOCK(&so->so_rcv); + } + return; - INP_LOCK_ASSERT(tp->t_inpcb); + } - so = tp->t_inpcb->inp_socket; + INP_LOCK_ASSERT(tp->t_inpcb); SOCKBUF_LOCK(&so->so_rcv); - read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc; - toep->tp_copied_seq += read; - toep->tp_enqueued_bytes -= read; + if (copied) + toep->tp_copied_seq += copied; + else { + read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc; + toep->tp_copied_seq += read; + } credits = toep->tp_copied_seq - toep->tp_rcv_wup; + toep->tp_enqueued_bytes = so->so_rcv.sb_cc; SOCKBUF_UNLOCK(&so->so_rcv); - if (credits > so->so_rcv.sb_mbmax) + if (credits > so->so_rcv.sb_mbmax) { printf("copied_seq=%u rcv_wup=%u credits=%u\n", toep->tp_copied_seq, toep->tp_rcv_wup, credits); - /* + credits = so->so_rcv.sb_mbmax; + } + + + /* * XXX this won't accurately reflect credit return - we need * to look at the difference between the amount that has been * put in the recv sockbuf and what is there now @@ -593,7 +771,7 @@ static int cxgb_toe_rcvd(struct tcpcb *tp) { INP_LOCK_ASSERT(tp->t_inpcb); - t3_cleanup_rbuf(tp); + t3_cleanup_rbuf(tp, 0); return (0); } @@ -631,16 +809,18 @@ static struct toe_usrreqs cxgb_toe_usrreqs = { static void -__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word, +__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, uint64_t mask, uint64_t val, int no_reply) { struct cpl_set_tcb_field *req; - struct tcpcb *tp = sototcpcb(so); - struct toepcb *toep = tp->t_toe; + + CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", + toep->tp_tid, word, mask, val); req = mtod(m, struct cpl_set_tcb_field *); m->m_pkthdr.len = m->m_len = sizeof(*req); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); req->reply = V_NO_REPLY(no_reply); req->cpu_idx = 0; @@ -648,8 +828,8 @@ __set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word, req->mask = htobe64(mask); req->val = htobe64(val); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); - send_or_defer(so, tp, m, 0); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + send_or_defer(toep, m, 0); } static void @@ -661,13 +841,15 @@ t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val) if (toep == NULL) return; - - if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) + + if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { + printf("not seting field\n"); return; - + } + m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); - __set_tcb_field(so, m, word, mask, val, 1); + __set_tcb_field(toep, m, word, mask, val, 1); } /* @@ -735,10 +917,11 @@ t3_set_tos(struct socket *so) static void t3_enable_ddp(struct socket *so, int on) { - if (on) + if (on) { + t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), V_TF_DDP_OFF(0)); - else + } else t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | TP_DDP_TIMER_WORKAROUND_MASK, @@ -747,7 +930,6 @@ t3_enable_ddp(struct socket *so, int on) } - void t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color) { @@ -777,7 +959,7 @@ t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset, static int t3_set_cong_control(struct socket *so, const char *name) { -#ifdef notyet +#ifdef CONGESTION_CONTROL_SUPPORTED int cong_algo; for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) @@ -802,12 +984,14 @@ t3_get_tcb(struct socket *so) return (ENOMEM); INP_LOCK_ASSERT(tp->t_inpcb); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); req = mtod(m, struct cpl_get_tcb *); m->m_pkthdr.len = m->m_len = sizeof(*req); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); req->cpuno = htons(toep->tp_qset); + req->rsvd = 0; if (sototcpcb(so)->t_state == TCPS_SYN_SENT) mbufq_tail(&toep->out_of_order_queue, m); // defer else @@ -863,14 +1047,6 @@ select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) return (idx); } -void -t3_release_ddp_resources(struct toepcb *toep) -{ - /* - * This is a no-op until we have DDP support - */ -} - static inline void free_atid(struct t3cdev *cdev, unsigned int tid) { @@ -915,8 +1091,6 @@ t3_release_offload_resources(struct toepcb *toep) l2t_release(L2DATA(cdev), toep->tp_l2t); toep->tp_l2t = NULL; } - printf("setting toep->tp_tp to NULL\n"); - toep->tp_tp = NULL; if (tp) { INP_LOCK_ASSERT(tp->t_inpcb); @@ -964,16 +1138,16 @@ select_rcv_wscale(int space) if (tcp_do_rfc1323) for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; - return wscale; + + return (wscale); } /* * Determine the receive window size for a socket. */ -static unsigned int -select_rcv_wnd(struct socket *so) +static unsigned long +select_rcv_wnd(struct toedev *dev, struct socket *so) { - struct toedev *dev = TOE_DEV(so); struct tom_data *d = TOM_DATA(dev); unsigned int wnd; unsigned int max_rcv_wnd; @@ -981,7 +1155,9 @@ select_rcv_wnd(struct socket *so) if (tcp_do_autorcvbuf) wnd = tcp_autorcvbuf_max; else - wnd = sbspace(&so->so_rcv); + wnd = so->so_rcv.sb_hiwat; + + /* XXX * For receive coalescing to work effectively we need a receive window @@ -991,7 +1167,7 @@ select_rcv_wnd(struct socket *so) wnd = MIN_RCV_WND; /* PR 5138 */ - max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? + max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? (uint32_t)d->rx_page_size * 23 : MAX_RCV_WND); @@ -1017,7 +1193,8 @@ init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, * or we need to add this */ so->so_snd.sb_flags |= SB_NOCOALESCE; - + so->so_rcv.sb_flags |= SB_NOCOALESCE; + tp->t_toe = toep; toep->tp_tp = tp; toep->tp_toedev = dev; @@ -1033,7 +1210,8 @@ init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, * XXX broken * */ - tp->rcv_wnd = select_rcv_wnd(so); + tp->rcv_wnd = select_rcv_wnd(dev, so); + toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) && tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; toep->tp_qset_idx = 0; @@ -1076,9 +1254,23 @@ calc_opt2(const struct socket *so, struct toedev *dev) flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); - return V_FLAVORS_VALID(flv_valid) | - V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0); + return (V_FLAVORS_VALID(flv_valid) | + V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); } + +#if DEBUG_WR > 1 +static int +count_pending_wrs(const struct toepcb *toep) +{ + const struct mbuf *m; + int n = 0; + + wr_queue_walk(toep, m) + n += m->m_pkthdr.csum_data; + return (n); +} +#endif + #if 0 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) #endif @@ -1093,18 +1285,18 @@ mk_act_open_req(struct socket *so, struct mbuf *m, struct toepcb *toep = tp->t_toe; struct toedev *tdev = TOE_DEV(so); - m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so)); + m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); req = mtod(m, struct cpl_act_open_req *); m->m_pkthdr.len = m->m_len = sizeof(*req); - + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); req->local_port = inp->inp_lport; req->peer_port = inp->inp_fport; memcpy(&req->local_ip, &inp->inp_laddr, 4); memcpy(&req->peer_ip, &inp->inp_faddr, 4); - DPRINTF("connect smt_idx=%d\n", e->smt_idx); req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); @@ -1144,7 +1336,7 @@ fail_act_open(struct toepcb *toep, int errno) t3_release_offload_resources(toep); if (tp) { INP_LOCK_ASSERT(tp->t_inpcb); - cxgb_tcp_drop(tp, errno); + tcp_drop(tp, errno); } #ifdef notyet @@ -1289,8 +1481,6 @@ t3_connect(struct toedev *tdev, struct socket *so, toep = tp->t_toe; m_set_toep(m, tp->t_toe); - printf("sending off request\n"); - toep->tp_state = TCPS_SYN_SENT; l2t_send(d->cdev, (struct mbuf *)m, e); @@ -1342,7 +1532,7 @@ t3_send_reset(struct toepcb *toep) mode |= CPL_ABORT_POST_CLOSE_REQ; m = m_gethdr_nofail(sizeof(*req)); - m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so)); + m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); set_arp_failure_handler(m, abort_arp_failure); req = mtod(m, struct cpl_abort_req *); @@ -1416,7 +1606,7 @@ t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) * XXX I need to revisit this */ if ((err = t3_set_cong_control(so, name)) == 0) { -#ifdef notyet +#ifdef CONGESTION_CONTROL_SUPPORTED tp->t_cong_control = strdup(name, M_CXGB); #endif } else @@ -1465,7 +1655,280 @@ t3_ctloutput(struct socket *so, struct sockopt *sopt) if (err != EOPNOTSUPP) return (err); - return tcp_ctloutput(so, sopt); + return (tcp_ctloutput(so, sopt)); +} + +/* + * Returns true if we need to explicitly request RST when we receive new data + * on an RX-closed connection. + */ +static inline int +need_rst_on_excess_rx(const struct toepcb *toep) +{ + return (1); +} + +/* + * Handles Rx data that arrives in a state where the socket isn't accepting + * new data. + */ +static void +handle_excess_rx(struct toepcb *toep, struct mbuf *m) +{ + + if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN)) + t3_send_reset(toep); + m_freem(m); +} + +/* + * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) + * by getting the DDP offset from the TCB. + */ +static void +tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) +{ + struct ddp_state *q = &toep->tp_ddp_state; + struct ddp_buf_state *bsp; + struct cpl_get_tcb_rpl *hdr; + unsigned int ddp_offset; + struct socket *so; + struct tcpcb *tp; + + uint64_t t; + __be64 *tcb; + + so = toeptoso(toep); + tp = toep->tp_tp; + + INP_LOCK_ASSERT(tp->t_inpcb); + SOCKBUF_LOCK(&so->so_rcv); + + /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We + * really need a cookie in order to dispatch the RPLs. + */ + q->get_tcb_count--; + + /* It is a possible that a previous CPL already invalidated UBUF DDP + * and moved the cur_buf idx and hence no further processing of this + * skb is required. However, the app might be sleeping on + * !q->get_tcb_count and we need to wake it up. + */ + if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { + struct socket *so = toeptoso(toep); + + m_freem(m); + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); + return; + } + + bsp = &q->buf_state[q->cur_buf]; + hdr = cplhdr(m); + tcb = (__be64 *)(hdr + 1); + if (q->cur_buf == 0) { + t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); + ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); + } else { + t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); + ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; + } + ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; + m->m_cur_offset = bsp->cur_offset; + bsp->cur_offset = ddp_offset; + m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; + + CTR5(KTR_TOM, + "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", + q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); + KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u", + ddp_offset, m->m_cur_offset)); + +#ifdef T3_TRACE + T3_TRACE3(TIDTB(so), + "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u", + tp->rcv_nxt, q->cur_buf, ddp_offset); +#endif + +#if 0 +{ + unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; + + t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); + ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; + + t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); + rcv_nxt = t >> S_TCB_RCV_NXT; + rcv_nxt &= M_TCB_RCV_NXT; + + t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); + rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); + rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; + + T3_TRACE2(TIDTB(sk), + "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", + ddp_flags, rcv_nxt - rx_hdr_offset); + T3_TRACE4(TB(q), + "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", + tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); + T3_TRACE3(TB(q), + "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", + rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); + T3_TRACE2(TB(q), + "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", + q->buf_state[0].flags, q->buf_state[1].flags); + +} +#endif + if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { + handle_excess_rx(toep, m); + return; + } + +#ifdef T3_TRACE + if ((int)m->m_pkthdr.len < 0) { + t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); + } +#endif + if (bsp->flags & DDP_BF_NOCOPY) { +#ifdef T3_TRACE + T3_TRACE0(TB(q), + "tcb_rpl_as_ddp_complete: CANCEL UBUF"); + + if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + printk("!cancel_ubuf"); + t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); + } +#endif + m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; + bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); + q->cur_buf ^= 1; + } else if (bsp->flags & DDP_BF_NOFLIP) { + + m->m_ddp_flags = 1; /* always a kernel buffer */ + + /* now HW buffer carries a user buffer */ + bsp->flags &= ~DDP_BF_NOFLIP; + bsp->flags |= DDP_BF_NOCOPY; + + /* It is possible that the CPL_GET_TCB_RPL doesn't indicate + * any new data in which case we're done. If in addition the + * offset is 0, then there wasn't a completion for the kbuf + * and we need to decrement the posted count. + */ + if (m->m_pkthdr.len == 0) { + if (ddp_offset == 0) { + q->kbuf_posted--; + bsp->flags |= DDP_BF_NODATA; + } + SOCKBUF_UNLOCK(&so->so_rcv); + + m_free(m); + return; + } + } else { + SOCKBUF_UNLOCK(&so->so_rcv); + /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, + * but it got here way late and nobody cares anymore. + */ + m_free(m); + return; + } + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt += m->m_pkthdr.len; + tp->t_rcvtime = ticks; +#ifdef T3_TRACE + T3_TRACE3(TB(q), + "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u", + m->m_seq, q->cur_buf, m->m_pkthdr.len); +#endif + CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", + m->m_seq, q->cur_buf, m->m_pkthdr.len); + if (m->m_pkthdr.len == 0) + q->user_ddp_pending = 0; + else + SBAPPEND(&so->so_rcv, m); + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +/* + * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, + * in that case they are similar to DDP completions. + */ +static int +do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + /* OK if socket doesn't exist */ + if (toep == NULL) { + printf("null toep in do_get_tcb_rpl\n"); + return (CPL_RET_BUF_DONE); + } + + INP_LOCK(toep->tp_tp->t_inpcb); + tcb_rpl_as_ddp_complete(toep, m); + INP_UNLOCK(toep->tp_tp->t_inpcb); + + return (0); +} + +static void +handle_ddp_data(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct socket *so = toeptoso(toep); + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_data *hdr = cplhdr(m); + unsigned int rcv_nxt = ntohl(hdr->seq); + + if (tp->rcv_nxt == rcv_nxt) + return; + + INP_LOCK_ASSERT(tp->t_inpcb); + SOCKBUF_LOCK(&so->so_rcv); + q = &toep->tp_ddp_state; + bsp = &q->buf_state[q->cur_buf]; + KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", + rcv_nxt, tp->rcv_nxt)); + m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", + rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); + +#ifdef T3_TRACE + if ((int)m->m_pkthdr.len < 0) { + t3_ddp_error(so, "handle_ddp_data: neg len"); + } +#endif + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_cur_offset = bsp->cur_offset; + m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; + if (bsp->flags & DDP_BF_NOCOPY) + bsp->flags &= ~DDP_BF_NOCOPY; + + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + bsp->cur_offset += m->m_pkthdr.len; + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; + /* + * For now, don't re-enable DDP after a connection fell out of DDP + * mode. + */ + q->ubuf_ddp_ready = 0; + SOCKBUF_UNLOCK(&so->so_rcv); } /* @@ -1481,32 +1944,33 @@ new_rx_data(struct toepcb *toep, struct mbuf *m) INP_LOCK(tp->t_inpcb); -#ifdef notyet - if (__predict_false(sk_no_receive(sk))) { - handle_excess_rx(so, skb); + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); + INP_UNLOCK(tp->t_inpcb); + TRACE_EXIT; return; } - if (ULP_MODE(tp) == ULP_MODE_TCPDDP) - handle_ddp_data(so, skb); + if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) + handle_ddp_data(toep, m); + + m->m_seq = ntohl(hdr->seq); + m->m_ulp_mode = 0; /* for iSCSI */ - TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); - TCP_SKB_CB(skb)->flags = 0; - skb_ulp_mode(skb) = 0; /* for iSCSI */ -#endif #if VALIDATE_SEQ - if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { - printk(KERN_ERR + if (__predict_false(m->m_seq != tp->rcv_nxt)) { + log(LOG_ERR, "%s: TID %u: Bad sequence number %u, expected %u\n", - TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, + TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq, tp->rcv_nxt); - __kfree_skb(skb); + m_freem(m); + INP_UNLOCK(tp->t_inpcb); return; } #endif m_adj(m, sizeof(*hdr)); -#ifdef notyet +#ifdef URGENT_DATA_SUPPORTED /* * We don't handle urgent data yet */ @@ -1521,8 +1985,8 @@ new_rx_data(struct toepcb *toep, struct mbuf *m) toep->tp_delack_mode = hdr->dack_mode; toep->tp_delack_seq = tp->rcv_nxt; } - - DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len); + CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", + m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); if (len < m->m_pkthdr.len) m->m_pkthdr.len = m->m_len = len; @@ -1532,21 +1996,29 @@ new_rx_data(struct toepcb *toep, struct mbuf *m) toep->tp_enqueued_bytes += m->m_pkthdr.len; #ifdef T3_TRACE T3_TRACE2(TIDTB(sk), - "new_rx_data: seq 0x%x len %u", - TCP_SKB_CB(skb)->seq, skb->len); + "new_rx_data: seq 0x%x len %u", + m->m_seq, m->m_pkthdr.len); #endif + INP_UNLOCK(tp->t_inpcb); SOCKBUF_LOCK(&so->so_rcv); if (sb_notify(&so->so_rcv)) DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len); - sbappend_locked(&so->so_rcv, m); - KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax, + SBAPPEND(&so->so_rcv, m); + +#ifdef notyet + /* + * We're giving too many credits to the card - but disable this check so we can keep on moving :-| + * + */ + KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1), ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax)); +#endif - INP_UNLOCK(tp->t_inpcb); - DPRINTF("sb_cc=%d sb_mbcnt=%d\n", + + CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt); if (__predict_true((so->so_state & SS_NOFDREF) == 0)) @@ -1571,22 +2043,26 @@ do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) } static void -new_rx_data_ddp(struct socket *so, struct mbuf *m) +new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) { - struct tcpcb *tp = sototcpcb(so); - struct toepcb *toep = tp->t_toe; + struct tcpcb *tp; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_data_ddp *hdr; unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; + struct socket *so = toeptoso(toep); + int nomoredata = 0; -#ifdef notyet - if (unlikely(sk_no_receive(sk))) { - handle_excess_rx(so, m); + tp = sototcpcb(so); + + INP_LOCK(tp->t_inpcb); + if (__predict_false(so_no_receive(so))) { + + handle_excess_rx(toep, m); + INP_UNLOCK(tp->t_inpcb); return; } -#endif - tp = sototcpcb(so); + q = &toep->tp_ddp_state; hdr = cplhdr(m); ddp_report = ntohl(hdr->u.ddp_report); @@ -1603,69 +2079,91 @@ new_rx_data_ddp(struct socket *so, struct mbuf *m) "new_rx_data_ddp: ddp_report 0x%x", ddp_report); #endif - + CTR4(KTR_TOM, + "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " + "hdr seq 0x%x len %u", + tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), + ntohs(hdr->len)); + CTR3(KTR_TOM, + "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", + G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); + ddp_len = ntohs(hdr->len); rcv_nxt = ntohl(hdr->seq) + ddp_len; - /* - * Overload to store old rcv_next - */ - m->m_pkthdr.csum_data = tp->rcv_nxt; + m->m_seq = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; + tp->t_rcvtime = ticks; /* * Store the length in m->m_len. We are changing the meaning of * m->m_len here, we need to be very careful that nothing from now on * interprets ->len of this packet the usual way. */ - m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data; - + m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; + INP_UNLOCK(tp->t_inpcb); + CTR3(KTR_TOM, + "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", + m->m_len, rcv_nxt, m->m_seq); /* * Figure out where the new data was placed in the buffer and store it * in when. Assumes the buffer offset starts at 0, consumer needs to * account for page pod's pg_offset. */ end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; -#ifdef notyet - TCP_SKB_CB(skb)->when = end_offset - skb->len; + m->m_cur_offset = end_offset - m->m_pkthdr.len; - /* - * We store in mac.raw the address of the gather list where the - * placement happened. - */ - skb->mac.raw = (unsigned char *)bsp->gl; -#endif + SOCKBUF_LOCK(&so->so_rcv); + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; bsp->cur_offset = end_offset; + toep->tp_enqueued_bytes += m->m_pkthdr.len; /* + * Length is only meaningful for kbuf + */ + if (!(bsp->flags & DDP_BF_NOCOPY)) + KASSERT(m->m_len <= bsp->gl->dgl_length, + ("length received exceeds ddp pages: len=%d dgl_length=%d", + m->m_len, bsp->gl->dgl_length)); + + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); + + + /* * Bit 0 of flags stores whether the DDP buffer is completed. * Note that other parts of the code depend on this being in bit 0. */ if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { -#if 0 - TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ -#endif panic("spurious ddp completion"); } else { - m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); - if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP)) + m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); + if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) q->cur_buf ^= 1; /* flip buffers */ } if (bsp->flags & DDP_BF_NOCOPY) { - m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY); + m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); bsp->flags &= ~DDP_BF_NOCOPY; } if (ddp_report & F_DDP_PSH) - m->m_pkthdr.csum_flags |= DDP_BF_PSH; + m->m_ddp_flags |= DDP_BF_PSH; + if (nomoredata) + m->m_ddp_flags |= DDP_BF_NODATA; + + if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { + toep->tp_delack_mode = G_DDP_DACK_MODE(ddp_report); + toep->tp_delack_seq = tp->rcv_nxt; + } + + SBAPPEND(&so->so_rcv, m); - tp->t_rcvtime = ticks; - sbappendstream_locked(&so->so_rcv, m); -#ifdef notyet - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); -#endif + if ((so->so_state & SS_NOFDREF) == 0) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ @@ -1680,7 +2178,6 @@ static int do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = ctx; - struct socket *so = toeptoso(toep); const struct cpl_rx_data_ddp *hdr = cplhdr(m); VALIDATE_SOCK(so); @@ -1688,40 +2185,50 @@ do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); - return CPL_RET_BUF_DONE; + return (CPL_RET_BUF_DONE); } #if 0 skb->h.th = tcphdr_skb->h.th; #endif - new_rx_data_ddp(so, m); + new_rx_data_ddp(toep, m); return (0); } static void -process_ddp_complete(struct socket *so, struct mbuf *m) +process_ddp_complete(struct toepcb *toep, struct mbuf *m) { - struct tcpcb *tp = sototcpcb(so); - struct toepcb *toep = tp->t_toe; + struct tcpcb *tp = toep->tp_tp; + struct socket *so = toeptoso(toep); struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_ddp_complete *hdr; unsigned int ddp_report, buf_idx, when; + int nomoredata = 0; -#ifdef notyet - if (unlikely(sk_no_receive(sk))) { - handle_excess_rx(sk, skb); + INP_LOCK(tp->t_inpcb); + if (__predict_false(so_no_receive(so))) { + struct inpcb *inp = sotoinpcb(so); + + handle_excess_rx(toep, m); + INP_UNLOCK(inp); return; } -#endif q = &toep->tp_ddp_state; hdr = cplhdr(m); ddp_report = ntohl(hdr->ddp_report); buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; - bsp = &q->buf_state[buf_idx]; + m->m_pkthdr.csum_data = tp->rcv_nxt; + + SOCKBUF_LOCK(&so->so_rcv); + bsp = &q->buf_state[buf_idx]; when = bsp->cur_offset; - m->m_len = G_DDP_OFFSET(ddp_report) - when; + m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; + tp->rcv_nxt += m->m_len; + tp->t_rcvtime = ticks; + INP_UNLOCK(tp->t_inpcb); + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); #ifdef T3_TRACE T3_TRACE5(TIDTB(sk), "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " @@ -1729,12 +2236,20 @@ process_ddp_complete(struct socket *so, struct mbuf *m) tp->rcv_nxt, bsp->cur_offset, ddp_report, G_DDP_OFFSET(ddp_report), skb->len); #endif - + CTR5(KTR_TOM, + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report 0x%x offset %u, len %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report), m->m_len); + bsp->cur_offset += m->m_len; - if (!(bsp->flags & DDP_BF_NOFLIP)) + if (!(bsp->flags & DDP_BF_NOFLIP)) { q->cur_buf ^= 1; /* flip buffers */ - + if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) + nomoredata=1; + } + #ifdef T3_TRACE T3_TRACE4(TIDTB(sk), "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " @@ -1742,21 +2257,26 @@ process_ddp_complete(struct socket *so, struct mbuf *m) tp->rcv_nxt, bsp->cur_offset, ddp_report, G_DDP_OFFSET(ddp_report)); #endif -#if 0 - skb->mac.raw = (unsigned char *)bsp->gl; -#endif - m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; + CTR4(KTR_TOM, + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report %u offset %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report)); + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; if (bsp->flags & DDP_BF_NOCOPY) bsp->flags &= ~DDP_BF_NOCOPY; - m->m_pkthdr.csum_data = tp->rcv_nxt; - tp->rcv_nxt += m->m_len; + if (nomoredata) + m->m_ddp_flags |= DDP_BF_NODATA; - tp->t_rcvtime = ticks; - sbappendstream_locked(&so->so_rcv, m); -#ifdef notyet - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); -#endif + SBAPPEND(&so->so_rcv, m); + + if ((so->so_state & SS_NOFDREF) == 0) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); } /* @@ -1766,13 +2286,12 @@ static int do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = ctx; - struct socket *so = toeptoso(toep); VALIDATE_SOCK(so); #if 0 skb->h.th = tcphdr_skb->h.th; #endif - process_ddp_complete(so, m); + process_ddp_complete(toep, m); return (0); } @@ -1801,6 +2320,65 @@ enter_timewait(struct socket *so) } /* + * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This + * function deals with the data that may be reported along with the FIN. + * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to + * perform normal FIN-related processing. In the latter case 1 indicates that + * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the + * skb can be freed. + */ +static int +handle_peer_close_data(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_peer_close *req = cplhdr(m); + unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ + + if (tp->rcv_nxt == rcv_nxt) /* no data */ + return (0); + + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); + + /* + * Although we discard the data we want to process the FIN so + * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + + * PEER_CLOSE without data. In particular this PEER_CLOSE + * may be what will close the connection. We return 1 because + * handle_excess_rx() already freed the packet. + */ + return (1); + } + + INP_LOCK_ASSERT(tp->t_inpcb); + q = &toep->tp_ddp_state; + SOCKBUF_LOCK(&so->so_rcv); + bsp = &q->buf_state[q->cur_buf]; + m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_cur_offset = bsp->cur_offset; + m->m_ddp_flags = + DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + bsp->cur_offset += m->m_pkthdr.len; + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; + tp->t_rcvtime = ticks; + SBAPPEND(&so->so_rcv, m); + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); + return (1); +} + +/* * Handle a peer FIN. */ static void @@ -1808,9 +2386,8 @@ do_peer_fin(struct socket *so, struct mbuf *m) { struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; - int keep = 0, dead = (so->so_state & SS_NOFDREF); - - DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead); + int keep = 0; + DPRINTF("do_peer_fin state=%d\n", tp->t_state); #ifdef T3_TRACE T3_TRACE0(TIDTB(sk),"do_peer_fin:"); @@ -1821,20 +2398,32 @@ do_peer_fin(struct socket *so, struct mbuf *m) goto out; } - -#ifdef notyet - if (ULP_MODE(tp) == ULP_MODE_TCPDDP) { - keep = handle_peer_close_data(so, skb); - if (keep < 0) - return; - } - sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(so, SOCK_DONE); -#endif INP_INFO_WLOCK(&tcbinfo); INP_LOCK(tp->t_inpcb); - if (TCPS_HAVERCVDFIN(tp->t_state) == 0) + if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { + keep = handle_peer_close_data(so, m); + if (keep < 0) { + INP_INFO_WUNLOCK(&tcbinfo); + INP_UNLOCK(tp->t_inpcb); + return; + } + } + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tp->t_flags & TF_NEEDSYN) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; @@ -1858,8 +2447,9 @@ do_peer_fin(struct socket *so, struct mbuf *m) t3_release_offload_resources(toep); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { tp = tcp_close(tp); - } else + } else { enter_timewait(so); + } break; default: log(LOG_ERR, @@ -1870,23 +2460,17 @@ do_peer_fin(struct socket *so, struct mbuf *m) if (tp) INP_UNLOCK(tp->t_inpcb); - if (!dead) { - DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags); - - sorwakeup(so); - sowwakeup(so); - wakeup(&so->so_timeo); -#ifdef notyet - sk->sk_state_change(sk); + DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags); - /* Do not send POLL_HUP for half duplex close. */ - if ((sk->sk_shutdown & SEND_SHUTDOWN) || - sk->sk_state == TCP_CLOSE) - sk_wake_async(so, 1, POLL_HUP); - else - sk_wake_async(so, 1, POLL_IN); +#ifdef notyet + /* Do not send POLL_HUP for half duplex close. */ + if ((sk->sk_shutdown & SEND_SHUTDOWN) || + sk->sk_state == TCP_CLOSE) + sk_wake_async(so, 1, POLL_HUP); + else + sk_wake_async(so, 1, POLL_IN); #endif - } + out: if (!keep) m_free(m); @@ -1929,8 +2513,10 @@ process_close_con_rpl(struct socket *so, struct mbuf *m) if (toep->tp_flags & TP_ABORT_RPL_PENDING) { tp = tcp_close(tp); - } else + } else { enter_timewait(so); + soisdisconnected(so); + } break; case TCPS_LAST_ACK: /* @@ -1942,21 +2528,29 @@ process_close_con_rpl(struct socket *so, struct mbuf *m) tp = tcp_close(tp); break; case TCPS_FIN_WAIT_1: -#ifdef notyet - dst_confirm(sk->sk_dst_cache); -#endif - soisdisconnecting(so); - - if ((so->so_state & SS_NOFDREF) == 0) { - /* - * Wake up lingering close - */ - sowwakeup(so); - sorwakeup(so); - wakeup(&so->so_timeo); - } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 && + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + * + * XXXjl: + * we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + int timeout; + + soisdisconnected(so); + timeout = (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : tcp_maxidle; + tcp_timer_activate(tp, TT_2MSL, timeout); + } + tp->t_state = TCPS_FIN_WAIT_2; + if ((so->so_options & SO_LINGER) && so->so_linger == 0 && (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { - tp = cxgb_tcp_drop(tp, 0); + tp = tcp_drop(tp, 0); } break; @@ -1970,7 +2564,7 @@ process_close_con_rpl(struct socket *so, struct mbuf *m) if (tp) INP_UNLOCK(tp->t_inpcb); out: - m_free(m); + m_freem(m); } /* @@ -2006,6 +2600,8 @@ process_abort_rpl(struct socket *so, struct mbuf *m) "process_abort_rpl: GTS rpl pending %d", sock_flag(sk, ABORT_RPL_PENDING)); #endif + + INP_INFO_WLOCK(&tcbinfo); INP_LOCK(tp->t_inpcb); if (toep->tp_flags & TP_ABORT_RPL_PENDING) { @@ -2020,16 +2616,14 @@ process_abort_rpl(struct socket *so, struct mbuf *m) !is_t3a(TOE_DEV(so))) { if (toep->tp_flags & TP_ABORT_REQ_RCVD) panic("TP_ABORT_REQ_RCVD set"); - INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(tp->t_inpcb); t3_release_offload_resources(toep); tp = tcp_close(tp); - INP_INFO_WUNLOCK(&tcbinfo); } } } if (tp) INP_UNLOCK(tp->t_inpcb); + INP_INFO_WUNLOCK(&tcbinfo); m_free(m); } @@ -2089,7 +2683,7 @@ discard: } /* - * Convert the status code of an ABORT_REQ into a Linux error code. Also + * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also * indicate whether RST should be sent in response. */ static int @@ -2289,10 +2883,8 @@ process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev) (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { so->so_error = abort_status_to_errno(so, req->status, &rst_status); -#if 0 - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_error_report(sk); -#endif + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup(so); /* * SYN_RECV needs special processing. If abort_syn_rcv() * returns 0 is has taken care of the abort. @@ -2513,7 +3105,8 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str struct tcphdr th; struct inpcb *inp; int mss, wsf, sack, ts; - + uint32_t rcv_isn = ntohl(req->rcv_isn); + bzero(&to, sizeof(struct tcpopt)); inp = sotoinpcb(lso); @@ -2522,10 +3115,11 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str */ inc.inc_fport = th.th_sport = req->peer_port; inc.inc_lport = th.th_dport = req->local_port; - toep->tp_iss = th.th_seq = req->rcv_isn; + th.th_seq = req->rcv_isn; th.th_flags = TH_SYN; - toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn); + toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; + inc.inc_isipv6 = 0; inc.inc_len = 0; @@ -2543,7 +3137,6 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str to.to_mss = mss; to.to_wscale = wsf; to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); - INP_INFO_WLOCK(&tcbinfo); INP_LOCK(inp); syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); @@ -2654,34 +3247,31 @@ process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, newtoep->tp_flags = TP_SYN_RCVD; newtoep->tp_tid = tid; newtoep->tp_toedev = tdev; + tp->rcv_wnd = select_rcv_wnd(tdev, so); - printf("inserting tid=%d\n", tid); cxgb_insert_tid(cdev, d->client, newtoep, tid); SOCK_LOCK(so); LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); SOCK_UNLOCK(so); - - if (lctx->ulp_mode) { + newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) && + tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; + + if (newtoep->tp_ulp_mode) { ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); - if (!ddp_mbuf) + if (ddp_mbuf == NULL) newtoep->tp_ulp_mode = 0; - else - newtoep->tp_ulp_mode = lctx->ulp_mode; } - + + CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", + TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); - - DPRINTF("adding request to syn cache\n"); - /* * XXX workaround for lack of syncache drop */ toepcb_hold(newtoep); syncache_add_accept_req(req, so, newtoep); - - rpl = cplhdr(reply_mbuf); reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); @@ -2692,50 +3282,34 @@ process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, rpl->rsvd = rpl->opt2; /* workaround for HW bug */ rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten - DPRINTF("accept smt_idx=%d\n", e->smt_idx); - rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); - rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) | + rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | CPL_PASS_OPEN_ACCEPT); DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); - m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so)); - -#ifdef DEBUG_PRINT - { - int i; - - DPRINTF("rpl:\n"); - uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *); - - for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++) - DPRINTF("[%d] %08x\n", i, rplbuf[i]); - } -#endif - + m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); l2t_send(cdev, reply_mbuf, e); m_free(m); -#ifdef notyet - /* - * XXX this call path has to be converted to not depend on sockets - */ - if (newtoep->tp_ulp_mode) - __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS, + if (newtoep->tp_ulp_mode) { + __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | TP_DDP_TIMER_WORKAROUND_MASK, V_TF_DDP_OFF(1) | - TP_DDP_TIMER_WORKAROUND_VAL, 1); + TP_DDP_TIMER_WORKAROUND_VAL, 1); + } else + printf("not offloading\n"); + + -#endif return; reject: if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) mk_pass_accept_rpl(reply_mbuf, m); else - mk_tid_release(reply_mbuf, NULL, tid); + mk_tid_release(reply_mbuf, newtoep, tid); cxgb_ofld_send(cdev, reply_mbuf); m_free(m); out: @@ -2793,7 +3367,7 @@ do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) /* * Called when a connection is established to translate the TCP options - * reported by HW to Linux's native format. + * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct socket *so, unsigned int opt) @@ -2808,8 +3382,9 @@ assign_rxopt(struct socket *so, unsigned int opt) tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; - if (tp->t_flags & TF_RCVD_SCALE) - tp->rcv_scale = 0; + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) + tp->rcv_scale = tp->request_r_scale; } /* @@ -2831,8 +3406,6 @@ make_established(struct socket *so, u32 snd_isn, unsigned int opt) #if 0 inet_sk(sk)->id = tp->write_seq ^ jiffies; #endif - - /* * XXX not clear what rcv_wup maps to */ @@ -2851,7 +3424,9 @@ make_established(struct socket *so, u32 snd_isn, unsigned int opt) */ dst_confirm(sk->sk_dst_cache); #endif + tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; + soisconnected(so); } static int @@ -2948,23 +3523,21 @@ do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) tp = sototcpcb(so); INP_LOCK(tp->t_inpcb); -#ifdef notyet - so->so_snd.sb_flags |= SB_TOE; - so->so_rcv.sb_flags |= SB_TOE; -#endif + + so->so_snd.sb_flags |= SB_NOCOALESCE; + so->so_rcv.sb_flags |= SB_NOCOALESCE; + toep->tp_tp = tp; toep->tp_flags = 0; tp->t_toe = toep; reset_wr_list(toep); - tp->rcv_wnd = select_rcv_wnd(so); - DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd); + tp->rcv_wnd = select_rcv_wnd(tdev, so); + tp->rcv_nxt = toep->tp_copied_seq; install_offload_ops(so); toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); toep->tp_wr_unacked = 0; toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); - toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) && - tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; toep->tp_qset_idx = 0; toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); @@ -2975,8 +3548,9 @@ do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); INP_INFO_WUNLOCK(&tcbinfo); INP_UNLOCK(tp->t_inpcb); - soisconnected(so); + CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); + cxgb_log_tcb(cdev->adapter, toep->tp_tid); #ifdef notyet /* * XXX not sure how these checks map to us @@ -3066,14 +3640,10 @@ socket_act_establish(struct socket *so, struct mbuf *m) fixup_and_send_ofo(so); if (__predict_false(so->so_state & SS_NOFDREF)) { -#ifdef notyet - /* - * XXX not clear what should be done here - * appears to correspond to sorwakeup_locked + /* + * XXX does this even make sense? */ - sk->sk_state_change(sk); - sk_wake_async(so, 0, POLL_OUT); -#endif + sorwakeup(so); } m_free(m); #ifdef notyet @@ -3095,8 +3665,7 @@ socket_act_establish(struct socket *so, struct mbuf *m) sk->sk_write_space(sk); #endif - soisconnected(so); - toep->tp_state = tp->t_state = TCPS_ESTABLISHED; + toep->tp_state = tp->t_state; tcpstat.tcps_connects++; } @@ -3139,6 +3708,9 @@ do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) socket_act_establish(so, m); INP_UNLOCK(tp->t_inpcb); + CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); + cxgb_log_tcb(cdev->adapter, toep->tp_tid); + return (0); } @@ -3156,7 +3728,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m) u32 snd_una = ntohl(hdr->snd_una); int bytes = 0; - DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits); + CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); INP_LOCK(tp->t_inpcb); @@ -3166,18 +3738,21 @@ wr_ack(struct toepcb *toep, struct mbuf *m) while (credits) { struct mbuf *p = peek_wr(toep); - DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ; if (__predict_false(!p)) { log(LOG_ERR, "%u WR_ACK credits for TID %u with " - "nothing pending, state %u\n", - credits, toep->tp_tid, tp->t_state); + "nothing pending, state %u wr_avail=%u\n", + credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); break; } + CTR2(KTR_TOM, + "wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len); + + KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list")); if (__predict_false(credits < p->m_pkthdr.csum_data)) { + #if DEBUG_WR > 1 struct tx_data_wr *w = cplhdr(p); -#ifdef notyet log(LOG_ERR, "TID %u got %u WR credits, need %u, len %u, " "main body %u, frags %u, seq # %u, ACK una %u," @@ -3185,8 +3760,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m) toep->tp_tid, credits, p->csum, p->len, p->len - p->data_len, skb_shinfo(p)->nr_frags, ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), - WR_AVAIL(tp), count_pending_wrs(tp) - credits); -#endif + toep->tp_wr_avail, count_pending_wrs(tp) - credits); #endif p->m_pkthdr.csum_data -= credits; break; @@ -3194,7 +3768,9 @@ wr_ack(struct toepcb *toep, struct mbuf *m) dequeue_wr(toep); credits -= p->m_pkthdr.csum_data; bytes += p->m_pkthdr.len; - DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len); + CTR3(KTR_TOM, + "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", + p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); m_free(p); } @@ -3228,7 +3804,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m) toep->tp_flags &= ~TP_TX_WAIT_IDLE; } if (bytes) { - DPRINTF("sbdrop(%d)\n", bytes); + CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, bytes); sowwakeup_locked(so); @@ -3250,15 +3826,21 @@ do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) { struct toepcb *toep = (struct toepcb *)ctx; - DPRINTF("do_wr_ack\n"); - dump_toepcb(toep); - VALIDATE_SOCK(so); wr_ack(toep, m); return 0; } +/* + * Handler for TRACE_PKT CPL messages. Just sink these packets. + */ +static int +do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) +{ + m_freem(m); + return 0; +} /* * Reset a connection that is on a listener's SYN queue or accept queue, @@ -3320,6 +3902,336 @@ t3_reset_synq(struct listen_ctx *lctx) SOCK_UNLOCK(lctx->lso); } + +int +t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl, + unsigned int nppods, unsigned int tag, unsigned int maxoff, + unsigned int pg_off, unsigned int color) +{ + unsigned int i, j, pidx; + struct pagepod *p; + struct mbuf *m; + struct ulp_mem_io *req; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + unsigned int tid = toep->tp_tid; + const struct tom_data *td = TOM_DATA(TOE_DEV(so)); + unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; + + CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", + gl, nppods, tag, maxoff, pg_off, color); + + for (i = 0; i < nppods; ++i) { + m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + req = mtod(m, struct ulp_mem_io *); + m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + req->wr.wr_lo = 0; + req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | + V_ULPTX_CMD(ULP_MEM_WRITE)); + req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | + V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); + + p = (struct pagepod *)(req + 1); + if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { + p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); + p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | + V_PPOD_COLOR(color)); + p->pp_max_offset = htonl(maxoff); + p->pp_page_offset = htonl(pg_off); + p->pp_rsvd = 0; + for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) + p->pp_addr[j] = pidx < gl->dgl_nelem ? + htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; + } else + p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ + send_or_defer(toep, m, 0); + ppod_addr += PPOD_SIZE; + } + return (0); +} + +/* + * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. + */ +static inline void +mk_cpl_barrier_ulp(struct cpl_barrier *b) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; + + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); + b->opcode = CPL_BARRIER; +} + +/* + * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. + */ +static inline void +mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; + + txpkt = (struct ulp_txpkt *)req; + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); + req->cpuno = htons(cpuno); +} + +/* + * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. + */ +static inline void +mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, + unsigned int word, uint64_t mask, uint64_t val) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; + + CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", + tid, word, mask, val); + + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply = V_NO_REPLY(1); + req->cpu_idx = 0; + req->word = htons(word); + req->mask = htobe64(mask); + req->val = htobe64(val); +} + +/* + * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. + */ +static void +mk_rx_data_ack_ulp(struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; + + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); + OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); + ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | + V_RX_DACK_MODE(1) | V_RX_CREDITS(credits)); +} + +void +t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) +{ + unsigned int wrlen; + struct mbuf *m; + struct work_request_hdr *wr; + struct cpl_barrier *lock; + struct cpl_set_tcb_field *req; + struct cpl_get_tcb *getreq; + struct ddp_state *p = &toep->tp_ddp_state; + + SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); + wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + + sizeof(*getreq); + m = m_gethdr_nofail(wrlen); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + wr = mtod(m, struct work_request_hdr *); + bzero(wr, wrlen); + + wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + m->m_pkthdr.len = m->m_len = wrlen; + + lock = (struct cpl_barrier *)(wr + 1); + mk_cpl_barrier_ulp(lock); + + req = (struct cpl_set_tcb_field *)(lock + 1); + + CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); + + /* Hmmm, not sure if this actually a good thing: reactivating + * the other buffer might be an issue if it has been completed + * already. However, that is unlikely, since the fact that the UBUF + * is not completed indicates that there is no oustanding data. + */ + if (bufidx == 0) + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_ACTIVE_BUF(1) | + V_TF_DDP_BUF0_VALID(1), + V_TF_DDP_ACTIVE_BUF(1)); + else + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_ACTIVE_BUF(1) | + V_TF_DDP_BUF1_VALID(1), 0); + + getreq = (struct cpl_get_tcb *)(req + 1); + mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); + + mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); + + /* Keep track of the number of oustanding CPL_GET_TCB requests + */ + p->get_tcb_count++; + +#ifdef T3_TRACE + T3_TRACE1(TIDTB(so), + "t3_cancel_ddpbuf: bufidx %u", bufidx); +#endif + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); +} + +/** + * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one + * @sk: the socket associated with the buffers + * @bufidx: index of HW DDP buffer (0 or 1) + * @tag0: new tag for HW buffer 0 + * @tag1: new tag for HW buffer 1 + * @len: new length for HW buf @bufidx + * + * Sends a compound WR to overlay a new DDP buffer on top of an existing + * buffer by changing the buffer tag and length and setting the valid and + * active flag accordingly. The caller must ensure the new buffer is at + * least as big as the existing one. Since we typically reprogram both HW + * buffers this function sets both tags for convenience. Read the TCB to + * determine how made data was written into the buffer before the overlay + * took place. + */ +void +t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, + unsigned int tag1, unsigned int len) +{ + unsigned int wrlen; + struct mbuf *m; + struct work_request_hdr *wr; + struct cpl_get_tcb *getreq; + struct cpl_set_tcb_field *req; + struct ddp_state *p = &toep->tp_ddp_state; + + CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", + bufidx, tag0, tag1, len); + SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); + wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); + m = m_gethdr_nofail(wrlen); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + wr = mtod(m, struct work_request_hdr *); + m->m_pkthdr.len = m->m_len = wrlen; + bzero(wr, wrlen); + + + /* Set the ATOMIC flag to make sure that TP processes the following + * CPLs in an atomic manner and no wire segments can be interleaved. + */ + wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); + req = (struct cpl_set_tcb_field *)(wr + 1); + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, + V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | + V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, + V_TCB_RX_DDP_BUF0_TAG(tag0) | + V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); + req++; + if (bufidx == 0) { + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); + req++; + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_PUSH_DISABLE_0(1) | + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), + V_TF_DDP_PUSH_DISABLE_0(0) | + V_TF_DDP_BUF0_VALID(1)); + } else { + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, + V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), + V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); + req++; + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_PUSH_DISABLE_1(1) | + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), + V_TF_DDP_PUSH_DISABLE_1(0) | + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); + } + + getreq = (struct cpl_get_tcb *)(req + 1); + mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); + + /* Keep track of the number of oustanding CPL_GET_TCB requests + */ + p->get_tcb_count++; + +#ifdef T3_TRACE + T3_TRACE4(TIDTB(sk), + "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " + "len %d", + bufidx, tag0, tag1, len); +#endif + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); +} + +/* + * Sends a compound WR containing all the CPL messages needed to program the + * two HW DDP buffers, namely optionally setting up the length and offset of + * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. + */ +void +t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, + unsigned int len1, unsigned int offset1, + uint64_t ddp_flags, uint64_t flag_mask, int modulate) +{ + unsigned int wrlen; + struct mbuf *m; + struct work_request_hdr *wr; + struct cpl_set_tcb_field *req; + + CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", + len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); + + SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); + wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + + (len1 ? sizeof(*req) : 0) + + (modulate ? sizeof(struct cpl_rx_data_ack) : 0); + m = m_gethdr_nofail(wrlen); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + wr = mtod(m, struct work_request_hdr *); + bzero(wr, wrlen); + + wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + m->m_pkthdr.len = m->m_len = wrlen; + + req = (struct cpl_set_tcb_field *)(wr + 1); + if (len0) { /* program buffer 0 offset and length */ + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, + V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | + V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); + req++; + } + if (len1) { /* program buffer 1 offset and length */ + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, + V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | + V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, + V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | + V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); + req++; + } + + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, + ddp_flags); + + if (modulate) { + mk_rx_data_ack_ulp((struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, + toep->tp_copied_seq - toep->tp_rcv_wup); + toep->tp_rcv_wup = toep->tp_copied_seq; + } + +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " + "modulate %d", + len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, + modulate); +#endif + + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); +} + void t3_init_wr_tab(unsigned int wr_len) { @@ -3353,7 +4265,6 @@ t3_init_cpl_io(void) tcphdr_skb->h.raw = tcphdr_skb->data; memset(tcphdr_skb->data, 0, tcphdr_skb->len); #endif - t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); @@ -3367,11 +4278,9 @@ t3_init_cpl_io(void) t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); -#ifdef notyet t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); -#endif return (0); } diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c index a3dd692..6edeacd 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -38,14 +38,18 @@ __FBSDID("$FreeBSD$"); #include <sys/limits.h> #include <sys/lock.h> #include <sys/mbuf.h> +#include <sys/condvar.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/smp.h> #include <sys/socket.h> #include <sys/syslog.h> #include <sys/socketvar.h> #include <sys/uio.h> +#include <sys/file.h> #include <machine/bus.h> +#include <machine/cpu.h> #include <net/if.h> #include <net/route.h> @@ -56,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_var.h> +#include <dev/cxgb/cxgb_config.h> #include <dev/cxgb/cxgb_osdep.h> #include <dev/cxgb/sys/mbufq.h> @@ -72,6 +77,7 @@ __FBSDID("$FreeBSD$"); #include <dev/cxgb/common/cxgb_ctl_defs.h> #include <dev/cxgb/cxgb_l2t.h> #include <dev/cxgb/cxgb_offload.h> + #include <vm/vm.h> #include <vm/vm_page.h> #include <vm/vm_map.h> @@ -85,6 +91,7 @@ __FBSDID("$FreeBSD$"); #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> #include <dev/cxgb/ulp/tom/cxgb_toepcb.h> #include <dev/cxgb/ulp/tom/cxgb_tcp.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, @@ -94,13 +101,11 @@ static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); -#ifdef notyet -#define VM_HOLD_WRITEABLE 0x1 -static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, - int *count, int flags); -#endif -static void vm_fault_unhold_pages(vm_page_t *m, int count); #define TMP_IOV_MAX 16 +#ifndef PG_FRAME +#define PG_FRAME ~PAGE_MASK +#endif +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) void t3_init_socket_ops(void) @@ -110,20 +115,8 @@ t3_init_socket_ops(void) prp = pffindtype(AF_INET, SOCK_STREAM); pru_sosend = prp->pr_usrreqs->pru_sosend; pru_soreceive = prp->pr_usrreqs->pru_soreceive; -#ifdef TCP_USRREQS_OVERLOAD - tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect; - tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort; - tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen; - tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send; - tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort; - tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect; - tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close; - tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown; - tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd; -#endif } - struct cxgb_dma_info { size_t cdi_mapped; int cdi_nsegs; @@ -182,21 +175,172 @@ iov_adj(struct iovec **iov, int *iovcnt, size_t count) } } - static void -cxgb_zero_copy_free(void *cl, void *arg) {} +cxgb_zero_copy_free(void *cl, void *arg) +{ + struct mbuf_vec *mv; + struct mbuf *m = (struct mbuf *)cl; + + mv = mtomv(m); + /* + * Physical addresses, don't try to free should be unheld separately from sbdrop + * + */ + mv->mv_count = 0; + m_free_iovec(m, m->m_type); +} + static int cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) { + struct iovec *iov = uio->uio_iov; + int iovcnt = uio->uio_iovcnt; + int err, i, count, totcount, maxcount, totbytes, npages, curbytes; + uint64_t start, end; + vm_page_t *mp; + + totbytes = totcount = 0; + maxcount = *held; + + mp = m; + for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) { + count = maxcount - totcount; + + start = (uintptr_t)iov->iov_base; + end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len); + start &= PG_FRAME; + end += PAGE_MASK; + end &= PG_FRAME; + npages = (end - start) >> PAGE_SHIFT; + + count = min(count, npages); + + err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags); + if (err) { + vm_fault_unhold_pages(m, totcount); + return (err); + } + mp += count; + totcount += count; + curbytes = iov->iov_len; + if (count != npages) + curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK); + totbytes += curbytes; + } + uio->uio_resid -= totbytes; - return (EINVAL); + return (0); +} + +/* + * Returns whether a connection should enable DDP. This happens when all of + * the following conditions are met: + * - the connection's ULP mode is DDP + * - DDP is not already enabled + * - the last receive was above the DDP threshold + * - receive buffers are in user space + * - receive side isn't shutdown (handled by caller) + * - the connection's receive window is big enough so that sizable buffers + * can be posted without closing the window in the middle of DDP (checked + * when the connection is offloaded) + */ +static int +so_should_ddp(const struct toepcb *toep, int last_recv_len) +{ + + DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n", + toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres), + toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN)); + + return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) && + last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && + toep->tp_tp->rcv_wnd > + (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); +} + +static inline int +is_ddp(const struct mbuf *m) +{ + return (m->m_flags & M_DDP); +} + +static inline int +is_ddp_psh(const struct mbuf *m) +{ + return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); +} + +static int +m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + int curlen, startlen, resid_init, err = 0; + caddr_t buf; + + DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n", + m, offset, len); + + startlen = len; + resid_init = uio->uio_resid; + while (m && len) { + buf = mtod(m, caddr_t); + curlen = m->m_len; + if (offset && (offset < curlen)) { + curlen -= offset; + buf += offset; + offset = 0; + } else if (offset) { + offset -= curlen; + m = m->m_next; + continue; + } + err = uiomove(buf, min(len, curlen), uio); + if (err) { + printf("uiomove returned %d\n", err); + return (err); + } + + len -= min(len, curlen); + m = m->m_next; + } + DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n", + startlen - len, resid_init, uio->uio_resid); + return (err); +} + +/* + * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the + * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a + * DDP buffer. + */ +static inline int +copy_data(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + struct iovec *to = uio->uio_iov; + int err; + + + if (__predict_true(!is_ddp(m))) { /* RX_DATA */ + return m_uiomove(m, offset, len, uio); + } if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ + to->iov_len -= len; + to->iov_base = ((caddr_t)to->iov_base) + len; + uio->uio_iov = to; + uio->uio_resid -= len; + return (0); + } + err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ + return (err); } static void -cxgb_wait_dma_completion(struct toepcb *tp) +cxgb_wait_dma_completion(struct toepcb *toep) { + struct mtx *lock; + lock = &toep->tp_tp->t_inpcb->inp_mtx; + INP_LOCK(toep->tp_tp->t_inpcb); + cv_wait_unlock(&toep->tp_cv, lock); } static int @@ -234,7 +378,13 @@ cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) mi_collapse_sge(mi, segs); *m = m0; - + + /* + * This appears to be a no-op at the moment + * as busdma is all or nothing need to make + * sure the tag values are large enough + * + */ if (cdi.cdi_mapped < uio->uio_resid) { uio->uio_resid -= cdi.cdi_mapped; } else @@ -305,10 +455,11 @@ sendmore: } uio->uio_resid -= m->m_pkthdr.len; sent += m->m_pkthdr.len; - sbappend_locked(&so->so_snd, m); + sbappend(&so->so_snd, m); t3_push_frames(so, TRUE); iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); } + /* * Wait for pending I/O to be DMA'd to the card * @@ -357,7 +508,7 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); - if ((uio->uio_resid > zcopy_thres) && + if (uio && (uio->uio_resid > zcopy_thres) && (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0) && zcopy_enabled) { rv = t3_sosend(so, uio); @@ -368,36 +519,378 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, return pru_sosend(so, addr, uio, top, control, flags, td); } +/* + * Following replacement or removal of the first mbuf on the first mbuf chain + * of a socket buffer, push necessary state changes back into the socket + * buffer so that other consumers see the values consistently. 'nextrecord' + * is the callers locally stored value of the original value of + * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. + * NOTE: 'nextrecord' may be NULL. + */ +static __inline void +sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) +{ + + SOCKBUF_LOCK_ASSERT(sb); + /* + * First, update for the new value of nextrecord. If necessary, make + * it the first record. + */ + if (sb->sb_mb != NULL) + sb->sb_mb->m_nextpkt = nextrecord; + else + sb->sb_mb = nextrecord; + + /* + * Now update any dependent socket buffer fields to reflect the new + * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the + * addition of a second clause that takes care of the case where + * sb_mb has been updated, but remains the last record. + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; +} + +#define IS_NONBLOCKING(so) ((so)->so_state & SS_NBIO) + static int -t3_soreceive(struct socket *so, struct uio *uio) +t3_soreceive(struct socket *so, int *flagsp, struct uio *uio) { -#ifdef notyet - int i, rv, count, hold_resid, sent, iovcnt; - struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m; - struct uio uiotmp; + uint32_t offset; + int err, flags, avail, len, copied, copied_unacked; + int target; /* Read at least this many bytes */ + int user_ddp_ok; + struct ddp_state *p; + struct inpcb *inp = sotoinpcb(so); + + avail = offset = copied = copied_unacked = 0; + flags = flagsp ? (*flagsp &~ MSG_EOR) : 0; + err = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + p = &toep->tp_ddp_state; + + if (err) + return (err); + SOCKBUF_LOCK(&so->so_rcv); + p->user_ddp_pending = 0; +restart: + len = uio->uio_resid; + m = so->so_rcv.sb_mb; + target = (flags & MSG_WAITALL) ? len : so->so_rcv.sb_lowat; + user_ddp_ok = p->ubuf_ddp_ready; + p->cancel_ubuf = 0; + + if (len == 0) + goto done; +#if 0 + while (m && m->m_len == 0) { + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + } +#endif + if (m) + goto got_mbuf; + + /* empty receive queue */ + if (copied >= target && (so->so_rcv.sb_mb == NULL) && + !p->user_ddp_pending) + goto done; + + if (copied) { + if (so->so_error || tp->t_state == TCPS_CLOSED || + (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))) + goto done; + } else { + if (so->so_state & SS_NOFDREF) + goto done; + if (so->so_error) { + err = so->so_error; + so->so_error = 0; + goto done; + } + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + goto done; + if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) + goto done; + if (tp->t_state == TCPS_CLOSED) { + err = ENOTCONN; + goto done; + } + } + if (so->so_rcv.sb_mb && !p->user_ddp_pending) { + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + SOCKBUF_LOCK(&so->so_rcv); + copied_unacked = 0; + goto restart; + } + if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + p->user_ddp_pending = + !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1); + if (p->user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + } + if (p->kbuf[0] && (p->kbuf_posted == 0)) { + t3_post_kbuf(so, 1, IS_NONBLOCKING(so)); + p->kbuf_posted++; + } + if (p->user_ddp_pending) { + /* One shot at DDP if we already have enough data */ + if (copied >= target) + user_ddp_ok = 0; + + DPRINTF("sbwaiting 1\n"); + if ((err = sbwait(&so->so_rcv)) != 0) + goto done; +//for timers to work await_ddp_completion(sk, flags, &timeo); + } else if (copied >= target) + goto done; + else { + if (copied_unacked) { + int i = 0; + + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + copied_unacked = 0; + if (mp_ncpus > 1) + while (i++ < 200 && so->so_rcv.sb_mb == NULL) + cpu_spinwait(); + SOCKBUF_LOCK(&so->so_rcv); + } + + if (so->so_rcv.sb_mb) + goto restart; + DPRINTF("sbwaiting 2 copied=%d target=%d avail=%d so=%p mb=%p cc=%d\n", copied, target, avail, so, + so->so_rcv.sb_mb, so->so_rcv.sb_cc); + if ((err = sbwait(&so->so_rcv)) != 0) + goto done; + } + goto restart; +got_mbuf: + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len, m->m_pkthdr.len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x m->m_len=%d", + m->m_next, m->m_nextpkt, m->m_flags, m->m_len)); + if (m->m_pkthdr.len == 0) { + if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0) + panic("empty mbuf and NOCOPY not set\n"); + CTR0(KTR_TOM, "ddp done notification"); + p->user_ddp_pending = 0; + sbdroprecord_locked(&so->so_rcv); + goto done; + } + + offset = toep->tp_copied_seq + copied_unacked - m->m_seq; + DPRINTF("m=%p copied_seq=0x%x copied_unacked=%d m_seq=0x%x offset=%d pktlen=%d is_ddp(m)=%d\n", + m, toep->tp_copied_seq, copied_unacked, m->m_seq, offset, m->m_pkthdr.len, !!is_ddp(m)); + + if (offset >= m->m_pkthdr.len) + panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x seq 0x%x " + "pktlen %d ddp flags 0x%x", offset, toep->tp_copied_seq + copied_unacked, m->m_seq, + m->m_pkthdr.len, m->m_ddp_flags); + + avail = m->m_pkthdr.len - offset; + if (len < avail) { + if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) + panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset); + avail = len; + } + CTR4(KTR_TOM, "t3_soreceive: m_len=%u offset=%u len=%u m_seq=0%08x", m->m_pkthdr.len, offset, len, m->m_seq); + +#ifdef URGENT_DATA_SUPPORTED /* - * Events requiring iteration: - * - number of pages exceeds max hold pages for process or system - * - number of pages exceeds maximum sg entries for a single WR - * - * We're limited to holding 128 pages at once - and we're limited to - * 34 SG entries per work request, but each SG entry can be any number - * of contiguous pages - * + * Check if the data we are preparing to copy contains urgent + * data. Either stop short of urgent data or skip it if it's + * first and we are not delivering urgent data inline. + */ + if (__predict_false(toep->tp_urg_data)) { + uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked; + + if (urg_offset < avail) { + if (urg_offset) { + /* stop short of the urgent data */ + avail = urg_offset; + } else if ((so->so_options & SO_OOBINLINE) == 0) { + /* First byte is urgent, skip */ + toep->tp_copied_seq++; + offset++; + avail--; + if (!avail) + goto skip_copy; + } + } + } +#endif + if (is_ddp_psh(m) || offset) { + user_ddp_ok = 0; +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), "t3_sosend: PSH"); +#endif + } + + if (user_ddp_ok && !p->user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + p->user_ddp_pending = + !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1); + if (p->user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending); + } else + DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n", + user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0, + p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted); + + /* + * If MSG_TRUNC is specified the data is discarded. + * XXX need to check pr_atomic */ + KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset)); + if (__predict_true(!(flags & MSG_TRUNC))) { + int resid = uio->uio_resid; + + SOCKBUF_UNLOCK(&so->so_rcv); + if ((err = copy_data(m, offset, avail, uio))) { + if (err) + err = EFAULT; + goto done_unlocked; + } + SOCKBUF_LOCK(&so->so_rcv); + if (avail != (resid - uio->uio_resid)) + printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n", + avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m)); + } + + copied += avail; + copied_unacked += avail; + len -= avail; + +#ifdef URGENT_DATA_SUPPORTED +skip_copy: + if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq)) + tp->urg_data = 0; +#endif + /* + * If the buffer is fully consumed free it. If it's a DDP + * buffer also handle any events it indicates. + */ + if (avail + offset >= m->m_pkthdr.len) { + unsigned int fl = m->m_ddp_flags; + int exitnow, got_psh = 0, nomoredata = 0; + int count; + struct mbuf *nextrecord; + + if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) { + if (is_ddp_psh(m) && p->user_ddp_pending) + got_psh = 1; + + if (fl & DDP_BF_NOCOPY) + p->user_ddp_pending = 0; + else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) { + p->kbuf_posted--; + nomoredata = 1; + } else { + p->kbuf_posted--; + p->ubuf_ddp_ready = 1; + } + } - uiotmp = *uio; - iovcnt = uio->uio_iovcnt; - iov = uio->uio_iov; - sent = 0; - re; -#endif - return (0); + nextrecord = m->m_nextpkt; + count = m->m_pkthdr.len; + while (count > 0) { + count -= m->m_len; + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + sbfree(&so->so_rcv, m); + so->so_rcv.sb_mb = m_free(m); + m = so->so_rcv.sb_mb; + } + sockbuf_pushsync(&so->so_rcv, nextrecord); +#if 0 + sbdrop_locked(&so->so_rcv, m->m_pkthdr.len); +#endif + exitnow = got_psh || nomoredata; + if ((so->so_rcv.sb_mb == NULL) && exitnow) + goto done; + if (copied_unacked > (so->so_rcv.sb_hiwat >> 2)) { + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + copied_unacked = 0; + SOCKBUF_LOCK(&so->so_rcv); + } + } + if (len > 0) + goto restart; + + done: + /* + * If we can still receive decide what to do in preparation for the + * next receive. Note that RCV_SHUTDOWN is set if the connection + * transitioned to CLOSE but not if it was in that state to begin with. + */ + if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) { + if (p->user_ddp_pending) { + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_rcv); + user_ddp_ok = 0; + t3_cancel_ubuf(toep); + if (so->so_rcv.sb_mb) { + if (copied < 0) + copied = 0; + if (len > 0) + goto restart; + } + p->user_ddp_pending = 0; + } + if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) { +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), + "chelsio_recvmsg: about to exit, repost kbuf"); +#endif + + t3_post_kbuf(so, 1, IS_NONBLOCKING(so)); + p->kbuf_posted++; + } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) { + CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid); + if (!t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so), + ddp_copy_limit), 0, IS_NONBLOCKING(so))) + p->kbuf_posted = 1; + } + } +#ifdef T3_TRACE + T3_TRACE5(TIDTB(so), + "chelsio_recvmsg <-: copied %d len %d buffers_freed %d " + "kbuf_posted %d user_ddp_pending %u", + copied, len, buffers_freed, p ? p->kbuf_posted : -1, + p->user_ddp_pending); +#endif + SOCKBUF_UNLOCK(&so->so_rcv); +done_unlocked: + if (copied_unacked) { + INP_LOCK(inp); + t3_cleanup_rbuf(tp, copied_unacked); + INP_UNLOCK(inp); + } + sbunlock(&so->so_rcv); + + return (err); } static int @@ -405,9 +898,11 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct toedev *tdev; - int rv, zcopy_thres, zcopy_enabled; + int rv, zcopy_thres, zcopy_enabled, flags; struct tcpcb *tp = sototcpcb(so); + flags = flagsp ? *flagsp &~ MSG_EOR : 0; + /* * In order to use DMA direct from userspace the following * conditions must be met: @@ -421,150 +916,30 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, * - iovcnt is 1 * */ - if (tp->t_flags & TF_TOE) { + + if ((tp->t_flags & TF_TOE) && uio && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0) + && (uio->uio_iovcnt == 1) && (mp0 == NULL)) { tdev = TOE_DEV(so); zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); zcopy_enabled = TOM_TUNABLE(tdev, ddp); if ((uio->uio_resid > zcopy_thres) && - (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0) + (uio->uio_iovcnt == 1) && zcopy_enabled) { - rv = t3_soreceive(so, uio); + rv = t3_soreceive(so, flagsp, uio); if (rv != EAGAIN) return (rv); - } - } - + else + printf("returned EAGAIN\n"); + } + } else if ((tp->t_flags & TF_TOE) && uio && mp0 == NULL) + printf("skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n", + flags, uio->uio_iovcnt, so->so_rcv.sb_state); return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); } - void t3_install_socket_ops(struct socket *so) { so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; } - -/* - * This routine takes a user address range and does the following: - * - validate that the user has access to those pages (flags indicates read or write) - if not fail - * - validate that count is enough to hold range number of pages - if not fail - * - fault in any non-resident pages - * - if the user is doing a read force a write fault for any COWed pages - * - if the user is doing a read mark all pages as dirty - * - hold all pages - * - return number of pages in count - */ -#ifdef notyet -static int -vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags) -{ - - vm_offset_t start, va; - vm_paddr_t pa; - int pageslen, faults, rv; - - struct thread *td; - vm_map_t map; - pmap_t pmap; - vm_page_t m, *pages; - vm_prot_t prot; - - start = addr & ~PAGE_MASK; - pageslen = roundup2(addr + len, PAGE_SIZE); - if (*count < (pageslen >> PAGE_SHIFT)) - return (EFBIG); - - *count = pageslen >> PAGE_SHIFT; - /* - * Check that virtual address range is legal - * This check is somewhat bogus as on some architectures kernel - * and user do not share VA - however, it appears that all FreeBSD - * architectures define it - */ - if (addr + len > VM_MAXUSER_ADDRESS) - return (EFAULT); - - td = curthread; - map = &td->td_proc->p_vmspace->vm_map; - pmap = &td->td_proc->p_vmspace->vm_pmap; - pages = mp; - - prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ; - bzero(pages, sizeof(vm_page_t *) * (*count)); -retry: - - /* - * First optimistically assume that all pages are resident (and R/W if for write) - * if so just mark pages as held (and dirty if for write) and return - */ - vm_page_lock_queues(); - for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) { - /* - * Assure that we only hold the page once - */ - if (*pages == NULL) { - /* - * page queue mutex is recursable so this is OK - * it would be really nice if we had an unlocked version of this so - * we were only acquiring the pmap lock 1 time as opposed to potentially - * many dozens of times - */ - m = pmap_extract_and_hold(pmap, va, prot); - if (m == NULL) { - faults++; - continue; - } - *pages = m; - if (flags & VM_HOLD_WRITEABLE) - vm_page_dirty(m); - } - } - vm_page_unlock_queues(); - - if (faults == 0) - return (0); - /* - * Pages either have insufficient permissions or are not present - * trigger a fault where neccessary - * - */ - for (va = start; va < pageslen; va += PAGE_SIZE) { - m = NULL; - pa = pmap_extract(pmap, va); - rv = 0; - if (pa) - m = PHYS_TO_VM_PAGE(pa); - if (flags & VM_HOLD_WRITEABLE) { - if (m == NULL || (m->flags & PG_WRITEABLE) == 0) - rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); - } else if (m == NULL) - rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL); - if (rv) - goto error; - } - goto retry; - -error: - vm_page_lock_queues(); - for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) - if (*pages) - vm_page_unhold(*pages); - vm_page_unlock_queues(); - return (EFAULT); -} -#endif - -static void -vm_fault_unhold_pages(vm_page_t *mp, int count) -{ - - KASSERT(count >= 0, ("negative count %d", count)); - vm_page_lock_queues(); - while (count--) { - vm_page_unhold(*mp); - mp++; - } - vm_page_unlock_queues(); -} - diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c new file mode 100644 index 0000000..8bdcb65 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c @@ -0,0 +1,735 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/syslog.h> +#include <sys/socketvar.h> +#include <sys/uio.h> + +#include <machine/bus.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + + +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> + +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_offload.h> +#include <net/route.h> + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/cxgb_l2t.h> +#include <dev/cxgb/cxgb_offload.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> + +#include <dev/cxgb/sys/mvec.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +#include <dev/cxgb/ulp/tom/cxgb_tcp.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> + +#define MAX_SCHEDULE_TIMEOUT 300 + +/* + * Return the # of page pods needed to accommodate a # of pages. + */ +static inline unsigned int +pages2ppods(unsigned int pages) +{ + return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS; +} + +/** + * t3_pin_pages - pin a user memory range and prepare it for DDP + * @addr - the starting address + * @len - the length of the range + * @newgl - contains the pages and physical addresses of the pinned range + * @gl - an existing gather list, may be %NULL + * + * Pins the pages in the user-space memory range [addr, addr + len) and + * maps them for DMA. Returns a gather list with the pinned pages and + * their physical addresses. If @gl is non NULL the pages it describes + * are compared against the pages for [addr, addr + len), and if the + * existing gather list already covers the range a new list is not + * allocated. Returns 0 on success, or a negative errno. On success if + * a new gather list was allocated it is returned in @newgl. + */ +static int +t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, vm_offset_t addr, + size_t len, struct ddp_gather_list **newgl, + const struct ddp_gather_list *gl) +{ + int i = 0, err; + size_t pg_off; + unsigned int npages; + struct ddp_gather_list *p; + + /* + * XXX need x86 agnostic check + */ + if (addr + len > VM_MAXUSER_ADDRESS) + return (EFAULT); + + pg_off = addr & PAGE_MASK; + npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (p == NULL) + return (ENOMEM); + + err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE); + if (err) + goto free_gl; + + if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages && + gl->dgl_length >= len) { + for (i = 0; i < npages; i++) + if (p->dgl_pages[i] != gl->dgl_pages[i]) + goto different_gl; + err = 0; + goto unpin; + } + +different_gl: + p->dgl_length = len; + p->dgl_offset = pg_off; + p->dgl_nelem = npages; +#ifdef NEED_BUSDMA + p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off, + PAGE_SIZE - pg_off, + PCI_DMA_FROMDEVICE) - pg_off; + for (i = 1; i < npages; ++i) + p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); +#endif + *newgl = p; + return (0); +unpin: + vm_fault_unhold_pages(p->dgl_pages, npages); + +free_gl: + + free(p, M_DEVBUF); + *newgl = NULL; + return (err); +} + +static void +unmap_ddp_gl(const struct ddp_gather_list *gl) +{ +#ifdef NEED_BUSDMA + int i; + + if (!gl->nelem) + return; + + pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset, + PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE); + for (i = 1; i < gl->nelem; ++i) + pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE, + PCI_DMA_FROMDEVICE); + +#endif +} + +static void +ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty) +{ + /* + * XXX mark pages as dirty before unholding + */ + vm_fault_unhold_pages(gl->dgl_pages, gl->dgl_nelem); +} + +void +t3_free_ddp_gl(struct ddp_gather_list *gl) +{ + unmap_ddp_gl(gl); + ddp_gl_free_pages(gl, 0); + free(gl, M_DEVBUF); +} + +/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */ +#define MAX_PPODS 64U + +/* + * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in + * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we + * try to allocate enough page pods to accommodate the whole buffer, subject to + * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page + * pods before failing entirely. + */ +static int +alloc_buf1_ppods(struct socket *so, struct ddp_state *p, + unsigned long addr, unsigned int len) +{ + int err, tag, npages, nppods; + struct tom_data *d = TOM_DATA(TOE_DEV(so)); + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nppods = min(pages2ppods(npages), MAX_PPODS); + nppods = roundup2(nppods, PPOD_CLUSTER_SIZE); + err = t3_alloc_ppods(d, nppods, &tag); + if (err && nppods > PPOD_CLUSTER_SIZE) { + nppods = PPOD_CLUSTER_SIZE; + err = t3_alloc_ppods(d, nppods, &tag); + } + if (err) + return (ENOMEM); + + p->ubuf_nppods = nppods; + p->ubuf_tag = tag; +#if NUM_DDP_KBUF == 1 + t3_set_ddp_tag(so, 1, tag << 6); +#endif + return (0); +} + +/* + * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush + * won't block indefinitely if there's nothing to place (which should be rare). + */ +#define UBUF_OFFSET 1 + +static __inline unsigned long +select_ddp_flags(const struct socket *so, int buf_idx, + int nonblock, int rcv_flags) +{ + if (buf_idx == 1) { + if (__predict_false(rcv_flags & MSG_WAITALL)) + return V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | + V_TF_DDP_PUSH_DISABLE_1(1); + if (nonblock) + return V_TF_DDP_BUF1_FLUSH(1); + + return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so), + ddp_push_wait)); + } + + if (__predict_false(rcv_flags & MSG_WAITALL)) + return V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | + V_TF_DDP_PUSH_DISABLE_0(1); + if (nonblock) + return V_TF_DDP_BUF0_FLUSH(1); + + return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait)); +} + +/* + * Reposts the kernel DDP buffer after it has been previously become full and + * invalidated. We just need to reset the offset and adjust the DDP flags. + * Conveniently, we can set the flags and the offset with a single message. + * Note that this function does not set the buffer length. Again conveniently + * our kernel buffer is of fixed size. If the length needs to be changed it + * needs to be done separately. + */ +static void +t3_repost_kbuf(struct socket *so, unsigned int bufidx, int modulate, + int activate, int nonblock) +{ + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + unsigned long flags; + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset; + p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0; + p->buf_state[bufidx].gl = p->kbuf[bufidx]; + p->cur_buf = bufidx; + p->kbuf_idx = bufidx; + + flags = select_ddp_flags(so, bufidx, nonblock, 0); + if (!bufidx) + t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags | + V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) | + V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | + V_TF_DDP_BUF0_VALID(1), + V_TF_DDP_BUF0_FLUSH(1) | + V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) | + V_TF_DDP_BUF0_VALID(1) | + V_TF_DDP_ACTIVE_BUF(activate), modulate); + else + t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags | + V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) | + V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | + V_TF_DDP_BUF1_VALID(1) | + V_TF_DDP_ACTIVE_BUF(activate), + V_TF_DDP_BUF1_FLUSH(1) | + V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) | + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), + modulate); + +} + +/** + * setup_uio_ppods - setup HW page pods for a user iovec + * @sk: the associated socket + * @uio: the uio + * @oft: additional bytes to map before the start of the buffer + * + * Pins a user iovec and sets up HW page pods for DDP into it. We allocate + * page pods for user buffers on the first call per socket. Afterwards we + * limit the buffer length to whatever the existing page pods can accommodate. + * Returns a negative error code or the length of the mapped buffer. + * + * The current implementation handles iovecs with only one entry. + */ +static int +setup_uio_ppods(struct socket *so, const struct uio *uio, int oft, int *length) +{ + int err; + unsigned int len; + struct ddp_gather_list *gl = NULL; + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + struct iovec *iov = uio->uio_iov; + vm_offset_t addr = (vm_offset_t)iov->iov_base - oft; + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + if (__predict_false(p->ubuf_nppods == 0)) { + err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft); + if (err) + return (err); + } + + len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE; + len -= addr & PAGE_MASK; + if (len > M_TCB_RX_DDP_BUF0_LEN) + len = M_TCB_RX_DDP_BUF0_LEN; + len = min(len, sototcpcb(so)->rcv_wnd - 32768); + len = min(len, iov->iov_len + oft); + + if (len <= p->kbuf[0]->dgl_length) { + printf("length too short\n"); + return (EINVAL); + } + + err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf); + if (err) + return (err); + if (gl) { + if (p->ubuf) + t3_free_ddp_gl(p->ubuf); + p->ubuf = gl; + t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len, + gl->dgl_offset, 0); + } + *length = len; + return (0); +} + +/* + * + */ +void +t3_cancel_ubuf(struct toepcb *toep) +{ + struct ddp_state *p = &toep->tp_ddp_state; + int ubuf_pending = t3_ddp_ubuf_pending(toep); + struct socket *so = toeptoso(toep); + int err = 0, count=0; + + if (p->ubuf == NULL) + return; + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + p->cancel_ubuf = 1; + while (ubuf_pending && !(so->so_rcv.sb_state & SBS_CANTRCVMORE)) { +#ifdef T3_TRACE + T3_TRACE3(TB(p), + "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d", + p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), + p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), + p->get_tcb_count); +#endif + CTR3(KTR_TOM, + "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d", + p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), + p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), + p->get_tcb_count); + if (p->get_tcb_count == 0) + t3_cancel_ddpbuf(toep, p->cur_buf); + else + CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d so=%p SBS_CANTRCVMORE=%d", + err, p->get_tcb_count, so->so_rcv.sb_timeo, so, + !!(so->so_rcv.sb_state & SBS_CANTRCVMORE)); + + while (p->get_tcb_count && !(so->so_rcv.sb_state & SBS_CANTRCVMORE)) { + if (count & 0xfffffff) + CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d so=%p count=%d", + err, p->get_tcb_count, so->so_rcv.sb_timeo, so, count); + count++; + err = sbwait(&so->so_rcv); + } + ubuf_pending = t3_ddp_ubuf_pending(toep); + } + p->cancel_ubuf = 0; +} + +#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \ + V_TF_DDP_PSH_NO_INVALIDATE1(1) | \ + V_TF_DDP_BUF1_FLUSH(1) | \ + V_TF_DDP_BUF0_FLUSH(1) | \ + V_TF_DDP_PUSH_DISABLE_1(1) | \ + V_TF_DDP_PUSH_DISABLE_0(1) | \ + V_TF_DDP_INDICATE_OUT(1)) + +/* + * Post a user buffer as an overlay on top of the current kernel buffer. + */ +int +t3_overlay_ubuf(struct socket *so, const struct uio *uio, + int nonblock, int rcv_flags, int modulate, int post_kbuf) +{ + int err, len, ubuf_idx; + unsigned long flags; + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + + if (p->kbuf[0] == NULL) { + return (EINVAL); + } + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + err = setup_uio_ppods(so, uio, 0, &len); + if (err) { + return (err); + } + + ubuf_idx = p->kbuf_idx; + p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP; + /* Use existing offset */ + /* Don't need to update .gl, user buffer isn't copied. */ + p->cur_buf = ubuf_idx; + + flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags); + + if (post_kbuf) { + struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1]; + + dbs->cur_offset = 0; + dbs->flags = 0; + dbs->gl = p->kbuf[ubuf_idx ^ 1]; + p->kbuf_idx ^= 1; + flags |= p->kbuf_idx ? + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) : + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0); + } + + if (ubuf_idx == 0) { + t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6, + len); + t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0, + flags, + OVERLAY_MASK | flags, 1); + } else { + t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6, + len); + t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0, + flags, + OVERLAY_MASK | flags, 1); + } +#ifdef T3_TRACE + T3_TRACE5(TIDTB(so), + "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d " + " kbuf_idx %d", + p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx); +#endif + CTR3(KTR_TOM, + "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x", + p->ubuf_tag, flags, OVERLAY_MASK); + CTR3(KTR_TOM, + "t3_overlay_ubuf: ubuf_idx %d kbuf_idx %d post_kbuf %d", + ubuf_idx, p->kbuf_idx, post_kbuf); + + return (0); +} + +/* + * Clean up DDP state that needs to survive until socket close time, such as the + * DDP buffers. The buffers are already unmapped at this point as unmapping + * needs the PCI device and a socket may close long after the device is removed. + */ +void +t3_cleanup_ddp(struct toepcb *toep) +{ + struct ddp_state *p = &toep->tp_ddp_state; + int idx; + + for (idx = 0; idx < NUM_DDP_KBUF; idx++) + if (p->kbuf[idx]) { + ddp_gl_free_pages(p->kbuf[idx], 0); + free(p->kbuf[idx], M_DEVBUF); + } + if (p->ubuf) { + ddp_gl_free_pages(p->ubuf, 0); + free(p->ubuf, M_DEVBUF); + p->ubuf = NULL; + } + toep->tp_ulp_mode = 0; +} + +/* + * This is a companion to t3_cleanup_ddp() and releases the HW resources + * associated with a connection's DDP state, such as the page pods. + * It's called when HW is done with a connection. The rest of the state + * remains available until both HW and the app are done with the connection. + */ +void +t3_release_ddp_resources(struct toepcb *toep) +{ + struct ddp_state *p = &toep->tp_ddp_state; + struct tom_data *d = TOM_DATA(toep->tp_toedev); + int idx; + + for (idx = 0; idx < NUM_DDP_KBUF; idx++) { + t3_free_ppods(d, p->kbuf_tag[idx], + p->kbuf_nppods[idx]); + unmap_ddp_gl(p->kbuf[idx]); + } + + if (p->ubuf_nppods) { + t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods); + p->ubuf_nppods = 0; + } + if (p->ubuf) + unmap_ddp_gl(p->ubuf); + +} + +void +t3_post_kbuf(struct socket *so, int modulate, int nonblock) +{ + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + + t3_set_ddp_tag(so, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6); + t3_set_ddp_buf(so, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length); + t3_repost_kbuf(so, p->cur_buf, modulate, 1, nonblock); +#ifdef T3_TRACE + T3_TRACE1(TIDTB(so), + "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); +#endif + CTR1(KTR_TOM, + "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); +} + +/* + * Prepare a socket for DDP. Must be called when the socket is known to be + * open. + */ +int +t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall, int nonblock) +{ + int i, err = ENOMEM; + static vm_pindex_t color; + unsigned int nppods, kbuf_pages, idx = 0; + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + struct tom_data *d = TOM_DATA(toep->tp_toedev); + + + if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN) + return (EINVAL); + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + + kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + nppods = pages2ppods(kbuf_pages); + + p->kbuf_noinval = !!waitall; + p->kbuf_tag[NUM_DDP_KBUF - 1] = -1; + for (idx = 0; idx < NUM_DDP_KBUF; idx++) { + p->kbuf[idx] = + malloc(sizeof (struct ddp_gather_list) + kbuf_pages * + sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO); + if (p->kbuf[idx] == NULL) + goto err; + err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]); + if (err) { + printf("t3_alloc_ppods failed err=%d\n", err); + goto err; + } + + p->kbuf_nppods[idx] = nppods; + p->kbuf[idx]->dgl_length = kbuf_size; + p->kbuf[idx]->dgl_offset = 0; + p->kbuf[idx]->dgl_nelem = kbuf_pages; + + for (i = 0; i < kbuf_pages; ++i) { + p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color, + VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (p->kbuf[idx]->dgl_pages[i] == NULL) { + p->kbuf[idx]->dgl_nelem = i; + printf("failed to allocate kbuf pages\n"); + goto err; + } + } +#ifdef NEED_BUSDMA + /* + * XXX we'll need this for VT-d or any platform with an iommu :-/ + * + */ + for (i = 0; i < kbuf_pages; ++i) + p->kbuf[idx]->phys_addr[i] = + pci_map_page(p->pdev, p->kbuf[idx]->pages[i], + 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); +#endif + t3_setup_ppods(so, p->kbuf[idx], nppods, p->kbuf_tag[idx], + p->kbuf[idx]->dgl_length, 0, 0); + } + cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid); + + t3_set_ddp_tag(so, 0, p->kbuf_tag[0] << 6); + t3_set_ddp_buf(so, 0, 0, p->kbuf[0]->dgl_length); + t3_repost_kbuf(so, 0, 0, 1, nonblock); + + t3_set_rcv_coalesce_enable(so, + TOM_TUNABLE(TOE_DEV(so), ddp_rcvcoalesce)); + +#ifdef T3_TRACE + T3_TRACE4(TIDTB(so), + "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", + kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); +#endif + CTR4(KTR_TOM, + "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", + kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); + DELAY(100000); + cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid); + return (0); + +err: + t3_release_ddp_resources(toep); + t3_cleanup_ddp(toep); + return (err); +} + +int +t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len) +{ + int page_off, resid_init, err; + struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl; + + resid_init = uio->uio_resid; + + if (!gl->dgl_pages) + panic("pages not set\n"); + + offset += gl->dgl_offset + m->m_cur_offset; + page_off = offset & PAGE_MASK; + KASSERT(len <= gl->dgl_length, + ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length)); + + err = uiomove_fromphys(gl->dgl_pages, page_off, len, uio); + return (err); +} + + +/* + * Allocate n page pods. Returns -1 on failure or the page pod tag. + */ +int +t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag) +{ + unsigned int i, j; + + if (__predict_false(!td->ppod_map)) { + printf("ppod_map not set\n"); + return (EINVAL); + } + + mtx_lock(&td->ppod_map_lock); + for (i = 0; i < td->nppods; ) { + + for (j = 0; j < n; ++j) /* scan ppod_map[i..i+n-1] */ + if (td->ppod_map[i + j]) { + i = i + j + 1; + goto next; + } + memset(&td->ppod_map[i], 1, n); /* allocate range */ + mtx_unlock(&td->ppod_map_lock); + CTR2(KTR_TOM, + "t3_alloc_ppods: n=%u tag=%u", n, i); + *ptag = i; + return (0); + next: ; + } + mtx_unlock(&td->ppod_map_lock); + return (0); +} + +void +t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n) +{ + /* No need to take ppod_lock here */ + memset(&td->ppod_map[tag], 0, n); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h index 9077295..8989fd9 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_defs.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h @@ -40,6 +40,13 @@ $FreeBSD$ #define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket) #define sototoep(so) (sototcpcb((so))->t_toe) +#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__) +#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__) + +#define KTR_TOM KTR_SPARE2 +#define KTR_TCB KTR_SPARE3 + +struct toepcb; struct listen_ctx; typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m); @@ -54,7 +61,8 @@ void t3_init_listen_cpl_handlers(void); int t3_init_cpl_io(void); void t3_init_wr_tab(unsigned int wr_len); uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail); -void t3_cleanup_rbuf(struct tcpcb *tp); +void t3_send_rx_modulate(struct toepcb *toep); +void t3_cleanup_rbuf(struct tcpcb *tp, int copied); void t3_init_socket_ops(void); void t3_install_socket_ops(struct socket *so); diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c index a88b26e..acbad6f 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -180,7 +180,6 @@ listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid) return p; } -#if 0 /* * Given a pointer to a listening socket return its server TID by consulting * the socket->stid map. Returns -1 if the socket is not in the map. @@ -191,16 +190,15 @@ listen_hash_find(struct tom_data *d, struct socket *so) int stid = -1, bucket = listen_hashfn(so); struct listen_info *p; - spin_lock(&d->listen_lock); + mtx_lock(&d->listen_lock); for (p = d->listen_hash_tab[bucket]; p; p = p->next) - if (p->sk == sk) { + if (p->so == so) { stid = p->stid; break; } - spin_unlock(&d->listen_lock); + mtx_unlock(&d->listen_lock); return stid; } -#endif /* * Delete the listen_info structure for a listening socket. Returns the server @@ -244,28 +242,24 @@ t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev) if (!TOM_TUNABLE(dev, activated)) return; - printf("start listen\n"); + if (listen_hash_find(d, so) != -1) + return; - ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT); + CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport)); + ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO); if (!ctx) return; ctx->tom_data = d; ctx->lso = so; - ctx->ulp_mode = 0; /* DDP if the default */ + ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0; LIST_INIT(&ctx->synq_head); stid = cxgb_alloc_stid(d->cdev, d->client, ctx); if (stid < 0) goto free_ctx; -#ifdef notyet - /* - * XXX need to mark inpcb as referenced - */ - sock_hold(sk); -#endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) goto free_stid; diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h index 9fa42b5..e37c9b1 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h @@ -1,4 +1,3 @@ - /************************************************************************** Copyright (c) 2007, Chelsio Inc. @@ -86,7 +85,6 @@ struct pagepod { #define M_PPOD_PGSZ 0x3 #define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ) -struct pci_dev; #include <vm/vm.h> #include <vm/vm_page.h> #include <machine/bus.h> @@ -96,8 +94,7 @@ struct ddp_gather_list { unsigned int dgl_length; unsigned int dgl_offset; unsigned int dgl_nelem; - vm_page_t *dgl_pages; - bus_addr_t dgl_phys_addr[0]; + vm_page_t dgl_pages[0]; }; struct ddp_buf_state { @@ -107,7 +104,6 @@ struct ddp_buf_state { }; struct ddp_state { - struct pci_dev *pdev; struct ddp_buf_state buf_state[2]; /* per buffer state */ int cur_buf; unsigned short kbuf_noinval; @@ -119,6 +115,7 @@ struct ddp_state { int get_tcb_count; unsigned int kbuf_posted; int cancel_ubuf; + int user_ddp_pending; unsigned int kbuf_nppods[NUM_DDP_KBUF]; unsigned int kbuf_tag[NUM_DDP_KBUF]; struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */ @@ -132,54 +129,51 @@ enum { DDP_BF_PSH = 1 << 3, /* set in skb->flags if the a DDP was completed with a segment having the PSH flag set */ + DDP_BF_NODATA = 1 << 4, /* buffer completed before filling */ }; -#ifdef notyet +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> + /* * Returns 1 if a UBUF DMA buffer might be active. */ -static inline int t3_ddp_ubuf_pending(struct sock *so) +static inline int +t3_ddp_ubuf_pending(struct toepcb *toep) { - struct tcp_sock *tp = tcp_sk(sk); - struct ddp_state *p = DDP_STATE(tp); + struct ddp_state *p = &toep->tp_ddp_state; /* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP, * but DDP_STATE() is only valid if the connection actually enabled * DDP. */ - if (!p) - return 0; + if (p->kbuf[0] == NULL) + return (0); return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)); } -#endif int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl, unsigned int nppods, unsigned int tag, unsigned int maxoff, unsigned int pg_off, unsigned int color); -int t3_alloc_ppods(struct tom_data *td, unsigned int n); +int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag); void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); -void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl); -int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len, - struct ddp_gather_list **newgl, - const struct ddp_gather_list *gl); -int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to, - int len); +void t3_free_ddp_gl(struct ddp_gather_list *gl); +int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len); //void t3_repost_kbuf(struct socket *so, int modulate, int activate); -void t3_post_kbuf(struct socket *so, int modulate); -int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock, +void t3_post_kbuf(struct socket *so, int modulate, int nonblock); +int t3_post_ubuf(struct socket *so, const struct uio *uio, int nonblock, int rcv_flags, int modulate, int post_kbuf); -void t3_cancel_ubuf(struct socket *so); -int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock, - int rcv_flags, int modulate, int post_kbuf); -int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall); -void t3_cleanup_ddp(struct socket *so); +void t3_cancel_ubuf(struct toepcb *toep); +int t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock, + int rcv_flags, int modulate, int post_kbuf); +int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall, int nonblock); +void t3_cleanup_ddp(struct toepcb *toep); void t3_release_ddp_resources(struct toepcb *toep); -void t3_cancel_ddpbuf(struct socket *so, unsigned int bufidx); -void t3_overlay_ddpbuf(struct socket *so, unsigned int bufidx, unsigned int tag0, +void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx); +void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0, unsigned int tag1, unsigned int len); -void t3_setup_ddpbufs(struct socket *so, unsigned int len0, unsigned int offset0, +void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0, unsigned int len1, unsigned int offset1, uint64_t ddp_flags, uint64_t flag_mask, int modulate); #endif /* T3_DDP_H */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c deleted file mode 100644 index 2eca099..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c +++ /dev/null @@ -1,694 +0,0 @@ -/*- - * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include "opt_compat.h" -#include "opt_inet.h" -#include "opt_inet6.h" -#include "opt_ipsec.h" -#include "opt_mac.h" -#include "opt_tcpdebug.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/callout.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#ifdef INET6 -#include <sys/domain.h> -#endif -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/protosw.h> -#include <sys/random.h> - -#include <vm/uma.h> - -#include <net/route.h> -#include <net/if.h> - -#include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/ip.h> -#ifdef INET6 -#include <netinet/ip6.h> -#endif -#include <netinet/in_pcb.h> -#ifdef INET6 -#include <netinet6/in6_pcb.h> -#endif -#include <netinet/in_var.h> -#include <netinet/ip_var.h> -#ifdef INET6 -#include <netinet6/ip6_var.h> -#include <netinet6/scope6_var.h> -#include <netinet6/nd6.h> -#endif -#include <netinet/ip_icmp.h> -#include <netinet/tcp.h> -#include <netinet/tcp_fsm.h> -#include <netinet/tcp_seq.h> -#include <netinet/tcp_timer.h> -#include <netinet/tcp_var.h> -#include <netinet/tcp_syncache.h> -#include <netinet/tcp_offload.h> -#ifdef INET6 -#include <netinet6/tcp6_var.h> -#endif -#include <netinet/tcpip.h> -#ifdef TCPDEBUG -#include <netinet/tcp_debug.h> -#endif -#include <netinet6/ip6protosw.h> - -#ifdef IPSEC -#include <netipsec/ipsec.h> -#include <netipsec/xform.h> -#ifdef INET6 -#include <netipsec/ipsec6.h> -#endif -#include <netipsec/key.h> -#endif /*IPSEC*/ - -#include <machine/in_cksum.h> -#include <sys/md5.h> - -#include <security/mac/mac_framework.h> - -#include <dev/cxgb/ulp/tom/cxgb_tcp.h> - - -SYSCTL_NODE(_net_inet_tcp, 0, cxgb, CTLFLAG_RW, 0, "chelsio TOE"); - -static int tcp_log_debug = 0; -SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, log_debug, CTLFLAG_RW, - &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); - -static int tcp_tcbhashsize = 0; -SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, - &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); - -static int do_tcpdrain = 1; -SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, do_tcpdrain, CTLFLAG_RW, - &do_tcpdrain, 0, - "Enable tcp_drain routine for extra help when low on mbufs"); - -SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, pcbcount, CTLFLAG_RD, - &tcbinfo.ipi_count, 0, "Number of active PCBs"); - -static int icmp_may_rst = 1; -SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, icmp_may_rst, CTLFLAG_RW, - &icmp_may_rst, 0, - "Certain ICMP unreachable messages may abort connections in SYN_SENT"); - -static int tcp_isn_reseed_interval = 0; -SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, - &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); - -/* - * TCP bandwidth limiting sysctls. Note that the default lower bound of - * 1024 exists only for debugging. A good production default would be - * something like 6100. - */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, - "TCP inflight data limiting"); - -static int tcp_inflight_enable = 1; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); - -static int tcp_inflight_debug = 0; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, - &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); - -static int tcp_inflight_rttthresh; -SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, - &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", - "RTT threshold below which inflight will deactivate itself"); - -static int tcp_inflight_min = 6144; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); - -static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); - -static int tcp_inflight_stab = 20; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); - -uma_zone_t sack_hole_zone; - -static struct inpcb *tcp_notify(struct inpcb *, int); -static struct inpcb *cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno); - -/* - * Target size of TCP PCB hash tables. Must be a power of two. - * - * Note that this can be overridden by the kernel environment - * variable net.inet.tcp.tcbhashsize - */ -#ifndef TCBHASHSIZE -#define TCBHASHSIZE 512 -#endif - -/* - * XXX - * Callouts should be moved into struct tcp directly. They are currently - * separate because the tcpcb structure is exported to userland for sysctl - * parsing purposes, which do not know about callouts. - */ -struct tcpcb_mem { - struct tcpcb tcb; - struct tcp_timer tt; -}; - -MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); - -/* - * Drop a TCP connection, reporting - * the specified error. If connection is synchronized, - * then send a RST to peer. - */ -struct tcpcb * -cxgb_tcp_drop(struct tcpcb *tp, int errno) -{ - struct socket *so = tp->t_inpcb->inp_socket; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tp->t_inpcb); - - if (TCPS_HAVERCVDSYN(tp->t_state)) { - tp->t_state = TCPS_CLOSED; - (void) tcp_gen_reset(tp); - tcpstat.tcps_drops++; - } else - tcpstat.tcps_conndrops++; - if (errno == ETIMEDOUT && tp->t_softerror) - errno = tp->t_softerror; - so->so_error = errno; - return (cxgb_tcp_close(tp)); -} - -/* - * Attempt to close a TCP control block, marking it as dropped, and freeing - * the socket if we hold the only reference. - */ -struct tcpcb * -cxgb_tcp_close(struct tcpcb *tp) -{ - struct inpcb *inp = tp->t_inpcb; - struct socket *so; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - if (tp->t_state == TCPS_LISTEN) - tcp_gen_listen_close(tp); - in_pcbdrop(inp); - tcpstat.tcps_closed++; - KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); - so = inp->inp_socket; - soisdisconnected(so); - if (inp->inp_vflag & INP_SOCKREF) { - KASSERT(so->so_state & SS_PROTOREF, - ("tcp_close: !SS_PROTOREF")); - inp->inp_vflag &= ~INP_SOCKREF; - INP_UNLOCK(inp); - ACCEPT_LOCK(); - SOCK_LOCK(so); - so->so_state &= ~SS_PROTOREF; - sofree(so); - return (NULL); - } - return (tp); -} - -/* - * Notify a tcp user of an asynchronous error; - * store error as soft error, but wake up user - * (for now, won't do anything until can select for soft error). - * - * Do not wake up user since there currently is no mechanism for - * reporting soft errors (yet - a kqueue filter may be added). - */ -static struct inpcb * -tcp_notify(struct inpcb *inp, int error) -{ - struct tcpcb *tp; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - if ((inp->inp_vflag & INP_TIMEWAIT) || - (inp->inp_vflag & INP_DROPPED)) - return (inp); - - tp = intotcpcb(inp); - KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); - - /* - * Ignore some errors if we are hooked up. - * If connection hasn't completed, has retransmitted several times, - * and receives a second error, give up now. This is better - * than waiting a long time to establish a connection that - * can never complete. - */ - if (tp->t_state == TCPS_ESTABLISHED && - (error == EHOSTUNREACH || error == ENETUNREACH || - error == EHOSTDOWN)) { - return (inp); - } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && - tp->t_softerror) { - tp = cxgb_tcp_drop(tp, error); - if (tp != NULL) - return (inp); - else - return (NULL); - } else { - tp->t_softerror = error; - return (inp); - } -#if 0 - wakeup( &so->so_timeo); - sorwakeup(so); - sowwakeup(so); -#endif -} - -void -cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) -{ - struct ip *ip = vip; - struct tcphdr *th; - struct in_addr faddr; - struct inpcb *inp; - struct tcpcb *tp; - struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; - struct icmp *icp; - struct in_conninfo inc; - tcp_seq icmp_tcp_seq; - int mtu; - - faddr = ((struct sockaddr_in *)sa)->sin_addr; - if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) - return; - - if (cmd == PRC_MSGSIZE) - notify = tcp_mtudisc; - else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || - cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) - notify = cxgb_tcp_drop_syn_sent; - /* - * Redirects don't need to be handled up here. - */ - else if (PRC_IS_REDIRECT(cmd)) - return; - /* - * Source quench is depreciated. - */ - else if (cmd == PRC_QUENCH) - return; - /* - * Hostdead is ugly because it goes linearly through all PCBs. - * XXX: We never get this from ICMP, otherwise it makes an - * excellent DoS attack on machines with many connections. - */ - else if (cmd == PRC_HOSTDEAD) - ip = NULL; - else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) - return; - if (ip != NULL) { - icp = (struct icmp *)((caddr_t)ip - - offsetof(struct icmp, icmp_ip)); - th = (struct tcphdr *)((caddr_t)ip - + (ip->ip_hl << 2)); - INP_INFO_WLOCK(&tcbinfo); - inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, 0, NULL); - if (inp != NULL) { - INP_LOCK(inp); - if (!(inp->inp_vflag & INP_TIMEWAIT) && - !(inp->inp_vflag & INP_DROPPED) && - !(inp->inp_socket == NULL)) { - icmp_tcp_seq = htonl(th->th_seq); - tp = intotcpcb(inp); - if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && - SEQ_LT(icmp_tcp_seq, tp->snd_max)) { - if (cmd == PRC_MSGSIZE) { - /* - * MTU discovery: - * If we got a needfrag set the MTU - * in the route to the suggested new - * value (if given) and then notify. - */ - bzero(&inc, sizeof(inc)); - inc.inc_flags = 0; /* IPv4 */ - inc.inc_faddr = faddr; - - mtu = ntohs(icp->icmp_nextmtu); - /* - * If no alternative MTU was - * proposed, try the next smaller - * one. ip->ip_len has already - * been swapped in icmp_input(). - */ - if (!mtu) - mtu = ip_next_mtu(ip->ip_len, - 1); - if (mtu < max(296, (tcp_minmss) - + sizeof(struct tcpiphdr))) - mtu = 0; - if (!mtu) - mtu = tcp_mssdflt - + sizeof(struct tcpiphdr); - /* - * Only cache the the MTU if it - * is smaller than the interface - * or route MTU. tcp_mtudisc() - * will do right thing by itself. - */ - if (mtu <= tcp_maxmtu(&inc, NULL)) - tcp_hc_updatemtu(&inc, mtu); - } - - inp = (*notify)(inp, inetctlerrmap[cmd]); - } - } - if (inp != NULL) - INP_UNLOCK(inp); - } else { - inc.inc_fport = th->th_dport; - inc.inc_lport = th->th_sport; - inc.inc_faddr = faddr; - inc.inc_laddr = ip->ip_src; -#ifdef INET6 - inc.inc_isipv6 = 0; -#endif - syncache_unreach(&inc, th); - } - INP_INFO_WUNLOCK(&tcbinfo); - } else - in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); -} - -#ifdef INET6 -void -tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) -{ - struct tcphdr th; - struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; - struct ip6_hdr *ip6; - struct mbuf *m; - struct ip6ctlparam *ip6cp = NULL; - const struct sockaddr_in6 *sa6_src = NULL; - int off; - struct tcp_portonly { - u_int16_t th_sport; - u_int16_t th_dport; - } *thp; - - if (sa->sa_family != AF_INET6 || - sa->sa_len != sizeof(struct sockaddr_in6)) - return; - - if (cmd == PRC_MSGSIZE) - notify = tcp_mtudisc; - else if (!PRC_IS_REDIRECT(cmd) && - ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) - return; - /* Source quench is depreciated. */ - else if (cmd == PRC_QUENCH) - return; - - /* if the parameter is from icmp6, decode it. */ - if (d != NULL) { - ip6cp = (struct ip6ctlparam *)d; - m = ip6cp->ip6c_m; - ip6 = ip6cp->ip6c_ip6; - off = ip6cp->ip6c_off; - sa6_src = ip6cp->ip6c_src; - } else { - m = NULL; - ip6 = NULL; - off = 0; /* fool gcc */ - sa6_src = &sa6_any; - } - - if (ip6 != NULL) { - struct in_conninfo inc; - /* - * XXX: We assume that when IPV6 is non NULL, - * M and OFF are valid. - */ - - /* check if we can safely examine src and dst ports */ - if (m->m_pkthdr.len < off + sizeof(*thp)) - return; - - bzero(&th, sizeof(th)); - m_copydata(m, off, sizeof(*thp), (caddr_t)&th); - - in6_pcbnotify(&tcbinfo, sa, th.th_dport, - (struct sockaddr *)ip6cp->ip6c_src, - th.th_sport, cmd, NULL, notify); - - inc.inc_fport = th.th_dport; - inc.inc_lport = th.th_sport; - inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; - inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; - inc.inc_isipv6 = 1; - INP_INFO_WLOCK(&tcbinfo); - syncache_unreach(&inc, &th); - INP_INFO_WUNLOCK(&tcbinfo); - } else - in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, - 0, cmd, NULL, notify); -} -#endif /* INET6 */ - - -/* - * Following is where TCP initial sequence number generation occurs. - * - * There are two places where we must use initial sequence numbers: - * 1. In SYN-ACK packets. - * 2. In SYN packets. - * - * All ISNs for SYN-ACK packets are generated by the syncache. See - * tcp_syncache.c for details. - * - * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling - * depends on this property. In addition, these ISNs should be - * unguessable so as to prevent connection hijacking. To satisfy - * the requirements of this situation, the algorithm outlined in - * RFC 1948 is used, with only small modifications. - * - * Implementation details: - * - * Time is based off the system timer, and is corrected so that it - * increases by one megabyte per second. This allows for proper - * recycling on high speed LANs while still leaving over an hour - * before rollover. - * - * As reading the *exact* system time is too expensive to be done - * whenever setting up a TCP connection, we increment the time - * offset in two ways. First, a small random positive increment - * is added to isn_offset for each connection that is set up. - * Second, the function tcp_isn_tick fires once per clock tick - * and increments isn_offset as necessary so that sequence numbers - * are incremented at approximately ISN_BYTES_PER_SECOND. The - * random positive increments serve only to ensure that the same - * exact sequence number is never sent out twice (as could otherwise - * happen when a port is recycled in less than the system tick - * interval.) - * - * net.inet.tcp.isn_reseed_interval controls the number of seconds - * between seeding of isn_secret. This is normally set to zero, - * as reseeding should not be necessary. - * - * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, - * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In - * general, this means holding an exclusive (write) lock. - */ - -#define ISN_BYTES_PER_SECOND 1048576 -#define ISN_STATIC_INCREMENT 4096 -#define ISN_RANDOM_INCREMENT (4096 - 1) - - -/* - * When a specific ICMP unreachable message is received and the - * connection state is SYN-SENT, drop the connection. This behavior - * is controlled by the icmp_may_rst sysctl. - */ -static struct inpcb * -cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno) -{ - struct tcpcb *tp; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - if ((inp->inp_vflag & INP_TIMEWAIT) || - (inp->inp_vflag & INP_DROPPED)) - return (inp); - - tp = intotcpcb(inp); - if (tp->t_state != TCPS_SYN_SENT) - return (inp); - - tp = cxgb_tcp_drop(tp, errno); - if (tp != NULL) - return (inp); - else - return (NULL); -} - -static int -cxgb_sysctl_drop(SYSCTL_HANDLER_ARGS) -{ - /* addrs[0] is a foreign socket, addrs[1] is a local one. */ - struct sockaddr_storage addrs[2]; - struct inpcb *inp; - struct tcpcb *tp; - struct tcptw *tw; - struct sockaddr_in *fin, *lin; -#ifdef INET6 - struct sockaddr_in6 *fin6, *lin6; - struct in6_addr f6, l6; -#endif - int error; - - inp = NULL; - fin = lin = NULL; -#ifdef INET6 - fin6 = lin6 = NULL; -#endif - error = 0; - - if (req->oldptr != NULL || req->oldlen != 0) - return (EINVAL); - if (req->newptr == NULL) - return (EPERM); - if (req->newlen < sizeof(addrs)) - return (ENOMEM); - error = SYSCTL_IN(req, &addrs, sizeof(addrs)); - if (error) - return (error); - - switch (addrs[0].ss_family) { -#ifdef INET6 - case AF_INET6: - fin6 = (struct sockaddr_in6 *)&addrs[0]; - lin6 = (struct sockaddr_in6 *)&addrs[1]; - if (fin6->sin6_len != sizeof(struct sockaddr_in6) || - lin6->sin6_len != sizeof(struct sockaddr_in6)) - return (EINVAL); - if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { - if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) - return (EINVAL); - in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); - in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); - fin = (struct sockaddr_in *)&addrs[0]; - lin = (struct sockaddr_in *)&addrs[1]; - break; - } - error = sa6_embedscope(fin6, ip6_use_defzone); - if (error) - return (error); - error = sa6_embedscope(lin6, ip6_use_defzone); - if (error) - return (error); - break; -#endif - case AF_INET: - fin = (struct sockaddr_in *)&addrs[0]; - lin = (struct sockaddr_in *)&addrs[1]; - if (fin->sin_len != sizeof(struct sockaddr_in) || - lin->sin_len != sizeof(struct sockaddr_in)) - return (EINVAL); - break; - default: - return (EINVAL); - } - INP_INFO_WLOCK(&tcbinfo); - switch (addrs[0].ss_family) { -#ifdef INET6 - case AF_INET6: - inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, - &l6, lin6->sin6_port, 0, NULL); - break; -#endif - case AF_INET: - inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, - lin->sin_addr, lin->sin_port, 0, NULL); - break; - } - if (inp != NULL) { - INP_LOCK(inp); - if (inp->inp_vflag & INP_TIMEWAIT) { - /* - * XXXRW: There currently exists a state where an - * inpcb is present, but its timewait state has been - * discarded. For now, don't allow dropping of this - * type of inpcb. - */ - tw = intotw(inp); - if (tw != NULL) - tcp_twclose(tw, 0); - else - INP_UNLOCK(inp); - } else if (!(inp->inp_vflag & INP_DROPPED) && - !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { - tp = intotcpcb(inp); - tp = cxgb_tcp_drop(tp, ECONNABORTED); - if (tp != NULL) - INP_UNLOCK(inp); - } else - INP_UNLOCK(inp); - } else - error = ESRCH; - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} - -SYSCTL_PROC(_net_inet_tcp_cxgb, TCPCTL_DROP, drop, - CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, - 0, cxgb_sysctl_drop, "", "Drop TCP connection"); - diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c deleted file mode 100644 index bd940b2..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c +++ /dev/null @@ -1,1362 +0,0 @@ -/*- - * Copyright (c) 1982, 1986, 1988, 1993 - * The Regents of the University of California. - * Copyright (c) 2006-2007 Robert N. M. Watson - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include "opt_ddb.h" -#include "opt_inet.h" -#include "opt_inet6.h" -#include "opt_tcpdebug.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> -#include <sys/mbuf.h> -#ifdef INET6 -#include <sys/domain.h> -#endif /* INET6 */ -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/protosw.h> -#include <sys/proc.h> -#include <sys/jail.h> - -#ifdef DDB -#include <ddb/ddb.h> -#endif - -#include <net/if.h> -#include <net/route.h> - -#include <netinet/in.h> -#include <netinet/in_systm.h> -#ifdef INET6 -#include <netinet/ip6.h> -#endif -#include <netinet/in_pcb.h> -#ifdef INET6 -#include <netinet6/in6_pcb.h> -#endif -#include <netinet/in_var.h> -#include <netinet/ip_var.h> -#ifdef INET6 -#include <netinet6/ip6_var.h> -#include <netinet6/scope6_var.h> -#endif -#include <netinet/tcp.h> -#include <netinet/tcp_fsm.h> -#include <netinet/tcp_seq.h> -#include <netinet/tcp_timer.h> -#include <netinet/tcp_var.h> -#include <netinet/tcpip.h> -#ifdef TCPDEBUG -#include <netinet/tcp_debug.h> -#endif -#include <netinet/tcp_offload.h> -#include <dev/cxgb/ulp/tom/cxgb_tcp.h> - - -/* - * TCP protocol interface to socket abstraction. - */ -static int tcp_attach(struct socket *); -static int tcp_connect(struct tcpcb *, struct sockaddr *, - struct thread *td); -#ifdef INET6 -static int tcp6_connect(struct tcpcb *, struct sockaddr *, - struct thread *td); -#endif /* INET6 */ -static void tcp_disconnect(struct tcpcb *); -static void tcp_usrclosed(struct tcpcb *); - -#ifdef TCPDEBUG -#define TCPDEBUG0 int ostate = 0 -#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 -#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ - tcp_trace(TA_USER, ostate, tp, 0, 0, req) -#else -#define TCPDEBUG0 -#define TCPDEBUG1() -#define TCPDEBUG2(req) -#endif - -/* - * TCP attaches to socket via pru_attach(), reserving space, - * and an internet control block. - */ -static int -tcp_usr_attach(struct socket *so, int proto, struct thread *td) -{ - struct inpcb *inp; - struct tcpcb *tp = NULL; - int error; - TCPDEBUG0; - - inp = sotoinpcb(so); - KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); - TCPDEBUG1(); - - error = tcp_attach(so); - if (error) - goto out; - - if ((so->so_options & SO_LINGER) && so->so_linger == 0) - so->so_linger = TCP_LINGERTIME; - - inp = sotoinpcb(so); - tp = intotcpcb(inp); -out: - TCPDEBUG2(PRU_ATTACH); - return error; -} - -/* - * tcp_detach is called when the socket layer loses its final reference - * to the socket, be it a file descriptor reference, a reference from TCP, - * etc. At this point, there is only one case in which we will keep around - * inpcb state: time wait. - * - * This function can probably be re-absorbed back into tcp_usr_detach() now - * that there is a single detach path. - */ -static void -tcp_detach(struct socket *so, struct inpcb *inp) -{ - struct tcpcb *tp; -#ifdef INET6 - int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; -#endif - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); - KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); - - tp = intotcpcb(inp); - - if (inp->inp_vflag & INP_TIMEWAIT) { - /* - * There are two cases to handle: one in which the time wait - * state is being discarded (INP_DROPPED), and one in which - * this connection will remain in timewait. In the former, - * it is time to discard all state (except tcptw, which has - * already been discarded by the timewait close code, which - * should be further up the call stack somewhere). In the - * latter case, we detach from the socket, but leave the pcb - * present until timewait ends. - * - * XXXRW: Would it be cleaner to free the tcptw here? - */ - if (inp->inp_vflag & INP_DROPPED) { - KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " - "INP_DROPPED && tp != NULL")); -#ifdef INET6 - if (isipv6) { - in6_pcbdetach(inp); - in6_pcbfree(inp); - } else { -#endif - in_pcbdetach(inp); - in_pcbfree(inp); -#ifdef INET6 - } -#endif - } else { -#ifdef INET6 - if (isipv6) - in6_pcbdetach(inp); - else -#endif - in_pcbdetach(inp); - INP_UNLOCK(inp); - } - } else { - /* - * If the connection is not in timewait, we consider two - * two conditions: one in which no further processing is - * necessary (dropped || embryonic), and one in which TCP is - * not yet done, but no longer requires the socket, so the - * pcb will persist for the time being. - * - * XXXRW: Does the second case still occur? - */ - if (inp->inp_vflag & INP_DROPPED || - tp->t_state < TCPS_SYN_SENT) { - tcp_discardcb(tp); -#ifdef INET6 - if (isipv6) { - in6_pcbdetach(inp); - in6_pcbfree(inp); - } else { -#endif - in_pcbdetach(inp); - in_pcbfree(inp); -#ifdef INET6 - } -#endif - } else { -#ifdef INET6 - if (isipv6) - in6_pcbdetach(inp); - else -#endif - in_pcbdetach(inp); - } - } -} - -/* - * pru_detach() detaches the TCP protocol from the socket. - * If the protocol state is non-embryonic, then can't - * do this directly: have to initiate a pru_disconnect(), - * which may finish later; embryonic TCB's can just - * be discarded here. - */ -static void -tcp_usr_detach(struct socket *so) -{ - struct inpcb *inp; - - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); - INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(inp); - KASSERT(inp->inp_socket != NULL, - ("tcp_usr_detach: inp_socket == NULL")); - tcp_detach(so, inp); - INP_INFO_WUNLOCK(&tcbinfo); -} - -/* - * Give the socket an address. - */ -static int -tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - struct sockaddr_in *sinp; - - sinp = (struct sockaddr_in *)nam; - if (nam->sa_len != sizeof (*sinp)) - return (EINVAL); - /* - * Must check for multicast addresses and disallow binding - * to them. - */ - if (sinp->sin_family == AF_INET && - IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) - return (EAFNOSUPPORT); - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - error = in_pcbbind(inp, nam, td->td_ucred); -out: - TCPDEBUG2(PRU_BIND); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - - return (error); -} - -#ifdef INET6 -static int -tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - struct sockaddr_in6 *sin6p; - - sin6p = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof (*sin6p)) - return (EINVAL); - /* - * Must check for multicast addresses and disallow binding - * to them. - */ - if (sin6p->sin6_family == AF_INET6 && - IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) - return (EAFNOSUPPORT); - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - inp->inp_vflag &= ~INP_IPV4; - inp->inp_vflag |= INP_IPV6; - if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { - if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) - inp->inp_vflag |= INP_IPV4; - else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { - struct sockaddr_in sin; - - in6_sin6_2_sin(&sin, sin6p); - inp->inp_vflag |= INP_IPV4; - inp->inp_vflag &= ~INP_IPV6; - error = in_pcbbind(inp, (struct sockaddr *)&sin, - td->td_ucred); - goto out; - } - } - error = in6_pcbbind(inp, nam, td->td_ucred); -out: - TCPDEBUG2(PRU_BIND); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} -#endif /* INET6 */ - -/* - * Prepare to accept connections. - */ -static int -tcp_usr_listen(struct socket *so, int backlog, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - SOCK_LOCK(so); - error = solisten_proto_check(so); - if (error == 0 && inp->inp_lport == 0) - error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - if (error == 0) { - tp->t_state = TCPS_LISTEN; - solisten_proto(so, backlog); - tcp_gen_listen_open(tp); - } - SOCK_UNLOCK(so); - -out: - TCPDEBUG2(PRU_LISTEN); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} - -#ifdef INET6 -static int -tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - SOCK_LOCK(so); - error = solisten_proto_check(so); - if (error == 0 && inp->inp_lport == 0) { - inp->inp_vflag &= ~INP_IPV4; - if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) - inp->inp_vflag |= INP_IPV4; - error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - } - if (error == 0) { - tp->t_state = TCPS_LISTEN; - solisten_proto(so, backlog); - } - SOCK_UNLOCK(so); - -out: - TCPDEBUG2(PRU_LISTEN); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} -#endif /* INET6 */ - -/* - * Initiate connection to peer. - * Create a template for use in transmissions on this connection. - * Enter SYN_SENT state, and mark socket as connecting. - * Start keep-alive timer, and seed output sequence space. - * Send initial segment on connection. - */ -static int -tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - struct sockaddr_in *sinp; - - sinp = (struct sockaddr_in *)nam; - if (nam->sa_len != sizeof (*sinp)) - return (EINVAL); - /* - * Must disallow TCP ``connections'' to multicast addresses. - */ - if (sinp->sin_family == AF_INET - && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) - return (EAFNOSUPPORT); - if (jailed(td->td_ucred)) - prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - if ((error = tcp_connect(tp, nam, td)) != 0) - goto out; - printf("calling tcp_gen_connect\n"); - - error = tcp_gen_connect(so, nam); -out: - TCPDEBUG2(PRU_CONNECT); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} - -#ifdef INET6 -static int -tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - struct sockaddr_in6 *sin6p; - - TCPDEBUG0; - - sin6p = (struct sockaddr_in6 *)nam; - if (nam->sa_len != sizeof (*sin6p)) - return (EINVAL); - /* - * Must disallow TCP ``connections'' to multicast addresses. - */ - if (sin6p->sin6_family == AF_INET6 - && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) - return (EAFNOSUPPORT); - - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { - struct sockaddr_in sin; - - if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { - error = EINVAL; - goto out; - } - - in6_sin6_2_sin(&sin, sin6p); - inp->inp_vflag |= INP_IPV4; - inp->inp_vflag &= ~INP_IPV6; - if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) - goto out; - error = tcp_gen_connect(so, nam); - goto out; - } - inp->inp_vflag &= ~INP_IPV4; - inp->inp_vflag |= INP_IPV6; - inp->inp_inc.inc_isipv6 = 1; - if ((error = tcp6_connect(tp, nam, td)) != 0) - goto out; - error = tcp_gen_connect(so, nam); - -out: - TCPDEBUG2(PRU_CONNECT); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} -#endif /* INET6 */ - -/* - * Initiate disconnect from peer. - * If connection never passed embryonic stage, just drop; - * else if don't need to let data drain, then can just drop anyways, - * else have to begin TCP shutdown process: mark socket disconnecting, - * drain unread data, state switch to reflect user close, and - * send segment (e.g. FIN) to peer. Socket will be really disconnected - * when peer sends FIN and acks ours. - * - * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. - */ -static int -tcp_usr_disconnect(struct socket *so) -{ - struct inpcb *inp; - struct tcpcb *tp = NULL; - int error = 0; - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - tcp_disconnect(tp); -out: - TCPDEBUG2(PRU_DISCONNECT); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} - -/* - * Accept a connection. Essentially all the work is - * done at higher levels; just return the address - * of the peer, storing through addr. - */ -static int -tcp_usr_accept(struct socket *so, struct sockaddr **nam) -{ - int error = 0; - struct inpcb *inp = NULL; - struct tcpcb *tp = NULL; - struct in_addr addr; - in_port_t port = 0; - TCPDEBUG0; - - if (so->so_state & SS_ISDISCONNECTED) - return (ECONNABORTED); - - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNABORTED; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - - /* - * We inline in_getpeeraddr and COMMON_END here, so that we can - * copy the data of interest and defer the malloc until after we - * release the lock. - */ - port = inp->inp_fport; - addr = inp->inp_faddr; - -out: - TCPDEBUG2(PRU_ACCEPT); - INP_UNLOCK(inp); - if (error == 0) - *nam = in_sockaddr(port, &addr); - return error; -} - -#ifdef INET6 -static int -tcp6_usr_accept(struct socket *so, struct sockaddr **nam) -{ - struct inpcb *inp = NULL; - int error = 0; - struct tcpcb *tp = NULL; - struct in_addr addr; - struct in6_addr addr6; - in_port_t port = 0; - int v4 = 0; - TCPDEBUG0; - - if (so->so_state & SS_ISDISCONNECTED) - return (ECONNABORTED); - - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNABORTED; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - - /* - * We inline in6_mapped_peeraddr and COMMON_END here, so that we can - * copy the data of interest and defer the malloc until after we - * release the lock. - */ - if (inp->inp_vflag & INP_IPV4) { - v4 = 1; - port = inp->inp_fport; - addr = inp->inp_faddr; - } else { - port = inp->inp_fport; - addr6 = inp->in6p_faddr; - } - -out: - TCPDEBUG2(PRU_ACCEPT); - INP_UNLOCK(inp); - if (error == 0) { - if (v4) - *nam = in6_v4mapsin6_sockaddr(port, &addr); - else - *nam = in6_sockaddr(port, &addr6); - } - return error; -} -#endif /* INET6 */ - -/* - * Mark the connection as being incapable of further output. - */ -static int -tcp_usr_shutdown(struct socket *so) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - - TCPDEBUG0; - INP_INFO_WLOCK(&tcbinfo); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - socantsendmore(so); - tcp_usrclosed(tp); - error = tcp_gen_disconnect(tp); - -out: - TCPDEBUG2(PRU_SHUTDOWN); - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - - return (error); -} - -/* - * After a receive, possibly send window update to peer. - */ -static int -tcp_usr_rcvd(struct socket *so, int flags) -{ - struct inpcb *inp; - struct tcpcb *tp = NULL; - int error = 0; - - TCPDEBUG0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - tcp_gen_rcvd(tp); - -out: - TCPDEBUG2(PRU_RCVD); - INP_UNLOCK(inp); - return (error); -} - -/* - * Do a send by putting data in output queue and updating urgent - * marker if URG set. Possibly send more data. Unlike the other - * pru_*() routines, the mbuf chains are our responsibility. We - * must either enqueue them or free them. The other pru_* routines - * generally are caller-frees. - */ -static int -tcp_usr_send(struct socket *so, int flags, struct mbuf *m, - struct sockaddr *nam, struct mbuf *control, struct thread *td) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - int headlocked = 0; -#ifdef INET6 - int isipv6; -#endif - TCPDEBUG0; - - /* - * We require the pcbinfo lock in two cases: - * - * (1) An implied connect is taking place, which can result in - * binding IPs and ports and hence modification of the pcb hash - * chains. - * - * (2) PRUS_EOF is set, resulting in explicit close on the send. - */ - if ((nam != NULL) || (flags & PRUS_EOF)) { - INP_INFO_WLOCK(&tcbinfo); - headlocked = 1; - } - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - if (control) - m_freem(control); - if (m) - m_freem(m); - error = ECONNRESET; - goto out; - } -#ifdef INET6 - isipv6 = nam && nam->sa_family == AF_INET6; -#endif /* INET6 */ - tp = intotcpcb(inp); - TCPDEBUG1(); - if (control) { - /* TCP doesn't do control messages (rights, creds, etc) */ - if (control->m_len) { - m_freem(control); - if (m) - m_freem(m); - error = EINVAL; - goto out; - } - m_freem(control); /* empty control, just free it */ - } - if (!(flags & PRUS_OOB)) { - sbappendstream(&so->so_snd, m); - if (nam && tp->t_state < TCPS_SYN_SENT) { - /* - * Do implied connect if not yet connected, - * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. - */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); -#ifdef INET6 - if (isipv6) - error = tcp6_connect(tp, nam, td); - else -#endif /* INET6 */ - error = tcp_connect(tp, nam, td); - if (error) - goto out; - tp->snd_wnd = TTCP_CLIENT_SND_WND; - tcp_mss(tp, -1); - } - if (flags & PRUS_EOF) { - /* - * Close the send side of the connection after - * the data is sent. - */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); - socantsendmore(so); - tcp_usrclosed(tp); - } - if (headlocked) { - INP_INFO_WUNLOCK(&tcbinfo); - headlocked = 0; - } - if (tp != NULL) { - if (flags & PRUS_MORETOCOME) - tp->t_flags |= TF_MORETOCOME; - error = tcp_gen_send(tp); - if (flags & PRUS_MORETOCOME) - tp->t_flags &= ~TF_MORETOCOME; - } - } else { - /* - * XXXRW: PRUS_EOF not implemented with PRUS_OOB? - */ - SOCKBUF_LOCK(&so->so_snd); - if (sbspace(&so->so_snd) < -512) { - SOCKBUF_UNLOCK(&so->so_snd); - m_freem(m); - error = ENOBUFS; - goto out; - } - /* - * According to RFC961 (Assigned Protocols), - * the urgent pointer points to the last octet - * of urgent data. We continue, however, - * to consider it to indicate the first octet - * of data past the urgent section. - * Otherwise, snd_up should be one lower. - */ - sbappendstream_locked(&so->so_snd, m); - SOCKBUF_UNLOCK(&so->so_snd); - if (nam && tp->t_state < TCPS_SYN_SENT) { - /* - * Do implied connect if not yet connected, - * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. - */ - INP_INFO_WLOCK_ASSERT(&tcbinfo); -#ifdef INET6 - if (isipv6) - error = tcp6_connect(tp, nam, td); - else -#endif /* INET6 */ - error = tcp_connect(tp, nam, td); - if (error) - goto out; - tp->snd_wnd = TTCP_CLIENT_SND_WND; - tcp_mss(tp, -1); - INP_INFO_WUNLOCK(&tcbinfo); - headlocked = 0; - } else if (nam) { - INP_INFO_WUNLOCK(&tcbinfo); - headlocked = 0; - } - tp->snd_up = tp->snd_una + so->so_snd.sb_cc; - tp->t_flags |= TF_FORCEDATA; - error = tcp_gen_send(tp); - tp->t_flags &= ~TF_FORCEDATA; - } -out: - TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : - ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); - INP_UNLOCK(inp); - if (headlocked) - INP_INFO_WUNLOCK(&tcbinfo); - return (error); -} - -/* - * Abort the TCP. Drop the connection abruptly. - */ -static void -tcp_usr_abort(struct socket *so) -{ - struct inpcb *inp; - struct tcpcb *tp = NULL; - TCPDEBUG0; - - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - - INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(inp); - KASSERT(inp->inp_socket != NULL, - ("tcp_usr_abort: inp_socket == NULL")); - - /* - * If we still have full TCP state, and we're not dropped, drop. - */ - if (!(inp->inp_vflag & INP_TIMEWAIT) && - !(inp->inp_vflag & INP_DROPPED)) { - tp = intotcpcb(inp); - TCPDEBUG1(); - cxgb_tcp_drop(tp, ECONNABORTED); - TCPDEBUG2(PRU_ABORT); - } - if (!(inp->inp_vflag & INP_DROPPED)) { - SOCK_LOCK(so); - so->so_state |= SS_PROTOREF; - SOCK_UNLOCK(so); - inp->inp_vflag |= INP_SOCKREF; - } - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); -} - -/* - * TCP socket is closed. Start friendly disconnect. - */ -static void -tcp_usr_close(struct socket *so) -{ - struct inpcb *inp; - struct tcpcb *tp = NULL; - TCPDEBUG0; - - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - - INP_INFO_WLOCK(&tcbinfo); - INP_LOCK(inp); - KASSERT(inp->inp_socket != NULL, - ("tcp_usr_close: inp_socket == NULL")); - - /* - * If we still have full TCP state, and we're not dropped, initiate - * a disconnect. - */ - if (!(inp->inp_vflag & INP_TIMEWAIT) && - !(inp->inp_vflag & INP_DROPPED)) { - tp = intotcpcb(inp); - TCPDEBUG1(); - tcp_disconnect(tp); - TCPDEBUG2(PRU_CLOSE); - } - if (!(inp->inp_vflag & INP_DROPPED)) { - SOCK_LOCK(so); - so->so_state |= SS_PROTOREF; - SOCK_UNLOCK(so); - inp->inp_vflag |= INP_SOCKREF; - } - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); -} - -/* - * Receive out-of-band data. - */ -static int -tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) -{ - int error = 0; - struct inpcb *inp; - struct tcpcb *tp = NULL; - - TCPDEBUG0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); - INP_LOCK(inp); - if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { - error = ECONNRESET; - goto out; - } - tp = intotcpcb(inp); - TCPDEBUG1(); - if ((so->so_oobmark == 0 && - (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || - so->so_options & SO_OOBINLINE || - tp->t_oobflags & TCPOOB_HADDATA) { - error = EINVAL; - goto out; - } - if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { - error = EWOULDBLOCK; - goto out; - } - m->m_len = 1; - *mtod(m, caddr_t) = tp->t_iobc; - if ((flags & MSG_PEEK) == 0) - tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); - -out: - TCPDEBUG2(PRU_RCVOOB); - INP_UNLOCK(inp); - return (error); -} - -struct pr_usrreqs cxgb_tcp_usrreqs = { - .pru_abort = tcp_usr_abort, - .pru_accept = tcp_usr_accept, - .pru_attach = tcp_usr_attach, - .pru_bind = tcp_usr_bind, - .pru_connect = tcp_usr_connect, - .pru_control = in_control, - .pru_detach = tcp_usr_detach, - .pru_disconnect = tcp_usr_disconnect, - .pru_listen = tcp_usr_listen, - .pru_peeraddr = in_getpeeraddr, - .pru_rcvd = tcp_usr_rcvd, - .pru_rcvoob = tcp_usr_rcvoob, - .pru_send = tcp_usr_send, - .pru_shutdown = tcp_usr_shutdown, - .pru_sockaddr = in_getsockaddr, - .pru_sosetlabel = in_pcbsosetlabel, - .pru_close = tcp_usr_close, -}; - -#ifdef INET6 -struct pr_usrreqs cxgb_tcp6_usrreqs = { - .pru_abort = tcp_usr_abort, - .pru_accept = tcp6_usr_accept, - .pru_attach = tcp_usr_attach, - .pru_bind = tcp6_usr_bind, - .pru_connect = tcp6_usr_connect, - .pru_control = in6_control, - .pru_detach = tcp_usr_detach, - .pru_disconnect = tcp_usr_disconnect, - .pru_listen = tcp6_usr_listen, - .pru_peeraddr = in6_mapped_peeraddr, - .pru_rcvd = tcp_usr_rcvd, - .pru_rcvoob = tcp_usr_rcvoob, - .pru_send = tcp_usr_send, - .pru_shutdown = tcp_usr_shutdown, - .pru_sockaddr = in6_mapped_sockaddr, - .pru_sosetlabel = in_pcbsosetlabel, - .pru_close = tcp_usr_close, -}; -#endif /* INET6 */ - -/* - * Common subroutine to open a TCP connection to remote host specified - * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local - * port number if needed. Call in_pcbconnect_setup to do the routing and - * to choose a local host address (interface). If there is an existing - * incarnation of the same connection in TIME-WAIT state and if the remote - * host was sending CC options and if the connection duration was < MSL, then - * truncate the previous TIME-WAIT state and proceed. - * Initialize connection parameters and enter SYN-SENT state. - */ -static int -tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) -{ - struct inpcb *inp = tp->t_inpcb, *oinp; - struct socket *so = inp->inp_socket; - struct in_addr laddr; - u_short lport; - int error; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - if (inp->inp_lport == 0) { - error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - if (error) - return error; - } - - /* - * Cannot simply call in_pcbconnect, because there might be an - * earlier incarnation of this same connection still in - * TIME_WAIT state, creating an ADDRINUSE error. - */ - laddr = inp->inp_laddr; - lport = inp->inp_lport; - error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, - &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); - if (error && oinp == NULL) - return error; - if (oinp) - return EADDRINUSE; - inp->inp_laddr = laddr; - in_pcbrehash(inp); - - /* - * Compute window scaling to request: - * Scale to fit into sweet spot. See tcp_syncache.c. - * XXX: This should move to tcp_output(). - */ - while (tp->request_r_scale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << tp->request_r_scale) < sb_max) - tp->request_r_scale++; - - soisconnecting(so); - tcpstat.tcps_connattempt++; - tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); - tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; - tcp_sendseqinit(tp); - - return 0; -} - -#ifdef INET6 -static int -tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) -{ - struct inpcb *inp = tp->t_inpcb, *oinp; - struct socket *so = inp->inp_socket; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; - struct in6_addr *addr6; - int error; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - if (inp->inp_lport == 0) { - error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - if (error) - return error; - } - - /* - * Cannot simply call in_pcbconnect, because there might be an - * earlier incarnation of this same connection still in - * TIME_WAIT state, creating an ADDRINUSE error. - * in6_pcbladdr() also handles scope zone IDs. - */ - error = in6_pcbladdr(inp, nam, &addr6); - if (error) - return error; - oinp = in6_pcblookup_hash(inp->inp_pcbinfo, - &sin6->sin6_addr, sin6->sin6_port, - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) - ? addr6 - : &inp->in6p_laddr, - inp->inp_lport, 0, NULL); - if (oinp) - return EADDRINUSE; - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) - inp->in6p_laddr = *addr6; - inp->in6p_faddr = sin6->sin6_addr; - inp->inp_fport = sin6->sin6_port; - /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ - inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; - if (inp->in6p_flags & IN6P_AUTOFLOWLABEL) - inp->in6p_flowinfo |= - (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); - in_pcbrehash(inp); - - /* Compute window scaling to request. */ - while (tp->request_r_scale < TCP_MAX_WINSHIFT && - (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) - tp->request_r_scale++; - - soisconnecting(so); - tcpstat.tcps_connattempt++; - tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); - tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; - tcp_sendseqinit(tp); - - return 0; -} -#endif /* INET6 */ - -/* - * tcp_sendspace and tcp_recvspace are the default send and receive window - * sizes, respectively. These are obsolescent (this information should - * be set by the route). - */ -u_long tcp_sendspace = 1024*32; -SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, - &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); -u_long tcp_recvspace = 1024*64; -SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, - &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); - -/* - * Attach TCP protocol to socket, allocating - * internet protocol control block, tcp control block, - * bufer space, and entering LISTEN state if to accept connections. - */ -static int -tcp_attach(struct socket *so) -{ - struct tcpcb *tp; - struct inpcb *inp; - int error; -#ifdef INET6 - int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; -#endif - - if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - error = soreserve(so, tcp_sendspace, tcp_recvspace); - if (error) - return (error); - } - so->so_rcv.sb_flags |= SB_AUTOSIZE; - so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_WLOCK(&tcbinfo); - error = in_pcballoc(so, &tcbinfo); - if (error) { - INP_INFO_WUNLOCK(&tcbinfo); - return (error); - } - inp = sotoinpcb(so); -#ifdef INET6 - if (isipv6) { - inp->inp_vflag |= INP_IPV6; - inp->in6p_hops = -1; /* use kernel default */ - } - else -#endif - inp->inp_vflag |= INP_IPV4; - tp = tcp_newtcpcb(inp); - if (tp == NULL) { -#ifdef INET6 - if (isipv6) { - in6_pcbdetach(inp); - in6_pcbfree(inp); - } else { -#endif - in_pcbdetach(inp); - in_pcbfree(inp); -#ifdef INET6 - } -#endif - INP_INFO_WUNLOCK(&tcbinfo); - return (ENOBUFS); - } - tp->t_state = TCPS_CLOSED; - INP_UNLOCK(inp); - INP_INFO_WUNLOCK(&tcbinfo); - return (0); -} - -/* - * Initiate (or continue) disconnect. - * If embryonic state, just send reset (once). - * If in ``let data drain'' option and linger null, just drop. - * Otherwise (hard), mark socket disconnecting and drop - * current input data; switch states based on user close, and - * send segment to peer (with FIN). - */ -static void -tcp_disconnect(struct tcpcb *tp) -{ - struct inpcb *inp = tp->t_inpcb; - struct socket *so = inp->inp_socket; - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(inp); - - /* - * Neither tcp_close() nor tcp_drop() should return NULL, as the - * socket is still open. - */ - if (tp->t_state < TCPS_ESTABLISHED) { - tp = cxgb_tcp_close(tp); - KASSERT(tp != NULL, - ("tcp_disconnect: tcp_close() returned NULL")); - } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { - tp = cxgb_tcp_drop(tp, 0); - KASSERT(tp != NULL, - ("tcp_disconnect: tcp_drop() returned NULL")); - } else { - soisdisconnecting(so); - sbflush(&so->so_rcv); - tcp_usrclosed(tp); - if (!(inp->inp_vflag & INP_DROPPED)) - tcp_gen_disconnect(tp); - } -} - -/* - * User issued close, and wish to trail through shutdown states: - * if never received SYN, just forget it. If got a SYN from peer, - * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. - * If already got a FIN from peer, then almost done; go to LAST_ACK - * state. In all other cases, have already sent FIN to peer (e.g. - * after PRU_SHUTDOWN), and just have to play tedious game waiting - * for peer to send FIN or not respond to keep-alives, etc. - * We can let the user exit from the close as soon as the FIN is acked. - */ -static void -tcp_usrclosed(struct tcpcb *tp) -{ - - INP_INFO_WLOCK_ASSERT(&tcbinfo); - INP_LOCK_ASSERT(tp->t_inpcb); - - switch (tp->t_state) { - case TCPS_LISTEN: - tcp_gen_listen_close(tp); - case TCPS_CLOSED: - tp->t_state = TCPS_CLOSED; - tp = cxgb_tcp_close(tp); - /* - * tcp_close() should never return NULL here as the socket is - * still open. - */ - KASSERT(tp != NULL, - ("tcp_usrclosed: tcp_close() returned NULL")); - break; - - case TCPS_SYN_SENT: - case TCPS_SYN_RECEIVED: - tp->t_flags |= TF_NEEDFIN; - break; - - case TCPS_ESTABLISHED: - tp->t_state = TCPS_FIN_WAIT_1; - break; - - case TCPS_CLOSE_WAIT: - tp->t_state = TCPS_LAST_ACK; - break; - } - if (tp->t_state >= TCPS_FIN_WAIT_2) { - soisdisconnected(tp->t_inpcb->inp_socket); - /* Prevent the connection hanging in FIN_WAIT_2 forever. */ - if (tp->t_state == TCPS_FIN_WAIT_2) { - int timeout; - - timeout = (tcp_fast_finwait2_recycle) ? - tcp_finwait2_timeout : tcp_maxidle; - tcp_timer_activate(tp, TT_2MSL, timeout); - } - } -} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h index a078bee..8a9c498 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h @@ -30,45 +30,49 @@ #ifndef CXGB_TOEPCB_H_ #define CXGB_TOEPCB_H_ #include <sys/bus.h> +#include <sys/condvar.h> #include <dev/cxgb/sys/mbufq.h> struct toepcb { - struct toedev *tp_toedev; - struct l2t_entry *tp_l2t; - pr_ctloutput_t *tp_ctloutput; - unsigned int tp_tid; - int tp_wr_max; - int tp_wr_avail; - int tp_wr_unacked; - int tp_delack_mode; - int tp_mtu_idx; - int tp_ulp_mode; - int tp_qset_idx; - int tp_mss_clamp; - int tp_qset; - int tp_flags; - int tp_enqueued_bytes; - int tp_page_count; - int tp_state; - - tcp_seq tp_iss; - tcp_seq tp_delack_seq; - tcp_seq tp_rcv_wup; - tcp_seq tp_copied_seq; - uint64_t tp_write_seq; - - volatile int tp_refcount; - vm_page_t *tp_pages; + struct toedev *tp_toedev; + struct l2t_entry *tp_l2t; + pr_ctloutput_t *tp_ctloutput; + unsigned int tp_tid; + int tp_wr_max; + int tp_wr_avail; + int tp_wr_unacked; + int tp_delack_mode; + int tp_mtu_idx; + int tp_ulp_mode; + int tp_qset_idx; + int tp_mss_clamp; + int tp_qset; + int tp_flags; + int tp_enqueued_bytes; + int tp_page_count; + int tp_state; + + tcp_seq tp_iss; + tcp_seq tp_delack_seq; + tcp_seq tp_rcv_wup; + tcp_seq tp_copied_seq; + uint64_t tp_write_seq; + + volatile int tp_refcount; + vm_page_t *tp_pages; - struct tcpcb *tp_tp; - struct mbuf *tp_m_last; - bus_dma_tag_t tp_tx_dmat; - bus_dmamap_t tp_dmamap; - - LIST_ENTRY(toepcb) synq_entry; - struct mbuf_head wr_list; - struct mbuf_head out_of_order_queue; - struct ddp_state tp_ddp_state; + struct tcpcb *tp_tp; + struct mbuf *tp_m_last; + bus_dma_tag_t tp_tx_dmat; + bus_dma_tag_t tp_rx_dmat; + bus_dmamap_t tp_dmamap; + + LIST_ENTRY(toepcb) synq_entry; + struct mbuf_head wr_list; + struct mbuf_head out_of_order_queue; + struct ddp_state tp_ddp_state; + struct cv tp_cv; + }; static inline void @@ -95,7 +99,7 @@ enqueue_wr(struct toepcb *toep, struct mbuf *m) } static inline struct mbuf * -peek_wr(struct toepcb *toep) +peek_wr(const struct toepcb *toep) { return (mbufq_peek(&toep->wr_list)); @@ -108,5 +112,10 @@ dequeue_wr(struct toepcb *toep) return (mbufq_dequeue(&toep->wr_list)); } +#define wr_queue_walk(toep, m) \ + for (m = peek_wr(toep); m; m = m->m_nextpkt) + + + #endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c index b5b87b7..4015cd3 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c @@ -34,11 +34,13 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/kernel.h> #include <sys/fcntl.h> +#include <sys/ktr.h> #include <sys/limits.h> #include <sys/lock.h> #include <sys/eventhandler.h> #include <sys/mbuf.h> #include <sys/module.h> +#include <sys/condvar.h> #include <sys/mutex.h> #include <sys/socket.h> #include <sys/sysctl.h> @@ -90,16 +92,20 @@ static TAILQ_HEAD(, tom_data) cxgb_list; static struct mtx cxgb_list_lock; static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry); +static void cxgb_register_listeners(void); + /* * Handlers for each CPL opcode */ -static cxgb_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS]; +static cxgb_cpl_handler_func tom_cpl_handlers[256]; + static eventhandler_tag listen_tag; static struct offload_id t3_toe_id_tab[] = { { TOE_ID_CHELSIO_T3, 0 }, { TOE_ID_CHELSIO_T3B, 0 }, + { TOE_ID_CHELSIO_T3C, 0 }, { 0 } }; @@ -138,7 +144,7 @@ toepcb_alloc(void) { struct toepcb *toep; - toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT); + toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT|M_ZERO); if (toep == NULL) return (NULL); @@ -150,8 +156,8 @@ toepcb_alloc(void) void toepcb_init(struct toepcb *toep) { - bzero(toep, sizeof(*toep)); toep->tp_refcount = 1; + cv_init(&toep->tp_cv, "toep cv"); } void @@ -164,12 +170,9 @@ void toepcb_release(struct toepcb *toep) { if (toep->tp_refcount == 1) { - printf("doing final toepcb free\n"); - free(toep, M_DEVBUF); return; } - atomic_add_acq_int(&toep->tp_refcount, -1); } @@ -179,13 +182,30 @@ toepcb_release(struct toepcb *toep) static void t3cdev_add(struct tom_data *t) { - printf("t3cdev_add\n"); - mtx_lock(&cxgb_list_lock); TAILQ_INSERT_TAIL(&cxgb_list, t, entry); mtx_unlock(&cxgb_list_lock); } +static inline int +cdev2type(struct t3cdev *cdev) +{ + int type = 0; + + switch (cdev->type) { + case T3A: + type = TOE_ID_CHELSIO_T3; + break; + case T3B: + type = TOE_ID_CHELSIO_T3B; + break; + case T3C: + type = TOE_ID_CHELSIO_T3C; + break; + } + return (type); +} + /* * Allocate a TOM data structure, * initialize its cpl_handlers @@ -200,11 +220,7 @@ t3c_tom_add(struct t3cdev *cdev) struct toedev *tdev; struct adap_ports *port_info; - printf("%s called\n", __FUNCTION__); - - t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); - if (t == NULL) return; @@ -224,8 +240,7 @@ t3c_tom_add(struct t3cdev *cdev) /* Register TCP offload device */ tdev = &t->tdev; - tdev->tod_ttid = (cdev->type == T3A ? - TOE_ID_CHELSIO_T3 : TOE_ID_CHELSIO_T3B); + tdev->tod_ttid = cdev2type(cdev); tdev->tod_lldev = cdev->lldev; if (register_toedev(tdev, "toe%d")) { @@ -234,13 +249,11 @@ t3c_tom_add(struct t3cdev *cdev) } TOM_DATA(tdev) = t; - printf("nports=%d\n", port_info->nports); for (i = 0; i < port_info->nports; i++) { struct ifnet *ifp = port_info->lldevs[i]; TOEDEV(ifp) = tdev; - printf("enabling toe on %p\n", ifp); - + CTR1(KTR_TOM, "enabling toe on %p", ifp); ifp->if_capabilities |= IFCAP_TOE4; ifp->if_capenable |= IFCAP_TOE4; } @@ -251,6 +264,7 @@ t3c_tom_add(struct t3cdev *cdev) /* Activate TCP offload device */ activate_offload(tdev); + cxgb_register_listeners(); return; out_free_all: @@ -269,8 +283,8 @@ static int do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) { log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name, - *mtod(m, unsigned int *)); - + 0xFF & *mtod(m, unsigned int *)); + kdb_backtrace(); return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG); } @@ -282,7 +296,7 @@ do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h) { - if (opcode < NUM_CPL_CMDS) + if (opcode < 256) tom_cpl_handlers[opcode] = h ? h : do_bad_cpl; else log(LOG_ERR, "Chelsio T3 TOM: handler registration for " @@ -327,7 +341,7 @@ init_cpl_handlers(void) { int i; - for (i = 0; i < NUM_CPL_CMDS; ++i) + for (i = 0; i < 256; ++i) tom_cpl_handlers[i] = do_bad_cpl; t3_init_listen_cpl_handlers(); @@ -349,7 +363,7 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry) #endif t3_init_tunables(t); mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF); - + CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry); /* Adjust TOE activation for this module */ t->conf.activated = activated; @@ -374,19 +388,14 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry) t->ddp_ulimit = ddp.ulimit; t->pdev = ddp.pdev; t->rx_page_size = rx_page_info.page_size; -#ifdef notyet /* OK if this fails, we just can't do DDP */ t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE; - t->ppod_map = t3_alloc_mem(t->nppods); -#endif + t->ppod_map = malloc(t->nppods, M_DEVBUF, M_WAITOK|M_ZERO); -#if 0 - spin_lock_init(&t->ppod_map_lock); - tom_proc_init(dev); -#ifdef CONFIG_SYSCTL - t->sysctl = t3_sysctl_register(dev, &t->conf); -#endif -#endif + mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF); + + + t3_sysctl_register(cdev->adapter, &t->conf); return (0); } @@ -411,11 +420,8 @@ cxgb_toe_listen_stop(void *unused, struct tcpcb *tp) mtx_lock(&cxgb_list_lock); TAILQ_FOREACH(p, &cxgb_list, entry) { - if (tp->t_state == TCPS_LISTEN) { - printf("stopping listen on port=%d\n", - ntohs(tp->t_inpcb->inp_lport)); + if (tp->t_state == TCPS_LISTEN) t3_listen_stop(&p->tdev, so, p->cdev); - } } mtx_unlock(&cxgb_list_lock); } @@ -439,23 +445,12 @@ cxgb_register_listeners(void) static int t3_tom_init(void) { - -#if 0 - struct socket *sock; - err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err < 0) { - printk(KERN_ERR "Could not create TCP socket, error %d\n", err); - return err; - } - - t3_def_state_change = sock->sk->sk_state_change; - t3_def_data_ready = sock->sk->sk_data_ready; - t3_def_error_report = sock->sk->sk_error_report; - sock_release(sock); -#endif init_cpl_handlers(); - if (t3_init_cpl_io() < 0) + if (t3_init_cpl_io() < 0) { + log(LOG_ERR, + "Unable to initialize cpl io ops\n"); return -1; + } t3_init_socket_ops(); /* Register with the TOE device layer. */ @@ -466,7 +461,6 @@ t3_tom_init(void) return -1; } INP_INFO_WLOCK(&tcbinfo); - INP_INFO_WUNLOCK(&tcbinfo); mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF); @@ -477,10 +471,8 @@ t3_tom_init(void) TAILQ_INIT(&cxgb_list); /* Register to offloading devices */ - printf("setting add to %p\n", t3c_tom_add); t3c_tom_client.add = t3c_tom_add; cxgb_register_client(&t3c_tom_client); - cxgb_register_listeners(); return (0); } @@ -491,8 +483,6 @@ t3_tom_load(module_t mod, int cmd, void *arg) switch (cmd) { case MOD_LOAD: - printf("wheeeeee ...\n"); - t3_tom_init(); break; case MOD_QUIESCE: diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h index 8d60bbd..bcda2c3 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h @@ -138,6 +138,8 @@ struct listen_ctx { void t3_init_tunables(struct tom_data *t); +void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p); + static __inline struct mbuf * m_gethdr_nofail(int len) { diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c index 7219922..b4ff748 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c @@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$"); #include <dev/cxgb/common/cxgb_ctl_defs.h> #include <dev/cxgb/common/cxgb_t3_cpl.h> #include <dev/cxgb/cxgb_offload.h> +#include <dev/cxgb/cxgb_include.h> #include <dev/cxgb/cxgb_l2t.h> #include <dev/cxgb/ulp/toecore/cxgb_toedev.h> #include <dev/cxgb/ulp/tom/cxgb_tom.h> @@ -82,7 +83,7 @@ static struct tom_tunables default_tunable_vals = { .delack = 1, .max_conn = -1, .soft_backlog_limit = 0, - .ddp = 0, + .ddp = 1, .ddp_thres = 14 * 4096, .ddp_copy_limit = 13 * 4096, .ddp_push_wait = 1, @@ -96,7 +97,8 @@ static struct tom_tunables default_tunable_vals = { .activated = 1, }; -void t3_init_tunables(struct tom_data *t) +void +t3_init_tunables(struct tom_data *t) { t->conf = default_tunable_vals; @@ -104,3 +106,15 @@ void t3_init_tunables(struct tom_data *t) t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk; t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs; } + +void +t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p) +{ + struct sysctl_ctx_list *ctx; + struct sysctl_oid_list *children; + + ctx = device_get_sysctl_ctx(sc->dev); + children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); + +} + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c new file mode 100644 index 0000000..7036005 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.c @@ -0,0 +1,180 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> + +#define TRACE_ENTER printf("%s:%s entered", __FUNCTION__, __FILE__) +#define TRACE_EXIT printf("%s:%s:%d exited", __FUNCTION__, __FILE__, __LINE__) + +/* + * This routine takes a user address range and does the following: + * - validate that the user has access to those pages (flags indicates read or write) - if not fail + * - validate that count is enough to hold range number of pages - if not fail + * - fault in any non-resident pages + * - if the user is doing a read force a write fault for any COWed pages + * - if the user is doing a read mark all pages as dirty + * - hold all pages + * - return number of pages in count + */ +int +vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags) +{ + + vm_offset_t end, va; + vm_paddr_t pa; + int faults, rv; + + struct thread *td; + vm_map_t map; + pmap_t pmap; + vm_page_t m, *pages; + vm_prot_t prot; + + + /* + * Check that virtual address range is legal + * This check is somewhat bogus as on some architectures kernel + * and user do not share VA - however, it appears that all FreeBSD + * architectures define it + */ + end = addr + (count * PAGE_SIZE); + if (end > VM_MAXUSER_ADDRESS) { + printf("bad address passed\n"); + return (EFAULT); + } + + td = curthread; + map = &td->td_proc->p_vmspace->vm_map; + pmap = &td->td_proc->p_vmspace->vm_pmap; + pages = mp; + + prot = VM_PROT_READ; + prot |= (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : 0; + bzero(pages, sizeof(vm_page_t *) * count); +retry: + + /* + * First optimistically assume that all pages are resident (and R/W if for write) + * if so just mark pages as held (and dirty if for write) and return + */ + vm_page_lock_queues(); + for (pages = mp, faults = 0, va = addr; va < end; va += PAGE_SIZE, pages++) { + /* + * Assure that we only hold the page once + */ + if (*pages == NULL) { + /* + * page queue mutex is recursable so this is OK + * it would be really nice if we had an unlocked version of this so + * we were only acquiring the pmap lock 1 time as opposed to potentially + * many dozens of times + */ + m = pmap_extract_and_hold(pmap, va, prot); + if (m == NULL) { + faults++; + continue; + } + + *pages = m; + if (flags & VM_HOLD_WRITEABLE) + vm_page_dirty(m); + } + } + vm_page_unlock_queues(); + + if (faults == 0) { + return (0); + } + + /* + * Pages either have insufficient permissions or are not present + * trigger a fault where neccessary + * + */ + for (va = addr; va < end; va += PAGE_SIZE) { + m = NULL; + pa = pmap_extract(pmap, va); + rv = 0; + if (pa) + m = PHYS_TO_VM_PAGE(pa); + if (flags & VM_HOLD_WRITEABLE) { + if (m == NULL || (m->flags & PG_WRITEABLE) == 0) + rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); + } else if (m == NULL) + rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL); + if (rv) { + printf("vm_fault bad return rv=%d va=0x%zx\n", rv, va); + + goto error; + } + } + + goto retry; + +error: + vm_page_lock_queues(); + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) + if (*pages) + vm_page_unhold(*pages); + vm_page_unlock_queues(); + return (EFAULT); +} + +void +vm_fault_unhold_pages(vm_page_t *mp, int count) +{ + + KASSERT(count >= 0, ("negative count %d", count)); + vm_page_lock_queues(); + while (count--) { + vm_page_unhold(*mp); + mp++; + } + vm_page_unlock_queues(); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h new file mode 100644 index 0000000..29418b6 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.h @@ -0,0 +1,40 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef CXGB_VM_H_ +#define CXGB_VM_H_ + +#define VM_HOLD_WRITEABLE 0x1 + +int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags); +void vm_fault_unhold_pages(vm_page_t *mp, int count); + +#endif |