summaryrefslogtreecommitdiffstats
path: root/sys/dev/cxgb/ulp
diff options
context:
space:
mode:
authorkmacy <kmacy@FreeBSD.org>2008-02-23 01:06:17 +0000
committerkmacy <kmacy@FreeBSD.org>2008-02-23 01:06:17 +0000
commit48fe676ff5ddc104ebc346eebf48c7c0e285f833 (patch)
tree02a3e854ca5eb4caea80ce68a9a12f620befb52d /sys/dev/cxgb/ulp
parentdf26e399aa077b14fb965be866012bccf2847bae (diff)
downloadFreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.zip
FreeBSD-src-48fe676ff5ddc104ebc346eebf48c7c0e285f833.tar.gz
- update firmware to 5.0
- add support for T3C - add DDP support (zero-copy receive) - fix TOE transmit of large requests - fix shutdown so that sockets don't remain in CLOSING state indefinitely - register listeners when an interface is brought up after tom is loaded - fix setting of multicast filter - enable link at device attach - exit tick handler if shutdown is in progress - add helper for logging TCB - add sysctls for dumping transmit queues - note that TOE wxill not be MFC'd until after 7.0 has been finalized MFC after: 3 days
Diffstat (limited to 'sys/dev/cxgb/ulp')
-rw-r--r--sys/dev/cxgb/ulp/toecore/cxgb_toedev.h4
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c1569
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c729
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_ddp.c735
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_defs.h10
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_listen.c22
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h52
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c694
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c1362
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_toepcb.h81
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom.c102
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom.h2
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c18
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_vm.c180
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_vm.h40
15 files changed, 2898 insertions, 2702 deletions
diff --git a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
index 8e88d6b..c70c37d 100644
--- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
+++ b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
@@ -41,6 +41,8 @@ enum {
TOE_ID_CHELSIO_T2,
TOE_ID_CHELSIO_T3,
TOE_ID_CHELSIO_T3B,
-};
+ TOE_ID_CHELSIO_T3C,
+}
+ ;
#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
index 0f2f2ee..96e5b65 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/limits.h>
+#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
@@ -63,9 +64,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_offload.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_timer.h>
#include <net/route.h>
-
#include <dev/cxgb/t3cdev.h>
#include <dev/cxgb/common/cxgb_firmware_exports.h>
#include <dev/cxgb/common/cxgb_t3_cpl.h>
@@ -84,8 +85,6 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
-
-
/*
* For ULP connections HW may add headers, e.g., for digests, that aren't part
* of the messages sent by the host but that are part of the TCP payload and
@@ -118,7 +117,7 @@ static unsigned int wrlen __read_mostly;
* in the skb and whether it has any payload in its main body. This maps the
* length of the gather list represented by an skb into the # of necessary WRs.
*/
-static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;
+static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
/*
* Max receive window supported by HW in bytes. Only a small part of it can
@@ -147,6 +146,37 @@ static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
static void handle_syncache_event(int event, void *arg);
+static inline void
+SBAPPEND(struct sockbuf *sb, struct mbuf *n)
+{
+ struct mbuf * m;
+
+ m = sb->sb_mb;
+ while (m) {
+ KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
+ !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
+ !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+ KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+ m->m_next, m->m_nextpkt, m->m_flags));
+ m = m->m_next;
+ }
+ m = n;
+ while (m) {
+ KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
+ !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
+ !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+ KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+ m->m_next, m->m_nextpkt, m->m_flags));
+ m = m->m_next;
+ }
+ sbappend_locked(sb, n);
+ m = sb->sb_mb;
+ while (m) {
+ KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+ m->m_next, m->m_nextpkt, m->m_flags));
+ m = m->m_next;
+ }
+}
static inline int
is_t3a(const struct toedev *dev)
@@ -166,6 +196,7 @@ dump_toepcb(struct toepcb *toep)
toep->tp_mss_clamp, toep->tp_flags);
}
+#ifndef RTALLOC2_DEFINED
static struct rtentry *
rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
{
@@ -176,7 +207,7 @@ rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
return (rt);
}
-
+#endif
/*
* Determine whether to send a CPL message now or defer it. A message is
* deferred if the connection is in SYN_SENT since we don't know the TID yet.
@@ -185,39 +216,39 @@ rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
* it is sent directly.
*/
static inline void
-send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
+send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
{
- struct toepcb *toep = tp->t_toe;
+ struct tcpcb *tp = toep->tp_tp;
-
if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
INP_LOCK(tp->t_inpcb);
mbufq_tail(&toep->out_of_order_queue, m); // defer
INP_UNLOCK(tp->t_inpcb);
} else if (through_l2t)
- l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T
+ l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
else
- cxgb_ofld_send(T3C_DEV(so), m); // send directly
+ cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
}
static inline unsigned int
-mkprio(unsigned int cntrl, const struct socket *so)
+mkprio(unsigned int cntrl, const struct toepcb *toep)
{
- return cntrl;
+ return (cntrl);
}
/*
* Populate a TID_RELEASE WR. The skb must be already propely sized.
*/
static inline void
-mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
+mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
{
struct cpl_tid_release *req;
- m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
+ m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
m->m_pkthdr.len = m->m_len = sizeof(*req);
req = mtod(m, struct cpl_tid_release *);
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ req->wr.wr_lo = 0;
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
}
@@ -257,6 +288,8 @@ make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
}
}
+#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
+
int
t3_push_frames(struct socket *so, int req_completion)
{
@@ -266,9 +299,8 @@ t3_push_frames(struct socket *so, int req_completion)
struct mbuf *tail, *m0, *last;
struct t3cdev *cdev;
struct tom_data *d;
- int bytes, count, total_bytes;
+ int i, bytes, count, total_bytes;
bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
- segp = segs;
if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
DPRINTF("tcp state=%d\n", tp->t_state);
@@ -281,10 +313,9 @@ t3_push_frames(struct socket *so, int req_completion)
return (0);
}
- INP_LOCK_ASSERT(tp->t_inpcb);
+ INP_LOCK_ASSERT(tp->t_inpcb);
SOCKBUF_LOCK(&so->so_snd);
-
d = TOM_DATA(TOE_DEV(so));
cdev = d->cdev;
last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
@@ -306,61 +337,103 @@ t3_push_frames(struct socket *so, int req_completion)
toep->tp_m_last = NULL;
while (toep->tp_wr_avail && (tail != NULL)) {
count = bytes = 0;
+ segp = segs;
if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
SOCKBUF_UNLOCK(&so->so_snd);
return (0);
}
- while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
- && (tail != NULL) && (count < TX_MAX_SEGS)) {
- bytes += tail->m_len;
- count++;
+ /*
+ * If the data in tail fits as in-line, then
+ * make an immediate data wr.
+ */
+ if (tail->m_len <= IMM_LEN) {
+ count = 1;
+ bytes = tail->m_len;
last = tail;
- /*
- * technically an abuse to be using this for a VA
- * but less gross than defining my own structure
- * or calling pmap_kextract from here :-|
- */
- segp->ds_addr = (bus_addr_t)tail->m_data;
- segp->ds_len = tail->m_len;
- DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
- count, mbuf_wrs[count], tail->m_data, tail->m_len);
-
- segp++;
tail = tail->m_next;
+ m_set_sgl(m0, NULL);
+ m_set_sgllen(m0, 0);
+ make_tx_data_wr(so, m0, bytes, tail);
+ m_append(m0, bytes, mtod(last, caddr_t));
+ KASSERT(!m0->m_next, ("bad append"));
+ } else {
+ while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
+ && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
+ bytes += tail->m_len;
+ last = tail;
+ count++;
+ /*
+ * technically an abuse to be using this for a VA
+ * but less gross than defining my own structure
+ * or calling pmap_kextract from here :-|
+ */
+ segp->ds_addr = (bus_addr_t)tail->m_data;
+ segp->ds_len = tail->m_len;
+ DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
+ count, mbuf_wrs[count], tail->m_data, tail->m_len);
+ segp++;
+ tail = tail->m_next;
+ }
+ DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
+ toep->tp_wr_avail, count, mbuf_wrs[count], tail);
+
+ m_set_sgl(m0, segs);
+ m_set_sgllen(m0, count);
+ make_tx_data_wr(so, m0, bytes, tail);
}
- DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
- toep->tp_wr_avail, count, mbuf_wrs[count], tail);
+ m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
+
if (tail) {
so->so_snd.sb_sndptr = tail;
toep->tp_m_last = NULL;
} else
toep->tp_m_last = so->so_snd.sb_sndptr = last;
+
DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
so->so_snd.sb_sndptroff += bytes;
total_bytes += bytes;
toep->tp_write_seq += bytes;
-
-
- SOCKBUF_UNLOCK(&so->so_snd);
-
- /*
- * XXX can drop socket buffer lock here
- */
+ CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
+ toep->tp_wr_avail, count, mbuf_wrs[count], tail, so->so_snd.sb_sndptr, so->so_snd.sb_sndptroff);
+ if (tail)
+ CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
+ total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
+ else
+ CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
+ total_bytes, toep->tp_m_last, tp->snd_una);
+
+
+ i = 0;
+ while (i < count && m_get_sgllen(m0)) {
+ if ((count - i) >= 3) {
+ CTR6(KTR_TOM,
+ "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
+ segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
+ segs[i + 2].ds_addr, segs[i + 2].ds_len);
+ i += 3;
+ } else if ((count - i) == 2) {
+ CTR4(KTR_TOM,
+ "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
+ segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
+ i += 2;
+ } else {
+ CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
+ segs[i].ds_addr, segs[i].ds_len);
+ i++;
+ }
- toep->tp_wr_avail -= mbuf_wrs[count];
- toep->tp_wr_unacked += mbuf_wrs[count];
+ }
- make_tx_data_wr(so, m0, bytes, tail);
- m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
- m_set_sgl(m0, segs);
- m_set_sgllen(m0, count);
- /*
+ /*
* remember credits used
*/
m0->m_pkthdr.csum_data = mbuf_wrs[count];
m0->m_pkthdr.len = bytes;
+ toep->tp_wr_avail -= mbuf_wrs[count];
+ toep->tp_wr_unacked += mbuf_wrs[count];
+
if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
struct work_request_hdr *wr = cplhdr(m0);
@@ -368,18 +441,16 @@ t3_push_frames(struct socket *so, int req_completion)
wr->wr_hi |= htonl(F_WR_COMPL);
toep->tp_wr_unacked = 0;
}
-
+ KASSERT((m0->m_pkthdr.csum_data > 0) &&
+ (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
+ m0->m_pkthdr.csum_data));
m0->m_type = MT_DONTFREE;
enqueue_wr(toep, m0);
DPRINTF("sending offload tx with %d bytes in %d segments\n",
bytes, count);
-
l2t_send(cdev, m0, toep->tp_l2t);
- if (toep->tp_wr_avail && (tail != NULL))
- SOCKBUF_LOCK(&so->so_snd);
}
-
- SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+ SOCKBUF_UNLOCK(&so->so_snd);
return (total_bytes);
}
@@ -467,13 +538,105 @@ t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail
req = mtod(m, struct cpl_rx_data_ack *);
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ req->wr.wr_lo = 0;
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
- m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep)));
+ m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
return (credits);
}
+/*
+ * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
+ * This is only used in DDP mode, so we take the opportunity to also set the
+ * DACK mode and flush any Rx credits.
+ */
+void
+t3_send_rx_modulate(struct toepcb *toep)
+{
+ struct mbuf *m;
+ struct cpl_rx_data_ack *req;
+
+ m = m_gethdr_nofail(sizeof(*req));
+
+ req = mtod(m, struct cpl_rx_data_ack *);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ req->wr.wr_lo = 0;
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
+ req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
+ V_RX_DACK_MODE(1) |
+ V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+ cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+ toep->tp_rcv_wup = toep->tp_copied_seq;
+}
+
+/*
+ * Handle receipt of an urgent pointer.
+ */
+static void
+handle_urg_ptr(struct socket *so, uint32_t urg_seq)
+{
+#ifdef URGENT_DATA_SUPPORTED
+ struct tcpcb *tp = sototcpcb(so);
+
+ urg_seq--; /* initially points past the urgent data, per BSD */
+
+ if (tp->urg_data && !after(urg_seq, tp->urg_seq))
+ return; /* duplicate pointer */
+ sk_send_sigurg(sk);
+ if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+ !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ tp->copied_seq++;
+ if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
+ tom_eat_skb(sk, skb, 0);
+ }
+ tp->urg_data = TCP_URG_NOTYET;
+ tp->urg_seq = urg_seq;
+#endif
+}
+
+/*
+ * Returns true if a socket cannot accept new Rx data.
+ */
+static inline int
+so_no_receive(const struct socket *so)
+{
+ return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
+}
+
+/*
+ * Process an urgent data notification.
+ */
+static void
+rx_urg_notify(struct toepcb *toep, struct mbuf *m)
+{
+ struct cpl_rx_urg_notify *hdr = cplhdr(m);
+ struct socket *so = toeptoso(toep);
+
+ VALIDATE_SOCK(so);
+
+ if (!so_no_receive(so))
+ handle_urg_ptr(so, ntohl(hdr->seq));
+
+ m_freem(m);
+}
+
+/*
+ * Handler for RX_URG_NOTIFY CPL messages.
+ */
+static int
+do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+
+ rx_urg_notify(toep, m);
+ return (0);
+}
/*
* Set of states for which we should return RX credits.
@@ -485,7 +648,7 @@ t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail
* to the HW for the amount of data processed.
*/
void
-t3_cleanup_rbuf(struct tcpcb *tp)
+t3_cleanup_rbuf(struct tcpcb *tp, int copied)
{
struct toepcb *toep = tp->t_toe;
struct socket *so;
@@ -493,23 +656,38 @@ t3_cleanup_rbuf(struct tcpcb *tp)
int dack_mode, must_send, read;
u32 thres, credits, dack = 0;
+ so = tp->t_inpcb->inp_socket;
if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
- (tp->t_state == TCPS_FIN_WAIT_2)))
+ (tp->t_state == TCPS_FIN_WAIT_2))) {
+ if (copied) {
+ SOCKBUF_LOCK(&so->so_rcv);
+ toep->tp_copied_seq += copied;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ }
+
return;
- INP_LOCK_ASSERT(tp->t_inpcb);
+ }
- so = tp->t_inpcb->inp_socket;
+ INP_LOCK_ASSERT(tp->t_inpcb);
SOCKBUF_LOCK(&so->so_rcv);
- read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
- toep->tp_copied_seq += read;
- toep->tp_enqueued_bytes -= read;
+ if (copied)
+ toep->tp_copied_seq += copied;
+ else {
+ read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
+ toep->tp_copied_seq += read;
+ }
credits = toep->tp_copied_seq - toep->tp_rcv_wup;
+ toep->tp_enqueued_bytes = so->so_rcv.sb_cc;
SOCKBUF_UNLOCK(&so->so_rcv);
- if (credits > so->so_rcv.sb_mbmax)
+ if (credits > so->so_rcv.sb_mbmax) {
printf("copied_seq=%u rcv_wup=%u credits=%u\n",
toep->tp_copied_seq, toep->tp_rcv_wup, credits);
- /*
+ credits = so->so_rcv.sb_mbmax;
+ }
+
+
+ /*
* XXX this won't accurately reflect credit return - we need
* to look at the difference between the amount that has been
* put in the recv sockbuf and what is there now
@@ -593,7 +771,7 @@ static int
cxgb_toe_rcvd(struct tcpcb *tp)
{
INP_LOCK_ASSERT(tp->t_inpcb);
- t3_cleanup_rbuf(tp);
+ t3_cleanup_rbuf(tp, 0);
return (0);
}
@@ -631,16 +809,18 @@ static struct toe_usrreqs cxgb_toe_usrreqs = {
static void
-__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
+__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
uint64_t mask, uint64_t val, int no_reply)
{
struct cpl_set_tcb_field *req;
- struct tcpcb *tp = sototcpcb(so);
- struct toepcb *toep = tp->t_toe;
+
+ CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
+ toep->tp_tid, word, mask, val);
req = mtod(m, struct cpl_set_tcb_field *);
m->m_pkthdr.len = m->m_len = sizeof(*req);
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ req->wr.wr_lo = 0;
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
req->reply = V_NO_REPLY(no_reply);
req->cpu_idx = 0;
@@ -648,8 +828,8 @@ __set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
req->mask = htobe64(mask);
req->val = htobe64(val);
- m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
- send_or_defer(so, tp, m, 0);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+ send_or_defer(toep, m, 0);
}
static void
@@ -661,13 +841,15 @@ t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
if (toep == NULL)
return;
-
- if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
+
+ if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
+ printf("not seting field\n");
return;
-
+ }
+
m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
- __set_tcb_field(so, m, word, mask, val, 1);
+ __set_tcb_field(toep, m, word, mask, val, 1);
}
/*
@@ -735,10 +917,11 @@ t3_set_tos(struct socket *so)
static void
t3_enable_ddp(struct socket *so, int on)
{
- if (on)
+ if (on) {
+
t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
V_TF_DDP_OFF(0));
- else
+ } else
t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_OFF(1) |
TP_DDP_TIMER_WORKAROUND_MASK,
@@ -747,7 +930,6 @@ t3_enable_ddp(struct socket *so, int on)
}
-
void
t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
{
@@ -777,7 +959,7 @@ t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
static int
t3_set_cong_control(struct socket *so, const char *name)
{
-#ifdef notyet
+#ifdef CONGESTION_CONTROL_SUPPORTED
int cong_algo;
for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
@@ -802,12 +984,14 @@ t3_get_tcb(struct socket *so)
return (ENOMEM);
INP_LOCK_ASSERT(tp->t_inpcb);
- m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
req = mtod(m, struct cpl_get_tcb *);
m->m_pkthdr.len = m->m_len = sizeof(*req);
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ req->wr.wr_lo = 0;
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
req->cpuno = htons(toep->tp_qset);
+ req->rsvd = 0;
if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
mbufq_tail(&toep->out_of_order_queue, m); // defer
else
@@ -863,14 +1047,6 @@ select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
return (idx);
}
-void
-t3_release_ddp_resources(struct toepcb *toep)
-{
- /*
- * This is a no-op until we have DDP support
- */
-}
-
static inline void
free_atid(struct t3cdev *cdev, unsigned int tid)
{
@@ -915,8 +1091,6 @@ t3_release_offload_resources(struct toepcb *toep)
l2t_release(L2DATA(cdev), toep->tp_l2t);
toep->tp_l2t = NULL;
}
- printf("setting toep->tp_tp to NULL\n");
-
toep->tp_tp = NULL;
if (tp) {
INP_LOCK_ASSERT(tp->t_inpcb);
@@ -964,16 +1138,16 @@ select_rcv_wscale(int space)
if (tcp_do_rfc1323)
for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
- return wscale;
+
+ return (wscale);
}
/*
* Determine the receive window size for a socket.
*/
-static unsigned int
-select_rcv_wnd(struct socket *so)
+static unsigned long
+select_rcv_wnd(struct toedev *dev, struct socket *so)
{
- struct toedev *dev = TOE_DEV(so);
struct tom_data *d = TOM_DATA(dev);
unsigned int wnd;
unsigned int max_rcv_wnd;
@@ -981,7 +1155,9 @@ select_rcv_wnd(struct socket *so)
if (tcp_do_autorcvbuf)
wnd = tcp_autorcvbuf_max;
else
- wnd = sbspace(&so->so_rcv);
+ wnd = so->so_rcv.sb_hiwat;
+
+
/* XXX
* For receive coalescing to work effectively we need a receive window
@@ -991,7 +1167,7 @@ select_rcv_wnd(struct socket *so)
wnd = MIN_RCV_WND;
/* PR 5138 */
- max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ?
+ max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
(uint32_t)d->rx_page_size * 23 :
MAX_RCV_WND);
@@ -1017,7 +1193,8 @@ init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
* or we need to add this
*/
so->so_snd.sb_flags |= SB_NOCOALESCE;
-
+ so->so_rcv.sb_flags |= SB_NOCOALESCE;
+
tp->t_toe = toep;
toep->tp_tp = tp;
toep->tp_toedev = dev;
@@ -1033,7 +1210,8 @@ init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
* XXX broken
*
*/
- tp->rcv_wnd = select_rcv_wnd(so);
+ tp->rcv_wnd = select_rcv_wnd(dev, so);
+
toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
toep->tp_qset_idx = 0;
@@ -1076,9 +1254,23 @@ calc_opt2(const struct socket *so, struct toedev *dev)
flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
- return V_FLAVORS_VALID(flv_valid) |
- V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
+ return (V_FLAVORS_VALID(flv_valid) |
+ V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
}
+
+#if DEBUG_WR > 1
+static int
+count_pending_wrs(const struct toepcb *toep)
+{
+ const struct mbuf *m;
+ int n = 0;
+
+ wr_queue_walk(toep, m)
+ n += m->m_pkthdr.csum_data;
+ return (n);
+}
+#endif
+
#if 0
(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
#endif
@@ -1093,18 +1285,18 @@ mk_act_open_req(struct socket *so, struct mbuf *m,
struct toepcb *toep = tp->t_toe;
struct toedev *tdev = TOE_DEV(so);
- m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
+ m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
req = mtod(m, struct cpl_act_open_req *);
m->m_pkthdr.len = m->m_len = sizeof(*req);
-
+
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ req->wr.wr_lo = 0;
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
req->local_port = inp->inp_lport;
req->peer_port = inp->inp_fport;
memcpy(&req->local_ip, &inp->inp_laddr, 4);
memcpy(&req->peer_ip, &inp->inp_faddr, 4);
- DPRINTF("connect smt_idx=%d\n", e->smt_idx);
req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
V_TX_CHANNEL(e->smt_idx));
req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
@@ -1144,7 +1336,7 @@ fail_act_open(struct toepcb *toep, int errno)
t3_release_offload_resources(toep);
if (tp) {
INP_LOCK_ASSERT(tp->t_inpcb);
- cxgb_tcp_drop(tp, errno);
+ tcp_drop(tp, errno);
}
#ifdef notyet
@@ -1289,8 +1481,6 @@ t3_connect(struct toedev *tdev, struct socket *so,
toep = tp->t_toe;
m_set_toep(m, tp->t_toe);
- printf("sending off request\n");
-
toep->tp_state = TCPS_SYN_SENT;
l2t_send(d->cdev, (struct mbuf *)m, e);
@@ -1342,7 +1532,7 @@ t3_send_reset(struct toepcb *toep)
mode |= CPL_ABORT_POST_CLOSE_REQ;
m = m_gethdr_nofail(sizeof(*req));
- m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
+ m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
set_arp_failure_handler(m, abort_arp_failure);
req = mtod(m, struct cpl_abort_req *);
@@ -1416,7 +1606,7 @@ t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
* XXX I need to revisit this
*/
if ((err = t3_set_cong_control(so, name)) == 0) {
-#ifdef notyet
+#ifdef CONGESTION_CONTROL_SUPPORTED
tp->t_cong_control = strdup(name, M_CXGB);
#endif
} else
@@ -1465,7 +1655,280 @@ t3_ctloutput(struct socket *so, struct sockopt *sopt)
if (err != EOPNOTSUPP)
return (err);
- return tcp_ctloutput(so, sopt);
+ return (tcp_ctloutput(so, sopt));
+}
+
+/*
+ * Returns true if we need to explicitly request RST when we receive new data
+ * on an RX-closed connection.
+ */
+static inline int
+need_rst_on_excess_rx(const struct toepcb *toep)
+{
+ return (1);
+}
+
+/*
+ * Handles Rx data that arrives in a state where the socket isn't accepting
+ * new data.
+ */
+static void
+handle_excess_rx(struct toepcb *toep, struct mbuf *m)
+{
+
+ if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
+ t3_send_reset(toep);
+ m_freem(m);
+}
+
+/*
+ * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
+ * by getting the DDP offset from the TCB.
+ */
+static void
+tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
+{
+ struct ddp_state *q = &toep->tp_ddp_state;
+ struct ddp_buf_state *bsp;
+ struct cpl_get_tcb_rpl *hdr;
+ unsigned int ddp_offset;
+ struct socket *so;
+ struct tcpcb *tp;
+
+ uint64_t t;
+ __be64 *tcb;
+
+ so = toeptoso(toep);
+ tp = toep->tp_tp;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ SOCKBUF_LOCK(&so->so_rcv);
+
+ /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
+ * really need a cookie in order to dispatch the RPLs.
+ */
+ q->get_tcb_count--;
+
+ /* It is a possible that a previous CPL already invalidated UBUF DDP
+ * and moved the cur_buf idx and hence no further processing of this
+ * skb is required. However, the app might be sleeping on
+ * !q->get_tcb_count and we need to wake it up.
+ */
+ if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
+ struct socket *so = toeptoso(toep);
+
+ m_freem(m);
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return;
+ }
+
+ bsp = &q->buf_state[q->cur_buf];
+ hdr = cplhdr(m);
+ tcb = (__be64 *)(hdr + 1);
+ if (q->cur_buf == 0) {
+ t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
+ ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
+ } else {
+ t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
+ ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
+ }
+ ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
+ m->m_cur_offset = bsp->cur_offset;
+ bsp->cur_offset = ddp_offset;
+ m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
+
+ CTR5(KTR_TOM,
+ "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
+ q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
+ KASSERT(ddp_offset >= m->m_cur_offset, ("ddp_offset=%u less than cur_offset=%u",
+ ddp_offset, m->m_cur_offset));
+
+#ifdef T3_TRACE
+ T3_TRACE3(TIDTB(so),
+ "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
+ tp->rcv_nxt, q->cur_buf, ddp_offset);
+#endif
+
+#if 0
+{
+ unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
+
+ t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
+ ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
+
+ t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
+ rcv_nxt = t >> S_TCB_RCV_NXT;
+ rcv_nxt &= M_TCB_RCV_NXT;
+
+ t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
+ rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
+ rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
+
+ T3_TRACE2(TIDTB(sk),
+ "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
+ ddp_flags, rcv_nxt - rx_hdr_offset);
+ T3_TRACE4(TB(q),
+ "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
+ tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
+ T3_TRACE3(TB(q),
+ "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
+ rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
+ T3_TRACE2(TB(q),
+ "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
+ q->buf_state[0].flags, q->buf_state[1].flags);
+
+}
+#endif
+ if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
+ handle_excess_rx(toep, m);
+ return;
+ }
+
+#ifdef T3_TRACE
+ if ((int)m->m_pkthdr.len < 0) {
+ t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
+ }
+#endif
+ if (bsp->flags & DDP_BF_NOCOPY) {
+#ifdef T3_TRACE
+ T3_TRACE0(TB(q),
+ "tcb_rpl_as_ddp_complete: CANCEL UBUF");
+
+ if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ printk("!cancel_ubuf");
+ t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
+ }
+#endif
+ m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
+ bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
+ q->cur_buf ^= 1;
+ } else if (bsp->flags & DDP_BF_NOFLIP) {
+
+ m->m_ddp_flags = 1; /* always a kernel buffer */
+
+ /* now HW buffer carries a user buffer */
+ bsp->flags &= ~DDP_BF_NOFLIP;
+ bsp->flags |= DDP_BF_NOCOPY;
+
+ /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
+ * any new data in which case we're done. If in addition the
+ * offset is 0, then there wasn't a completion for the kbuf
+ * and we need to decrement the posted count.
+ */
+ if (m->m_pkthdr.len == 0) {
+ if (ddp_offset == 0) {
+ q->kbuf_posted--;
+ bsp->flags |= DDP_BF_NODATA;
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ m_free(m);
+ return;
+ }
+ } else {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
+ * but it got here way late and nobody cares anymore.
+ */
+ m_free(m);
+ return;
+ }
+
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_flags |= M_DDP;
+ m->m_seq = tp->rcv_nxt;
+ tp->rcv_nxt += m->m_pkthdr.len;
+ tp->t_rcvtime = ticks;
+#ifdef T3_TRACE
+ T3_TRACE3(TB(q),
+ "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
+ m->m_seq, q->cur_buf, m->m_pkthdr.len);
+#endif
+ CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
+ m->m_seq, q->cur_buf, m->m_pkthdr.len);
+ if (m->m_pkthdr.len == 0)
+ q->user_ddp_pending = 0;
+ else
+ SBAPPEND(&so->so_rcv, m);
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+/*
+ * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
+ * in that case they are similar to DDP completions.
+ */
+static int
+do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+
+ /* OK if socket doesn't exist */
+ if (toep == NULL) {
+ printf("null toep in do_get_tcb_rpl\n");
+ return (CPL_RET_BUF_DONE);
+ }
+
+ INP_LOCK(toep->tp_tp->t_inpcb);
+ tcb_rpl_as_ddp_complete(toep, m);
+ INP_UNLOCK(toep->tp_tp->t_inpcb);
+
+ return (0);
+}
+
+static void
+handle_ddp_data(struct toepcb *toep, struct mbuf *m)
+{
+ struct tcpcb *tp = toep->tp_tp;
+ struct socket *so = toeptoso(toep);
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_rx_data *hdr = cplhdr(m);
+ unsigned int rcv_nxt = ntohl(hdr->seq);
+
+ if (tp->rcv_nxt == rcv_nxt)
+ return;
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ SOCKBUF_LOCK(&so->so_rcv);
+ q = &toep->tp_ddp_state;
+ bsp = &q->buf_state[q->cur_buf];
+ KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
+ rcv_nxt, tp->rcv_nxt));
+ m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+ KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+ CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
+ rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
+
+#ifdef T3_TRACE
+ if ((int)m->m_pkthdr.len < 0) {
+ t3_ddp_error(so, "handle_ddp_data: neg len");
+ }
+#endif
+
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_flags |= M_DDP;
+ m->m_cur_offset = bsp->cur_offset;
+ m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+ if (bsp->flags & DDP_BF_NOCOPY)
+ bsp->flags &= ~DDP_BF_NOCOPY;
+
+ m->m_seq = tp->rcv_nxt;
+ tp->rcv_nxt = rcv_nxt;
+ bsp->cur_offset += m->m_pkthdr.len;
+ if (!(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1;
+ /*
+ * For now, don't re-enable DDP after a connection fell out of DDP
+ * mode.
+ */
+ q->ubuf_ddp_ready = 0;
+ SOCKBUF_UNLOCK(&so->so_rcv);
}
/*
@@ -1481,32 +1944,33 @@ new_rx_data(struct toepcb *toep, struct mbuf *m)
INP_LOCK(tp->t_inpcb);
-#ifdef notyet
- if (__predict_false(sk_no_receive(sk))) {
- handle_excess_rx(so, skb);
+ if (__predict_false(so_no_receive(so))) {
+ handle_excess_rx(toep, m);
+ INP_UNLOCK(tp->t_inpcb);
+ TRACE_EXIT;
return;
}
- if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
- handle_ddp_data(so, skb);
+ if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
+ handle_ddp_data(toep, m);
+
+ m->m_seq = ntohl(hdr->seq);
+ m->m_ulp_mode = 0; /* for iSCSI */
- TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
- TCP_SKB_CB(skb)->flags = 0;
- skb_ulp_mode(skb) = 0; /* for iSCSI */
-#endif
#if VALIDATE_SEQ
- if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
- printk(KERN_ERR
+ if (__predict_false(m->m_seq != tp->rcv_nxt)) {
+ log(LOG_ERR,
"%s: TID %u: Bad sequence number %u, expected %u\n",
- TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
+ TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
tp->rcv_nxt);
- __kfree_skb(skb);
+ m_freem(m);
+ INP_UNLOCK(tp->t_inpcb);
return;
}
#endif
m_adj(m, sizeof(*hdr));
-#ifdef notyet
+#ifdef URGENT_DATA_SUPPORTED
/*
* We don't handle urgent data yet
*/
@@ -1521,8 +1985,8 @@ new_rx_data(struct toepcb *toep, struct mbuf *m)
toep->tp_delack_mode = hdr->dack_mode;
toep->tp_delack_seq = tp->rcv_nxt;
}
-
- DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
+ CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
+ m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
if (len < m->m_pkthdr.len)
m->m_pkthdr.len = m->m_len = len;
@@ -1532,21 +1996,29 @@ new_rx_data(struct toepcb *toep, struct mbuf *m)
toep->tp_enqueued_bytes += m->m_pkthdr.len;
#ifdef T3_TRACE
T3_TRACE2(TIDTB(sk),
- "new_rx_data: seq 0x%x len %u",
- TCP_SKB_CB(skb)->seq, skb->len);
+ "new_rx_data: seq 0x%x len %u",
+ m->m_seq, m->m_pkthdr.len);
#endif
+ INP_UNLOCK(tp->t_inpcb);
SOCKBUF_LOCK(&so->so_rcv);
if (sb_notify(&so->so_rcv))
DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);
- sbappend_locked(&so->so_rcv, m);
- KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,
+ SBAPPEND(&so->so_rcv, m);
+
+#ifdef notyet
+ /*
+ * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
+ *
+ */
+ KASSERT(so->so_rcv.sb_cc < (so->so_rcv.sb_mbmax << 1),
("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
+#endif
- INP_UNLOCK(tp->t_inpcb);
- DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
+
+ CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
if (__predict_true((so->so_state & SS_NOFDREF) == 0))
@@ -1571,22 +2043,26 @@ do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
}
static void
-new_rx_data_ddp(struct socket *so, struct mbuf *m)
+new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
{
- struct tcpcb *tp = sototcpcb(so);
- struct toepcb *toep = tp->t_toe;
+ struct tcpcb *tp;
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_data_ddp *hdr;
unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+ struct socket *so = toeptoso(toep);
+ int nomoredata = 0;
-#ifdef notyet
- if (unlikely(sk_no_receive(sk))) {
- handle_excess_rx(so, m);
+ tp = sototcpcb(so);
+
+ INP_LOCK(tp->t_inpcb);
+ if (__predict_false(so_no_receive(so))) {
+
+ handle_excess_rx(toep, m);
+ INP_UNLOCK(tp->t_inpcb);
return;
}
-#endif
- tp = sototcpcb(so);
+
q = &toep->tp_ddp_state;
hdr = cplhdr(m);
ddp_report = ntohl(hdr->u.ddp_report);
@@ -1603,69 +2079,91 @@ new_rx_data_ddp(struct socket *so, struct mbuf *m)
"new_rx_data_ddp: ddp_report 0x%x",
ddp_report);
#endif
-
+ CTR4(KTR_TOM,
+ "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
+ "hdr seq 0x%x len %u",
+ tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
+ ntohs(hdr->len));
+ CTR3(KTR_TOM,
+ "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
+ G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
+
ddp_len = ntohs(hdr->len);
rcv_nxt = ntohl(hdr->seq) + ddp_len;
- /*
- * Overload to store old rcv_next
- */
- m->m_pkthdr.csum_data = tp->rcv_nxt;
+ m->m_seq = tp->rcv_nxt;
tp->rcv_nxt = rcv_nxt;
+ tp->t_rcvtime = ticks;
/*
* Store the length in m->m_len. We are changing the meaning of
* m->m_len here, we need to be very careful that nothing from now on
* interprets ->len of this packet the usual way.
*/
- m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;
-
+ m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
+ INP_UNLOCK(tp->t_inpcb);
+ CTR3(KTR_TOM,
+ "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
+ m->m_len, rcv_nxt, m->m_seq);
/*
* Figure out where the new data was placed in the buffer and store it
* in when. Assumes the buffer offset starts at 0, consumer needs to
* account for page pod's pg_offset.
*/
end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
-#ifdef notyet
- TCP_SKB_CB(skb)->when = end_offset - skb->len;
+ m->m_cur_offset = end_offset - m->m_pkthdr.len;
- /*
- * We store in mac.raw the address of the gather list where the
- * placement happened.
- */
- skb->mac.raw = (unsigned char *)bsp->gl;
-#endif
+ SOCKBUF_LOCK(&so->so_rcv);
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_flags |= M_DDP;
bsp->cur_offset = end_offset;
+ toep->tp_enqueued_bytes += m->m_pkthdr.len;
/*
+ * Length is only meaningful for kbuf
+ */
+ if (!(bsp->flags & DDP_BF_NOCOPY))
+ KASSERT(m->m_len <= bsp->gl->dgl_length,
+ ("length received exceeds ddp pages: len=%d dgl_length=%d",
+ m->m_len, bsp->gl->dgl_length));
+
+ KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+ KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
+
+
+ /*
* Bit 0 of flags stores whether the DDP buffer is completed.
* Note that other parts of the code depend on this being in bit 0.
*/
if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
-#if 0
- TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */
-#endif
panic("spurious ddp completion");
} else {
- m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
- if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
+ m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
+ if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
q->cur_buf ^= 1; /* flip buffers */
}
if (bsp->flags & DDP_BF_NOCOPY) {
- m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
+ m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
bsp->flags &= ~DDP_BF_NOCOPY;
}
if (ddp_report & F_DDP_PSH)
- m->m_pkthdr.csum_flags |= DDP_BF_PSH;
+ m->m_ddp_flags |= DDP_BF_PSH;
+ if (nomoredata)
+ m->m_ddp_flags |= DDP_BF_NODATA;
+
+ if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
+ toep->tp_delack_mode = G_DDP_DACK_MODE(ddp_report);
+ toep->tp_delack_seq = tp->rcv_nxt;
+ }
+
+ SBAPPEND(&so->so_rcv, m);
- tp->t_rcvtime = ticks;
- sbappendstream_locked(&so->so_rcv, m);
-#ifdef notyet
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
-#endif
+ if ((so->so_state & SS_NOFDREF) == 0)
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
}
#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
@@ -1680,7 +2178,6 @@ static int
do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
struct toepcb *toep = ctx;
- struct socket *so = toeptoso(toep);
const struct cpl_rx_data_ddp *hdr = cplhdr(m);
VALIDATE_SOCK(so);
@@ -1688,40 +2185,50 @@ do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
- return CPL_RET_BUF_DONE;
+ return (CPL_RET_BUF_DONE);
}
#if 0
skb->h.th = tcphdr_skb->h.th;
#endif
- new_rx_data_ddp(so, m);
+ new_rx_data_ddp(toep, m);
return (0);
}
static void
-process_ddp_complete(struct socket *so, struct mbuf *m)
+process_ddp_complete(struct toepcb *toep, struct mbuf *m)
{
- struct tcpcb *tp = sototcpcb(so);
- struct toepcb *toep = tp->t_toe;
+ struct tcpcb *tp = toep->tp_tp;
+ struct socket *so = toeptoso(toep);
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_ddp_complete *hdr;
unsigned int ddp_report, buf_idx, when;
+ int nomoredata = 0;
-#ifdef notyet
- if (unlikely(sk_no_receive(sk))) {
- handle_excess_rx(sk, skb);
+ INP_LOCK(tp->t_inpcb);
+ if (__predict_false(so_no_receive(so))) {
+ struct inpcb *inp = sotoinpcb(so);
+
+ handle_excess_rx(toep, m);
+ INP_UNLOCK(inp);
return;
}
-#endif
q = &toep->tp_ddp_state;
hdr = cplhdr(m);
ddp_report = ntohl(hdr->ddp_report);
buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
- bsp = &q->buf_state[buf_idx];
+ m->m_pkthdr.csum_data = tp->rcv_nxt;
+
+ SOCKBUF_LOCK(&so->so_rcv);
+ bsp = &q->buf_state[buf_idx];
when = bsp->cur_offset;
- m->m_len = G_DDP_OFFSET(ddp_report) - when;
+ m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
+ tp->rcv_nxt += m->m_len;
+ tp->t_rcvtime = ticks;
+ INP_UNLOCK(tp->t_inpcb);
+ KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
@@ -1729,12 +2236,20 @@ process_ddp_complete(struct socket *so, struct mbuf *m)
tp->rcv_nxt, bsp->cur_offset, ddp_report,
G_DDP_OFFSET(ddp_report), skb->len);
#endif
-
+ CTR5(KTR_TOM,
+ "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+ "ddp_report 0x%x offset %u, len %u",
+ tp->rcv_nxt, bsp->cur_offset, ddp_report,
+ G_DDP_OFFSET(ddp_report), m->m_len);
+
bsp->cur_offset += m->m_len;
- if (!(bsp->flags & DDP_BF_NOFLIP))
+ if (!(bsp->flags & DDP_BF_NOFLIP)) {
q->cur_buf ^= 1; /* flip buffers */
-
+ if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
+ nomoredata=1;
+ }
+
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
@@ -1742,21 +2257,26 @@ process_ddp_complete(struct socket *so, struct mbuf *m)
tp->rcv_nxt, bsp->cur_offset, ddp_report,
G_DDP_OFFSET(ddp_report));
#endif
-#if 0
- skb->mac.raw = (unsigned char *)bsp->gl;
-#endif
- m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
+ CTR4(KTR_TOM,
+ "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+ "ddp_report %u offset %u",
+ tp->rcv_nxt, bsp->cur_offset, ddp_report,
+ G_DDP_OFFSET(ddp_report));
+
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_flags |= M_DDP;
+ m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
if (bsp->flags & DDP_BF_NOCOPY)
bsp->flags &= ~DDP_BF_NOCOPY;
- m->m_pkthdr.csum_data = tp->rcv_nxt;
- tp->rcv_nxt += m->m_len;
+ if (nomoredata)
+ m->m_ddp_flags |= DDP_BF_NODATA;
- tp->t_rcvtime = ticks;
- sbappendstream_locked(&so->so_rcv, m);
-#ifdef notyet
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
-#endif
+ SBAPPEND(&so->so_rcv, m);
+
+ if ((so->so_state & SS_NOFDREF) == 0)
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
}
/*
@@ -1766,13 +2286,12 @@ static int
do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
struct toepcb *toep = ctx;
- struct socket *so = toeptoso(toep);
VALIDATE_SOCK(so);
#if 0
skb->h.th = tcphdr_skb->h.th;
#endif
- process_ddp_complete(so, m);
+ process_ddp_complete(toep, m);
return (0);
}
@@ -1801,6 +2320,65 @@ enter_timewait(struct socket *so)
}
/*
+ * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
+ * function deals with the data that may be reported along with the FIN.
+ * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
+ * perform normal FIN-related processing. In the latter case 1 indicates that
+ * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
+ * skb can be freed.
+ */
+static int
+handle_peer_close_data(struct socket *so, struct mbuf *m)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_peer_close *req = cplhdr(m);
+ unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
+
+ if (tp->rcv_nxt == rcv_nxt) /* no data */
+ return (0);
+
+ if (__predict_false(so_no_receive(so))) {
+ handle_excess_rx(toep, m);
+
+ /*
+ * Although we discard the data we want to process the FIN so
+ * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
+ * PEER_CLOSE without data. In particular this PEER_CLOSE
+ * may be what will close the connection. We return 1 because
+ * handle_excess_rx() already freed the packet.
+ */
+ return (1);
+ }
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ q = &toep->tp_ddp_state;
+ SOCKBUF_LOCK(&so->so_rcv);
+ bsp = &q->buf_state[q->cur_buf];
+ m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+ KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_flags |= M_DDP;
+ m->m_cur_offset = bsp->cur_offset;
+ m->m_ddp_flags =
+ DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+ m->m_seq = tp->rcv_nxt;
+ tp->rcv_nxt = rcv_nxt;
+ bsp->cur_offset += m->m_pkthdr.len;
+ if (!(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1;
+ tp->t_rcvtime = ticks;
+ SBAPPEND(&so->so_rcv, m);
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup_locked(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (1);
+}
+
+/*
* Handle a peer FIN.
*/
static void
@@ -1808,9 +2386,8 @@ do_peer_fin(struct socket *so, struct mbuf *m)
{
struct tcpcb *tp = sototcpcb(so);
struct toepcb *toep = tp->t_toe;
- int keep = 0, dead = (so->so_state & SS_NOFDREF);
-
- DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
+ int keep = 0;
+ DPRINTF("do_peer_fin state=%d\n", tp->t_state);
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk),"do_peer_fin:");
@@ -1821,20 +2398,32 @@ do_peer_fin(struct socket *so, struct mbuf *m)
goto out;
}
-
-#ifdef notyet
- if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
- keep = handle_peer_close_data(so, skb);
- if (keep < 0)
- return;
- }
- sk->sk_shutdown |= RCV_SHUTDOWN;
- sock_set_flag(so, SOCK_DONE);
-#endif
INP_INFO_WLOCK(&tcbinfo);
INP_LOCK(tp->t_inpcb);
- if (TCPS_HAVERCVDFIN(tp->t_state) == 0)
+ if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
+ keep = handle_peer_close_data(so, m);
+ if (keep < 0) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ INP_UNLOCK(tp->t_inpcb);
+ return;
+ }
+ }
+ if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
socantrcvmore(so);
+ /*
+ * If connection is half-synchronized
+ * (ie NEEDSYN flag on) then delay ACK,
+ * so it may be piggybacked when SYN is sent.
+ * Otherwise, since we received a FIN then no
+ * more input can be expected, send ACK now.
+ */
+ if (tp->t_flags & TF_NEEDSYN)
+ tp->t_flags |= TF_DELACK;
+ else
+ tp->t_flags |= TF_ACKNOW;
+ tp->rcv_nxt++;
+ }
+
switch (tp->t_state) {
case TCPS_SYN_RECEIVED:
tp->t_starttime = ticks;
@@ -1858,8 +2447,9 @@ do_peer_fin(struct socket *so, struct mbuf *m)
t3_release_offload_resources(toep);
if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
tp = tcp_close(tp);
- } else
+ } else {
enter_timewait(so);
+ }
break;
default:
log(LOG_ERR,
@@ -1870,23 +2460,17 @@ do_peer_fin(struct socket *so, struct mbuf *m)
if (tp)
INP_UNLOCK(tp->t_inpcb);
- if (!dead) {
- DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
-
- sorwakeup(so);
- sowwakeup(so);
- wakeup(&so->so_timeo);
-#ifdef notyet
- sk->sk_state_change(sk);
+ DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
- /* Do not send POLL_HUP for half duplex close. */
- if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
- sk->sk_state == TCP_CLOSE)
- sk_wake_async(so, 1, POLL_HUP);
- else
- sk_wake_async(so, 1, POLL_IN);
+#ifdef notyet
+ /* Do not send POLL_HUP for half duplex close. */
+ if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(so, 1, POLL_HUP);
+ else
+ sk_wake_async(so, 1, POLL_IN);
#endif
- }
+
out:
if (!keep)
m_free(m);
@@ -1929,8 +2513,10 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
tp = tcp_close(tp);
- } else
+ } else {
enter_timewait(so);
+ soisdisconnected(so);
+ }
break;
case TCPS_LAST_ACK:
/*
@@ -1942,21 +2528,29 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
tp = tcp_close(tp);
break;
case TCPS_FIN_WAIT_1:
-#ifdef notyet
- dst_confirm(sk->sk_dst_cache);
-#endif
- soisdisconnecting(so);
-
- if ((so->so_state & SS_NOFDREF) == 0) {
- /*
- * Wake up lingering close
- */
- sowwakeup(so);
- sorwakeup(so);
- wakeup(&so->so_timeo);
- } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
+ /*
+ * If we can't receive any more
+ * data, then closing user can proceed.
+ * Starting the timer is contrary to the
+ * specification, but if we don't get a FIN
+ * we'll hang forever.
+ *
+ * XXXjl:
+ * we should release the tp also, and use a
+ * compressed state.
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ int timeout;
+
+ soisdisconnected(so);
+ timeout = (tcp_fast_finwait2_recycle) ?
+ tcp_finwait2_timeout : tcp_maxidle;
+ tcp_timer_activate(tp, TT_2MSL, timeout);
+ }
+ tp->t_state = TCPS_FIN_WAIT_2;
+ if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
(toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
- tp = cxgb_tcp_drop(tp, 0);
+ tp = tcp_drop(tp, 0);
}
break;
@@ -1970,7 +2564,7 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
if (tp)
INP_UNLOCK(tp->t_inpcb);
out:
- m_free(m);
+ m_freem(m);
}
/*
@@ -2006,6 +2600,8 @@ process_abort_rpl(struct socket *so, struct mbuf *m)
"process_abort_rpl: GTS rpl pending %d",
sock_flag(sk, ABORT_RPL_PENDING));
#endif
+
+ INP_INFO_WLOCK(&tcbinfo);
INP_LOCK(tp->t_inpcb);
if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
@@ -2020,16 +2616,14 @@ process_abort_rpl(struct socket *so, struct mbuf *m)
!is_t3a(TOE_DEV(so))) {
if (toep->tp_flags & TP_ABORT_REQ_RCVD)
panic("TP_ABORT_REQ_RCVD set");
- INP_INFO_WLOCK(&tcbinfo);
- INP_LOCK(tp->t_inpcb);
t3_release_offload_resources(toep);
tp = tcp_close(tp);
- INP_INFO_WUNLOCK(&tcbinfo);
}
}
}
if (tp)
INP_UNLOCK(tp->t_inpcb);
+ INP_INFO_WUNLOCK(&tcbinfo);
m_free(m);
}
@@ -2089,7 +2683,7 @@ discard:
}
/*
- * Convert the status code of an ABORT_REQ into a Linux error code. Also
+ * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
* indicate whether RST should be sent in response.
*/
static int
@@ -2289,10 +2883,8 @@ process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
(is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
so->so_error = abort_status_to_errno(so, req->status,
&rst_status);
-#if 0
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
-#endif
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup(so);
/*
* SYN_RECV needs special processing. If abort_syn_rcv()
* returns 0 is has taken care of the abort.
@@ -2513,7 +3105,8 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str
struct tcphdr th;
struct inpcb *inp;
int mss, wsf, sack, ts;
-
+ uint32_t rcv_isn = ntohl(req->rcv_isn);
+
bzero(&to, sizeof(struct tcpopt));
inp = sotoinpcb(lso);
@@ -2522,10 +3115,11 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str
*/
inc.inc_fport = th.th_sport = req->peer_port;
inc.inc_lport = th.th_dport = req->local_port;
- toep->tp_iss = th.th_seq = req->rcv_isn;
+ th.th_seq = req->rcv_isn;
th.th_flags = TH_SYN;
- toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
+ toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
+
inc.inc_isipv6 = 0;
inc.inc_len = 0;
@@ -2543,7 +3137,6 @@ syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, str
to.to_mss = mss;
to.to_wscale = wsf;
to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
-
INP_INFO_WLOCK(&tcbinfo);
INP_LOCK(inp);
syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
@@ -2654,34 +3247,31 @@ process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
newtoep->tp_flags = TP_SYN_RCVD;
newtoep->tp_tid = tid;
newtoep->tp_toedev = tdev;
+ tp->rcv_wnd = select_rcv_wnd(tdev, so);
- printf("inserting tid=%d\n", tid);
cxgb_insert_tid(cdev, d->client, newtoep, tid);
SOCK_LOCK(so);
LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
SOCK_UNLOCK(so);
-
- if (lctx->ulp_mode) {
+ newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
+ tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+
+ if (newtoep->tp_ulp_mode) {
ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
- if (!ddp_mbuf)
+ if (ddp_mbuf == NULL)
newtoep->tp_ulp_mode = 0;
- else
- newtoep->tp_ulp_mode = lctx->ulp_mode;
}
-
+
+ CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
+ TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
-
- DPRINTF("adding request to syn cache\n");
-
/*
* XXX workaround for lack of syncache drop
*/
toepcb_hold(newtoep);
syncache_add_accept_req(req, so, newtoep);
-
-
rpl = cplhdr(reply_mbuf);
reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
@@ -2692,50 +3282,34 @@ process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
rpl->rsvd = rpl->opt2; /* workaround for HW bug */
rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
- DPRINTF("accept smt_idx=%d\n", e->smt_idx);
-
rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
- rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
+ rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
CPL_PASS_OPEN_ACCEPT);
DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
- m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
-
-#ifdef DEBUG_PRINT
- {
- int i;
-
- DPRINTF("rpl:\n");
- uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
-
- for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
- DPRINTF("[%d] %08x\n", i, rplbuf[i]);
- }
-#endif
-
+ m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
l2t_send(cdev, reply_mbuf, e);
m_free(m);
-#ifdef notyet
- /*
- * XXX this call path has to be converted to not depend on sockets
- */
- if (newtoep->tp_ulp_mode)
- __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
+ if (newtoep->tp_ulp_mode) {
+ __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_OFF(1) |
TP_DDP_TIMER_WORKAROUND_MASK,
V_TF_DDP_OFF(1) |
- TP_DDP_TIMER_WORKAROUND_VAL, 1);
+ TP_DDP_TIMER_WORKAROUND_VAL, 1);
+ } else
+ printf("not offloading\n");
+
+
-#endif
return;
reject:
if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
mk_pass_accept_rpl(reply_mbuf, m);
else
- mk_tid_release(reply_mbuf, NULL, tid);
+ mk_tid_release(reply_mbuf, newtoep, tid);
cxgb_ofld_send(cdev, reply_mbuf);
m_free(m);
out:
@@ -2793,7 +3367,7 @@ do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
/*
* Called when a connection is established to translate the TCP options
- * reported by HW to Linux's native format.
+ * reported by HW to FreeBSD's native format.
*/
static void
assign_rxopt(struct socket *so, unsigned int opt)
@@ -2808,8 +3382,9 @@ assign_rxopt(struct socket *so, unsigned int opt)
tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
- if (tp->t_flags & TF_RCVD_SCALE)
- tp->rcv_scale = 0;
+ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE|TF_REQ_SCALE))
+ tp->rcv_scale = tp->request_r_scale;
}
/*
@@ -2831,8 +3406,6 @@ make_established(struct socket *so, u32 snd_isn, unsigned int opt)
#if 0
inet_sk(sk)->id = tp->write_seq ^ jiffies;
#endif
-
-
/*
* XXX not clear what rcv_wup maps to
*/
@@ -2851,7 +3424,9 @@ make_established(struct socket *so, u32 snd_isn, unsigned int opt)
*/
dst_confirm(sk->sk_dst_cache);
#endif
+ tp->t_starttime = ticks;
tp->t_state = TCPS_ESTABLISHED;
+ soisconnected(so);
}
static int
@@ -2948,23 +3523,21 @@ do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
tp = sototcpcb(so);
INP_LOCK(tp->t_inpcb);
-#ifdef notyet
- so->so_snd.sb_flags |= SB_TOE;
- so->so_rcv.sb_flags |= SB_TOE;
-#endif
+
+ so->so_snd.sb_flags |= SB_NOCOALESCE;
+ so->so_rcv.sb_flags |= SB_NOCOALESCE;
+
toep->tp_tp = tp;
toep->tp_flags = 0;
tp->t_toe = toep;
reset_wr_list(toep);
- tp->rcv_wnd = select_rcv_wnd(so);
- DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
+ tp->rcv_wnd = select_rcv_wnd(tdev, so);
+ tp->rcv_nxt = toep->tp_copied_seq;
install_offload_ops(so);
toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
toep->tp_wr_unacked = 0;
toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
- toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
- tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
toep->tp_qset_idx = 0;
toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
@@ -2975,8 +3548,9 @@ do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
INP_INFO_WUNLOCK(&tcbinfo);
INP_UNLOCK(tp->t_inpcb);
- soisconnected(so);
+ CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
+ cxgb_log_tcb(cdev->adapter, toep->tp_tid);
#ifdef notyet
/*
* XXX not sure how these checks map to us
@@ -3066,14 +3640,10 @@ socket_act_establish(struct socket *so, struct mbuf *m)
fixup_and_send_ofo(so);
if (__predict_false(so->so_state & SS_NOFDREF)) {
-#ifdef notyet
- /*
- * XXX not clear what should be done here
- * appears to correspond to sorwakeup_locked
+ /*
+ * XXX does this even make sense?
*/
- sk->sk_state_change(sk);
- sk_wake_async(so, 0, POLL_OUT);
-#endif
+ sorwakeup(so);
}
m_free(m);
#ifdef notyet
@@ -3095,8 +3665,7 @@ socket_act_establish(struct socket *so, struct mbuf *m)
sk->sk_write_space(sk);
#endif
- soisconnected(so);
- toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
+ toep->tp_state = tp->t_state;
tcpstat.tcps_connects++;
}
@@ -3139,6 +3708,9 @@ do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
socket_act_establish(so, m);
INP_UNLOCK(tp->t_inpcb);
+ CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
+ cxgb_log_tcb(cdev->adapter, toep->tp_tid);
+
return (0);
}
@@ -3156,7 +3728,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
u32 snd_una = ntohl(hdr->snd_una);
int bytes = 0;
- DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);
+ CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
INP_LOCK(tp->t_inpcb);
@@ -3166,18 +3738,21 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
while (credits) {
struct mbuf *p = peek_wr(toep);
- DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
if (__predict_false(!p)) {
log(LOG_ERR, "%u WR_ACK credits for TID %u with "
- "nothing pending, state %u\n",
- credits, toep->tp_tid, tp->t_state);
+ "nothing pending, state %u wr_avail=%u\n",
+ credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
break;
}
+ CTR2(KTR_TOM,
+ "wr_ack: p->credits=%d p->bytes=%d", p->m_pkthdr.csum_data, p->m_pkthdr.len);
+
+ KASSERT(p->m_pkthdr.csum_data != 0, ("empty request still on list"));
if (__predict_false(credits < p->m_pkthdr.csum_data)) {
+
#if DEBUG_WR > 1
struct tx_data_wr *w = cplhdr(p);
-#ifdef notyet
log(LOG_ERR,
"TID %u got %u WR credits, need %u, len %u, "
"main body %u, frags %u, seq # %u, ACK una %u,"
@@ -3185,8 +3760,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
toep->tp_tid, credits, p->csum, p->len,
p->len - p->data_len, skb_shinfo(p)->nr_frags,
ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
- WR_AVAIL(tp), count_pending_wrs(tp) - credits);
-#endif
+ toep->tp_wr_avail, count_pending_wrs(tp) - credits);
#endif
p->m_pkthdr.csum_data -= credits;
break;
@@ -3194,7 +3768,9 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
dequeue_wr(toep);
credits -= p->m_pkthdr.csum_data;
bytes += p->m_pkthdr.len;
- DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
+ CTR3(KTR_TOM,
+ "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
+ p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
m_free(p);
}
@@ -3228,7 +3804,7 @@ wr_ack(struct toepcb *toep, struct mbuf *m)
toep->tp_flags &= ~TP_TX_WAIT_IDLE;
}
if (bytes) {
- DPRINTF("sbdrop(%d)\n", bytes);
+ CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
SOCKBUF_LOCK(&so->so_snd);
sbdrop_locked(&so->so_snd, bytes);
sowwakeup_locked(so);
@@ -3250,15 +3826,21 @@ do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
{
struct toepcb *toep = (struct toepcb *)ctx;
- DPRINTF("do_wr_ack\n");
- dump_toepcb(toep);
-
VALIDATE_SOCK(so);
wr_ack(toep, m);
return 0;
}
+/*
+ * Handler for TRACE_PKT CPL messages. Just sink these packets.
+ */
+static int
+do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
+{
+ m_freem(m);
+ return 0;
+}
/*
* Reset a connection that is on a listener's SYN queue or accept queue,
@@ -3320,6 +3902,336 @@ t3_reset_synq(struct listen_ctx *lctx)
SOCK_UNLOCK(lctx->lso);
}
+
+int
+t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
+ unsigned int nppods, unsigned int tag, unsigned int maxoff,
+ unsigned int pg_off, unsigned int color)
+{
+ unsigned int i, j, pidx;
+ struct pagepod *p;
+ struct mbuf *m;
+ struct ulp_mem_io *req;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ unsigned int tid = toep->tp_tid;
+ const struct tom_data *td = TOM_DATA(TOE_DEV(so));
+ unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
+
+ CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
+ gl, nppods, tag, maxoff, pg_off, color);
+
+ for (i = 0; i < nppods; ++i) {
+ m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+ req = mtod(m, struct ulp_mem_io *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+ req->wr.wr_lo = 0;
+ req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
+ V_ULPTX_CMD(ULP_MEM_WRITE));
+ req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
+ V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
+
+ p = (struct pagepod *)(req + 1);
+ if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
+ p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
+ p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
+ V_PPOD_COLOR(color));
+ p->pp_max_offset = htonl(maxoff);
+ p->pp_page_offset = htonl(pg_off);
+ p->pp_rsvd = 0;
+ for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
+ p->pp_addr[j] = pidx < gl->dgl_nelem ?
+ htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
+ } else
+ p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
+ send_or_defer(toep, m, 0);
+ ppod_addr += PPOD_SIZE;
+ }
+ return (0);
+}
+
+/*
+ * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_cpl_barrier_ulp(struct cpl_barrier *b)
+{
+ struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
+
+ txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+ txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
+ b->opcode = CPL_BARRIER;
+}
+
+/*
+ * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
+{
+ struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+
+ txpkt = (struct ulp_txpkt *)req;
+ txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+ txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
+ req->cpuno = htons(cpuno);
+}
+
+/*
+ * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
+ unsigned int word, uint64_t mask, uint64_t val)
+{
+ struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+
+ CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
+ tid, word, mask, val);
+
+ txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+ txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+ req->reply = V_NO_REPLY(1);
+ req->cpu_idx = 0;
+ req->word = htons(word);
+ req->mask = htobe64(mask);
+ req->val = htobe64(val);
+}
+
+/*
+ * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
+ */
+static void
+mk_rx_data_ack_ulp(struct cpl_rx_data_ack *ack, unsigned int tid, unsigned int credits)
+{
+ struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
+
+ txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+ txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
+ OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
+ ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
+ V_RX_DACK_MODE(1) | V_RX_CREDITS(credits));
+}
+
+void
+t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
+{
+ unsigned int wrlen;
+ struct mbuf *m;
+ struct work_request_hdr *wr;
+ struct cpl_barrier *lock;
+ struct cpl_set_tcb_field *req;
+ struct cpl_get_tcb *getreq;
+ struct ddp_state *p = &toep->tp_ddp_state;
+
+ SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+ wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
+ sizeof(*getreq);
+ m = m_gethdr_nofail(wrlen);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+ wr = mtod(m, struct work_request_hdr *);
+ bzero(wr, wrlen);
+
+ wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+ m->m_pkthdr.len = m->m_len = wrlen;
+
+ lock = (struct cpl_barrier *)(wr + 1);
+ mk_cpl_barrier_ulp(lock);
+
+ req = (struct cpl_set_tcb_field *)(lock + 1);
+
+ CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
+
+ /* Hmmm, not sure if this actually a good thing: reactivating
+ * the other buffer might be an issue if it has been completed
+ * already. However, that is unlikely, since the fact that the UBUF
+ * is not completed indicates that there is no oustanding data.
+ */
+ if (bufidx == 0)
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_ACTIVE_BUF(1) |
+ V_TF_DDP_BUF0_VALID(1),
+ V_TF_DDP_ACTIVE_BUF(1));
+ else
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_ACTIVE_BUF(1) |
+ V_TF_DDP_BUF1_VALID(1), 0);
+
+ getreq = (struct cpl_get_tcb *)(req + 1);
+ mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
+
+ mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
+
+ /* Keep track of the number of oustanding CPL_GET_TCB requests
+ */
+ p->get_tcb_count++;
+
+#ifdef T3_TRACE
+ T3_TRACE1(TIDTB(so),
+ "t3_cancel_ddpbuf: bufidx %u", bufidx);
+#endif
+ cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+/**
+ * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
+ * @sk: the socket associated with the buffers
+ * @bufidx: index of HW DDP buffer (0 or 1)
+ * @tag0: new tag for HW buffer 0
+ * @tag1: new tag for HW buffer 1
+ * @len: new length for HW buf @bufidx
+ *
+ * Sends a compound WR to overlay a new DDP buffer on top of an existing
+ * buffer by changing the buffer tag and length and setting the valid and
+ * active flag accordingly. The caller must ensure the new buffer is at
+ * least as big as the existing one. Since we typically reprogram both HW
+ * buffers this function sets both tags for convenience. Read the TCB to
+ * determine how made data was written into the buffer before the overlay
+ * took place.
+ */
+void
+t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
+ unsigned int tag1, unsigned int len)
+{
+ unsigned int wrlen;
+ struct mbuf *m;
+ struct work_request_hdr *wr;
+ struct cpl_get_tcb *getreq;
+ struct cpl_set_tcb_field *req;
+ struct ddp_state *p = &toep->tp_ddp_state;
+
+ CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
+ bufidx, tag0, tag1, len);
+ SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+ wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
+ m = m_gethdr_nofail(wrlen);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+ wr = mtod(m, struct work_request_hdr *);
+ m->m_pkthdr.len = m->m_len = wrlen;
+ bzero(wr, wrlen);
+
+
+ /* Set the ATOMIC flag to make sure that TP processes the following
+ * CPLs in an atomic manner and no wire segments can be interleaved.
+ */
+ wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
+ req = (struct cpl_set_tcb_field *)(wr + 1);
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
+ V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
+ V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
+ V_TCB_RX_DDP_BUF0_TAG(tag0) |
+ V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
+ req++;
+ if (bufidx == 0) {
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
+ V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+ V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+ req++;
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_PUSH_DISABLE_0(1) |
+ V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+ V_TF_DDP_PUSH_DISABLE_0(0) |
+ V_TF_DDP_BUF0_VALID(1));
+ } else {
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
+ V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
+ V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
+ req++;
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_PUSH_DISABLE_1(1) |
+ V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+ V_TF_DDP_PUSH_DISABLE_1(0) |
+ V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
+ }
+
+ getreq = (struct cpl_get_tcb *)(req + 1);
+ mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
+
+ /* Keep track of the number of oustanding CPL_GET_TCB requests
+ */
+ p->get_tcb_count++;
+
+#ifdef T3_TRACE
+ T3_TRACE4(TIDTB(sk),
+ "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
+ "len %d",
+ bufidx, tag0, tag1, len);
+#endif
+ cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+/*
+ * Sends a compound WR containing all the CPL messages needed to program the
+ * two HW DDP buffers, namely optionally setting up the length and offset of
+ * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
+ */
+void
+t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
+ unsigned int len1, unsigned int offset1,
+ uint64_t ddp_flags, uint64_t flag_mask, int modulate)
+{
+ unsigned int wrlen;
+ struct mbuf *m;
+ struct work_request_hdr *wr;
+ struct cpl_set_tcb_field *req;
+
+ CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
+ len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
+
+ SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+ wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
+ (len1 ? sizeof(*req) : 0) +
+ (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
+ m = m_gethdr_nofail(wrlen);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+ wr = mtod(m, struct work_request_hdr *);
+ bzero(wr, wrlen);
+
+ wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+ m->m_pkthdr.len = m->m_len = wrlen;
+
+ req = (struct cpl_set_tcb_field *)(wr + 1);
+ if (len0) { /* program buffer 0 offset and length */
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
+ V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+ V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+ V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
+ V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
+ req++;
+ }
+ if (len1) { /* program buffer 1 offset and length */
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
+ V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+ V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
+ V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
+ V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
+ req++;
+ }
+
+ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
+ ddp_flags);
+
+ if (modulate) {
+ mk_rx_data_ack_ulp((struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
+ toep->tp_copied_seq - toep->tp_rcv_wup);
+ toep->tp_rcv_wup = toep->tp_copied_seq;
+ }
+
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(sk),
+ "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
+ "modulate %d",
+ len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
+ modulate);
+#endif
+
+ cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
void
t3_init_wr_tab(unsigned int wr_len)
{
@@ -3353,7 +4265,6 @@ t3_init_cpl_io(void)
tcphdr_skb->h.raw = tcphdr_skb->data;
memset(tcphdr_skb->data, 0, tcphdr_skb->len);
#endif
-
t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
@@ -3367,11 +4278,9 @@ t3_init_cpl_io(void)
t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
-#ifdef notyet
t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
-#endif
return (0);
}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
index a3dd692..6edeacd 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -38,14 +38,18 @@ __FBSDID("$FreeBSD$");
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
+#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/socketvar.h>
#include <sys/uio.h>
+#include <sys/file.h>
#include <machine/bus.h>
+#include <machine/cpu.h>
#include <net/if.h>
#include <net/route.h>
@@ -56,6 +60,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_var.h>
+#include <dev/cxgb/cxgb_config.h>
#include <dev/cxgb/cxgb_osdep.h>
#include <dev/cxgb/sys/mbufq.h>
@@ -72,6 +77,7 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/common/cxgb_ctl_defs.h>
#include <dev/cxgb/cxgb_l2t.h>
#include <dev/cxgb/cxgb_offload.h>
+
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
@@ -85,6 +91,7 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
struct uio *uio, struct mbuf *top, struct mbuf *control,
@@ -94,13 +101,11 @@ static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);
-#ifdef notyet
-#define VM_HOLD_WRITEABLE 0x1
-static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
- int *count, int flags);
-#endif
-static void vm_fault_unhold_pages(vm_page_t *m, int count);
#define TMP_IOV_MAX 16
+#ifndef PG_FRAME
+#define PG_FRAME ~PAGE_MASK
+#endif
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
void
t3_init_socket_ops(void)
@@ -110,20 +115,8 @@ t3_init_socket_ops(void)
prp = pffindtype(AF_INET, SOCK_STREAM);
pru_sosend = prp->pr_usrreqs->pru_sosend;
pru_soreceive = prp->pr_usrreqs->pru_soreceive;
-#ifdef TCP_USRREQS_OVERLOAD
- tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect;
- tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
- tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen;
- tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send;
- tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
- tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect;
- tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close;
- tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown;
- tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd;
-#endif
}
-
struct cxgb_dma_info {
size_t cdi_mapped;
int cdi_nsegs;
@@ -182,21 +175,172 @@ iov_adj(struct iovec **iov, int *iovcnt, size_t count)
}
}
-
static void
-cxgb_zero_copy_free(void *cl, void *arg) {}
+cxgb_zero_copy_free(void *cl, void *arg)
+{
+ struct mbuf_vec *mv;
+ struct mbuf *m = (struct mbuf *)cl;
+
+ mv = mtomv(m);
+ /*
+ * Physical addresses, don't try to free should be unheld separately from sbdrop
+ *
+ */
+ mv->mv_count = 0;
+ m_free_iovec(m, m->m_type);
+}
+
static int
cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
{
+ struct iovec *iov = uio->uio_iov;
+ int iovcnt = uio->uio_iovcnt;
+ int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
+ uint64_t start, end;
+ vm_page_t *mp;
+
+ totbytes = totcount = 0;
+ maxcount = *held;
+
+ mp = m;
+ for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) {
+ count = maxcount - totcount;
+
+ start = (uintptr_t)iov->iov_base;
+ end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
+ start &= PG_FRAME;
+ end += PAGE_MASK;
+ end &= PG_FRAME;
+ npages = (end - start) >> PAGE_SHIFT;
+
+ count = min(count, npages);
+
+ err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
+ if (err) {
+ vm_fault_unhold_pages(m, totcount);
+ return (err);
+ }
+ mp += count;
+ totcount += count;
+ curbytes = iov->iov_len;
+ if (count != npages)
+ curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
+ totbytes += curbytes;
+ }
+ uio->uio_resid -= totbytes;
- return (EINVAL);
+ return (0);
+}
+
+/*
+ * Returns whether a connection should enable DDP. This happens when all of
+ * the following conditions are met:
+ * - the connection's ULP mode is DDP
+ * - DDP is not already enabled
+ * - the last receive was above the DDP threshold
+ * - receive buffers are in user space
+ * - receive side isn't shutdown (handled by caller)
+ * - the connection's receive window is big enough so that sizable buffers
+ * can be posted without closing the window in the middle of DDP (checked
+ * when the connection is offloaded)
+ */
+static int
+so_should_ddp(const struct toepcb *toep, int last_recv_len)
+{
+
+ DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
+ toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres),
+ toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
+
+ return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
+ last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
+ toep->tp_tp->rcv_wnd >
+ (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
+}
+
+static inline int
+is_ddp(const struct mbuf *m)
+{
+ return (m->m_flags & M_DDP);
+}
+
+static inline int
+is_ddp_psh(const struct mbuf *m)
+{
+ return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
+}
+
+static int
+m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+ int curlen, startlen, resid_init, err = 0;
+ caddr_t buf;
+
+ DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
+ m, offset, len);
+
+ startlen = len;
+ resid_init = uio->uio_resid;
+ while (m && len) {
+ buf = mtod(m, caddr_t);
+ curlen = m->m_len;
+ if (offset && (offset < curlen)) {
+ curlen -= offset;
+ buf += offset;
+ offset = 0;
+ } else if (offset) {
+ offset -= curlen;
+ m = m->m_next;
+ continue;
+ }
+ err = uiomove(buf, min(len, curlen), uio);
+ if (err) {
+ printf("uiomove returned %d\n", err);
+ return (err);
+ }
+
+ len -= min(len, curlen);
+ m = m->m_next;
+ }
+ DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
+ startlen - len, resid_init, uio->uio_resid);
+ return (err);
+}
+
+/*
+ * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
+ * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
+ * DDP buffer.
+ */
+static inline int
+copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+ struct iovec *to = uio->uio_iov;
+ int err;
+
+
+ if (__predict_true(!is_ddp(m))) { /* RX_DATA */
+ return m_uiomove(m, offset, len, uio);
+ } if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
+ to->iov_len -= len;
+ to->iov_base = ((caddr_t)to->iov_base) + len;
+ uio->uio_iov = to;
+ uio->uio_resid -= len;
+ return (0);
+ }
+ err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
+ return (err);
}
static void
-cxgb_wait_dma_completion(struct toepcb *tp)
+cxgb_wait_dma_completion(struct toepcb *toep)
{
+ struct mtx *lock;
+ lock = &toep->tp_tp->t_inpcb->inp_mtx;
+ INP_LOCK(toep->tp_tp->t_inpcb);
+ cv_wait_unlock(&toep->tp_cv, lock);
}
static int
@@ -234,7 +378,13 @@ cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
mi_collapse_sge(mi, segs);
*m = m0;
-
+
+ /*
+ * This appears to be a no-op at the moment
+ * as busdma is all or nothing need to make
+ * sure the tag values are large enough
+ *
+ */
if (cdi.cdi_mapped < uio->uio_resid) {
uio->uio_resid -= cdi.cdi_mapped;
} else
@@ -305,10 +455,11 @@ sendmore:
}
uio->uio_resid -= m->m_pkthdr.len;
sent += m->m_pkthdr.len;
- sbappend_locked(&so->so_snd, m);
+ sbappend(&so->so_snd, m);
t3_push_frames(so, TRUE);
iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
}
+
/*
* Wait for pending I/O to be DMA'd to the card
*
@@ -357,7 +508,7 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
- if ((uio->uio_resid > zcopy_thres) &&
+ if (uio && (uio->uio_resid > zcopy_thres) &&
(uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0)
&& zcopy_enabled) {
rv = t3_sosend(so, uio);
@@ -368,36 +519,378 @@ cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
return pru_sosend(so, addr, uio, top, control, flags, td);
}
+/*
+ * Following replacement or removal of the first mbuf on the first mbuf chain
+ * of a socket buffer, push necessary state changes back into the socket
+ * buffer so that other consumers see the values consistently. 'nextrecord'
+ * is the callers locally stored value of the original value of
+ * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
+ * NOTE: 'nextrecord' may be NULL.
+ */
+static __inline void
+sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ /*
+ * First, update for the new value of nextrecord. If necessary, make
+ * it the first record.
+ */
+ if (sb->sb_mb != NULL)
+ sb->sb_mb->m_nextpkt = nextrecord;
+ else
+ sb->sb_mb = nextrecord;
+
+ /*
+ * Now update any dependent socket buffer fields to reflect the new
+ * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
+ * addition of a second clause that takes care of the case where
+ * sb_mb has been updated, but remains the last record.
+ */
+ if (sb->sb_mb == NULL) {
+ sb->sb_mbtail = NULL;
+ sb->sb_lastrecord = NULL;
+ } else if (sb->sb_mb->m_nextpkt == NULL)
+ sb->sb_lastrecord = sb->sb_mb;
+}
+
+#define IS_NONBLOCKING(so) ((so)->so_state & SS_NBIO)
+
static int
-t3_soreceive(struct socket *so, struct uio *uio)
+t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
{
-#ifdef notyet
- int i, rv, count, hold_resid, sent, iovcnt;
- struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
struct tcpcb *tp = sototcpcb(so);
struct toepcb *toep = tp->t_toe;
struct mbuf *m;
- struct uio uiotmp;
+ uint32_t offset;
+ int err, flags, avail, len, copied, copied_unacked;
+ int target; /* Read at least this many bytes */
+ int user_ddp_ok;
+ struct ddp_state *p;
+ struct inpcb *inp = sotoinpcb(so);
+
+ avail = offset = copied = copied_unacked = 0;
+ flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
+ err = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+ p = &toep->tp_ddp_state;
+
+ if (err)
+ return (err);
+ SOCKBUF_LOCK(&so->so_rcv);
+ p->user_ddp_pending = 0;
+restart:
+ len = uio->uio_resid;
+ m = so->so_rcv.sb_mb;
+ target = (flags & MSG_WAITALL) ? len : so->so_rcv.sb_lowat;
+ user_ddp_ok = p->ubuf_ddp_ready;
+ p->cancel_ubuf = 0;
+
+ if (len == 0)
+ goto done;
+#if 0
+ while (m && m->m_len == 0) {
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ }
+#endif
+ if (m)
+ goto got_mbuf;
+
+ /* empty receive queue */
+ if (copied >= target && (so->so_rcv.sb_mb == NULL) &&
+ !p->user_ddp_pending)
+ goto done;
+
+ if (copied) {
+ if (so->so_error || tp->t_state == TCPS_CLOSED ||
+ (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
+ goto done;
+ } else {
+ if (so->so_state & SS_NOFDREF)
+ goto done;
+ if (so->so_error) {
+ err = so->so_error;
+ so->so_error = 0;
+ goto done;
+ }
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ goto done;
+ if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
+ goto done;
+ if (tp->t_state == TCPS_CLOSED) {
+ err = ENOTCONN;
+ goto done;
+ }
+ }
+ if (so->so_rcv.sb_mb && !p->user_ddp_pending) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ INP_LOCK(inp);
+ t3_cleanup_rbuf(tp, copied_unacked);
+ INP_UNLOCK(inp);
+ SOCKBUF_LOCK(&so->so_rcv);
+ copied_unacked = 0;
+ goto restart;
+ }
+ if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending &&
+ uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+ p->ubuf_ddp_ready) {
+ p->user_ddp_pending =
+ !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1);
+ if (p->user_ddp_pending) {
+ p->kbuf_posted++;
+ user_ddp_ok = 0;
+ }
+ }
+ if (p->kbuf[0] && (p->kbuf_posted == 0)) {
+ t3_post_kbuf(so, 1, IS_NONBLOCKING(so));
+ p->kbuf_posted++;
+ }
+ if (p->user_ddp_pending) {
+ /* One shot at DDP if we already have enough data */
+ if (copied >= target)
+ user_ddp_ok = 0;
+
+ DPRINTF("sbwaiting 1\n");
+ if ((err = sbwait(&so->so_rcv)) != 0)
+ goto done;
+//for timers to work await_ddp_completion(sk, flags, &timeo);
+ } else if (copied >= target)
+ goto done;
+ else {
+ if (copied_unacked) {
+ int i = 0;
+
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ INP_LOCK(inp);
+ t3_cleanup_rbuf(tp, copied_unacked);
+ INP_UNLOCK(inp);
+ copied_unacked = 0;
+ if (mp_ncpus > 1)
+ while (i++ < 200 && so->so_rcv.sb_mb == NULL)
+ cpu_spinwait();
+ SOCKBUF_LOCK(&so->so_rcv);
+ }
+
+ if (so->so_rcv.sb_mb)
+ goto restart;
+ DPRINTF("sbwaiting 2 copied=%d target=%d avail=%d so=%p mb=%p cc=%d\n", copied, target, avail, so,
+ so->so_rcv.sb_mb, so->so_rcv.sb_cc);
+ if ((err = sbwait(&so->so_rcv)) != 0)
+ goto done;
+ }
+ goto restart;
+got_mbuf:
+ KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
+ KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x m->m_len=%d",
+ m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
+ if (m->m_pkthdr.len == 0) {
+ if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
+ panic("empty mbuf and NOCOPY not set\n");
+ CTR0(KTR_TOM, "ddp done notification");
+ p->user_ddp_pending = 0;
+ sbdroprecord_locked(&so->so_rcv);
+ goto done;
+ }
+
+ offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
+ DPRINTF("m=%p copied_seq=0x%x copied_unacked=%d m_seq=0x%x offset=%d pktlen=%d is_ddp(m)=%d\n",
+ m, toep->tp_copied_seq, copied_unacked, m->m_seq, offset, m->m_pkthdr.len, !!is_ddp(m));
+
+ if (offset >= m->m_pkthdr.len)
+ panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x seq 0x%x "
+ "pktlen %d ddp flags 0x%x", offset, toep->tp_copied_seq + copied_unacked, m->m_seq,
+ m->m_pkthdr.len, m->m_ddp_flags);
+
+ avail = m->m_pkthdr.len - offset;
+ if (len < avail) {
+ if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY))
+ panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
+ avail = len;
+ }
+ CTR4(KTR_TOM, "t3_soreceive: m_len=%u offset=%u len=%u m_seq=0%08x", m->m_pkthdr.len, offset, len, m->m_seq);
+
+#ifdef URGENT_DATA_SUPPORTED
/*
- * Events requiring iteration:
- * - number of pages exceeds max hold pages for process or system
- * - number of pages exceeds maximum sg entries for a single WR
- *
- * We're limited to holding 128 pages at once - and we're limited to
- * 34 SG entries per work request, but each SG entry can be any number
- * of contiguous pages
- *
+ * Check if the data we are preparing to copy contains urgent
+ * data. Either stop short of urgent data or skip it if it's
+ * first and we are not delivering urgent data inline.
+ */
+ if (__predict_false(toep->tp_urg_data)) {
+ uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
+
+ if (urg_offset < avail) {
+ if (urg_offset) {
+ /* stop short of the urgent data */
+ avail = urg_offset;
+ } else if ((so->so_options & SO_OOBINLINE) == 0) {
+ /* First byte is urgent, skip */
+ toep->tp_copied_seq++;
+ offset++;
+ avail--;
+ if (!avail)
+ goto skip_copy;
+ }
+ }
+ }
+#endif
+ if (is_ddp_psh(m) || offset) {
+ user_ddp_ok = 0;
+#ifdef T3_TRACE
+ T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
+#endif
+ }
+
+ if (user_ddp_ok && !p->user_ddp_pending &&
+ uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+ p->ubuf_ddp_ready) {
+ p->user_ddp_pending =
+ !t3_overlay_ubuf(so, uio, IS_NONBLOCKING(so), flags, 1, 1);
+ if (p->user_ddp_pending) {
+ p->kbuf_posted++;
+ user_ddp_ok = 0;
+ }
+ DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
+ } else
+ DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
+ user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
+ p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
+
+ /*
+ * If MSG_TRUNC is specified the data is discarded.
+ * XXX need to check pr_atomic
*/
+ KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset));
+ if (__predict_true(!(flags & MSG_TRUNC))) {
+ int resid = uio->uio_resid;
+
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ if ((err = copy_data(m, offset, avail, uio))) {
+ if (err)
+ err = EFAULT;
+ goto done_unlocked;
+ }
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (avail != (resid - uio->uio_resid))
+ printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
+ avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
+ }
+
+ copied += avail;
+ copied_unacked += avail;
+ len -= avail;
+
+#ifdef URGENT_DATA_SUPPORTED
+skip_copy:
+ if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
+ tp->urg_data = 0;
+#endif
+ /*
+ * If the buffer is fully consumed free it. If it's a DDP
+ * buffer also handle any events it indicates.
+ */
+ if (avail + offset >= m->m_pkthdr.len) {
+ unsigned int fl = m->m_ddp_flags;
+ int exitnow, got_psh = 0, nomoredata = 0;
+ int count;
+ struct mbuf *nextrecord;
+
+ if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
+ if (is_ddp_psh(m) && p->user_ddp_pending)
+ got_psh = 1;
+
+ if (fl & DDP_BF_NOCOPY)
+ p->user_ddp_pending = 0;
+ else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
+ p->kbuf_posted--;
+ nomoredata = 1;
+ } else {
+ p->kbuf_posted--;
+ p->ubuf_ddp_ready = 1;
+ }
+ }
- uiotmp = *uio;
- iovcnt = uio->uio_iovcnt;
- iov = uio->uio_iov;
- sent = 0;
- re;
-#endif
- return (0);
+ nextrecord = m->m_nextpkt;
+ count = m->m_pkthdr.len;
+ while (count > 0) {
+ count -= m->m_len;
+ KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+ sbfree(&so->so_rcv, m);
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ }
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+#if 0
+ sbdrop_locked(&so->so_rcv, m->m_pkthdr.len);
+#endif
+ exitnow = got_psh || nomoredata;
+ if ((so->so_rcv.sb_mb == NULL) && exitnow)
+ goto done;
+ if (copied_unacked > (so->so_rcv.sb_hiwat >> 2)) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ INP_LOCK(inp);
+ t3_cleanup_rbuf(tp, copied_unacked);
+ INP_UNLOCK(inp);
+ copied_unacked = 0;
+ SOCKBUF_LOCK(&so->so_rcv);
+ }
+ }
+ if (len > 0)
+ goto restart;
+
+ done:
+ /*
+ * If we can still receive decide what to do in preparation for the
+ * next receive. Note that RCV_SHUTDOWN is set if the connection
+ * transitioned to CLOSE but not if it was in that state to begin with.
+ */
+ if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
+ if (p->user_ddp_pending) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_rcv);
+ user_ddp_ok = 0;
+ t3_cancel_ubuf(toep);
+ if (so->so_rcv.sb_mb) {
+ if (copied < 0)
+ copied = 0;
+ if (len > 0)
+ goto restart;
+ }
+ p->user_ddp_pending = 0;
+ }
+ if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
+#ifdef T3_TRACE
+ T3_TRACE0(TIDTB(so),
+ "chelsio_recvmsg: about to exit, repost kbuf");
+#endif
+
+ t3_post_kbuf(so, 1, IS_NONBLOCKING(so));
+ p->kbuf_posted++;
+ } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
+ CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
+ if (!t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so),
+ ddp_copy_limit), 0, IS_NONBLOCKING(so)))
+ p->kbuf_posted = 1;
+ }
+ }
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(so),
+ "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
+ "kbuf_posted %d user_ddp_pending %u",
+ copied, len, buffers_freed, p ? p->kbuf_posted : -1,
+ p->user_ddp_pending);
+#endif
+ SOCKBUF_UNLOCK(&so->so_rcv);
+done_unlocked:
+ if (copied_unacked) {
+ INP_LOCK(inp);
+ t3_cleanup_rbuf(tp, copied_unacked);
+ INP_UNLOCK(inp);
+ }
+ sbunlock(&so->so_rcv);
+
+ return (err);
}
static int
@@ -405,9 +898,11 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
struct toedev *tdev;
- int rv, zcopy_thres, zcopy_enabled;
+ int rv, zcopy_thres, zcopy_enabled, flags;
struct tcpcb *tp = sototcpcb(so);
+ flags = flagsp ? *flagsp &~ MSG_EOR : 0;
+
/*
* In order to use DMA direct from userspace the following
* conditions must be met:
@@ -421,150 +916,30 @@ cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
* - iovcnt is 1
*
*/
- if (tp->t_flags & TF_TOE) {
+
+ if ((tp->t_flags & TF_TOE) && uio && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
+ && (uio->uio_iovcnt == 1) && (mp0 == NULL)) {
tdev = TOE_DEV(so);
zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
zcopy_enabled = TOM_TUNABLE(tdev, ddp);
if ((uio->uio_resid > zcopy_thres) &&
- (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0)
+ (uio->uio_iovcnt == 1)
&& zcopy_enabled) {
- rv = t3_soreceive(so, uio);
+ rv = t3_soreceive(so, flagsp, uio);
if (rv != EAGAIN)
return (rv);
- }
- }
-
+ else
+ printf("returned EAGAIN\n");
+ }
+ } else if ((tp->t_flags & TF_TOE) && uio && mp0 == NULL)
+ printf("skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
+ flags, uio->uio_iovcnt, so->so_rcv.sb_state);
return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
}
-
void
t3_install_socket_ops(struct socket *so)
{
so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
}
-
-/*
- * This routine takes a user address range and does the following:
- * - validate that the user has access to those pages (flags indicates read or write) - if not fail
- * - validate that count is enough to hold range number of pages - if not fail
- * - fault in any non-resident pages
- * - if the user is doing a read force a write fault for any COWed pages
- * - if the user is doing a read mark all pages as dirty
- * - hold all pages
- * - return number of pages in count
- */
-#ifdef notyet
-static int
-vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
-{
-
- vm_offset_t start, va;
- vm_paddr_t pa;
- int pageslen, faults, rv;
-
- struct thread *td;
- vm_map_t map;
- pmap_t pmap;
- vm_page_t m, *pages;
- vm_prot_t prot;
-
- start = addr & ~PAGE_MASK;
- pageslen = roundup2(addr + len, PAGE_SIZE);
- if (*count < (pageslen >> PAGE_SHIFT))
- return (EFBIG);
-
- *count = pageslen >> PAGE_SHIFT;
- /*
- * Check that virtual address range is legal
- * This check is somewhat bogus as on some architectures kernel
- * and user do not share VA - however, it appears that all FreeBSD
- * architectures define it
- */
- if (addr + len > VM_MAXUSER_ADDRESS)
- return (EFAULT);
-
- td = curthread;
- map = &td->td_proc->p_vmspace->vm_map;
- pmap = &td->td_proc->p_vmspace->vm_pmap;
- pages = mp;
-
- prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
- bzero(pages, sizeof(vm_page_t *) * (*count));
-retry:
-
- /*
- * First optimistically assume that all pages are resident (and R/W if for write)
- * if so just mark pages as held (and dirty if for write) and return
- */
- vm_page_lock_queues();
- for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
- /*
- * Assure that we only hold the page once
- */
- if (*pages == NULL) {
- /*
- * page queue mutex is recursable so this is OK
- * it would be really nice if we had an unlocked version of this so
- * we were only acquiring the pmap lock 1 time as opposed to potentially
- * many dozens of times
- */
- m = pmap_extract_and_hold(pmap, va, prot);
- if (m == NULL) {
- faults++;
- continue;
- }
- *pages = m;
- if (flags & VM_HOLD_WRITEABLE)
- vm_page_dirty(m);
- }
- }
- vm_page_unlock_queues();
-
- if (faults == 0)
- return (0);
- /*
- * Pages either have insufficient permissions or are not present
- * trigger a fault where neccessary
- *
- */
- for (va = start; va < pageslen; va += PAGE_SIZE) {
- m = NULL;
- pa = pmap_extract(pmap, va);
- rv = 0;
- if (pa)
- m = PHYS_TO_VM_PAGE(pa);
- if (flags & VM_HOLD_WRITEABLE) {
- if (m == NULL || (m->flags & PG_WRITEABLE) == 0)
- rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
- } else if (m == NULL)
- rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
- if (rv)
- goto error;
- }
- goto retry;
-
-error:
- vm_page_lock_queues();
- for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++)
- if (*pages)
- vm_page_unhold(*pages);
- vm_page_unlock_queues();
- return (EFAULT);
-}
-#endif
-
-static void
-vm_fault_unhold_pages(vm_page_t *mp, int count)
-{
-
- KASSERT(count >= 0, ("negative count %d", count));
- vm_page_lock_queues();
- while (count--) {
- vm_page_unhold(*mp);
- mp++;
- }
- vm_page_unlock_queues();
-}
-
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
new file mode 100644
index 0000000..8bdcb65
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
@@ -0,0 +1,735 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <machine/bus.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_offload.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_l2t.h>
+#include <dev/cxgb/cxgb_offload.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+#define MAX_SCHEDULE_TIMEOUT 300
+
+/*
+ * Return the # of page pods needed to accommodate a # of pages.
+ */
+static inline unsigned int
+pages2ppods(unsigned int pages)
+{
+ return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
+}
+
+/**
+ * t3_pin_pages - pin a user memory range and prepare it for DDP
+ * @addr - the starting address
+ * @len - the length of the range
+ * @newgl - contains the pages and physical addresses of the pinned range
+ * @gl - an existing gather list, may be %NULL
+ *
+ * Pins the pages in the user-space memory range [addr, addr + len) and
+ * maps them for DMA. Returns a gather list with the pinned pages and
+ * their physical addresses. If @gl is non NULL the pages it describes
+ * are compared against the pages for [addr, addr + len), and if the
+ * existing gather list already covers the range a new list is not
+ * allocated. Returns 0 on success, or a negative errno. On success if
+ * a new gather list was allocated it is returned in @newgl.
+ */
+static int
+t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, vm_offset_t addr,
+ size_t len, struct ddp_gather_list **newgl,
+ const struct ddp_gather_list *gl)
+{
+ int i = 0, err;
+ size_t pg_off;
+ unsigned int npages;
+ struct ddp_gather_list *p;
+
+ /*
+ * XXX need x86 agnostic check
+ */
+ if (addr + len > VM_MAXUSER_ADDRESS)
+ return (EFAULT);
+
+ pg_off = addr & PAGE_MASK;
+ npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
+ M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (p == NULL)
+ return (ENOMEM);
+
+ err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE);
+ if (err)
+ goto free_gl;
+
+ if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
+ gl->dgl_length >= len) {
+ for (i = 0; i < npages; i++)
+ if (p->dgl_pages[i] != gl->dgl_pages[i])
+ goto different_gl;
+ err = 0;
+ goto unpin;
+ }
+
+different_gl:
+ p->dgl_length = len;
+ p->dgl_offset = pg_off;
+ p->dgl_nelem = npages;
+#ifdef NEED_BUSDMA
+ p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
+ PAGE_SIZE - pg_off,
+ PCI_DMA_FROMDEVICE) - pg_off;
+ for (i = 1; i < npages; ++i)
+ p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
+#endif
+ *newgl = p;
+ return (0);
+unpin:
+ vm_fault_unhold_pages(p->dgl_pages, npages);
+
+free_gl:
+
+ free(p, M_DEVBUF);
+ *newgl = NULL;
+ return (err);
+}
+
+static void
+unmap_ddp_gl(const struct ddp_gather_list *gl)
+{
+#ifdef NEED_BUSDMA
+ int i;
+
+ if (!gl->nelem)
+ return;
+
+ pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset,
+ PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE);
+ for (i = 1; i < gl->nelem; ++i)
+ pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
+
+#endif
+}
+
+static void
+ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty)
+{
+ /*
+ * XXX mark pages as dirty before unholding
+ */
+ vm_fault_unhold_pages(gl->dgl_pages, gl->dgl_nelem);
+}
+
+void
+t3_free_ddp_gl(struct ddp_gather_list *gl)
+{
+ unmap_ddp_gl(gl);
+ ddp_gl_free_pages(gl, 0);
+ free(gl, M_DEVBUF);
+}
+
+/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
+#define MAX_PPODS 64U
+
+/*
+ * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
+ * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we
+ * try to allocate enough page pods to accommodate the whole buffer, subject to
+ * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page
+ * pods before failing entirely.
+ */
+static int
+alloc_buf1_ppods(struct socket *so, struct ddp_state *p,
+ unsigned long addr, unsigned int len)
+{
+ int err, tag, npages, nppods;
+ struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ nppods = min(pages2ppods(npages), MAX_PPODS);
+ nppods = roundup2(nppods, PPOD_CLUSTER_SIZE);
+ err = t3_alloc_ppods(d, nppods, &tag);
+ if (err && nppods > PPOD_CLUSTER_SIZE) {
+ nppods = PPOD_CLUSTER_SIZE;
+ err = t3_alloc_ppods(d, nppods, &tag);
+ }
+ if (err)
+ return (ENOMEM);
+
+ p->ubuf_nppods = nppods;
+ p->ubuf_tag = tag;
+#if NUM_DDP_KBUF == 1
+ t3_set_ddp_tag(so, 1, tag << 6);
+#endif
+ return (0);
+}
+
+/*
+ * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush
+ * won't block indefinitely if there's nothing to place (which should be rare).
+ */
+#define UBUF_OFFSET 1
+
+static __inline unsigned long
+select_ddp_flags(const struct socket *so, int buf_idx,
+ int nonblock, int rcv_flags)
+{
+ if (buf_idx == 1) {
+ if (__predict_false(rcv_flags & MSG_WAITALL))
+ return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+ V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+ V_TF_DDP_PUSH_DISABLE_1(1);
+ if (nonblock)
+ return V_TF_DDP_BUF1_FLUSH(1);
+
+ return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so),
+ ddp_push_wait));
+ }
+
+ if (__predict_false(rcv_flags & MSG_WAITALL))
+ return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+ V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+ V_TF_DDP_PUSH_DISABLE_0(1);
+ if (nonblock)
+ return V_TF_DDP_BUF0_FLUSH(1);
+
+ return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait));
+}
+
+/*
+ * Reposts the kernel DDP buffer after it has been previously become full and
+ * invalidated. We just need to reset the offset and adjust the DDP flags.
+ * Conveniently, we can set the flags and the offset with a single message.
+ * Note that this function does not set the buffer length. Again conveniently
+ * our kernel buffer is of fixed size. If the length needs to be changed it
+ * needs to be done separately.
+ */
+static void
+t3_repost_kbuf(struct socket *so, unsigned int bufidx, int modulate,
+ int activate, int nonblock)
+{
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+ unsigned long flags;
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset;
+ p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0;
+ p->buf_state[bufidx].gl = p->kbuf[bufidx];
+ p->cur_buf = bufidx;
+ p->kbuf_idx = bufidx;
+
+ flags = select_ddp_flags(so, bufidx, nonblock, 0);
+ if (!bufidx)
+ t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
+ V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |
+ V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) |
+ V_TF_DDP_BUF0_VALID(1),
+ V_TF_DDP_BUF0_FLUSH(1) |
+ V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+ V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
+ V_TF_DDP_BUF0_VALID(1) |
+ V_TF_DDP_ACTIVE_BUF(activate), modulate);
+ else
+ t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
+ V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |
+ V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) |
+ V_TF_DDP_BUF1_VALID(1) |
+ V_TF_DDP_ACTIVE_BUF(activate),
+ V_TF_DDP_BUF1_FLUSH(1) |
+ V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+ V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
+ V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+ modulate);
+
+}
+
+/**
+ * setup_uio_ppods - setup HW page pods for a user iovec
+ * @sk: the associated socket
+ * @uio: the uio
+ * @oft: additional bytes to map before the start of the buffer
+ *
+ * Pins a user iovec and sets up HW page pods for DDP into it. We allocate
+ * page pods for user buffers on the first call per socket. Afterwards we
+ * limit the buffer length to whatever the existing page pods can accommodate.
+ * Returns a negative error code or the length of the mapped buffer.
+ *
+ * The current implementation handles iovecs with only one entry.
+ */
+static int
+setup_uio_ppods(struct socket *so, const struct uio *uio, int oft, int *length)
+{
+ int err;
+ unsigned int len;
+ struct ddp_gather_list *gl = NULL;
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+ struct iovec *iov = uio->uio_iov;
+ vm_offset_t addr = (vm_offset_t)iov->iov_base - oft;
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (__predict_false(p->ubuf_nppods == 0)) {
+ err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft);
+ if (err)
+ return (err);
+ }
+
+ len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
+ len -= addr & PAGE_MASK;
+ if (len > M_TCB_RX_DDP_BUF0_LEN)
+ len = M_TCB_RX_DDP_BUF0_LEN;
+ len = min(len, sototcpcb(so)->rcv_wnd - 32768);
+ len = min(len, iov->iov_len + oft);
+
+ if (len <= p->kbuf[0]->dgl_length) {
+ printf("length too short\n");
+ return (EINVAL);
+ }
+
+ err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
+ if (err)
+ return (err);
+ if (gl) {
+ if (p->ubuf)
+ t3_free_ddp_gl(p->ubuf);
+ p->ubuf = gl;
+ t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
+ gl->dgl_offset, 0);
+ }
+ *length = len;
+ return (0);
+}
+
+/*
+ *
+ */
+void
+t3_cancel_ubuf(struct toepcb *toep)
+{
+ struct ddp_state *p = &toep->tp_ddp_state;
+ int ubuf_pending = t3_ddp_ubuf_pending(toep);
+ struct socket *so = toeptoso(toep);
+ int err = 0, count=0;
+
+ if (p->ubuf == NULL)
+ return;
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ p->cancel_ubuf = 1;
+ while (ubuf_pending && !(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+#ifdef T3_TRACE
+ T3_TRACE3(TB(p),
+ "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d",
+ p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+ p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+ p->get_tcb_count);
+#endif
+ CTR3(KTR_TOM,
+ "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d",
+ p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+ p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+ p->get_tcb_count);
+ if (p->get_tcb_count == 0)
+ t3_cancel_ddpbuf(toep, p->cur_buf);
+ else
+ CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d so=%p SBS_CANTRCVMORE=%d",
+ err, p->get_tcb_count, so->so_rcv.sb_timeo, so,
+ !!(so->so_rcv.sb_state & SBS_CANTRCVMORE));
+
+ while (p->get_tcb_count && !(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+ if (count & 0xfffffff)
+ CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d so=%p count=%d",
+ err, p->get_tcb_count, so->so_rcv.sb_timeo, so, count);
+ count++;
+ err = sbwait(&so->so_rcv);
+ }
+ ubuf_pending = t3_ddp_ubuf_pending(toep);
+ }
+ p->cancel_ubuf = 0;
+}
+
+#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \
+ V_TF_DDP_PSH_NO_INVALIDATE1(1) | \
+ V_TF_DDP_BUF1_FLUSH(1) | \
+ V_TF_DDP_BUF0_FLUSH(1) | \
+ V_TF_DDP_PUSH_DISABLE_1(1) | \
+ V_TF_DDP_PUSH_DISABLE_0(1) | \
+ V_TF_DDP_INDICATE_OUT(1))
+
+/*
+ * Post a user buffer as an overlay on top of the current kernel buffer.
+ */
+int
+t3_overlay_ubuf(struct socket *so, const struct uio *uio,
+ int nonblock, int rcv_flags, int modulate, int post_kbuf)
+{
+ int err, len, ubuf_idx;
+ unsigned long flags;
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+
+ if (p->kbuf[0] == NULL) {
+ return (EINVAL);
+ }
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ err = setup_uio_ppods(so, uio, 0, &len);
+ if (err) {
+ return (err);
+ }
+
+ ubuf_idx = p->kbuf_idx;
+ p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
+ /* Use existing offset */
+ /* Don't need to update .gl, user buffer isn't copied. */
+ p->cur_buf = ubuf_idx;
+
+ flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags);
+
+ if (post_kbuf) {
+ struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
+
+ dbs->cur_offset = 0;
+ dbs->flags = 0;
+ dbs->gl = p->kbuf[ubuf_idx ^ 1];
+ p->kbuf_idx ^= 1;
+ flags |= p->kbuf_idx ?
+ V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
+ V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
+ }
+
+ if (ubuf_idx == 0) {
+ t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
+ len);
+ t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0,
+ flags,
+ OVERLAY_MASK | flags, 1);
+ } else {
+ t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
+ len);
+ t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0,
+ flags,
+ OVERLAY_MASK | flags, 1);
+ }
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(so),
+ "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
+ " kbuf_idx %d",
+ p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
+#endif
+ CTR3(KTR_TOM,
+ "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x",
+ p->ubuf_tag, flags, OVERLAY_MASK);
+ CTR3(KTR_TOM,
+ "t3_overlay_ubuf: ubuf_idx %d kbuf_idx %d post_kbuf %d",
+ ubuf_idx, p->kbuf_idx, post_kbuf);
+
+ return (0);
+}
+
+/*
+ * Clean up DDP state that needs to survive until socket close time, such as the
+ * DDP buffers. The buffers are already unmapped at this point as unmapping
+ * needs the PCI device and a socket may close long after the device is removed.
+ */
+void
+t3_cleanup_ddp(struct toepcb *toep)
+{
+ struct ddp_state *p = &toep->tp_ddp_state;
+ int idx;
+
+ for (idx = 0; idx < NUM_DDP_KBUF; idx++)
+ if (p->kbuf[idx]) {
+ ddp_gl_free_pages(p->kbuf[idx], 0);
+ free(p->kbuf[idx], M_DEVBUF);
+ }
+ if (p->ubuf) {
+ ddp_gl_free_pages(p->ubuf, 0);
+ free(p->ubuf, M_DEVBUF);
+ p->ubuf = NULL;
+ }
+ toep->tp_ulp_mode = 0;
+}
+
+/*
+ * This is a companion to t3_cleanup_ddp() and releases the HW resources
+ * associated with a connection's DDP state, such as the page pods.
+ * It's called when HW is done with a connection. The rest of the state
+ * remains available until both HW and the app are done with the connection.
+ */
+void
+t3_release_ddp_resources(struct toepcb *toep)
+{
+ struct ddp_state *p = &toep->tp_ddp_state;
+ struct tom_data *d = TOM_DATA(toep->tp_toedev);
+ int idx;
+
+ for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
+ t3_free_ppods(d, p->kbuf_tag[idx],
+ p->kbuf_nppods[idx]);
+ unmap_ddp_gl(p->kbuf[idx]);
+ }
+
+ if (p->ubuf_nppods) {
+ t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods);
+ p->ubuf_nppods = 0;
+ }
+ if (p->ubuf)
+ unmap_ddp_gl(p->ubuf);
+
+}
+
+void
+t3_post_kbuf(struct socket *so, int modulate, int nonblock)
+{
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+
+ t3_set_ddp_tag(so, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6);
+ t3_set_ddp_buf(so, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length);
+ t3_repost_kbuf(so, p->cur_buf, modulate, 1, nonblock);
+#ifdef T3_TRACE
+ T3_TRACE1(TIDTB(so),
+ "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
+#endif
+ CTR1(KTR_TOM,
+ "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
+}
+
+/*
+ * Prepare a socket for DDP. Must be called when the socket is known to be
+ * open.
+ */
+int
+t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall, int nonblock)
+{
+ int i, err = ENOMEM;
+ static vm_pindex_t color;
+ unsigned int nppods, kbuf_pages, idx = 0;
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+ struct tom_data *d = TOM_DATA(toep->tp_toedev);
+
+
+ if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN)
+ return (EINVAL);
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ nppods = pages2ppods(kbuf_pages);
+
+ p->kbuf_noinval = !!waitall;
+ p->kbuf_tag[NUM_DDP_KBUF - 1] = -1;
+ for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
+ p->kbuf[idx] =
+ malloc(sizeof (struct ddp_gather_list) + kbuf_pages *
+ sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (p->kbuf[idx] == NULL)
+ goto err;
+ err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]);
+ if (err) {
+ printf("t3_alloc_ppods failed err=%d\n", err);
+ goto err;
+ }
+
+ p->kbuf_nppods[idx] = nppods;
+ p->kbuf[idx]->dgl_length = kbuf_size;
+ p->kbuf[idx]->dgl_offset = 0;
+ p->kbuf[idx]->dgl_nelem = kbuf_pages;
+
+ for (i = 0; i < kbuf_pages; ++i) {
+ p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color,
+ VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
+ VM_ALLOC_ZERO);
+ if (p->kbuf[idx]->dgl_pages[i] == NULL) {
+ p->kbuf[idx]->dgl_nelem = i;
+ printf("failed to allocate kbuf pages\n");
+ goto err;
+ }
+ }
+#ifdef NEED_BUSDMA
+ /*
+ * XXX we'll need this for VT-d or any platform with an iommu :-/
+ *
+ */
+ for (i = 0; i < kbuf_pages; ++i)
+ p->kbuf[idx]->phys_addr[i] =
+ pci_map_page(p->pdev, p->kbuf[idx]->pages[i],
+ 0, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+#endif
+ t3_setup_ppods(so, p->kbuf[idx], nppods, p->kbuf_tag[idx],
+ p->kbuf[idx]->dgl_length, 0, 0);
+ }
+ cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
+
+ t3_set_ddp_tag(so, 0, p->kbuf_tag[0] << 6);
+ t3_set_ddp_buf(so, 0, 0, p->kbuf[0]->dgl_length);
+ t3_repost_kbuf(so, 0, 0, 1, nonblock);
+
+ t3_set_rcv_coalesce_enable(so,
+ TOM_TUNABLE(TOE_DEV(so), ddp_rcvcoalesce));
+
+#ifdef T3_TRACE
+ T3_TRACE4(TIDTB(so),
+ "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
+ kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
+#endif
+ CTR4(KTR_TOM,
+ "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
+ kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
+ DELAY(100000);
+ cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
+ return (0);
+
+err:
+ t3_release_ddp_resources(toep);
+ t3_cleanup_ddp(toep);
+ return (err);
+}
+
+int
+t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len)
+{
+ int page_off, resid_init, err;
+ struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl;
+
+ resid_init = uio->uio_resid;
+
+ if (!gl->dgl_pages)
+ panic("pages not set\n");
+
+ offset += gl->dgl_offset + m->m_cur_offset;
+ page_off = offset & PAGE_MASK;
+ KASSERT(len <= gl->dgl_length,
+ ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length));
+
+ err = uiomove_fromphys(gl->dgl_pages, page_off, len, uio);
+ return (err);
+}
+
+
+/*
+ * Allocate n page pods. Returns -1 on failure or the page pod tag.
+ */
+int
+t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag)
+{
+ unsigned int i, j;
+
+ if (__predict_false(!td->ppod_map)) {
+ printf("ppod_map not set\n");
+ return (EINVAL);
+ }
+
+ mtx_lock(&td->ppod_map_lock);
+ for (i = 0; i < td->nppods; ) {
+
+ for (j = 0; j < n; ++j) /* scan ppod_map[i..i+n-1] */
+ if (td->ppod_map[i + j]) {
+ i = i + j + 1;
+ goto next;
+ }
+ memset(&td->ppod_map[i], 1, n); /* allocate range */
+ mtx_unlock(&td->ppod_map_lock);
+ CTR2(KTR_TOM,
+ "t3_alloc_ppods: n=%u tag=%u", n, i);
+ *ptag = i;
+ return (0);
+ next: ;
+ }
+ mtx_unlock(&td->ppod_map_lock);
+ return (0);
+}
+
+void
+t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n)
+{
+ /* No need to take ppod_lock here */
+ memset(&td->ppod_map[tag], 0, n);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
index 9077295..8989fd9 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_defs.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
@@ -40,6 +40,13 @@ $FreeBSD$
#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket)
#define sototoep(so) (sototcpcb((so))->t_toe)
+#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__)
+#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__)
+
+#define KTR_TOM KTR_SPARE2
+#define KTR_TCB KTR_SPARE3
+
+struct toepcb;
struct listen_ctx;
typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m);
@@ -54,7 +61,8 @@ void t3_init_listen_cpl_handlers(void);
int t3_init_cpl_io(void);
void t3_init_wr_tab(unsigned int wr_len);
uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail);
-void t3_cleanup_rbuf(struct tcpcb *tp);
+void t3_send_rx_modulate(struct toepcb *toep);
+void t3_cleanup_rbuf(struct tcpcb *tp, int copied);
void t3_init_socket_ops(void);
void t3_install_socket_ops(struct socket *so);
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
index a88b26e..acbad6f 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -180,7 +180,6 @@ listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid)
return p;
}
-#if 0
/*
* Given a pointer to a listening socket return its server TID by consulting
* the socket->stid map. Returns -1 if the socket is not in the map.
@@ -191,16 +190,15 @@ listen_hash_find(struct tom_data *d, struct socket *so)
int stid = -1, bucket = listen_hashfn(so);
struct listen_info *p;
- spin_lock(&d->listen_lock);
+ mtx_lock(&d->listen_lock);
for (p = d->listen_hash_tab[bucket]; p; p = p->next)
- if (p->sk == sk) {
+ if (p->so == so) {
stid = p->stid;
break;
}
- spin_unlock(&d->listen_lock);
+ mtx_unlock(&d->listen_lock);
return stid;
}
-#endif
/*
* Delete the listen_info structure for a listening socket. Returns the server
@@ -244,28 +242,24 @@ t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
if (!TOM_TUNABLE(dev, activated))
return;
- printf("start listen\n");
+ if (listen_hash_find(d, so) != -1)
+ return;
- ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT);
+ CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport));
+ ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO);
if (!ctx)
return;
ctx->tom_data = d;
ctx->lso = so;
- ctx->ulp_mode = 0; /* DDP if the default */
+ ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0;
LIST_INIT(&ctx->synq_head);
stid = cxgb_alloc_stid(d->cdev, d->client, ctx);
if (stid < 0)
goto free_ctx;
-#ifdef notyet
- /*
- * XXX need to mark inpcb as referenced
- */
- sock_hold(sk);
-#endif
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
goto free_stid;
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
index 9fa42b5..e37c9b1 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
@@ -1,4 +1,3 @@
-
/**************************************************************************
Copyright (c) 2007, Chelsio Inc.
@@ -86,7 +85,6 @@ struct pagepod {
#define M_PPOD_PGSZ 0x3
#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
-struct pci_dev;
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <machine/bus.h>
@@ -96,8 +94,7 @@ struct ddp_gather_list {
unsigned int dgl_length;
unsigned int dgl_offset;
unsigned int dgl_nelem;
- vm_page_t *dgl_pages;
- bus_addr_t dgl_phys_addr[0];
+ vm_page_t dgl_pages[0];
};
struct ddp_buf_state {
@@ -107,7 +104,6 @@ struct ddp_buf_state {
};
struct ddp_state {
- struct pci_dev *pdev;
struct ddp_buf_state buf_state[2]; /* per buffer state */
int cur_buf;
unsigned short kbuf_noinval;
@@ -119,6 +115,7 @@ struct ddp_state {
int get_tcb_count;
unsigned int kbuf_posted;
int cancel_ubuf;
+ int user_ddp_pending;
unsigned int kbuf_nppods[NUM_DDP_KBUF];
unsigned int kbuf_tag[NUM_DDP_KBUF];
struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */
@@ -132,54 +129,51 @@ enum {
DDP_BF_PSH = 1 << 3, /* set in skb->flags if the a DDP was
completed with a segment having the
PSH flag set */
+ DDP_BF_NODATA = 1 << 4, /* buffer completed before filling */
};
-#ifdef notyet
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
/*
* Returns 1 if a UBUF DMA buffer might be active.
*/
-static inline int t3_ddp_ubuf_pending(struct sock *so)
+static inline int
+t3_ddp_ubuf_pending(struct toepcb *toep)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct ddp_state *p = DDP_STATE(tp);
+ struct ddp_state *p = &toep->tp_ddp_state;
/* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
* but DDP_STATE() is only valid if the connection actually enabled
* DDP.
*/
- if (!p)
- return 0;
+ if (p->kbuf[0] == NULL)
+ return (0);
return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) ||
(p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY));
}
-#endif
int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
unsigned int nppods, unsigned int tag, unsigned int maxoff,
unsigned int pg_off, unsigned int color);
-int t3_alloc_ppods(struct tom_data *td, unsigned int n);
+int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
-void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl);
-int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len,
- struct ddp_gather_list **newgl,
- const struct ddp_gather_list *gl);
-int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
- int len);
+void t3_free_ddp_gl(struct ddp_gather_list *gl);
+int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len);
//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
-void t3_post_kbuf(struct socket *so, int modulate);
-int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+void t3_post_kbuf(struct socket *so, int modulate, int nonblock);
+int t3_post_ubuf(struct socket *so, const struct uio *uio, int nonblock,
int rcv_flags, int modulate, int post_kbuf);
-void t3_cancel_ubuf(struct socket *so);
-int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
- int rcv_flags, int modulate, int post_kbuf);
-int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
-void t3_cleanup_ddp(struct socket *so);
+void t3_cancel_ubuf(struct toepcb *toep);
+int t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock,
+ int rcv_flags, int modulate, int post_kbuf);
+int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall, int nonblock);
+void t3_cleanup_ddp(struct toepcb *toep);
void t3_release_ddp_resources(struct toepcb *toep);
-void t3_cancel_ddpbuf(struct socket *so, unsigned int bufidx);
-void t3_overlay_ddpbuf(struct socket *so, unsigned int bufidx, unsigned int tag0,
+void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx);
+void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0,
unsigned int tag1, unsigned int len);
-void t3_setup_ddpbufs(struct socket *so, unsigned int len0, unsigned int offset0,
+void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0,
unsigned int len1, unsigned int offset1,
uint64_t ddp_flags, uint64_t flag_mask, int modulate);
#endif /* T3_DDP_H */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
deleted file mode 100644
index 2eca099..0000000
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_compat.h"
-#include "opt_inet.h"
-#include "opt_inet6.h"
-#include "opt_ipsec.h"
-#include "opt_mac.h"
-#include "opt_tcpdebug.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/callout.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#ifdef INET6
-#include <sys/domain.h>
-#endif
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/protosw.h>
-#include <sys/random.h>
-
-#include <vm/uma.h>
-
-#include <net/route.h>
-#include <net/if.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#include <netinet/ip.h>
-#ifdef INET6
-#include <netinet/ip6.h>
-#endif
-#include <netinet/in_pcb.h>
-#ifdef INET6
-#include <netinet6/in6_pcb.h>
-#endif
-#include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-#ifdef INET6
-#include <netinet6/ip6_var.h>
-#include <netinet6/scope6_var.h>
-#include <netinet6/nd6.h>
-#endif
-#include <netinet/ip_icmp.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
-#ifdef INET6
-#include <netinet6/tcp6_var.h>
-#endif
-#include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif
-#include <netinet6/ip6protosw.h>
-
-#ifdef IPSEC
-#include <netipsec/ipsec.h>
-#include <netipsec/xform.h>
-#ifdef INET6
-#include <netipsec/ipsec6.h>
-#endif
-#include <netipsec/key.h>
-#endif /*IPSEC*/
-
-#include <machine/in_cksum.h>
-#include <sys/md5.h>
-
-#include <security/mac/mac_framework.h>
-
-#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
-
-
-SYSCTL_NODE(_net_inet_tcp, 0, cxgb, CTLFLAG_RW, 0, "chelsio TOE");
-
-static int tcp_log_debug = 0;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, log_debug, CTLFLAG_RW,
- &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
-
-static int tcp_tcbhashsize = 0;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
- &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
-
-static int do_tcpdrain = 1;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, do_tcpdrain, CTLFLAG_RW,
- &do_tcpdrain, 0,
- "Enable tcp_drain routine for extra help when low on mbufs");
-
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, pcbcount, CTLFLAG_RD,
- &tcbinfo.ipi_count, 0, "Number of active PCBs");
-
-static int icmp_may_rst = 1;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
- &icmp_may_rst, 0,
- "Certain ICMP unreachable messages may abort connections in SYN_SENT");
-
-static int tcp_isn_reseed_interval = 0;
-SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
- &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
-
-/*
- * TCP bandwidth limiting sysctls. Note that the default lower bound of
- * 1024 exists only for debugging. A good production default would be
- * something like 6100.
- */
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
- "TCP inflight data limiting");
-
-static int tcp_inflight_enable = 1;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
- &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
-
-static int tcp_inflight_debug = 0;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
- &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
-
-static int tcp_inflight_rttthresh;
-SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
- &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
- "RTT threshold below which inflight will deactivate itself");
-
-static int tcp_inflight_min = 6144;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
- &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
-
-static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
- &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
-
-static int tcp_inflight_stab = 20;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
- &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-
-uma_zone_t sack_hole_zone;
-
-static struct inpcb *tcp_notify(struct inpcb *, int);
-static struct inpcb *cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno);
-
-/*
- * Target size of TCP PCB hash tables. Must be a power of two.
- *
- * Note that this can be overridden by the kernel environment
- * variable net.inet.tcp.tcbhashsize
- */
-#ifndef TCBHASHSIZE
-#define TCBHASHSIZE 512
-#endif
-
-/*
- * XXX
- * Callouts should be moved into struct tcp directly. They are currently
- * separate because the tcpcb structure is exported to userland for sysctl
- * parsing purposes, which do not know about callouts.
- */
-struct tcpcb_mem {
- struct tcpcb tcb;
- struct tcp_timer tt;
-};
-
-MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
-
-/*
- * Drop a TCP connection, reporting
- * the specified error. If connection is synchronized,
- * then send a RST to peer.
- */
-struct tcpcb *
-cxgb_tcp_drop(struct tcpcb *tp, int errno)
-{
- struct socket *so = tp->t_inpcb->inp_socket;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(tp->t_inpcb);
-
- if (TCPS_HAVERCVDSYN(tp->t_state)) {
- tp->t_state = TCPS_CLOSED;
- (void) tcp_gen_reset(tp);
- tcpstat.tcps_drops++;
- } else
- tcpstat.tcps_conndrops++;
- if (errno == ETIMEDOUT && tp->t_softerror)
- errno = tp->t_softerror;
- so->so_error = errno;
- return (cxgb_tcp_close(tp));
-}
-
-/*
- * Attempt to close a TCP control block, marking it as dropped, and freeing
- * the socket if we hold the only reference.
- */
-struct tcpcb *
-cxgb_tcp_close(struct tcpcb *tp)
-{
- struct inpcb *inp = tp->t_inpcb;
- struct socket *so;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- if (tp->t_state == TCPS_LISTEN)
- tcp_gen_listen_close(tp);
- in_pcbdrop(inp);
- tcpstat.tcps_closed++;
- KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
- so = inp->inp_socket;
- soisdisconnected(so);
- if (inp->inp_vflag & INP_SOCKREF) {
- KASSERT(so->so_state & SS_PROTOREF,
- ("tcp_close: !SS_PROTOREF"));
- inp->inp_vflag &= ~INP_SOCKREF;
- INP_UNLOCK(inp);
- ACCEPT_LOCK();
- SOCK_LOCK(so);
- so->so_state &= ~SS_PROTOREF;
- sofree(so);
- return (NULL);
- }
- return (tp);
-}
-
-/*
- * Notify a tcp user of an asynchronous error;
- * store error as soft error, but wake up user
- * (for now, won't do anything until can select for soft error).
- *
- * Do not wake up user since there currently is no mechanism for
- * reporting soft errors (yet - a kqueue filter may be added).
- */
-static struct inpcb *
-tcp_notify(struct inpcb *inp, int error)
-{
- struct tcpcb *tp;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- if ((inp->inp_vflag & INP_TIMEWAIT) ||
- (inp->inp_vflag & INP_DROPPED))
- return (inp);
-
- tp = intotcpcb(inp);
- KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
-
- /*
- * Ignore some errors if we are hooked up.
- * If connection hasn't completed, has retransmitted several times,
- * and receives a second error, give up now. This is better
- * than waiting a long time to establish a connection that
- * can never complete.
- */
- if (tp->t_state == TCPS_ESTABLISHED &&
- (error == EHOSTUNREACH || error == ENETUNREACH ||
- error == EHOSTDOWN)) {
- return (inp);
- } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
- tp->t_softerror) {
- tp = cxgb_tcp_drop(tp, error);
- if (tp != NULL)
- return (inp);
- else
- return (NULL);
- } else {
- tp->t_softerror = error;
- return (inp);
- }
-#if 0
- wakeup( &so->so_timeo);
- sorwakeup(so);
- sowwakeup(so);
-#endif
-}
-
-void
-cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
-{
- struct ip *ip = vip;
- struct tcphdr *th;
- struct in_addr faddr;
- struct inpcb *inp;
- struct tcpcb *tp;
- struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
- struct icmp *icp;
- struct in_conninfo inc;
- tcp_seq icmp_tcp_seq;
- int mtu;
-
- faddr = ((struct sockaddr_in *)sa)->sin_addr;
- if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
- return;
-
- if (cmd == PRC_MSGSIZE)
- notify = tcp_mtudisc;
- else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
- cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
- notify = cxgb_tcp_drop_syn_sent;
- /*
- * Redirects don't need to be handled up here.
- */
- else if (PRC_IS_REDIRECT(cmd))
- return;
- /*
- * Source quench is depreciated.
- */
- else if (cmd == PRC_QUENCH)
- return;
- /*
- * Hostdead is ugly because it goes linearly through all PCBs.
- * XXX: We never get this from ICMP, otherwise it makes an
- * excellent DoS attack on machines with many connections.
- */
- else if (cmd == PRC_HOSTDEAD)
- ip = NULL;
- else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
- return;
- if (ip != NULL) {
- icp = (struct icmp *)((caddr_t)ip
- - offsetof(struct icmp, icmp_ip));
- th = (struct tcphdr *)((caddr_t)ip
- + (ip->ip_hl << 2));
- INP_INFO_WLOCK(&tcbinfo);
- inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
- ip->ip_src, th->th_sport, 0, NULL);
- if (inp != NULL) {
- INP_LOCK(inp);
- if (!(inp->inp_vflag & INP_TIMEWAIT) &&
- !(inp->inp_vflag & INP_DROPPED) &&
- !(inp->inp_socket == NULL)) {
- icmp_tcp_seq = htonl(th->th_seq);
- tp = intotcpcb(inp);
- if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
- SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
- if (cmd == PRC_MSGSIZE) {
- /*
- * MTU discovery:
- * If we got a needfrag set the MTU
- * in the route to the suggested new
- * value (if given) and then notify.
- */
- bzero(&inc, sizeof(inc));
- inc.inc_flags = 0; /* IPv4 */
- inc.inc_faddr = faddr;
-
- mtu = ntohs(icp->icmp_nextmtu);
- /*
- * If no alternative MTU was
- * proposed, try the next smaller
- * one. ip->ip_len has already
- * been swapped in icmp_input().
- */
- if (!mtu)
- mtu = ip_next_mtu(ip->ip_len,
- 1);
- if (mtu < max(296, (tcp_minmss)
- + sizeof(struct tcpiphdr)))
- mtu = 0;
- if (!mtu)
- mtu = tcp_mssdflt
- + sizeof(struct tcpiphdr);
- /*
- * Only cache the the MTU if it
- * is smaller than the interface
- * or route MTU. tcp_mtudisc()
- * will do right thing by itself.
- */
- if (mtu <= tcp_maxmtu(&inc, NULL))
- tcp_hc_updatemtu(&inc, mtu);
- }
-
- inp = (*notify)(inp, inetctlerrmap[cmd]);
- }
- }
- if (inp != NULL)
- INP_UNLOCK(inp);
- } else {
- inc.inc_fport = th->th_dport;
- inc.inc_lport = th->th_sport;
- inc.inc_faddr = faddr;
- inc.inc_laddr = ip->ip_src;
-#ifdef INET6
- inc.inc_isipv6 = 0;
-#endif
- syncache_unreach(&inc, th);
- }
- INP_INFO_WUNLOCK(&tcbinfo);
- } else
- in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
-}
-
-#ifdef INET6
-void
-tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
-{
- struct tcphdr th;
- struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
- struct ip6_hdr *ip6;
- struct mbuf *m;
- struct ip6ctlparam *ip6cp = NULL;
- const struct sockaddr_in6 *sa6_src = NULL;
- int off;
- struct tcp_portonly {
- u_int16_t th_sport;
- u_int16_t th_dport;
- } *thp;
-
- if (sa->sa_family != AF_INET6 ||
- sa->sa_len != sizeof(struct sockaddr_in6))
- return;
-
- if (cmd == PRC_MSGSIZE)
- notify = tcp_mtudisc;
- else if (!PRC_IS_REDIRECT(cmd) &&
- ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
- return;
- /* Source quench is depreciated. */
- else if (cmd == PRC_QUENCH)
- return;
-
- /* if the parameter is from icmp6, decode it. */
- if (d != NULL) {
- ip6cp = (struct ip6ctlparam *)d;
- m = ip6cp->ip6c_m;
- ip6 = ip6cp->ip6c_ip6;
- off = ip6cp->ip6c_off;
- sa6_src = ip6cp->ip6c_src;
- } else {
- m = NULL;
- ip6 = NULL;
- off = 0; /* fool gcc */
- sa6_src = &sa6_any;
- }
-
- if (ip6 != NULL) {
- struct in_conninfo inc;
- /*
- * XXX: We assume that when IPV6 is non NULL,
- * M and OFF are valid.
- */
-
- /* check if we can safely examine src and dst ports */
- if (m->m_pkthdr.len < off + sizeof(*thp))
- return;
-
- bzero(&th, sizeof(th));
- m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
-
- in6_pcbnotify(&tcbinfo, sa, th.th_dport,
- (struct sockaddr *)ip6cp->ip6c_src,
- th.th_sport, cmd, NULL, notify);
-
- inc.inc_fport = th.th_dport;
- inc.inc_lport = th.th_sport;
- inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
- inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
- inc.inc_isipv6 = 1;
- INP_INFO_WLOCK(&tcbinfo);
- syncache_unreach(&inc, &th);
- INP_INFO_WUNLOCK(&tcbinfo);
- } else
- in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
- 0, cmd, NULL, notify);
-}
-#endif /* INET6 */
-
-
-/*
- * Following is where TCP initial sequence number generation occurs.
- *
- * There are two places where we must use initial sequence numbers:
- * 1. In SYN-ACK packets.
- * 2. In SYN packets.
- *
- * All ISNs for SYN-ACK packets are generated by the syncache. See
- * tcp_syncache.c for details.
- *
- * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
- * depends on this property. In addition, these ISNs should be
- * unguessable so as to prevent connection hijacking. To satisfy
- * the requirements of this situation, the algorithm outlined in
- * RFC 1948 is used, with only small modifications.
- *
- * Implementation details:
- *
- * Time is based off the system timer, and is corrected so that it
- * increases by one megabyte per second. This allows for proper
- * recycling on high speed LANs while still leaving over an hour
- * before rollover.
- *
- * As reading the *exact* system time is too expensive to be done
- * whenever setting up a TCP connection, we increment the time
- * offset in two ways. First, a small random positive increment
- * is added to isn_offset for each connection that is set up.
- * Second, the function tcp_isn_tick fires once per clock tick
- * and increments isn_offset as necessary so that sequence numbers
- * are incremented at approximately ISN_BYTES_PER_SECOND. The
- * random positive increments serve only to ensure that the same
- * exact sequence number is never sent out twice (as could otherwise
- * happen when a port is recycled in less than the system tick
- * interval.)
- *
- * net.inet.tcp.isn_reseed_interval controls the number of seconds
- * between seeding of isn_secret. This is normally set to zero,
- * as reseeding should not be necessary.
- *
- * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
- * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
- * general, this means holding an exclusive (write) lock.
- */
-
-#define ISN_BYTES_PER_SECOND 1048576
-#define ISN_STATIC_INCREMENT 4096
-#define ISN_RANDOM_INCREMENT (4096 - 1)
-
-
-/*
- * When a specific ICMP unreachable message is received and the
- * connection state is SYN-SENT, drop the connection. This behavior
- * is controlled by the icmp_may_rst sysctl.
- */
-static struct inpcb *
-cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno)
-{
- struct tcpcb *tp;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- if ((inp->inp_vflag & INP_TIMEWAIT) ||
- (inp->inp_vflag & INP_DROPPED))
- return (inp);
-
- tp = intotcpcb(inp);
- if (tp->t_state != TCPS_SYN_SENT)
- return (inp);
-
- tp = cxgb_tcp_drop(tp, errno);
- if (tp != NULL)
- return (inp);
- else
- return (NULL);
-}
-
-static int
-cxgb_sysctl_drop(SYSCTL_HANDLER_ARGS)
-{
- /* addrs[0] is a foreign socket, addrs[1] is a local one. */
- struct sockaddr_storage addrs[2];
- struct inpcb *inp;
- struct tcpcb *tp;
- struct tcptw *tw;
- struct sockaddr_in *fin, *lin;
-#ifdef INET6
- struct sockaddr_in6 *fin6, *lin6;
- struct in6_addr f6, l6;
-#endif
- int error;
-
- inp = NULL;
- fin = lin = NULL;
-#ifdef INET6
- fin6 = lin6 = NULL;
-#endif
- error = 0;
-
- if (req->oldptr != NULL || req->oldlen != 0)
- return (EINVAL);
- if (req->newptr == NULL)
- return (EPERM);
- if (req->newlen < sizeof(addrs))
- return (ENOMEM);
- error = SYSCTL_IN(req, &addrs, sizeof(addrs));
- if (error)
- return (error);
-
- switch (addrs[0].ss_family) {
-#ifdef INET6
- case AF_INET6:
- fin6 = (struct sockaddr_in6 *)&addrs[0];
- lin6 = (struct sockaddr_in6 *)&addrs[1];
- if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
- lin6->sin6_len != sizeof(struct sockaddr_in6))
- return (EINVAL);
- if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
- if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
- return (EINVAL);
- in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
- in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
- fin = (struct sockaddr_in *)&addrs[0];
- lin = (struct sockaddr_in *)&addrs[1];
- break;
- }
- error = sa6_embedscope(fin6, ip6_use_defzone);
- if (error)
- return (error);
- error = sa6_embedscope(lin6, ip6_use_defzone);
- if (error)
- return (error);
- break;
-#endif
- case AF_INET:
- fin = (struct sockaddr_in *)&addrs[0];
- lin = (struct sockaddr_in *)&addrs[1];
- if (fin->sin_len != sizeof(struct sockaddr_in) ||
- lin->sin_len != sizeof(struct sockaddr_in))
- return (EINVAL);
- break;
- default:
- return (EINVAL);
- }
- INP_INFO_WLOCK(&tcbinfo);
- switch (addrs[0].ss_family) {
-#ifdef INET6
- case AF_INET6:
- inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port,
- &l6, lin6->sin6_port, 0, NULL);
- break;
-#endif
- case AF_INET:
- inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port,
- lin->sin_addr, lin->sin_port, 0, NULL);
- break;
- }
- if (inp != NULL) {
- INP_LOCK(inp);
- if (inp->inp_vflag & INP_TIMEWAIT) {
- /*
- * XXXRW: There currently exists a state where an
- * inpcb is present, but its timewait state has been
- * discarded. For now, don't allow dropping of this
- * type of inpcb.
- */
- tw = intotw(inp);
- if (tw != NULL)
- tcp_twclose(tw, 0);
- else
- INP_UNLOCK(inp);
- } else if (!(inp->inp_vflag & INP_DROPPED) &&
- !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
- tp = intotcpcb(inp);
- tp = cxgb_tcp_drop(tp, ECONNABORTED);
- if (tp != NULL)
- INP_UNLOCK(inp);
- } else
- INP_UNLOCK(inp);
- } else
- error = ESRCH;
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-
-SYSCTL_PROC(_net_inet_tcp_cxgb, TCPCTL_DROP, drop,
- CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
- 0, cxgb_sysctl_drop, "", "Drop TCP connection");
-
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
deleted file mode 100644
index bd940b2..0000000
--- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
+++ /dev/null
@@ -1,1362 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1988, 1993
- * The Regents of the University of California.
- * Copyright (c) 2006-2007 Robert N. M. Watson
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include "opt_ddb.h"
-#include "opt_inet.h"
-#include "opt_inet6.h"
-#include "opt_tcpdebug.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-#include <sys/mbuf.h>
-#ifdef INET6
-#include <sys/domain.h>
-#endif /* INET6 */
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/protosw.h>
-#include <sys/proc.h>
-#include <sys/jail.h>
-
-#ifdef DDB
-#include <ddb/ddb.h>
-#endif
-
-#include <net/if.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#ifdef INET6
-#include <netinet/ip6.h>
-#endif
-#include <netinet/in_pcb.h>
-#ifdef INET6
-#include <netinet6/in6_pcb.h>
-#endif
-#include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-#ifdef INET6
-#include <netinet6/ip6_var.h>
-#include <netinet6/scope6_var.h>
-#endif
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif
-#include <netinet/tcp_offload.h>
-#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
-
-
-/*
- * TCP protocol interface to socket abstraction.
- */
-static int tcp_attach(struct socket *);
-static int tcp_connect(struct tcpcb *, struct sockaddr *,
- struct thread *td);
-#ifdef INET6
-static int tcp6_connect(struct tcpcb *, struct sockaddr *,
- struct thread *td);
-#endif /* INET6 */
-static void tcp_disconnect(struct tcpcb *);
-static void tcp_usrclosed(struct tcpcb *);
-
-#ifdef TCPDEBUG
-#define TCPDEBUG0 int ostate = 0
-#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
-#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
- tcp_trace(TA_USER, ostate, tp, 0, 0, req)
-#else
-#define TCPDEBUG0
-#define TCPDEBUG1()
-#define TCPDEBUG2(req)
-#endif
-
-/*
- * TCP attaches to socket via pru_attach(), reserving space,
- * and an internet control block.
- */
-static int
-tcp_usr_attach(struct socket *so, int proto, struct thread *td)
-{
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- int error;
- TCPDEBUG0;
-
- inp = sotoinpcb(so);
- KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
- TCPDEBUG1();
-
- error = tcp_attach(so);
- if (error)
- goto out;
-
- if ((so->so_options & SO_LINGER) && so->so_linger == 0)
- so->so_linger = TCP_LINGERTIME;
-
- inp = sotoinpcb(so);
- tp = intotcpcb(inp);
-out:
- TCPDEBUG2(PRU_ATTACH);
- return error;
-}
-
-/*
- * tcp_detach is called when the socket layer loses its final reference
- * to the socket, be it a file descriptor reference, a reference from TCP,
- * etc. At this point, there is only one case in which we will keep around
- * inpcb state: time wait.
- *
- * This function can probably be re-absorbed back into tcp_usr_detach() now
- * that there is a single detach path.
- */
-static void
-tcp_detach(struct socket *so, struct inpcb *inp)
-{
- struct tcpcb *tp;
-#ifdef INET6
- int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
-#endif
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
- KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
-
- tp = intotcpcb(inp);
-
- if (inp->inp_vflag & INP_TIMEWAIT) {
- /*
- * There are two cases to handle: one in which the time wait
- * state is being discarded (INP_DROPPED), and one in which
- * this connection will remain in timewait. In the former,
- * it is time to discard all state (except tcptw, which has
- * already been discarded by the timewait close code, which
- * should be further up the call stack somewhere). In the
- * latter case, we detach from the socket, but leave the pcb
- * present until timewait ends.
- *
- * XXXRW: Would it be cleaner to free the tcptw here?
- */
- if (inp->inp_vflag & INP_DROPPED) {
- KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
- "INP_DROPPED && tp != NULL"));
-#ifdef INET6
- if (isipv6) {
- in6_pcbdetach(inp);
- in6_pcbfree(inp);
- } else {
-#endif
- in_pcbdetach(inp);
- in_pcbfree(inp);
-#ifdef INET6
- }
-#endif
- } else {
-#ifdef INET6
- if (isipv6)
- in6_pcbdetach(inp);
- else
-#endif
- in_pcbdetach(inp);
- INP_UNLOCK(inp);
- }
- } else {
- /*
- * If the connection is not in timewait, we consider two
- * two conditions: one in which no further processing is
- * necessary (dropped || embryonic), and one in which TCP is
- * not yet done, but no longer requires the socket, so the
- * pcb will persist for the time being.
- *
- * XXXRW: Does the second case still occur?
- */
- if (inp->inp_vflag & INP_DROPPED ||
- tp->t_state < TCPS_SYN_SENT) {
- tcp_discardcb(tp);
-#ifdef INET6
- if (isipv6) {
- in6_pcbdetach(inp);
- in6_pcbfree(inp);
- } else {
-#endif
- in_pcbdetach(inp);
- in_pcbfree(inp);
-#ifdef INET6
- }
-#endif
- } else {
-#ifdef INET6
- if (isipv6)
- in6_pcbdetach(inp);
- else
-#endif
- in_pcbdetach(inp);
- }
- }
-}
-
-/*
- * pru_detach() detaches the TCP protocol from the socket.
- * If the protocol state is non-embryonic, then can't
- * do this directly: have to initiate a pru_disconnect(),
- * which may finish later; embryonic TCB's can just
- * be discarded here.
- */
-static void
-tcp_usr_detach(struct socket *so)
-{
- struct inpcb *inp;
-
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
- INP_INFO_WLOCK(&tcbinfo);
- INP_LOCK(inp);
- KASSERT(inp->inp_socket != NULL,
- ("tcp_usr_detach: inp_socket == NULL"));
- tcp_detach(so, inp);
- INP_INFO_WUNLOCK(&tcbinfo);
-}
-
-/*
- * Give the socket an address.
- */
-static int
-tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- struct sockaddr_in *sinp;
-
- sinp = (struct sockaddr_in *)nam;
- if (nam->sa_len != sizeof (*sinp))
- return (EINVAL);
- /*
- * Must check for multicast addresses and disallow binding
- * to them.
- */
- if (sinp->sin_family == AF_INET &&
- IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
- return (EAFNOSUPPORT);
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- error = in_pcbbind(inp, nam, td->td_ucred);
-out:
- TCPDEBUG2(PRU_BIND);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
-
- return (error);
-}
-
-#ifdef INET6
-static int
-tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- struct sockaddr_in6 *sin6p;
-
- sin6p = (struct sockaddr_in6 *)nam;
- if (nam->sa_len != sizeof (*sin6p))
- return (EINVAL);
- /*
- * Must check for multicast addresses and disallow binding
- * to them.
- */
- if (sin6p->sin6_family == AF_INET6 &&
- IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
- return (EAFNOSUPPORT);
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- inp->inp_vflag &= ~INP_IPV4;
- inp->inp_vflag |= INP_IPV6;
- if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
- inp->inp_vflag |= INP_IPV4;
- else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
- struct sockaddr_in sin;
-
- in6_sin6_2_sin(&sin, sin6p);
- inp->inp_vflag |= INP_IPV4;
- inp->inp_vflag &= ~INP_IPV6;
- error = in_pcbbind(inp, (struct sockaddr *)&sin,
- td->td_ucred);
- goto out;
- }
- }
- error = in6_pcbbind(inp, nam, td->td_ucred);
-out:
- TCPDEBUG2(PRU_BIND);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-#endif /* INET6 */
-
-/*
- * Prepare to accept connections.
- */
-static int
-tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- SOCK_LOCK(so);
- error = solisten_proto_check(so);
- if (error == 0 && inp->inp_lport == 0)
- error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
- if (error == 0) {
- tp->t_state = TCPS_LISTEN;
- solisten_proto(so, backlog);
- tcp_gen_listen_open(tp);
- }
- SOCK_UNLOCK(so);
-
-out:
- TCPDEBUG2(PRU_LISTEN);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-
-#ifdef INET6
-static int
-tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- SOCK_LOCK(so);
- error = solisten_proto_check(so);
- if (error == 0 && inp->inp_lport == 0) {
- inp->inp_vflag &= ~INP_IPV4;
- if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
- inp->inp_vflag |= INP_IPV4;
- error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
- }
- if (error == 0) {
- tp->t_state = TCPS_LISTEN;
- solisten_proto(so, backlog);
- }
- SOCK_UNLOCK(so);
-
-out:
- TCPDEBUG2(PRU_LISTEN);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-#endif /* INET6 */
-
-/*
- * Initiate connection to peer.
- * Create a template for use in transmissions on this connection.
- * Enter SYN_SENT state, and mark socket as connecting.
- * Start keep-alive timer, and seed output sequence space.
- * Send initial segment on connection.
- */
-static int
-tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- struct sockaddr_in *sinp;
-
- sinp = (struct sockaddr_in *)nam;
- if (nam->sa_len != sizeof (*sinp))
- return (EINVAL);
- /*
- * Must disallow TCP ``connections'' to multicast addresses.
- */
- if (sinp->sin_family == AF_INET
- && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
- return (EAFNOSUPPORT);
- if (jailed(td->td_ucred))
- prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- if ((error = tcp_connect(tp, nam, td)) != 0)
- goto out;
- printf("calling tcp_gen_connect\n");
-
- error = tcp_gen_connect(so, nam);
-out:
- TCPDEBUG2(PRU_CONNECT);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-
-#ifdef INET6
-static int
-tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- struct sockaddr_in6 *sin6p;
-
- TCPDEBUG0;
-
- sin6p = (struct sockaddr_in6 *)nam;
- if (nam->sa_len != sizeof (*sin6p))
- return (EINVAL);
- /*
- * Must disallow TCP ``connections'' to multicast addresses.
- */
- if (sin6p->sin6_family == AF_INET6
- && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
- return (EAFNOSUPPORT);
-
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
- struct sockaddr_in sin;
-
- if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
- error = EINVAL;
- goto out;
- }
-
- in6_sin6_2_sin(&sin, sin6p);
- inp->inp_vflag |= INP_IPV4;
- inp->inp_vflag &= ~INP_IPV6;
- if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
- goto out;
- error = tcp_gen_connect(so, nam);
- goto out;
- }
- inp->inp_vflag &= ~INP_IPV4;
- inp->inp_vflag |= INP_IPV6;
- inp->inp_inc.inc_isipv6 = 1;
- if ((error = tcp6_connect(tp, nam, td)) != 0)
- goto out;
- error = tcp_gen_connect(so, nam);
-
-out:
- TCPDEBUG2(PRU_CONNECT);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-#endif /* INET6 */
-
-/*
- * Initiate disconnect from peer.
- * If connection never passed embryonic stage, just drop;
- * else if don't need to let data drain, then can just drop anyways,
- * else have to begin TCP shutdown process: mark socket disconnecting,
- * drain unread data, state switch to reflect user close, and
- * send segment (e.g. FIN) to peer. Socket will be really disconnected
- * when peer sends FIN and acks ours.
- *
- * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
- */
-static int
-tcp_usr_disconnect(struct socket *so)
-{
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- int error = 0;
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNRESET;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- tcp_disconnect(tp);
-out:
- TCPDEBUG2(PRU_DISCONNECT);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-
-/*
- * Accept a connection. Essentially all the work is
- * done at higher levels; just return the address
- * of the peer, storing through addr.
- */
-static int
-tcp_usr_accept(struct socket *so, struct sockaddr **nam)
-{
- int error = 0;
- struct inpcb *inp = NULL;
- struct tcpcb *tp = NULL;
- struct in_addr addr;
- in_port_t port = 0;
- TCPDEBUG0;
-
- if (so->so_state & SS_ISDISCONNECTED)
- return (ECONNABORTED);
-
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNABORTED;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
-
- /*
- * We inline in_getpeeraddr and COMMON_END here, so that we can
- * copy the data of interest and defer the malloc until after we
- * release the lock.
- */
- port = inp->inp_fport;
- addr = inp->inp_faddr;
-
-out:
- TCPDEBUG2(PRU_ACCEPT);
- INP_UNLOCK(inp);
- if (error == 0)
- *nam = in_sockaddr(port, &addr);
- return error;
-}
-
-#ifdef INET6
-static int
-tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
-{
- struct inpcb *inp = NULL;
- int error = 0;
- struct tcpcb *tp = NULL;
- struct in_addr addr;
- struct in6_addr addr6;
- in_port_t port = 0;
- int v4 = 0;
- TCPDEBUG0;
-
- if (so->so_state & SS_ISDISCONNECTED)
- return (ECONNABORTED);
-
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNABORTED;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
-
- /*
- * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
- * copy the data of interest and defer the malloc until after we
- * release the lock.
- */
- if (inp->inp_vflag & INP_IPV4) {
- v4 = 1;
- port = inp->inp_fport;
- addr = inp->inp_faddr;
- } else {
- port = inp->inp_fport;
- addr6 = inp->in6p_faddr;
- }
-
-out:
- TCPDEBUG2(PRU_ACCEPT);
- INP_UNLOCK(inp);
- if (error == 0) {
- if (v4)
- *nam = in6_v4mapsin6_sockaddr(port, &addr);
- else
- *nam = in6_sockaddr(port, &addr6);
- }
- return error;
-}
-#endif /* INET6 */
-
-/*
- * Mark the connection as being incapable of further output.
- */
-static int
-tcp_usr_shutdown(struct socket *so)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
-
- TCPDEBUG0;
- INP_INFO_WLOCK(&tcbinfo);
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNRESET;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- socantsendmore(so);
- tcp_usrclosed(tp);
- error = tcp_gen_disconnect(tp);
-
-out:
- TCPDEBUG2(PRU_SHUTDOWN);
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
-
- return (error);
-}
-
-/*
- * After a receive, possibly send window update to peer.
- */
-static int
-tcp_usr_rcvd(struct socket *so, int flags)
-{
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- int error = 0;
-
- TCPDEBUG0;
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNRESET;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- tcp_gen_rcvd(tp);
-
-out:
- TCPDEBUG2(PRU_RCVD);
- INP_UNLOCK(inp);
- return (error);
-}
-
-/*
- * Do a send by putting data in output queue and updating urgent
- * marker if URG set. Possibly send more data. Unlike the other
- * pru_*() routines, the mbuf chains are our responsibility. We
- * must either enqueue them or free them. The other pru_* routines
- * generally are caller-frees.
- */
-static int
-tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
- struct sockaddr *nam, struct mbuf *control, struct thread *td)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- int headlocked = 0;
-#ifdef INET6
- int isipv6;
-#endif
- TCPDEBUG0;
-
- /*
- * We require the pcbinfo lock in two cases:
- *
- * (1) An implied connect is taking place, which can result in
- * binding IPs and ports and hence modification of the pcb hash
- * chains.
- *
- * (2) PRUS_EOF is set, resulting in explicit close on the send.
- */
- if ((nam != NULL) || (flags & PRUS_EOF)) {
- INP_INFO_WLOCK(&tcbinfo);
- headlocked = 1;
- }
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- if (control)
- m_freem(control);
- if (m)
- m_freem(m);
- error = ECONNRESET;
- goto out;
- }
-#ifdef INET6
- isipv6 = nam && nam->sa_family == AF_INET6;
-#endif /* INET6 */
- tp = intotcpcb(inp);
- TCPDEBUG1();
- if (control) {
- /* TCP doesn't do control messages (rights, creds, etc) */
- if (control->m_len) {
- m_freem(control);
- if (m)
- m_freem(m);
- error = EINVAL;
- goto out;
- }
- m_freem(control); /* empty control, just free it */
- }
- if (!(flags & PRUS_OOB)) {
- sbappendstream(&so->so_snd, m);
- if (nam && tp->t_state < TCPS_SYN_SENT) {
- /*
- * Do implied connect if not yet connected,
- * initialize window to default value, and
- * initialize maxseg/maxopd using peer's cached
- * MSS.
- */
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
-#ifdef INET6
- if (isipv6)
- error = tcp6_connect(tp, nam, td);
- else
-#endif /* INET6 */
- error = tcp_connect(tp, nam, td);
- if (error)
- goto out;
- tp->snd_wnd = TTCP_CLIENT_SND_WND;
- tcp_mss(tp, -1);
- }
- if (flags & PRUS_EOF) {
- /*
- * Close the send side of the connection after
- * the data is sent.
- */
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- socantsendmore(so);
- tcp_usrclosed(tp);
- }
- if (headlocked) {
- INP_INFO_WUNLOCK(&tcbinfo);
- headlocked = 0;
- }
- if (tp != NULL) {
- if (flags & PRUS_MORETOCOME)
- tp->t_flags |= TF_MORETOCOME;
- error = tcp_gen_send(tp);
- if (flags & PRUS_MORETOCOME)
- tp->t_flags &= ~TF_MORETOCOME;
- }
- } else {
- /*
- * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
- */
- SOCKBUF_LOCK(&so->so_snd);
- if (sbspace(&so->so_snd) < -512) {
- SOCKBUF_UNLOCK(&so->so_snd);
- m_freem(m);
- error = ENOBUFS;
- goto out;
- }
- /*
- * According to RFC961 (Assigned Protocols),
- * the urgent pointer points to the last octet
- * of urgent data. We continue, however,
- * to consider it to indicate the first octet
- * of data past the urgent section.
- * Otherwise, snd_up should be one lower.
- */
- sbappendstream_locked(&so->so_snd, m);
- SOCKBUF_UNLOCK(&so->so_snd);
- if (nam && tp->t_state < TCPS_SYN_SENT) {
- /*
- * Do implied connect if not yet connected,
- * initialize window to default value, and
- * initialize maxseg/maxopd using peer's cached
- * MSS.
- */
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
-#ifdef INET6
- if (isipv6)
- error = tcp6_connect(tp, nam, td);
- else
-#endif /* INET6 */
- error = tcp_connect(tp, nam, td);
- if (error)
- goto out;
- tp->snd_wnd = TTCP_CLIENT_SND_WND;
- tcp_mss(tp, -1);
- INP_INFO_WUNLOCK(&tcbinfo);
- headlocked = 0;
- } else if (nam) {
- INP_INFO_WUNLOCK(&tcbinfo);
- headlocked = 0;
- }
- tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
- tp->t_flags |= TF_FORCEDATA;
- error = tcp_gen_send(tp);
- tp->t_flags &= ~TF_FORCEDATA;
- }
-out:
- TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
- ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
- INP_UNLOCK(inp);
- if (headlocked)
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
-}
-
-/*
- * Abort the TCP. Drop the connection abruptly.
- */
-static void
-tcp_usr_abort(struct socket *so)
-{
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- TCPDEBUG0;
-
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
-
- INP_INFO_WLOCK(&tcbinfo);
- INP_LOCK(inp);
- KASSERT(inp->inp_socket != NULL,
- ("tcp_usr_abort: inp_socket == NULL"));
-
- /*
- * If we still have full TCP state, and we're not dropped, drop.
- */
- if (!(inp->inp_vflag & INP_TIMEWAIT) &&
- !(inp->inp_vflag & INP_DROPPED)) {
- tp = intotcpcb(inp);
- TCPDEBUG1();
- cxgb_tcp_drop(tp, ECONNABORTED);
- TCPDEBUG2(PRU_ABORT);
- }
- if (!(inp->inp_vflag & INP_DROPPED)) {
- SOCK_LOCK(so);
- so->so_state |= SS_PROTOREF;
- SOCK_UNLOCK(so);
- inp->inp_vflag |= INP_SOCKREF;
- }
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
-}
-
-/*
- * TCP socket is closed. Start friendly disconnect.
- */
-static void
-tcp_usr_close(struct socket *so)
-{
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
- TCPDEBUG0;
-
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
-
- INP_INFO_WLOCK(&tcbinfo);
- INP_LOCK(inp);
- KASSERT(inp->inp_socket != NULL,
- ("tcp_usr_close: inp_socket == NULL"));
-
- /*
- * If we still have full TCP state, and we're not dropped, initiate
- * a disconnect.
- */
- if (!(inp->inp_vflag & INP_TIMEWAIT) &&
- !(inp->inp_vflag & INP_DROPPED)) {
- tp = intotcpcb(inp);
- TCPDEBUG1();
- tcp_disconnect(tp);
- TCPDEBUG2(PRU_CLOSE);
- }
- if (!(inp->inp_vflag & INP_DROPPED)) {
- SOCK_LOCK(so);
- so->so_state |= SS_PROTOREF;
- SOCK_UNLOCK(so);
- inp->inp_vflag |= INP_SOCKREF;
- }
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
-}
-
-/*
- * Receive out-of-band data.
- */
-static int
-tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
-{
- int error = 0;
- struct inpcb *inp;
- struct tcpcb *tp = NULL;
-
- TCPDEBUG0;
- inp = sotoinpcb(so);
- KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
- INP_LOCK(inp);
- if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
- error = ECONNRESET;
- goto out;
- }
- tp = intotcpcb(inp);
- TCPDEBUG1();
- if ((so->so_oobmark == 0 &&
- (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
- so->so_options & SO_OOBINLINE ||
- tp->t_oobflags & TCPOOB_HADDATA) {
- error = EINVAL;
- goto out;
- }
- if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
- error = EWOULDBLOCK;
- goto out;
- }
- m->m_len = 1;
- *mtod(m, caddr_t) = tp->t_iobc;
- if ((flags & MSG_PEEK) == 0)
- tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
-
-out:
- TCPDEBUG2(PRU_RCVOOB);
- INP_UNLOCK(inp);
- return (error);
-}
-
-struct pr_usrreqs cxgb_tcp_usrreqs = {
- .pru_abort = tcp_usr_abort,
- .pru_accept = tcp_usr_accept,
- .pru_attach = tcp_usr_attach,
- .pru_bind = tcp_usr_bind,
- .pru_connect = tcp_usr_connect,
- .pru_control = in_control,
- .pru_detach = tcp_usr_detach,
- .pru_disconnect = tcp_usr_disconnect,
- .pru_listen = tcp_usr_listen,
- .pru_peeraddr = in_getpeeraddr,
- .pru_rcvd = tcp_usr_rcvd,
- .pru_rcvoob = tcp_usr_rcvoob,
- .pru_send = tcp_usr_send,
- .pru_shutdown = tcp_usr_shutdown,
- .pru_sockaddr = in_getsockaddr,
- .pru_sosetlabel = in_pcbsosetlabel,
- .pru_close = tcp_usr_close,
-};
-
-#ifdef INET6
-struct pr_usrreqs cxgb_tcp6_usrreqs = {
- .pru_abort = tcp_usr_abort,
- .pru_accept = tcp6_usr_accept,
- .pru_attach = tcp_usr_attach,
- .pru_bind = tcp6_usr_bind,
- .pru_connect = tcp6_usr_connect,
- .pru_control = in6_control,
- .pru_detach = tcp_usr_detach,
- .pru_disconnect = tcp_usr_disconnect,
- .pru_listen = tcp6_usr_listen,
- .pru_peeraddr = in6_mapped_peeraddr,
- .pru_rcvd = tcp_usr_rcvd,
- .pru_rcvoob = tcp_usr_rcvoob,
- .pru_send = tcp_usr_send,
- .pru_shutdown = tcp_usr_shutdown,
- .pru_sockaddr = in6_mapped_sockaddr,
- .pru_sosetlabel = in_pcbsosetlabel,
- .pru_close = tcp_usr_close,
-};
-#endif /* INET6 */
-
-/*
- * Common subroutine to open a TCP connection to remote host specified
- * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
- * port number if needed. Call in_pcbconnect_setup to do the routing and
- * to choose a local host address (interface). If there is an existing
- * incarnation of the same connection in TIME-WAIT state and if the remote
- * host was sending CC options and if the connection duration was < MSL, then
- * truncate the previous TIME-WAIT state and proceed.
- * Initialize connection parameters and enter SYN-SENT state.
- */
-static int
-tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
-{
- struct inpcb *inp = tp->t_inpcb, *oinp;
- struct socket *so = inp->inp_socket;
- struct in_addr laddr;
- u_short lport;
- int error;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- if (inp->inp_lport == 0) {
- error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
- if (error)
- return error;
- }
-
- /*
- * Cannot simply call in_pcbconnect, because there might be an
- * earlier incarnation of this same connection still in
- * TIME_WAIT state, creating an ADDRINUSE error.
- */
- laddr = inp->inp_laddr;
- lport = inp->inp_lport;
- error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
- &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
- if (error && oinp == NULL)
- return error;
- if (oinp)
- return EADDRINUSE;
- inp->inp_laddr = laddr;
- in_pcbrehash(inp);
-
- /*
- * Compute window scaling to request:
- * Scale to fit into sweet spot. See tcp_syncache.c.
- * XXX: This should move to tcp_output().
- */
- while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
- (TCP_MAXWIN << tp->request_r_scale) < sb_max)
- tp->request_r_scale++;
-
- soisconnecting(so);
- tcpstat.tcps_connattempt++;
- tp->t_state = TCPS_SYN_SENT;
- tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
- tp->iss = tcp_new_isn(tp);
- tp->t_bw_rtseq = tp->iss;
- tcp_sendseqinit(tp);
-
- return 0;
-}
-
-#ifdef INET6
-static int
-tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
-{
- struct inpcb *inp = tp->t_inpcb, *oinp;
- struct socket *so = inp->inp_socket;
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
- struct in6_addr *addr6;
- int error;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- if (inp->inp_lport == 0) {
- error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
- if (error)
- return error;
- }
-
- /*
- * Cannot simply call in_pcbconnect, because there might be an
- * earlier incarnation of this same connection still in
- * TIME_WAIT state, creating an ADDRINUSE error.
- * in6_pcbladdr() also handles scope zone IDs.
- */
- error = in6_pcbladdr(inp, nam, &addr6);
- if (error)
- return error;
- oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
- &sin6->sin6_addr, sin6->sin6_port,
- IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
- ? addr6
- : &inp->in6p_laddr,
- inp->inp_lport, 0, NULL);
- if (oinp)
- return EADDRINUSE;
- if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
- inp->in6p_laddr = *addr6;
- inp->in6p_faddr = sin6->sin6_addr;
- inp->inp_fport = sin6->sin6_port;
- /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
- inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
- if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
- inp->in6p_flowinfo |=
- (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
- in_pcbrehash(inp);
-
- /* Compute window scaling to request. */
- while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
- (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
- tp->request_r_scale++;
-
- soisconnecting(so);
- tcpstat.tcps_connattempt++;
- tp->t_state = TCPS_SYN_SENT;
- tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
- tp->iss = tcp_new_isn(tp);
- tp->t_bw_rtseq = tp->iss;
- tcp_sendseqinit(tp);
-
- return 0;
-}
-#endif /* INET6 */
-
-/*
- * tcp_sendspace and tcp_recvspace are the default send and receive window
- * sizes, respectively. These are obsolescent (this information should
- * be set by the route).
- */
-u_long tcp_sendspace = 1024*32;
-SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
- &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
-u_long tcp_recvspace = 1024*64;
-SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
- &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
-
-/*
- * Attach TCP protocol to socket, allocating
- * internet protocol control block, tcp control block,
- * bufer space, and entering LISTEN state if to accept connections.
- */
-static int
-tcp_attach(struct socket *so)
-{
- struct tcpcb *tp;
- struct inpcb *inp;
- int error;
-#ifdef INET6
- int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
-#endif
-
- if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
- error = soreserve(so, tcp_sendspace, tcp_recvspace);
- if (error)
- return (error);
- }
- so->so_rcv.sb_flags |= SB_AUTOSIZE;
- so->so_snd.sb_flags |= SB_AUTOSIZE;
- INP_INFO_WLOCK(&tcbinfo);
- error = in_pcballoc(so, &tcbinfo);
- if (error) {
- INP_INFO_WUNLOCK(&tcbinfo);
- return (error);
- }
- inp = sotoinpcb(so);
-#ifdef INET6
- if (isipv6) {
- inp->inp_vflag |= INP_IPV6;
- inp->in6p_hops = -1; /* use kernel default */
- }
- else
-#endif
- inp->inp_vflag |= INP_IPV4;
- tp = tcp_newtcpcb(inp);
- if (tp == NULL) {
-#ifdef INET6
- if (isipv6) {
- in6_pcbdetach(inp);
- in6_pcbfree(inp);
- } else {
-#endif
- in_pcbdetach(inp);
- in_pcbfree(inp);
-#ifdef INET6
- }
-#endif
- INP_INFO_WUNLOCK(&tcbinfo);
- return (ENOBUFS);
- }
- tp->t_state = TCPS_CLOSED;
- INP_UNLOCK(inp);
- INP_INFO_WUNLOCK(&tcbinfo);
- return (0);
-}
-
-/*
- * Initiate (or continue) disconnect.
- * If embryonic state, just send reset (once).
- * If in ``let data drain'' option and linger null, just drop.
- * Otherwise (hard), mark socket disconnecting and drop
- * current input data; switch states based on user close, and
- * send segment to peer (with FIN).
- */
-static void
-tcp_disconnect(struct tcpcb *tp)
-{
- struct inpcb *inp = tp->t_inpcb;
- struct socket *so = inp->inp_socket;
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(inp);
-
- /*
- * Neither tcp_close() nor tcp_drop() should return NULL, as the
- * socket is still open.
- */
- if (tp->t_state < TCPS_ESTABLISHED) {
- tp = cxgb_tcp_close(tp);
- KASSERT(tp != NULL,
- ("tcp_disconnect: tcp_close() returned NULL"));
- } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
- tp = cxgb_tcp_drop(tp, 0);
- KASSERT(tp != NULL,
- ("tcp_disconnect: tcp_drop() returned NULL"));
- } else {
- soisdisconnecting(so);
- sbflush(&so->so_rcv);
- tcp_usrclosed(tp);
- if (!(inp->inp_vflag & INP_DROPPED))
- tcp_gen_disconnect(tp);
- }
-}
-
-/*
- * User issued close, and wish to trail through shutdown states:
- * if never received SYN, just forget it. If got a SYN from peer,
- * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
- * If already got a FIN from peer, then almost done; go to LAST_ACK
- * state. In all other cases, have already sent FIN to peer (e.g.
- * after PRU_SHUTDOWN), and just have to play tedious game waiting
- * for peer to send FIN or not respond to keep-alives, etc.
- * We can let the user exit from the close as soon as the FIN is acked.
- */
-static void
-tcp_usrclosed(struct tcpcb *tp)
-{
-
- INP_INFO_WLOCK_ASSERT(&tcbinfo);
- INP_LOCK_ASSERT(tp->t_inpcb);
-
- switch (tp->t_state) {
- case TCPS_LISTEN:
- tcp_gen_listen_close(tp);
- case TCPS_CLOSED:
- tp->t_state = TCPS_CLOSED;
- tp = cxgb_tcp_close(tp);
- /*
- * tcp_close() should never return NULL here as the socket is
- * still open.
- */
- KASSERT(tp != NULL,
- ("tcp_usrclosed: tcp_close() returned NULL"));
- break;
-
- case TCPS_SYN_SENT:
- case TCPS_SYN_RECEIVED:
- tp->t_flags |= TF_NEEDFIN;
- break;
-
- case TCPS_ESTABLISHED:
- tp->t_state = TCPS_FIN_WAIT_1;
- break;
-
- case TCPS_CLOSE_WAIT:
- tp->t_state = TCPS_LAST_ACK;
- break;
- }
- if (tp->t_state >= TCPS_FIN_WAIT_2) {
- soisdisconnected(tp->t_inpcb->inp_socket);
- /* Prevent the connection hanging in FIN_WAIT_2 forever. */
- if (tp->t_state == TCPS_FIN_WAIT_2) {
- int timeout;
-
- timeout = (tcp_fast_finwait2_recycle) ?
- tcp_finwait2_timeout : tcp_maxidle;
- tcp_timer_activate(tp, TT_2MSL, timeout);
- }
- }
-}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
index a078bee..8a9c498 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
@@ -30,45 +30,49 @@
#ifndef CXGB_TOEPCB_H_
#define CXGB_TOEPCB_H_
#include <sys/bus.h>
+#include <sys/condvar.h>
#include <dev/cxgb/sys/mbufq.h>
struct toepcb {
- struct toedev *tp_toedev;
- struct l2t_entry *tp_l2t;
- pr_ctloutput_t *tp_ctloutput;
- unsigned int tp_tid;
- int tp_wr_max;
- int tp_wr_avail;
- int tp_wr_unacked;
- int tp_delack_mode;
- int tp_mtu_idx;
- int tp_ulp_mode;
- int tp_qset_idx;
- int tp_mss_clamp;
- int tp_qset;
- int tp_flags;
- int tp_enqueued_bytes;
- int tp_page_count;
- int tp_state;
-
- tcp_seq tp_iss;
- tcp_seq tp_delack_seq;
- tcp_seq tp_rcv_wup;
- tcp_seq tp_copied_seq;
- uint64_t tp_write_seq;
-
- volatile int tp_refcount;
- vm_page_t *tp_pages;
+ struct toedev *tp_toedev;
+ struct l2t_entry *tp_l2t;
+ pr_ctloutput_t *tp_ctloutput;
+ unsigned int tp_tid;
+ int tp_wr_max;
+ int tp_wr_avail;
+ int tp_wr_unacked;
+ int tp_delack_mode;
+ int tp_mtu_idx;
+ int tp_ulp_mode;
+ int tp_qset_idx;
+ int tp_mss_clamp;
+ int tp_qset;
+ int tp_flags;
+ int tp_enqueued_bytes;
+ int tp_page_count;
+ int tp_state;
+
+ tcp_seq tp_iss;
+ tcp_seq tp_delack_seq;
+ tcp_seq tp_rcv_wup;
+ tcp_seq tp_copied_seq;
+ uint64_t tp_write_seq;
+
+ volatile int tp_refcount;
+ vm_page_t *tp_pages;
- struct tcpcb *tp_tp;
- struct mbuf *tp_m_last;
- bus_dma_tag_t tp_tx_dmat;
- bus_dmamap_t tp_dmamap;
-
- LIST_ENTRY(toepcb) synq_entry;
- struct mbuf_head wr_list;
- struct mbuf_head out_of_order_queue;
- struct ddp_state tp_ddp_state;
+ struct tcpcb *tp_tp;
+ struct mbuf *tp_m_last;
+ bus_dma_tag_t tp_tx_dmat;
+ bus_dma_tag_t tp_rx_dmat;
+ bus_dmamap_t tp_dmamap;
+
+ LIST_ENTRY(toepcb) synq_entry;
+ struct mbuf_head wr_list;
+ struct mbuf_head out_of_order_queue;
+ struct ddp_state tp_ddp_state;
+ struct cv tp_cv;
+
};
static inline void
@@ -95,7 +99,7 @@ enqueue_wr(struct toepcb *toep, struct mbuf *m)
}
static inline struct mbuf *
-peek_wr(struct toepcb *toep)
+peek_wr(const struct toepcb *toep)
{
return (mbufq_peek(&toep->wr_list));
@@ -108,5 +112,10 @@ dequeue_wr(struct toepcb *toep)
return (mbufq_dequeue(&toep->wr_list));
}
+#define wr_queue_walk(toep, m) \
+ for (m = peek_wr(toep); m; m = m->m_nextpkt)
+
+
+
#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
index b5b87b7..4015cd3 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -34,11 +34,13 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/fcntl.h>
+#include <sys/ktr.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/eventhandler.h>
#include <sys/mbuf.h>
#include <sys/module.h>
+#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@@ -90,16 +92,20 @@ static TAILQ_HEAD(, tom_data) cxgb_list;
static struct mtx cxgb_list_lock;
static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry);
+static void cxgb_register_listeners(void);
+
/*
* Handlers for each CPL opcode
*/
-static cxgb_cpl_handler_func tom_cpl_handlers[NUM_CPL_CMDS];
+static cxgb_cpl_handler_func tom_cpl_handlers[256];
+
static eventhandler_tag listen_tag;
static struct offload_id t3_toe_id_tab[] = {
{ TOE_ID_CHELSIO_T3, 0 },
{ TOE_ID_CHELSIO_T3B, 0 },
+ { TOE_ID_CHELSIO_T3C, 0 },
{ 0 }
};
@@ -138,7 +144,7 @@ toepcb_alloc(void)
{
struct toepcb *toep;
- toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT);
+ toep = malloc(sizeof(struct toepcb), M_DEVBUF, M_NOWAIT|M_ZERO);
if (toep == NULL)
return (NULL);
@@ -150,8 +156,8 @@ toepcb_alloc(void)
void
toepcb_init(struct toepcb *toep)
{
- bzero(toep, sizeof(*toep));
toep->tp_refcount = 1;
+ cv_init(&toep->tp_cv, "toep cv");
}
void
@@ -164,12 +170,9 @@ void
toepcb_release(struct toepcb *toep)
{
if (toep->tp_refcount == 1) {
- printf("doing final toepcb free\n");
-
free(toep, M_DEVBUF);
return;
}
-
atomic_add_acq_int(&toep->tp_refcount, -1);
}
@@ -179,13 +182,30 @@ toepcb_release(struct toepcb *toep)
static void
t3cdev_add(struct tom_data *t)
{
- printf("t3cdev_add\n");
-
mtx_lock(&cxgb_list_lock);
TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
mtx_unlock(&cxgb_list_lock);
}
+static inline int
+cdev2type(struct t3cdev *cdev)
+{
+ int type = 0;
+
+ switch (cdev->type) {
+ case T3A:
+ type = TOE_ID_CHELSIO_T3;
+ break;
+ case T3B:
+ type = TOE_ID_CHELSIO_T3B;
+ break;
+ case T3C:
+ type = TOE_ID_CHELSIO_T3C;
+ break;
+ }
+ return (type);
+}
+
/*
* Allocate a TOM data structure,
* initialize its cpl_handlers
@@ -200,11 +220,7 @@ t3c_tom_add(struct t3cdev *cdev)
struct toedev *tdev;
struct adap_ports *port_info;
- printf("%s called\n", __FUNCTION__);
-
-
t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
-
if (t == NULL)
return;
@@ -224,8 +240,7 @@ t3c_tom_add(struct t3cdev *cdev)
/* Register TCP offload device */
tdev = &t->tdev;
- tdev->tod_ttid = (cdev->type == T3A ?
- TOE_ID_CHELSIO_T3 : TOE_ID_CHELSIO_T3B);
+ tdev->tod_ttid = cdev2type(cdev);
tdev->tod_lldev = cdev->lldev;
if (register_toedev(tdev, "toe%d")) {
@@ -234,13 +249,11 @@ t3c_tom_add(struct t3cdev *cdev)
}
TOM_DATA(tdev) = t;
- printf("nports=%d\n", port_info->nports);
for (i = 0; i < port_info->nports; i++) {
struct ifnet *ifp = port_info->lldevs[i];
TOEDEV(ifp) = tdev;
- printf("enabling toe on %p\n", ifp);
-
+ CTR1(KTR_TOM, "enabling toe on %p", ifp);
ifp->if_capabilities |= IFCAP_TOE4;
ifp->if_capenable |= IFCAP_TOE4;
}
@@ -251,6 +264,7 @@ t3c_tom_add(struct t3cdev *cdev)
/* Activate TCP offload device */
activate_offload(tdev);
+ cxgb_register_listeners();
return;
out_free_all:
@@ -269,8 +283,8 @@ static int
do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name,
- *mtod(m, unsigned int *));
-
+ 0xFF & *mtod(m, unsigned int *));
+ kdb_backtrace();
return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
}
@@ -282,7 +296,7 @@ do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
void
t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h)
{
- if (opcode < NUM_CPL_CMDS)
+ if (opcode < 256)
tom_cpl_handlers[opcode] = h ? h : do_bad_cpl;
else
log(LOG_ERR, "Chelsio T3 TOM: handler registration for "
@@ -327,7 +341,7 @@ init_cpl_handlers(void)
{
int i;
- for (i = 0; i < NUM_CPL_CMDS; ++i)
+ for (i = 0; i < 256; ++i)
tom_cpl_handlers[i] = do_bad_cpl;
t3_init_listen_cpl_handlers();
@@ -349,7 +363,7 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
#endif
t3_init_tunables(t);
mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF);
-
+ CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry);
/* Adjust TOE activation for this module */
t->conf.activated = activated;
@@ -374,19 +388,14 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
t->ddp_ulimit = ddp.ulimit;
t->pdev = ddp.pdev;
t->rx_page_size = rx_page_info.page_size;
-#ifdef notyet
/* OK if this fails, we just can't do DDP */
t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE;
- t->ppod_map = t3_alloc_mem(t->nppods);
-#endif
+ t->ppod_map = malloc(t->nppods, M_DEVBUF, M_WAITOK|M_ZERO);
-#if 0
- spin_lock_init(&t->ppod_map_lock);
- tom_proc_init(dev);
-#ifdef CONFIG_SYSCTL
- t->sysctl = t3_sysctl_register(dev, &t->conf);
-#endif
-#endif
+ mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF);
+
+
+ t3_sysctl_register(cdev->adapter, &t->conf);
return (0);
}
@@ -411,11 +420,8 @@ cxgb_toe_listen_stop(void *unused, struct tcpcb *tp)
mtx_lock(&cxgb_list_lock);
TAILQ_FOREACH(p, &cxgb_list, entry) {
- if (tp->t_state == TCPS_LISTEN) {
- printf("stopping listen on port=%d\n",
- ntohs(tp->t_inpcb->inp_lport));
+ if (tp->t_state == TCPS_LISTEN)
t3_listen_stop(&p->tdev, so, p->cdev);
- }
}
mtx_unlock(&cxgb_list_lock);
}
@@ -439,23 +445,12 @@ cxgb_register_listeners(void)
static int
t3_tom_init(void)
{
-
-#if 0
- struct socket *sock;
- err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (err < 0) {
- printk(KERN_ERR "Could not create TCP socket, error %d\n", err);
- return err;
- }
-
- t3_def_state_change = sock->sk->sk_state_change;
- t3_def_data_ready = sock->sk->sk_data_ready;
- t3_def_error_report = sock->sk->sk_error_report;
- sock_release(sock);
-#endif
init_cpl_handlers();
- if (t3_init_cpl_io() < 0)
+ if (t3_init_cpl_io() < 0) {
+ log(LOG_ERR,
+ "Unable to initialize cpl io ops\n");
return -1;
+ }
t3_init_socket_ops();
/* Register with the TOE device layer. */
@@ -466,7 +461,6 @@ t3_tom_init(void)
return -1;
}
INP_INFO_WLOCK(&tcbinfo);
-
INP_INFO_WUNLOCK(&tcbinfo);
mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
@@ -477,10 +471,8 @@ t3_tom_init(void)
TAILQ_INIT(&cxgb_list);
/* Register to offloading devices */
- printf("setting add to %p\n", t3c_tom_add);
t3c_tom_client.add = t3c_tom_add;
cxgb_register_client(&t3c_tom_client);
- cxgb_register_listeners();
return (0);
}
@@ -491,8 +483,6 @@ t3_tom_load(module_t mod, int cmd, void *arg)
switch (cmd) {
case MOD_LOAD:
- printf("wheeeeee ...\n");
-
t3_tom_init();
break;
case MOD_QUIESCE:
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
index 8d60bbd..bcda2c3 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.h
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
@@ -138,6 +138,8 @@ struct listen_ctx {
void t3_init_tunables(struct tom_data *t);
+void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p);
+
static __inline struct mbuf *
m_gethdr_nofail(int len)
{
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
index 7219922..b4ff748 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/common/cxgb_ctl_defs.h>
#include <dev/cxgb/common/cxgb_t3_cpl.h>
#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_include.h>
#include <dev/cxgb/cxgb_l2t.h>
#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
@@ -82,7 +83,7 @@ static struct tom_tunables default_tunable_vals = {
.delack = 1,
.max_conn = -1,
.soft_backlog_limit = 0,
- .ddp = 0,
+ .ddp = 1,
.ddp_thres = 14 * 4096,
.ddp_copy_limit = 13 * 4096,
.ddp_push_wait = 1,
@@ -96,7 +97,8 @@ static struct tom_tunables default_tunable_vals = {
.activated = 1,
};
-void t3_init_tunables(struct tom_data *t)
+void
+t3_init_tunables(struct tom_data *t)
{
t->conf = default_tunable_vals;
@@ -104,3 +106,15 @@ void t3_init_tunables(struct tom_data *t)
t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk;
t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs;
}
+
+void
+t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p)
+{
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid_list *children;
+
+ ctx = device_get_sysctl_ctx(sc->dev);
+ children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
+
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
new file mode 100644
index 0000000..7036005
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
@@ -0,0 +1,180 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+#define TRACE_ENTER printf("%s:%s entered", __FUNCTION__, __FILE__)
+#define TRACE_EXIT printf("%s:%s:%d exited", __FUNCTION__, __FILE__, __LINE__)
+
+/*
+ * This routine takes a user address range and does the following:
+ * - validate that the user has access to those pages (flags indicates read or write) - if not fail
+ * - validate that count is enough to hold range number of pages - if not fail
+ * - fault in any non-resident pages
+ * - if the user is doing a read force a write fault for any COWed pages
+ * - if the user is doing a read mark all pages as dirty
+ * - hold all pages
+ * - return number of pages in count
+ */
+int
+vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags)
+{
+
+ vm_offset_t end, va;
+ vm_paddr_t pa;
+ int faults, rv;
+
+ struct thread *td;
+ vm_map_t map;
+ pmap_t pmap;
+ vm_page_t m, *pages;
+ vm_prot_t prot;
+
+
+ /*
+ * Check that virtual address range is legal
+ * This check is somewhat bogus as on some architectures kernel
+ * and user do not share VA - however, it appears that all FreeBSD
+ * architectures define it
+ */
+ end = addr + (count * PAGE_SIZE);
+ if (end > VM_MAXUSER_ADDRESS) {
+ printf("bad address passed\n");
+ return (EFAULT);
+ }
+
+ td = curthread;
+ map = &td->td_proc->p_vmspace->vm_map;
+ pmap = &td->td_proc->p_vmspace->vm_pmap;
+ pages = mp;
+
+ prot = VM_PROT_READ;
+ prot |= (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : 0;
+ bzero(pages, sizeof(vm_page_t *) * count);
+retry:
+
+ /*
+ * First optimistically assume that all pages are resident (and R/W if for write)
+ * if so just mark pages as held (and dirty if for write) and return
+ */
+ vm_page_lock_queues();
+ for (pages = mp, faults = 0, va = addr; va < end; va += PAGE_SIZE, pages++) {
+ /*
+ * Assure that we only hold the page once
+ */
+ if (*pages == NULL) {
+ /*
+ * page queue mutex is recursable so this is OK
+ * it would be really nice if we had an unlocked version of this so
+ * we were only acquiring the pmap lock 1 time as opposed to potentially
+ * many dozens of times
+ */
+ m = pmap_extract_and_hold(pmap, va, prot);
+ if (m == NULL) {
+ faults++;
+ continue;
+ }
+
+ *pages = m;
+ if (flags & VM_HOLD_WRITEABLE)
+ vm_page_dirty(m);
+ }
+ }
+ vm_page_unlock_queues();
+
+ if (faults == 0) {
+ return (0);
+ }
+
+ /*
+ * Pages either have insufficient permissions or are not present
+ * trigger a fault where neccessary
+ *
+ */
+ for (va = addr; va < end; va += PAGE_SIZE) {
+ m = NULL;
+ pa = pmap_extract(pmap, va);
+ rv = 0;
+ if (pa)
+ m = PHYS_TO_VM_PAGE(pa);
+ if (flags & VM_HOLD_WRITEABLE) {
+ if (m == NULL || (m->flags & PG_WRITEABLE) == 0)
+ rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
+ } else if (m == NULL)
+ rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
+ if (rv) {
+ printf("vm_fault bad return rv=%d va=0x%zx\n", rv, va);
+
+ goto error;
+ }
+ }
+
+ goto retry;
+
+error:
+ vm_page_lock_queues();
+ for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++)
+ if (*pages)
+ vm_page_unhold(*pages);
+ vm_page_unlock_queues();
+ return (EFAULT);
+}
+
+void
+vm_fault_unhold_pages(vm_page_t *mp, int count)
+{
+
+ KASSERT(count >= 0, ("negative count %d", count));
+ vm_page_lock_queues();
+ while (count--) {
+ vm_page_unhold(*mp);
+ mp++;
+ }
+ vm_page_unlock_queues();
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
new file mode 100644
index 0000000..29418b6
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_VM_H_
+#define CXGB_VM_H_
+
+#define VM_HOLD_WRITEABLE 0x1
+
+int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags);
+void vm_fault_unhold_pages(vm_page_t *mp, int count);
+
+#endif
OpenPOWER on IntegriCloud