diff options
author | np <np@FreeBSD.org> | 2012-11-29 19:10:04 +0000 |
---|---|---|
committer | np <np@FreeBSD.org> | 2012-11-29 19:10:04 +0000 |
commit | aedb69184652bffb10d55a856fbfa2e5dcb61177 (patch) | |
tree | 0083e3574fad3e505bdf6548c0e47d8d25a3c03f | |
parent | 58e590440b4a1ca9d0fd818f1d8464be99faaec1 (diff) | |
download | FreeBSD-src-aedb69184652bffb10d55a856fbfa2e5dcb61177.zip FreeBSD-src-aedb69184652bffb10d55a856fbfa2e5dcb61177.tar.gz |
cxgbe/tom: Add a flag to indicate that the L2 table entry for an
embryonic connection has been setup and never attempt to abort a tid
before this is done. This fixes a bad race where a listening socket is
closed when the driver is in the middle of step (b) here. The symptom
of this were "ARP miss" errors from the driver followed by tid leaks.
A hardware-offloaded passive open works this way:
a) A SYN "hits" the TCAM entry for a server tid and the chip delivers it
to the queue associated with the server tid (say, queue A). It waits
for a response from the driver telling it what to do.
b) The driver decides it is ok to proceed. It adds the new tid to the
list of embryonic connections associated with the server tid and then
hands off the SYN to the kernel's syncache to make sure that the kernel
okays it too. If it does then the driver provides an L2 table entry,
queue id (say, queue B), etc. and instructs the chip to send the SYN/ACK
response.
c) The chip delivers a status to queue B depending on how the third step
of the 3-way handshake goes. The driver removes the tid from its list
of embryonic connections and either expands the syncache entry or
destroys the tid. In any case all subsequent messages for the new tid
will be delivered to queue B, not queue A. Anything running in queue B
knows that the L2 entry has long been setup and the new flag is of no
interest from here on. If the listener is closed it will deal with
so_comp as normal.
MFC after: 1 week
-rw-r--r-- | sys/dev/cxgbe/tom/t4_cpl_io.c | 48 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_listen.c | 102 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_tom.h | 1 |
3 files changed, 89 insertions, 62 deletions
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index e3f0296..6e56b29 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -786,6 +786,29 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) KASSERT(opcode == CPL_PEER_CLOSE, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + + if (__predict_false(toep->flags & TPF_SYNQE)) { +#ifdef INVARIANTS + struct synq_entry *synqe = (void *)toep; + + INP_WLOCK(synqe->lctx->inp); + if (synqe->flags & TPF_SYNQE_HAS_L2TE) { + KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, + ("%s: listen socket closed but tid %u not aborted.", + __func__, tid)); + } else { + /* + * do_pass_accept_req is still running and will + * eventually take care of this tid. + */ + } + INP_WUNLOCK(synqe->lctx->inp); +#endif + CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, + toep, toep->flags); + return (0); + } + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_INFO_WLOCK(&V_tcbinfo); @@ -1092,13 +1115,24 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) int len; if (__predict_false(toep->flags & TPF_SYNQE)) { - /* - * do_pass_establish failed and must be attempting to abort the - * synqe's tid. Meanwhile, the T4 has sent us data for such a - * connection. - */ - KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, - ("%s: synqe and tid isn't being aborted.", __func__)); +#ifdef INVARIANTS + struct synq_entry *synqe = (void *)toep; + + INP_WLOCK(synqe->lctx->inp); + if (synqe->flags & TPF_SYNQE_HAS_L2TE) { + KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, + ("%s: listen socket closed but tid %u not aborted.", + __func__, tid)); + } else { + /* + * do_pass_accept_req is still running and will + * eventually take care of this tid. + */ + } + INP_WUNLOCK(synqe->lctx->inp); +#endif + CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid, + toep, toep->flags); m_freem(m); return (0); } diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 84cd1a2..e5c457d 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -282,8 +282,8 @@ send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) INP_WLOCK_ASSERT(synqe->lctx->inp); - CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s", - __func__, synqe, synqe->tid, + CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", + __func__, synqe, synqe->flags, synqe->tid, synqe->flags & TPF_ABORT_SHUTDOWN ? " (abort already in progress)" : ""); if (synqe->flags & TPF_ABORT_SHUTDOWN) @@ -501,8 +501,10 @@ t4_listen_stop(struct toedev *tod, struct tcpcb *tp) * socket's so_comp. It doesn't know about the connections on the synq * so we need to take care of those. */ - TAILQ_FOREACH(synqe, &lctx->synq, link) - send_reset_synqe(tod, synqe); + TAILQ_FOREACH(synqe, &lctx->synq, link) { + if (synqe->flags & TPF_SYNQE_HAS_L2TE) + send_reset_synqe(tod, synqe); + } destroy_server(sc, lctx); return (0); @@ -1131,8 +1133,7 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, synqe->lctx = lctx; synqe->syn = m; m = NULL; - refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the - duration of this function */ + refcount_init(&synqe->refcnt, 0); synqe->l2e_idx = e->idx; synqe->rcv_bufsize = rx_credits; atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); @@ -1155,46 +1156,9 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, * If we replied during syncache_add (synqe->wr has been consumed), * good. Otherwise, set it to 0 so that further syncache_respond * attempts by the kernel will be ignored. - * - * The extra hold on the synqe makes sure that it is still around, even - * if the listener has been dropped and the synqe was aborted and the - * reply to the abort has removed and released the synqe from the synq - * list. */ if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { - INP_WLOCK(inp); - if (__predict_false(inp->inp_flags & INP_DROPPED)) { - /* listener closed. synqe must have been aborted. */ - KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, - ("%s: listener %p closed but synqe %p not aborted", - __func__, inp, synqe)); - - CTR5(KTR_CXGBE, - "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", - __func__, stid, tid, lctx, synqe); - INP_WUNLOCK(inp); - free(wr, M_CXGBE); - release_synqe(synqe); /* about to exit function */ - return (__LINE__); - } - - /* - * synqe aborted before TOM replied to PASS_ACCEPT_REQ. But - * that can only happen if the listener was closed and we just - * checked for that. - */ - KASSERT(!(synqe->flags & TPF_ABORT_SHUTDOWN), - ("%s: synqe %p aborted, but listener %p not dropped.", - __func__, synqe, inp)); - - /* Yank the synqe out of the lctx synq. */ - TAILQ_REMOVE(&lctx->synq, synqe, link); - release_synqe(synqe); /* removed from synq list */ - inp = release_lctx(sc, lctx); - if (inp) - INP_WUNLOCK(inp); - /* * syncache may or may not have a hold on the synqe, which may * or may not be stashed in the original SYN mbuf passed to us. @@ -1205,13 +1169,39 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, m->m_pkthdr.rcvif = ifp; remove_tid(sc, synqe->tid); - release_synqe(synqe); /* about to exit function */ free(wr, M_CXGBE); + + /* Yank the synqe out of the lctx synq. */ + INP_WLOCK(inp); + TAILQ_REMOVE(&lctx->synq, synqe, link); + release_synqe(synqe); /* removed from synq list */ + inp = release_lctx(sc, lctx); + if (inp) + INP_WUNLOCK(inp); + REJECT_PASS_ACCEPT(); } - release_synqe(synqe); /* about to exit function */ + CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK", __func__, stid, tid, lctx, synqe); + + INP_WLOCK(inp); + synqe->flags |= TPF_SYNQE_HAS_L2TE; + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * Listening socket closed but tod_listen_stop did not abort + * this tid because there was no L2T entry for the tid at that + * time. Abort it now. The reply to the abort will clean up. + */ + CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, ABORT", + __func__, stid, tid, lctx, synqe); + send_reset_synqe(tod, synqe); + INP_WUNLOCK(inp); + + return (__LINE__); + } + INP_WUNLOCK(inp); + return (0); reject: CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, @@ -1293,15 +1283,12 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); if (__predict_false(inp->inp_flags & INP_DROPPED)) { - /* - * The listening socket has closed. The TOM must have aborted - * all the embryonic connections (including this one) that were - * on the lctx's synq. do_abort_rpl for the tid is responsible - * for cleaning up. - */ - KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, - ("%s: listen socket dropped but tid %u not aborted.", - __func__, tid)); + + if (synqe->flags & TPF_SYNQE_HAS_L2TE) { + KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, + ("%s: listen socket closed but tid %u not aborted.", + __func__, tid)); + } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); @@ -1321,7 +1308,12 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT); if (toep == NULL) { reset: - /* The reply to this abort will perform final cleanup */ + /* + * The reply to this abort will perform final cleanup. There is + * no need to check for HAS_L2TE here. We can be here only if + * we responded to the PASS_ACCEPT_REQ, and our response had the + * L2T idx. + */ send_reset_synqe(TOEDEV(ifp), synqe); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index ccf5081..0704856 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -67,6 +67,7 @@ enum { TPF_SYNQE_NEEDFREE = (1 << 9), /* synq_entry was malloc'd separately */ TPF_SYNQE_TCPDDP = (1 << 10), /* ulp_mode TCPDDP in toepcb */ TPF_SYNQE_EXPANDED = (1 << 11), /* toepcb ready, tid context updated */ + TPF_SYNQE_HAS_L2TE = (1 << 12), /* we've replied to PASS_ACCEPT_REQ */ }; enum { |