summaryrefslogtreecommitdiffstats
path: root/sys/dev/cxgbe
diff options
context:
space:
mode:
authornp <np@FreeBSD.org>2012-06-19 07:34:13 +0000
committernp <np@FreeBSD.org>2012-06-19 07:34:13 +0000
commit67d5f1a727273d8e141e96c429114dff9fb06ec3 (patch)
tree9255a545bbd49a0458ed8850371b4fe6ed2cd01f /sys/dev/cxgbe
parent27063437e23a5e5e7debf9144ee974d21b6a6774 (diff)
downloadFreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.zip
FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.tar.gz
- Updated TOE support in the kernel.
- Stateful TCP offload drivers for Terminator 3 and 4 (T3 and T4) ASICs. These are available as t3_tom and t4_tom modules that augment cxgb(4) and cxgbe(4) respectively. The cxgb/cxgbe drivers continue to work as usual with or without these extra features. - iWARP driver for Terminator 3 ASIC (kernel verbs). T4 iWARP in the works and will follow soon. Build-tested with make universe. 30s overview ============ What interfaces support TCP offload? Look for TOE4 and/or TOE6 in the capabilities of an interface: # ifconfig -m | grep TOE Enable/disable TCP offload on an interface (just like any other ifnet capability): # ifconfig cxgbe0 toe # ifconfig cxgbe0 -toe Which connections are offloaded? Look for toe4 and/or toe6 in the output of netstat and sockstat: # netstat -np tcp | grep toe # sockstat -46c | grep toe Reviewed by: bz, gnn Sponsored by: Chelsio communications. MFC after: ~3 months (after 9.1, and after ensuring MFC is feasible)
Diffstat (limited to 'sys/dev/cxgbe')
-rw-r--r--sys/dev/cxgbe/adapter.h103
-rw-r--r--sys/dev/cxgbe/common/t4_hw.c2
-rw-r--r--sys/dev/cxgbe/offload.h19
-rw-r--r--sys/dev/cxgbe/t4_l2t.c563
-rw-r--r--sys/dev/cxgbe/t4_l2t.h55
-rw-r--r--sys/dev/cxgbe/t4_main.c213
-rw-r--r--sys/dev/cxgbe/t4_sge.c128
-rw-r--r--sys/dev/cxgbe/tom/t4_connect.c377
-rw-r--r--sys/dev/cxgbe/tom/t4_cpl_io.c1276
-rw-r--r--sys/dev/cxgbe/tom/t4_listen.c1362
-rw-r--r--sys/dev/cxgbe/tom/t4_tom.c755
-rw-r--r--sys/dev/cxgbe/tom/t4_tom.h248
-rw-r--r--sys/dev/cxgbe/tom/t4_tom_l2t.c405
-rw-r--r--sys/dev/cxgbe/tom/t4_tom_l2t.h53
14 files changed, 4840 insertions, 719 deletions
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index 6be75bc..ba5335a 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -157,6 +157,7 @@ enum {
INTR_DIRECT = (1 << 2), /* direct interrupts for everything */
MASTER_PF = (1 << 3),
ADAP_SYSCTL_CTX = (1 << 4),
+ TOM_INIT_DONE = (1 << 5),
CXGBE_BUSY = (1 << 9),
@@ -199,7 +200,7 @@ struct port_info {
int first_txq; /* index of first tx queue */
int nrxq; /* # of rx queues */
int first_rxq; /* index of first rx queue */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
int nofldtxq; /* # of offload tx queues */
int first_ofld_txq; /* index of first offload tx queue */
int nofldrxq; /* # of offload rx queues */
@@ -213,6 +214,8 @@ struct port_info {
struct link_config link_cfg;
struct port_stats stats;
+ eventhandler_tag vlan_c;
+
struct callout tick;
struct sysctl_ctx_list ctx; /* from ifconfig up to driver detach */
@@ -296,7 +299,7 @@ struct sge_iq {
enum {
EQ_CTRL = 1,
EQ_ETH = 2,
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
EQ_OFLD = 3,
#endif
@@ -422,14 +425,36 @@ struct sge_rxq {
} __aligned(CACHE_LINE_SIZE);
-#ifndef TCP_OFFLOAD_DISABLE
+static inline struct sge_rxq *
+iq_to_rxq(struct sge_iq *iq)
+{
+
+ return (member2struct(sge_rxq, iq, iq));
+}
+
+
+#ifdef TCP_OFFLOAD
/* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */
struct sge_ofld_rxq {
struct sge_iq iq; /* MUST be first */
struct sge_fl fl; /* MUST follow iq */
} __aligned(CACHE_LINE_SIZE);
+
+static inline struct sge_ofld_rxq *
+iq_to_ofld_rxq(struct sge_iq *iq)
+{
+
+ return (member2struct(sge_ofld_rxq, iq, iq));
+}
#endif
+struct wrqe {
+ STAILQ_ENTRY(wrqe) link;
+ struct sge_wrq *wrq;
+ int wr_len;
+ uint64_t wr[] __aligned(16);
+};
+
/*
* wrq: SGE egress queue that is given prebuilt work requests. Both the control
* and offload tx queues are of this type.
@@ -438,8 +463,9 @@ struct sge_wrq {
struct sge_eq eq; /* MUST be first */
struct adapter *adapter;
- struct mbuf *head; /* held up due to lack of descriptors */
- struct mbuf *tail; /* valid only if head is valid */
+
+ /* List of WRs held up due to lack of tx descriptors */
+ STAILQ_HEAD(, wrqe) wr_list;
/* stats for common events first */
@@ -457,7 +483,7 @@ struct sge {
int nrxq; /* total # of Ethernet rx queues */
int ntxq; /* total # of Ethernet tx tx queues */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
int nofldrxq; /* total # of TOE rx queues */
int nofldtxq; /* total # of TOE tx queues */
#endif
@@ -469,7 +495,7 @@ struct sge {
struct sge_wrq *ctrlq; /* Control queues */
struct sge_txq *txq; /* NIC tx queues */
struct sge_rxq *rxq; /* NIC rx queues */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
struct sge_wrq *ofld_txq; /* TOE tx queues */
struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */
#endif
@@ -483,6 +509,7 @@ struct sge {
struct rss_header;
typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *,
struct mbuf *);
+typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *);
struct adapter {
SLIST_ENTRY(adapter) link;
@@ -519,15 +546,15 @@ struct adapter {
uint8_t chan_map[NCHAN];
uint32_t filter_mode;
-#ifndef TCP_OFFLOAD_DISABLE
- struct uld_softc tom;
+#ifdef TCP_OFFLOAD
+ void *tom_softc; /* (struct tom_data *) */
struct tom_tunables tt;
#endif
struct l2t_data *l2t; /* L2 table */
struct tid_info tids;
int open_device_map;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
int offload_map;
#endif
int flags;
@@ -554,7 +581,8 @@ struct adapter {
TAILQ_HEAD(, sge_fl) sfl;
struct callout sfl_callout;
- cpl_handler_t cpl_handler[256] __aligned(CACHE_LINE_SIZE);
+ an_handler_t an_handler __aligned(CACHE_LINE_SIZE);
+ cpl_handler_t cpl_handler[256];
};
#define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock)
@@ -609,82 +637,96 @@ struct adapter {
static inline uint32_t
t4_read_reg(struct adapter *sc, uint32_t reg)
{
+
return bus_space_read_4(sc->bt, sc->bh, reg);
}
static inline void
t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val)
{
+
bus_space_write_4(sc->bt, sc->bh, reg, val);
}
static inline uint64_t
t4_read_reg64(struct adapter *sc, uint32_t reg)
{
+
return t4_bus_space_read_8(sc->bt, sc->bh, reg);
}
static inline void
t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
{
+
t4_bus_space_write_8(sc->bt, sc->bh, reg, val);
}
static inline void
t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val)
{
+
*val = pci_read_config(sc->dev, reg, 1);
}
static inline void
t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val)
{
+
pci_write_config(sc->dev, reg, val, 1);
}
static inline void
t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val)
{
+
*val = pci_read_config(sc->dev, reg, 2);
}
static inline void
t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val)
{
+
pci_write_config(sc->dev, reg, val, 2);
}
static inline void
t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val)
{
+
*val = pci_read_config(sc->dev, reg, 4);
}
static inline void
t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val)
{
+
pci_write_config(sc->dev, reg, val, 4);
}
static inline struct port_info *
adap2pinfo(struct adapter *sc, int idx)
{
+
return (sc->port[idx]);
}
static inline void
t4_os_set_hw_addr(struct adapter *sc, int idx, uint8_t hw_addr[])
{
+
bcopy(hw_addr, sc->port[idx]->hw_addr, ETHER_ADDR_LEN);
}
static inline bool is_10G_port(const struct port_info *pi)
{
+
return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0);
}
static inline int tx_resume_threshold(struct sge_eq *eq)
{
+
return (eq->qsize / 4);
}
@@ -698,6 +740,7 @@ void t4_os_portmod_changed(const struct adapter *, int);
void t4_os_link_changed(struct adapter *, int, int);
void t4_iterate(void (*)(struct adapter *, void *), void *);
int t4_register_cpl_handler(struct adapter *, int, cpl_handler_t);
+int t4_register_an_handler(struct adapter *, an_handler_t);
/* t4_sge.c */
void t4_sge_modload(void);
@@ -714,21 +757,45 @@ void t4_intr_all(void *);
void t4_intr(void *);
void t4_intr_err(void *);
void t4_intr_evt(void *);
-int t4_mgmt_tx(struct adapter *, struct mbuf *);
-int t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct mbuf *);
+void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *);
void t4_update_fl_bufsize(struct ifnet *);
int can_resume_tx(struct sge_eq *);
-static inline int t4_wrq_tx(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m)
+static inline struct wrqe *
+alloc_wrqe(int wr_len, struct sge_wrq *wrq)
{
- int rc;
+ int len = offsetof(struct wrqe, wr) + wr_len;
+ struct wrqe *wr;
+
+ wr = malloc(len, M_CXGBE, M_NOWAIT);
+ if (__predict_false(wr == NULL))
+ return (NULL);
+ wr->wr_len = wr_len;
+ wr->wrq = wrq;
+ return (wr);
+}
+
+static inline void *
+wrtod(struct wrqe *wr)
+{
+ return (&wr->wr[0]);
+}
+
+static inline void
+free_wrqe(struct wrqe *wr)
+{
+ free(wr, M_CXGBE);
+}
+
+static inline void
+t4_wrq_tx(struct adapter *sc, struct wrqe *wr)
+{
+ struct sge_wrq *wrq = wr->wrq;
TXQ_LOCK(wrq);
- rc = t4_wrq_tx_locked(sc, wrq, m);
+ t4_wrq_tx_locked(sc, wrq, wr);
TXQ_UNLOCK(wrq);
- return (rc);
}
-
#endif
diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c
index 6f4dd8d..f629cbe 100644
--- a/sys/dev/cxgbe/common/t4_hw.c
+++ b/sys/dev/cxgbe/common/t4_hw.c
@@ -27,6 +27,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+
#include "common.h"
#include "t4_regs.h"
#include "t4_regs_values.h"
diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h
index f6ada9d..1ae9f1f 100644
--- a/sys/dev/cxgbe/offload.h
+++ b/sys/dev/cxgbe/offload.h
@@ -31,12 +31,6 @@
#ifndef __T4_OFFLOAD_H__
#define __T4_OFFLOAD_H__
-/* XXX: flagrant misuse of mbuf fields (during tx by TOM) */
-#define MBUF_EQ(m) (*((void **)(&(m)->m_pkthdr.rcvif)))
-/* These have to work for !M_PKTHDR so we use a field from m_hdr. */
-#define MBUF_TX_CREDITS(m) ((m)->m_hdr.pad[0])
-#define MBUF_DMA_MAPPED(m) ((m)->m_hdr.pad[1])
-
#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
@@ -119,7 +113,7 @@ struct t4_virt_res { /* virtualized HW resources */
struct t4_range ocq;
};
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
enum {
ULD_TOM = 1,
};
@@ -130,13 +124,8 @@ struct uld_info {
SLIST_ENTRY(uld_info) link;
int refcount;
int uld_id;
- int (*attach)(struct adapter *, void **);
- int (*detach)(void *);
-};
-
-struct uld_softc {
- struct uld_info *uld;
- void *softc;
+ int (*activate)(struct adapter *);
+ int (*deactivate)(struct adapter *);
};
struct tom_tunables {
@@ -148,6 +137,8 @@ struct tom_tunables {
int t4_register_uld(struct uld_info *);
int t4_unregister_uld(struct uld_info *);
+int t4_activate_uld(struct adapter *, int);
+int t4_deactivate_uld(struct adapter *, int);
#endif
#endif
diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c
index 55491cd..8373c32 100644
--- a/sys/dev/cxgbe/t4_l2t.c
+++ b/sys/dev/cxgbe/t4_l2t.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2011 Chelsio Communications, Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -38,16 +38,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rwlock.h>
#include <sys/socket.h>
#include <sys/sbuf.h>
-#include <net/if.h>
-#include <net/if_types.h>
-#include <net/ethernet.h>
-#include <net/if_vlan_var.h>
-#include <net/if_dl.h>
-#include <net/if_llatbl.h>
-#include <net/route.h>
#include <netinet/in.h>
-#include <netinet/in_var.h>
-#include <netinet/if_ether.h>
#include "common/common.h"
#include "common/jhash.h"
@@ -72,42 +63,11 @@ __FBSDID("$FreeBSD$");
* lifetime of an L2T entry is fully contained in the lifetime of the TOE.
*/
-/* identifies sync vs async L2T_WRITE_REQs */
-#define S_SYNC_WR 12
-#define V_SYNC_WR(x) ((x) << S_SYNC_WR)
-#define F_SYNC_WR V_SYNC_WR(1)
-
-enum {
- L2T_STATE_VALID, /* entry is up to date */
- L2T_STATE_STALE, /* entry may be used but needs revalidation */
- L2T_STATE_RESOLVING, /* entry needs address resolution */
- L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */
-
- /* when state is one of the below the entry is not hashed */
- L2T_STATE_SWITCHING, /* entry is being used by a switching filter */
- L2T_STATE_UNUSED /* entry not in use */
-};
-
-struct l2t_data {
- struct rwlock lock;
- volatile int nfree; /* number of free entries */
- struct l2t_entry *rover;/* starting point for next allocation */
- struct l2t_entry l2tab[L2T_SIZE];
-};
-
-static int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *,
- struct mbuf *);
-
-#define VLAN_NONE 0xfff
-#define SA(x) ((struct sockaddr *)(x))
-#define SIN(x) ((struct sockaddr_in *)(x))
-#define SINADDR(x) (SIN(x)->sin_addr.s_addr)
-
/*
* Allocate a free L2T entry. Must be called with l2t_data.lock held.
*/
-static struct l2t_entry *
-alloc_l2e(struct l2t_data *d)
+struct l2t_entry *
+t4_alloc_l2e(struct l2t_data *d)
{
struct l2t_entry *end, *e, **p;
@@ -121,7 +81,8 @@ alloc_l2e(struct l2t_data *d)
if (atomic_load_acq_int(&e->refcnt) == 0)
goto found;
- for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) ;
+ for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e)
+ continue;
found:
d->rover = e + 1;
atomic_subtract_int(&d->nfree, 1);
@@ -148,19 +109,18 @@ found:
* Write an L2T entry. Must be called with the entry locked.
* The write may be synchronous or asynchronous.
*/
-static int
-write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
+int
+t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
{
- struct mbuf *m;
+ struct wrqe *wr;
struct cpl_l2t_write_req *req;
mtx_assert(&e->lock, MA_OWNED);
- if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+ wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq);
+ if (wr == NULL)
return (ENOMEM);
-
- req = mtod(m, struct cpl_l2t_write_req *);
- m->m_pkthdr.len = m->m_len = sizeof(*req);
+ req = wrtod(wr);
INIT_TP_WR(req, 0);
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx |
@@ -170,7 +130,7 @@ write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
req->vlan = htons(e->vlan);
memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
- t4_mgmt_tx(sc, m);
+ t4_wrq_tx(sc, wr);
if (sync && e->state != L2T_STATE_SWITCHING)
e->state = L2T_STATE_SYNC_WRITE;
@@ -189,7 +149,7 @@ t4_l2t_alloc_switching(struct l2t_data *d)
struct l2t_entry *e;
rw_rlock(&d->lock);
- e = alloc_l2e(d);
+ e = t4_alloc_l2e(d);
if (e) {
mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
e->state = L2T_STATE_SWITCHING;
@@ -214,7 +174,7 @@ t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan,
e->lport = port;
memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN);
mtx_lock(&e->lock);
- rc = write_l2e(sc, e, 0);
+ rc = t4_write_l2e(sc, e, 0);
mtx_unlock(&e->lock);
return (rc);
}
@@ -234,10 +194,13 @@ t4_init_l2t(struct adapter *sc, int flags)
rw_init(&d->lock, "L2T");
for (i = 0; i < L2T_SIZE; i++) {
- d->l2tab[i].idx = i;
- d->l2tab[i].state = L2T_STATE_UNUSED;
- mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF);
- atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
+ struct l2t_entry *e = &d->l2tab[i];
+
+ e->idx = i;
+ e->state = L2T_STATE_UNUSED;
+ mtx_init(&e->lock, "L2T_E", NULL, MTX_DEF);
+ STAILQ_INIT(&e->wr_list);
+ atomic_store_rel_int(&e->refcnt, 0);
}
sc->l2t = d;
@@ -259,6 +222,24 @@ t4_free_l2t(struct l2t_data *d)
return (0);
}
+int
+do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(rpl);
+ unsigned int idx = tid & (L2T_SIZE - 1);
+
+ if (__predict_false(rpl->status != CPL_ERR_NONE)) {
+ log(LOG_ERR,
+ "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+ rpl->status, idx);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
#ifdef SBUF_DRAIN
static inline unsigned int
vlan_prio(const struct l2t_entry *e)
@@ -273,7 +254,7 @@ l2e_state(const struct l2t_entry *e)
case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */
case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */
case L2T_STATE_SYNC_WRITE: return 'W';
- case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R';
+ case L2T_STATE_RESOLVING: return STAILQ_EMPTY(&e->wr_list) ? 'R' : 'A';
case L2T_STATE_SWITCHING: return 'X';
default: return 'U';
}
@@ -311,20 +292,20 @@ sysctl_l2t(SYSCTL_HANDLER_ARGS)
"Ethernet address VLAN/P LP State Users Port");
header = 1;
}
- if (e->state == L2T_STATE_SWITCHING || e->v6)
+ if (e->state == L2T_STATE_SWITCHING)
ip[0] = 0;
else
snprintf(ip, sizeof(ip), "%s",
- inet_ntoa(*(struct in_addr *)&e->addr[0]));
+ inet_ntoa(*(struct in_addr *)&e->addr));
- /* XXX: accessing lle probably not safe? */
+ /* XXX: e->ifp may not be around */
sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d"
" %u %2u %c %5u %s",
e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2],
e->dmac[3], e->dmac[4], e->dmac[5],
e->vlan & 0xfff, vlan_prio(e), e->lport,
l2e_state(e), atomic_load_acq_int(&e->refcnt),
- e->lle ? e->lle->lle_tbl->llt_ifp->if_xname : "");
+ e->ifp->if_xname);
skip:
mtx_unlock(&e->lock);
}
@@ -335,459 +316,3 @@ skip:
return (rc);
}
#endif
-
-#ifndef TCP_OFFLOAD_DISABLE
-static inline void
-l2t_hold(struct l2t_data *d, struct l2t_entry *e)
-{
- if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */
- atomic_subtract_int(&d->nfree, 1);
-}
-
-/*
- * To avoid having to check address families we do not allow v4 and v6
- * neighbors to be on the same hash chain. We keep v4 entries in the first
- * half of available hash buckets and v6 in the second.
- */
-enum {
- L2T_SZ_HALF = L2T_SIZE / 2,
- L2T_HASH_MASK = L2T_SZ_HALF - 1
-};
-
-static inline unsigned int
-arp_hash(const uint32_t *key, int ifindex)
-{
- return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK;
-}
-
-static inline unsigned int
-ipv6_hash(const uint32_t *key, int ifindex)
-{
- uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3];
-
- return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK);
-}
-
-static inline unsigned int
-addr_hash(const uint32_t *addr, int addr_len, int ifindex)
-{
- return addr_len == 4 ? arp_hash(addr, ifindex) :
- ipv6_hash(addr, ifindex);
-}
-
-/*
- * Checks if an L2T entry is for the given IP/IPv6 address. It does not check
- * whether the L2T entry and the address are of the same address family.
- * Callers ensure an address is only checked against L2T entries of the same
- * family, something made trivial by the separation of IP and IPv6 hash chains
- * mentioned above. Returns 0 if there's a match,
- */
-static inline int
-addreq(const struct l2t_entry *e, const uint32_t *addr)
-{
- if (e->v6)
- return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
- (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]);
- return e->addr[0] ^ addr[0];
-}
-
-/*
- * Add a packet to an L2T entry's queue of packets awaiting resolution.
- * Must be called with the entry's lock held.
- */
-static inline void
-arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
-{
- mtx_assert(&e->lock, MA_OWNED);
-
- KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt not NULL", __func__));
- if (e->arpq_head)
- e->arpq_tail->m_nextpkt = m;
- else
- e->arpq_head = m;
- e->arpq_tail = m;
-}
-
-static inline void
-send_pending(struct adapter *sc, struct l2t_entry *e)
-{
- struct mbuf *m, *next;
-
- mtx_assert(&e->lock, MA_OWNED);
-
- for (m = e->arpq_head; m; m = next) {
- next = m->m_nextpkt;
- m->m_nextpkt = NULL;
- t4_wrq_tx(sc, MBUF_EQ(m), m);
- }
- e->arpq_head = e->arpq_tail = NULL;
-}
-
-#ifdef INET
-/*
- * Looks up and fills up an l2t_entry's lle. We grab all the locks that we need
- * ourself, and update e->state at the end if e->lle was successfully filled.
- *
- * The lle passed in comes from arpresolve and is ignored as it does not appear
- * to be of much use.
- */
-static int
-l2t_fill_lle(struct adapter *sc, struct l2t_entry *e, struct llentry *unused)
-{
- int rc = 0;
- struct sockaddr_in sin;
- struct ifnet *ifp = e->ifp;
- struct llentry *lle;
-
- bzero(&sin, sizeof(struct sockaddr_in));
- if (e->v6)
- panic("%s: IPv6 L2 resolution not supported yet.", __func__);
-
- sin.sin_family = AF_INET;
- sin.sin_len = sizeof(struct sockaddr_in);
- memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
-
- mtx_assert(&e->lock, MA_NOTOWNED);
- KASSERT(e->addr && ifp, ("%s: bad prep before call", __func__));
-
- IF_AFDATA_LOCK(ifp);
- lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, SA(&sin));
- IF_AFDATA_UNLOCK(ifp);
- if (!LLE_IS_VALID(lle))
- return (ENOMEM);
- if (!(lle->la_flags & LLE_VALID)) {
- rc = EINVAL;
- goto done;
- }
-
- LLE_ADDREF(lle);
-
- mtx_lock(&e->lock);
- if (e->state == L2T_STATE_RESOLVING) {
- KASSERT(e->lle == NULL, ("%s: lle already valid", __func__));
- e->lle = lle;
- memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
- write_l2e(sc, e, 1);
- } else {
- KASSERT(e->lle == lle, ("%s: lle changed", __func__));
- LLE_REMREF(lle);
- }
- mtx_unlock(&e->lock);
-done:
- LLE_WUNLOCK(lle);
- return (rc);
-}
-#endif
-
-int
-t4_l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e)
-{
-#ifndef INET
- return (EINVAL);
-#else
- struct llentry *lle = NULL;
- struct sockaddr_in sin;
- struct ifnet *ifp = e->ifp;
-
- if (e->v6)
- panic("%s: IPv6 L2 resolution not supported yet.", __func__);
-
- bzero(&sin, sizeof(struct sockaddr_in));
- sin.sin_family = AF_INET;
- sin.sin_len = sizeof(struct sockaddr_in);
- memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
-
-again:
- switch (e->state) {
- case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
- if (arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
- l2t_fill_lle(sc, e, lle);
-
- /* Fall through */
-
- case L2T_STATE_VALID: /* fast-path, send the packet on */
- return t4_wrq_tx(sc, MBUF_EQ(m), m);
-
- case L2T_STATE_RESOLVING:
- case L2T_STATE_SYNC_WRITE:
- mtx_lock(&e->lock);
- if (e->state != L2T_STATE_SYNC_WRITE &&
- e->state != L2T_STATE_RESOLVING) {
- /* state changed by the time we got here */
- mtx_unlock(&e->lock);
- goto again;
- }
- arpq_enqueue(e, m);
- mtx_unlock(&e->lock);
-
- if (e->state == L2T_STATE_RESOLVING &&
- arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
- l2t_fill_lle(sc, e, lle);
- }
-
- return (0);
-#endif
-}
-
-/*
- * Called when an L2T entry has no more users. The entry is left in the hash
- * table since it is likely to be reused but we also bump nfree to indicate
- * that the entry can be reallocated for a different neighbor. We also drop
- * the existing neighbor reference in case the neighbor is going away and is
- * waiting on our reference.
- *
- * Because entries can be reallocated to other neighbors once their ref count
- * drops to 0 we need to take the entry's lock to avoid races with a new
- * incarnation.
- */
-static void
-t4_l2e_free(struct l2t_entry *e)
-{
- struct llentry *lle = NULL;
- struct l2t_data *d;
-
- mtx_lock(&e->lock);
- if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */
- lle = e->lle;
- e->lle = NULL;
- /*
- * Don't need to worry about the arpq, an L2T entry can't be
- * released if any packets are waiting for resolution as we
- * need to be able to communicate with the device to close a
- * connection.
- */
- }
- mtx_unlock(&e->lock);
-
- d = container_of(e, struct l2t_data, l2tab[e->idx]);
- atomic_add_int(&d->nfree, 1);
-
- if (lle)
- LLE_FREE(lle);
-}
-
-void
-t4_l2t_release(struct l2t_entry *e)
-{
- if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
- t4_l2e_free(e);
-}
-
-static int
-do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
- struct mbuf *m)
-{
- struct adapter *sc = iq->adapter;
- const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
- unsigned int tid = GET_TID(rpl);
- unsigned int idx = tid & (L2T_SIZE - 1);
-
- if (__predict_false(rpl->status != CPL_ERR_NONE)) {
- log(LOG_ERR,
- "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
- rpl->status, idx);
- return (EINVAL);
- }
-
- if (tid & F_SYNC_WR) {
- struct l2t_entry *e = &sc->l2t->l2tab[idx];
-
- mtx_lock(&e->lock);
- if (e->state != L2T_STATE_SWITCHING) {
- send_pending(sc, e);
- e->state = L2T_STATE_VALID;
- }
- mtx_unlock(&e->lock);
- }
-
- return (0);
-}
-
-/*
- * Reuse an L2T entry that was previously used for the same next hop.
- */
-static void
-reuse_entry(struct l2t_entry *e)
-{
- struct llentry *lle;
-
- mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
- lle = e->lle;
- if (lle) {
- KASSERT(lle->la_flags & LLE_VALID,
- ("%s: invalid lle stored in l2t_entry", __func__));
-
- if (lle->la_expire >= time_uptime)
- e->state = L2T_STATE_STALE;
- else
- e->state = L2T_STATE_VALID;
- } else
- e->state = L2T_STATE_RESOLVING;
- mtx_unlock(&e->lock);
-}
-
-/*
- * The TOE wants an L2 table entry that it can use to reach the next hop over
- * the specified port. Produce such an entry - create one if needed.
- *
- * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
- * top of the real cxgbe interface.
- */
-struct l2t_entry *
-t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
-{
- struct l2t_entry *e;
- struct l2t_data *d = pi->adapter->l2t;
- int addr_len;
- uint32_t *addr;
- int hash;
- struct sockaddr_in6 *sin6;
- unsigned int smt_idx = pi->port_id;
-
- if (sa->sa_family == AF_INET) {
- addr = (uint32_t *)&SINADDR(sa);
- addr_len = sizeof(SINADDR(sa));
- } else if (sa->sa_family == AF_INET6) {
- sin6 = (struct sockaddr_in6 *)sa;
- addr = (uint32_t *)&sin6->sin6_addr.s6_addr;
- addr_len = sizeof(sin6->sin6_addr.s6_addr);
- } else
- return (NULL);
-
-#ifndef VLAN_TAG
- if (ifp->if_type == IFT_L2VLAN)
- return (NULL);
-#endif
-
- hash = addr_hash(addr, addr_len, ifp->if_index);
-
- rw_wlock(&d->lock);
- for (e = d->l2tab[hash].first; e; e = e->next) {
- if (!addreq(e, addr) && e->ifp == ifp && e->smt_idx == smt_idx){
- l2t_hold(d, e);
- if (atomic_load_acq_int(&e->refcnt) == 1)
- reuse_entry(e);
- goto done;
- }
- }
-
- /* Need to allocate a new entry */
- e = alloc_l2e(d);
- if (e) {
- mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
- e->state = L2T_STATE_RESOLVING;
- memcpy(e->addr, addr, addr_len);
- e->ifindex = ifp->if_index;
- e->smt_idx = smt_idx;
- e->ifp = ifp;
- e->hash = hash;
- e->lport = pi->lport;
- e->v6 = (addr_len == 16);
- e->lle = NULL;
- atomic_store_rel_int(&e->refcnt, 1);
-#ifdef VLAN_TAG
- if (ifp->if_type == IFT_L2VLAN)
- VLAN_TAG(ifp, &e->vlan);
- else
- e->vlan = VLAN_NONE;
-#endif
- e->next = d->l2tab[hash].first;
- d->l2tab[hash].first = e;
- mtx_unlock(&e->lock);
- }
-done:
- rw_wunlock(&d->lock);
- return e;
-}
-
-/*
- * Called when the host's neighbor layer makes a change to some entry that is
- * loaded into the HW L2 table.
- */
-void
-t4_l2t_update(struct adapter *sc, struct llentry *lle)
-{
- struct l2t_entry *e;
- struct l2t_data *d = sc->l2t;
- struct sockaddr *sa = L3_ADDR(lle);
- struct llentry *old_lle = NULL;
- uint32_t *addr = (uint32_t *)&SINADDR(sa);
- struct ifnet *ifp = lle->lle_tbl->llt_ifp;
- int hash = addr_hash(addr, sizeof(*addr), ifp->if_index);
-
- KASSERT(d != NULL, ("%s: no L2 table", __func__));
- LLE_WLOCK_ASSERT(lle);
- KASSERT(lle->la_flags & LLE_VALID || lle->la_flags & LLE_DELETED,
- ("%s: entry neither valid nor deleted.", __func__));
-
- rw_rlock(&d->lock);
- for (e = d->l2tab[hash].first; e; e = e->next) {
- if (!addreq(e, addr) && e->ifp == ifp) {
- mtx_lock(&e->lock);
- if (atomic_load_acq_int(&e->refcnt))
- goto found;
- e->state = L2T_STATE_STALE;
- mtx_unlock(&e->lock);
- break;
- }
- }
- rw_runlock(&d->lock);
-
- /* The TOE has no interest in this LLE */
- return;
-
- found:
- rw_runlock(&d->lock);
-
- if (atomic_load_acq_int(&e->refcnt)) {
-
- /* Entry is referenced by at least 1 offloaded connection. */
-
- /* Handle deletes first */
- if (lle->la_flags & LLE_DELETED) {
- if (lle == e->lle) {
- e->lle = NULL;
- e->state = L2T_STATE_RESOLVING;
- LLE_REMREF(lle);
- }
- goto done;
- }
-
- if (lle != e->lle) {
- old_lle = e->lle;
- LLE_ADDREF(lle);
- e->lle = lle;
- }
-
- if (e->state == L2T_STATE_RESOLVING ||
- memcmp(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN)) {
-
- /* unresolved -> resolved; or dmac changed */
-
- memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
- write_l2e(sc, e, 1);
- } else {
-
- /* +ve reinforcement of a valid or stale entry */
-
- }
-
- e->state = L2T_STATE_VALID;
-
- } else {
- /*
- * Entry was used previously but is unreferenced right now.
- * e->lle has been released and NULL'd out by t4_l2t_free, or
- * l2t_release is about to call t4_l2t_free and do that.
- *
- * Either way this is of no interest to us.
- */
- }
-
-done:
- mtx_unlock(&e->lock);
- if (old_lle)
- LLE_FREE(old_lle);
-}
-
-#endif
diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h
index 5dfce83..0303885 100644
--- a/sys/dev/cxgbe/t4_l2t.h
+++ b/sys/dev/cxgbe/t4_l2t.h
@@ -30,8 +30,25 @@
#ifndef __T4_L2T_H
#define __T4_L2T_H
+/* identifies sync vs async L2T_WRITE_REQs */
+#define S_SYNC_WR 12
+#define V_SYNC_WR(x) ((x) << S_SYNC_WR)
+#define F_SYNC_WR V_SYNC_WR(1)
+
enum { L2T_SIZE = 4096 }; /* # of L2T entries */
+enum {
+ L2T_STATE_VALID, /* entry is up to date */
+ L2T_STATE_STALE, /* entry may be used but needs revalidation */
+ L2T_STATE_RESOLVING, /* entry needs address resolution */
+ L2T_STATE_FAILED, /* failed to resolve */
+ L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */
+
+ /* when state is one of the below the entry is not hashed */
+ L2T_STATE_SWITCHING, /* entry is being used by a switching filter */
+ L2T_STATE_UNUSED /* entry not in use */
+};
+
/*
* Each L2T entry plays multiple roles. First of all, it keeps state for the
* corresponding entry of the HW L2 table and maintains a queue of offload
@@ -43,39 +60,49 @@ enum { L2T_SIZE = 4096 }; /* # of L2T entries */
struct l2t_entry {
uint16_t state; /* entry state */
uint16_t idx; /* entry index */
- uint32_t addr[4]; /* next hop IP or IPv6 address */
+ uint32_t addr; /* next hop IP address */
struct ifnet *ifp; /* outgoing interface */
uint16_t smt_idx; /* SMT index */
uint16_t vlan; /* VLAN TCI (id: 0-11, prio: 13-15) */
- int ifindex; /* interface index */
- struct llentry *lle; /* llentry for next hop */
struct l2t_entry *first; /* start of hash chain */
struct l2t_entry *next; /* next l2t_entry on chain */
- struct mbuf *arpq_head; /* list of mbufs awaiting resolution */
- struct mbuf *arpq_tail;
+ STAILQ_HEAD(, wrqe) wr_list; /* list of WRs awaiting resolution */
struct mtx lock;
volatile int refcnt; /* entry reference count */
uint16_t hash; /* hash bucket the entry is on */
- uint8_t v6; /* whether entry is for IPv6 */
uint8_t lport; /* associated offload logical port */
uint8_t dmac[ETHER_ADDR_LEN]; /* next hop's MAC address */
};
+struct l2t_data {
+ struct rwlock lock;
+ volatile int nfree; /* number of free entries */
+ struct l2t_entry *rover;/* starting point for next allocation */
+ struct l2t_entry l2tab[L2T_SIZE];
+};
+
+
int t4_init_l2t(struct adapter *, int);
int t4_free_l2t(struct l2t_data *);
+struct l2t_entry *t4_alloc_l2e(struct l2t_data *);
struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *);
int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t,
uint8_t, uint8_t *);
-void t4_l2t_release(struct l2t_entry *);
+int t4_write_l2e(struct adapter *, struct l2t_entry *, int);
+int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *);
+
+static inline void
+t4_l2t_release(struct l2t_entry *e)
+{
+ struct l2t_data *d = container_of(e, struct l2t_data, l2tab[e->idx]);
+
+ if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
+ atomic_add_int(&d->nfree, 1);
+}
+
+
#ifdef SBUF_DRAIN
int sysctl_l2t(SYSCTL_HANDLER_ARGS);
#endif
-#ifndef TCP_OFFLOAD_DISABLE
-struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *,
- struct sockaddr *);
-int t4_l2t_send(struct adapter *, struct mbuf *, struct l2t_entry *);
-void t4_l2t_update(struct adapter *, struct llentry *);
-#endif
-
#endif /* __T4_L2T_H */
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 874a6ad..a91363b 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -119,9 +119,13 @@ static void cxgbe_media_status(struct ifnet *, struct ifmediareq *);
MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4 Ethernet driver and services");
+/*
+ * Correct lock order when you need to acquire multiple locks is t4_list_lock,
+ * then ADAPTER_LOCK, then t4_uld_list_lock.
+ */
static struct mtx t4_list_lock;
static SLIST_HEAD(, adapter) t4_list;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static struct mtx t4_uld_list_lock;
static SLIST_HEAD(, uld_info) t4_uld_list;
#endif
@@ -149,7 +153,7 @@ TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g);
static int t4_nrxq1g = -1;
TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
#define NOFLDTXQ_10G 8
static int t4_nofldtxq10g = -1;
TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g);
@@ -237,7 +241,7 @@ struct intrs_and_queues {
int nrxq10g; /* # of NIC rxq's for each 10G port */
int ntxq1g; /* # of NIC txq's for each 1G port */
int nrxq1g; /* # of NIC rxq's for each 1G port */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
int nofldtxq10g; /* # of TOE txq's for each 10G port */
int nofldrxq10g; /* # of TOE rxq's for each 10G port */
int nofldtxq1g; /* # of TOE txq's for each 1G port */
@@ -297,8 +301,10 @@ static void reg_block_dump(struct adapter *, uint8_t *, unsigned int,
unsigned int);
static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *);
static void cxgbe_tick(void *);
+static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t);
static int cpl_not_handled(struct sge_iq *, const struct rss_header *,
struct mbuf *);
+static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *);
static int t4_sysctls(struct adapter *);
static int cxgbe_sysctls(struct port_info *);
static int sysctl_int_array(SYSCTL_HANDLER_ARGS);
@@ -342,10 +348,8 @@ static int filter_rpl(struct sge_iq *, const struct rss_header *,
struct mbuf *);
static int get_sge_context(struct adapter *, struct t4_sge_context *);
static int read_card_mem(struct adapter *, struct t4_mem_range *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static int toe_capability(struct port_info *, int);
-static int activate_uld(struct adapter *, int, struct uld_softc *);
-static int deactivate_uld(struct uld_softc *);
#endif
static int t4_mod_event(module_t, int, void *);
@@ -368,8 +372,12 @@ struct t4_pciids {
{0x440a, 4, "Chelsio T404-BT"},
};
-#ifndef TCP_OFFLOAD_DISABLE
-/* This is used in service_iq() to get to the fl associated with an iq. */
+#ifdef TCP_OFFLOAD
+/*
+ * service_iq() has an iq and needs the fl. Offset of fl from the iq should be
+ * exactly the same for both rxq and ofld_rxq.
+ */
+CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq));
CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
#endif
@@ -401,7 +409,7 @@ t4_attach(device_t dev)
int rc = 0, i, n10g, n1g, rqidx, tqidx;
struct intrs_and_queues iaq;
struct sge *s;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
int ofld_rqidx, ofld_tqidx;
#endif
@@ -436,6 +444,7 @@ t4_attach(device_t dev)
goto done; /* error message displayed already */
memset(sc->chan_map, 0xff, sizeof(sc->chan_map));
+ sc->an_handler = an_not_handled;
for (i = 0; i < ARRAY_SIZE(sc->cpl_handler); i++)
sc->cpl_handler[i] = cpl_not_handled;
t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, filter_rpl);
@@ -595,7 +604,7 @@ t4_attach(device_t dev)
s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */
s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(sc)) {
s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g;
@@ -631,7 +640,7 @@ t4_attach(device_t dev)
* tx queues that each port should get.
*/
rqidx = tqidx = 0;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
ofld_rqidx = ofld_tqidx = 0;
#endif
for_each_port(sc, i) {
@@ -653,7 +662,7 @@ t4_attach(device_t dev)
rqidx += pi->nrxq;
tqidx += pi->ntxq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(sc)) {
pi->first_ofld_rxq = ofld_rqidx;
pi->first_ofld_txq = ofld_tqidx;
@@ -761,7 +770,7 @@ t4_detach(device_t dev)
if (sc->l2t)
t4_free_l2t(sc->l2t);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
free(sc->sge.ofld_rxq, M_CXGBE);
free(sc->sge.ofld_txq, M_CXGBE);
#endif
@@ -832,7 +841,7 @@ cxgbe_attach(device_t dev)
ifp->if_qflush = cxgbe_qflush;
ifp->if_capabilities = T4_CAP;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(pi->adapter))
ifp->if_capabilities |= IFCAP_TOE4;
#endif
@@ -844,9 +853,12 @@ cxgbe_attach(device_t dev)
cxgbe_media_status);
build_medialist(pi);
+ pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp,
+ EVENTHANDLER_PRI_ANY);
+
ether_ifattach(ifp, pi->hw_addr);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(pi->adapter)) {
device_printf(dev,
"%d txq, %d rxq (NIC); %d txq, %d rxq (TOE)\n",
@@ -876,6 +888,9 @@ cxgbe_detach(device_t dev)
SET_BUSY(sc);
ADAPTER_UNLOCK(sc);
+ if (pi->vlan_c)
+ EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c);
+
PORT_LOCK(pi);
ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
callout_stop(&pi->tick);
@@ -1042,7 +1057,7 @@ fail:
}
#endif
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (mask & IFCAP_TOE) {
int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE;
@@ -1292,7 +1307,7 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g,
iaq->ntxq1g = t4_ntxq1g;
iaq->nrxq10g = nrxq10g = t4_nrxq10g;
iaq->nrxq1g = nrxq1g = t4_nrxq1g;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
iaq->nofldtxq10g = t4_nofldtxq10g;
iaq->nofldtxq1g = t4_nofldtxq1g;
iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g;
@@ -1364,7 +1379,7 @@ restart:
n++;
}
iaq->nrxq10g = min(n, nrxq10g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
iaq->nofldrxq10g = min(n, nofldrxq10g);
#endif
}
@@ -1379,7 +1394,7 @@ restart:
n++;
}
iaq->nrxq1g = min(n, nrxq1g);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
iaq->nofldrxq1g = min(n, nofldrxq1g);
#endif
}
@@ -1392,7 +1407,7 @@ restart:
* Least desirable option: one interrupt vector for everything.
*/
iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
iaq->nofldrxq10g = iaq->nofldrxq1g = 1;
#endif
@@ -2305,7 +2320,7 @@ adapter_full_init(struct adapter *sc)
struct irq *irq;
struct port_info *pi;
struct sge_rxq *rxq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
struct sge_ofld_rxq *ofld_rxq;
#endif
@@ -2369,7 +2384,7 @@ adapter_full_init(struct adapter *sc)
for_each_port(sc, p) {
pi = sc->port[p];
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
/*
* Skip over the NIC queues if they aren't taking direct
* interrupts.
@@ -2386,7 +2401,7 @@ adapter_full_init(struct adapter *sc)
rid++;
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
/*
* Skip over the offload queues if they aren't taking
* direct interrupts.
@@ -2494,7 +2509,7 @@ port_full_uninit(struct port_info *pi)
int i;
struct sge_rxq *rxq;
struct sge_txq *txq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
struct sge_ofld_rxq *ofld_rxq;
struct sge_wrq *ofld_txq;
#endif
@@ -2507,7 +2522,7 @@ port_full_uninit(struct port_info *pi)
quiesce_eq(sc, &txq->eq);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_txq(pi, i, ofld_txq) {
quiesce_eq(sc, &ofld_txq->eq);
}
@@ -2518,7 +2533,7 @@ port_full_uninit(struct port_info *pi)
quiesce_fl(sc, &rxq->fl);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_rxq(pi, i, ofld_rxq) {
quiesce_iq(sc, &ofld_rxq->iq);
quiesce_fl(sc, &ofld_rxq->fl);
@@ -2892,14 +2907,27 @@ cxgbe_tick(void *arg)
PORT_UNLOCK(pi);
}
+static void
+cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid)
+{
+ struct ifnet *vlan;
+
+ if (arg != ifp)
+ return;
+
+ vlan = VLAN_DEVAT(ifp, vid);
+ VLAN_SETCOOKIE(vlan, ifp);
+}
+
static int
cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
{
+
#ifdef INVARIANTS
- panic("%s: opcode %02x on iq %p with payload %p",
+ panic("%s: opcode 0x%02x on iq %p with payload %p",
__func__, rss->opcode, iq, m);
#else
- log(LOG_ERR, "%s: opcode %02x on iq %p with payload %p",
+ log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p",
__func__, rss->opcode, iq, m);
m_freem(m);
#endif
@@ -2922,6 +2950,31 @@ t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h)
}
static int
+an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
+{
+
+#ifdef INVARIANTS
+ panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
+#else
+ log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)",
+ __func__, iq, ctrl);
+#endif
+ return (EDOOFUS);
+}
+
+int
+t4_register_an_handler(struct adapter *sc, an_handler_t h)
+{
+ uintptr_t *loc, new;
+
+ new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
+ loc = (uintptr_t *) &sc->an_handler;
+ atomic_store_rel_ptr(loc, new);
+
+ return (0);
+}
+
+static int
t4_sysctls(struct adapter *sc)
{
struct sysctl_ctx_list *ctx;
@@ -3072,7 +3125,7 @@ t4_sysctls(struct adapter *sc)
sysctl_tx_rate, "A", "Tx rate");
#endif
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(sc)) {
/*
* dev.t4nex.X.toe.
@@ -3125,7 +3178,7 @@ cxgbe_sysctls(struct port_info *pi)
SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD,
&pi->first_txq, 0, "index of first tx queue");
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(pi->adapter)) {
SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD,
&pi->nofldrxq, 0,
@@ -4543,7 +4596,7 @@ set_filter_mode(struct adapter *sc, uint32_t mode)
goto done;
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (sc->offload_map) {
rc = EBUSY;
goto done;
@@ -4734,7 +4787,7 @@ static int
set_filter_wr(struct adapter *sc, int fidx)
{
struct filter_entry *f = &sc->tids.ftid_tab[fidx];
- struct mbuf *m;
+ struct wrqe *wr;
struct fw_filter_wr *fwr;
unsigned int ftid;
@@ -4755,12 +4808,11 @@ set_filter_wr(struct adapter *sc, int fidx)
ftid = sc->tids.ftid_base + fidx;
- m = m_gethdr(M_NOWAIT, MT_DATA);
- if (m == NULL)
+ wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
+ if (wr == NULL)
return (ENOMEM);
- fwr = mtod(m, struct fw_filter_wr *);
- m->m_len = m->m_pkthdr.len = sizeof(*fwr);
+ fwr = wrtod(wr);
bzero(fwr, sizeof (*fwr));
fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
@@ -4830,7 +4882,7 @@ set_filter_wr(struct adapter *sc, int fidx)
f->pending = 1;
sc->tids.ftids_in_use++;
- t4_mgmt_tx(sc, m);
+ t4_wrq_tx(sc, wr);
return (0);
}
@@ -4838,7 +4890,7 @@ static int
del_filter_wr(struct adapter *sc, int fidx)
{
struct filter_entry *f = &sc->tids.ftid_tab[fidx];
- struct mbuf *m;
+ struct wrqe *wr;
struct fw_filter_wr *fwr;
unsigned int ftid;
@@ -4846,18 +4898,16 @@ del_filter_wr(struct adapter *sc, int fidx)
ftid = sc->tids.ftid_base + fidx;
- m = m_gethdr(M_NOWAIT, MT_DATA);
- if (m == NULL)
+ wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
+ if (wr == NULL)
return (ENOMEM);
-
- fwr = mtod(m, struct fw_filter_wr *);
- m->m_len = m->m_pkthdr.len = sizeof(*fwr);
+ fwr = wrtod(wr);
bzero(fwr, sizeof (*fwr));
t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
f->pending = 1;
- t4_mgmt_tx(sc, m);
+ t4_wrq_tx(sc, wr);
return (0);
}
@@ -5215,7 +5265,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
return (rc);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static int
toe_capability(struct port_info *pi, int enable)
{
@@ -5228,13 +5278,28 @@ toe_capability(struct port_info *pi, int enable)
return (ENODEV);
if (enable) {
+ if (!(sc->flags & FULL_INIT_DONE)) {
+ log(LOG_WARNING,
+ "You must enable a cxgbe interface first\n");
+ return (EAGAIN);
+ }
+
if (isset(&sc->offload_map, pi->port_id))
return (0);
- if (sc->offload_map == 0) {
- rc = activate_uld(sc, ULD_TOM, &sc->tom);
+ if (!(sc->flags & TOM_INIT_DONE)) {
+ rc = t4_activate_uld(sc, ULD_TOM);
+ if (rc == EAGAIN) {
+ log(LOG_WARNING,
+ "You must kldload t4_tom.ko before trying "
+ "to enable TOE on a cxgbe interface.\n");
+ }
if (rc != 0)
return (rc);
+ KASSERT(sc->tom_softc != NULL,
+ ("%s: TOM activated but softc NULL", __func__));
+ KASSERT(sc->flags & TOM_INIT_DONE,
+ ("%s: TOM activated but flag not set", __func__));
}
setbit(&sc->offload_map, pi->port_id);
@@ -5242,15 +5307,9 @@ toe_capability(struct port_info *pi, int enable)
if (!isset(&sc->offload_map, pi->port_id))
return (0);
+ KASSERT(sc->flags & TOM_INIT_DONE,
+ ("%s: TOM never initialized?", __func__));
clrbit(&sc->offload_map, pi->port_id);
-
- if (sc->offload_map == 0) {
- rc = deactivate_uld(&sc->tom);
- if (rc != 0) {
- setbit(&sc->offload_map, pi->port_id);
- return (rc);
- }
- }
}
return (0);
@@ -5305,8 +5364,8 @@ done:
return (rc);
}
-static int
-activate_uld(struct adapter *sc, int id, struct uld_softc *usc)
+int
+t4_activate_uld(struct adapter *sc, int id)
{
int rc = EAGAIN;
struct uld_info *ui;
@@ -5315,13 +5374,9 @@ activate_uld(struct adapter *sc, int id, struct uld_softc *usc)
SLIST_FOREACH(ui, &t4_uld_list, link) {
if (ui->uld_id == id) {
- rc = ui->attach(sc, &usc->softc);
- if (rc == 0) {
- KASSERT(usc->softc != NULL,
- ("%s: ULD %d has no state", __func__, id));
+ rc = ui->activate(sc);
+ if (rc == 0)
ui->refcount++;
- usc->uld = ui;
- }
goto done;
}
}
@@ -5331,25 +5386,21 @@ done:
return (rc);
}
-static int
-deactivate_uld(struct uld_softc *usc)
+int
+t4_deactivate_uld(struct adapter *sc, int id)
{
- int rc;
+ int rc = EINVAL;
+ struct uld_info *ui;
mtx_lock(&t4_uld_list_lock);
- if (usc->uld == NULL || usc->softc == NULL) {
- rc = EINVAL;
- goto done;
- }
-
- rc = usc->uld->detach(usc->softc);
- if (rc == 0) {
- KASSERT(usc->uld->refcount > 0,
- ("%s: ULD has bad refcount", __func__));
- usc->uld->refcount--;
- usc->uld = NULL;
- usc->softc = NULL;
+ SLIST_FOREACH(ui, &t4_uld_list, link) {
+ if (ui->uld_id == id) {
+ rc = ui->deactivate(sc);
+ if (rc == 0)
+ ui->refcount--;
+ goto done;
+ }
}
done:
mtx_unlock(&t4_uld_list_lock);
@@ -5379,7 +5430,7 @@ tweak_tunables(void)
if (t4_nrxq1g < 1)
t4_nrxq1g = min(nc, NRXQ_1G);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (t4_nofldtxq10g < 1)
t4_nofldtxq10g = min(nc, NOFLDTXQ_10G);
@@ -5426,7 +5477,7 @@ t4_mod_event(module_t mod, int cmd, void *arg)
t4_sge_modload();
mtx_init(&t4_list_lock, "T4 adapters", 0, MTX_DEF);
SLIST_INIT(&t4_list);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
mtx_init(&t4_uld_list_lock, "T4 ULDs", 0, MTX_DEF);
SLIST_INIT(&t4_uld_list);
#endif
@@ -5434,7 +5485,7 @@ t4_mod_event(module_t mod, int cmd, void *arg)
break;
case MOD_UNLOAD:
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
mtx_lock(&t4_uld_list_lock);
if (!SLIST_EMPTY(&t4_uld_list)) {
rc = EBUSY;
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index 8f39f10..92c9212 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/kernel.h>
+#include <sys/kdb.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/taskqueue.h>
@@ -51,7 +52,6 @@ __FBSDID("$FreeBSD$");
#include "common/t4_regs.h"
#include "common/t4_regs_values.h"
#include "common/t4_msg.h"
-#include "t4_l2t.h"
struct fl_buf_info {
int size;
@@ -115,14 +115,14 @@ static int free_mgmtq(struct adapter *);
static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int,
struct sysctl_oid *);
static int free_rxq(struct port_info *, struct sge_rxq *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int,
struct sysctl_oid *);
static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *);
#endif
static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *);
#endif
static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *);
@@ -397,7 +397,7 @@ first_vector(struct port_info *pi)
if (i == pi->port_id)
break;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (sc->flags & INTR_DIRECT)
rc += pi->nrxq + pi->nofldrxq;
else
@@ -434,7 +434,7 @@ port_intr_iq(struct port_info *pi, int idx)
if (sc->intr_count == 1)
return (&sc->sge.fwq);
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (sc->flags & INTR_DIRECT) {
idx %= pi->nrxq + pi->nofldrxq;
@@ -475,19 +475,20 @@ t4_setup_port_queues(struct port_info *pi)
struct sge_rxq *rxq;
struct sge_txq *txq;
struct sge_wrq *ctrlq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
struct sge_ofld_rxq *ofld_rxq;
struct sge_wrq *ofld_txq;
+ struct sysctl_oid *oid2 = NULL;
#endif
char name[16];
struct adapter *sc = pi->adapter;
- struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev), *oid2 = NULL;
+ struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
NULL, "rx queues");
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
if (is_offload(sc)) {
oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq",
CTLFLAG_RD, NULL,
@@ -515,7 +516,7 @@ t4_setup_port_queues(struct port_info *pi)
init_fl(&rxq->fl, pi->qsize_rxq / 8, pi->ifp->if_mtu, name);
if (sc->flags & INTR_DIRECT
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
|| (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq)
#endif
) {
@@ -527,7 +528,7 @@ t4_setup_port_queues(struct port_info *pi)
}
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_rxq(pi, i, ofld_rxq) {
snprintf(name, sizeof(name), "%s ofld_rxq%d-iq",
@@ -567,7 +568,7 @@ t4_setup_port_queues(struct port_info *pi)
j++;
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_rxq(pi, i, ofld_rxq) {
if (ofld_rxq->iq.flags & IQ_INTR)
continue;
@@ -603,7 +604,7 @@ t4_setup_port_queues(struct port_info *pi)
j++;
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq",
CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
for_each_ofld_txq(pi, i, ofld_txq) {
@@ -655,7 +656,7 @@ t4_teardown_port_queues(struct port_info *pi)
struct adapter *sc = pi->adapter;
struct sge_rxq *rxq;
struct sge_txq *txq;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
struct sge_ofld_rxq *ofld_rxq;
struct sge_wrq *ofld_txq;
#endif
@@ -677,7 +678,7 @@ t4_teardown_port_queues(struct port_info *pi)
free_txq(pi, txq);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_txq(pi, i, ofld_txq) {
free_wrq(sc, ofld_txq);
}
@@ -693,7 +694,7 @@ t4_teardown_port_queues(struct port_info *pi)
free_rxq(pi, rxq);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_rxq(pi, i, ofld_rxq) {
if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
free_ofld_rxq(pi, ofld_rxq);
@@ -709,7 +710,7 @@ t4_teardown_port_queues(struct port_info *pi)
free_rxq(pi, rxq);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
for_each_ofld_rxq(pi, i, ofld_rxq) {
if (ofld_rxq->iq.flags & IQ_INTR)
free_ofld_rxq(pi, ofld_rxq);
@@ -775,7 +776,7 @@ static int
service_iq(struct sge_iq *iq, int budget)
{
struct sge_iq *q;
- struct sge_rxq *rxq = (void *)iq; /* Use iff iq is part of rxq */
+ struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */
struct sge_fl *fl = &rxq->fl; /* Use iff IQ_HAS_FL */
struct adapter *sc = iq->adapter;
struct rsp_ctrl *ctrl;
@@ -862,7 +863,8 @@ service_iq(struct sge_iq *iq, int budget)
break;
default:
- panic("%s: rsp_type %u", __func__, rsp_type);
+ sc->an_handler(iq, ctrl);
+ break;
}
iq_next(iq);
@@ -1076,42 +1078,33 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
return (0);
}
-int
-t4_mgmt_tx(struct adapter *sc, struct mbuf *m)
-{
- return t4_wrq_tx(sc, &sc->sge.mgmtq, m);
-}
-
/*
* Doesn't fail. Holds on to work requests it can't send right away.
*/
-int
-t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
+void
+t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
{
struct sge_eq *eq = &wrq->eq;
int can_reclaim;
caddr_t dst;
- struct mbuf *wr, *next;
TXQ_LOCK_ASSERT_OWNED(wrq);
+#ifdef TCP_OFFLOAD
KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD ||
(eq->flags & EQ_TYPEMASK) == EQ_CTRL,
("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+#else
+ KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL,
+ ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK));
+#endif
- if (__predict_true(m0 != NULL)) {
- if (wrq->head)
- wrq->tail->m_nextpkt = m0;
- else
- wrq->head = m0;
- while (m0->m_nextpkt)
- m0 = m0->m_nextpkt;
- wrq->tail = m0;
- }
+ if (__predict_true(wr != NULL))
+ STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
can_reclaim = reclaimable(eq);
if (__predict_false(eq->flags & EQ_STALLED)) {
if (can_reclaim < tx_resume_threshold(eq))
- return (0);
+ return;
eq->flags &= ~EQ_STALLED;
eq->unstalled++;
}
@@ -1120,39 +1113,34 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
if (__predict_false(eq->cidx >= eq->cap))
eq->cidx -= eq->cap;
- for (wr = wrq->head; wr; wr = next) {
+ while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) {
int ndesc;
- struct mbuf *m;
- next = wr->m_nextpkt;
- wr->m_nextpkt = NULL;
+ if (__predict_false(wr->wr_len < 0 ||
+ wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) {
- M_ASSERTPKTHDR(wr);
- KASSERT(wr->m_pkthdr.len > 0 && (wr->m_pkthdr.len & 0x7) == 0,
- ("%s: work request len %d.", __func__, wr->m_pkthdr.len));
-
- if (wr->m_pkthdr.len > SGE_MAX_WR_LEN) {
#ifdef INVARIANTS
- panic("%s: oversized work request", __func__);
-#else
- log(LOG_ERR, "%s: %s work request too long (%d)",
- device_get_nameunit(sc->dev), __func__,
- wr->m_pkthdr.len);
- m_freem(wr);
- continue;
+ panic("%s: work request with length %d", __func__,
+ wr->wr_len);
#endif
+#ifdef KDB
+ kdb_backtrace();
+#endif
+ log(LOG_ERR, "%s: %s work request with length %d",
+ device_get_nameunit(sc->dev), __func__, wr->wr_len);
+ STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+ free_wrqe(wr);
+ continue;
}
- ndesc = howmany(wr->m_pkthdr.len, EQ_ESIZE);
+ ndesc = howmany(wr->wr_len, EQ_ESIZE);
if (eq->avail < ndesc) {
- wr->m_nextpkt = next;
wrq->no_desc++;
break;
}
dst = (void *)&eq->desc[eq->pidx];
- for (m = wr; m; m = m->m_next)
- copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
+ copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len);
eq->pidx += ndesc;
eq->avail -= ndesc;
@@ -1164,7 +1152,8 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
ring_eq_db(sc, eq);
wrq->tx_wrs++;
- m_freem(wr);
+ STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
+ free_wrqe(wr);
if (eq->avail < 8) {
can_reclaim = reclaimable(eq);
@@ -1178,20 +1167,11 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0)
if (eq->pending)
ring_eq_db(sc, eq);
- if (wr == NULL)
- wrq->head = wrq->tail = NULL;
- else {
- wrq->head = wr;
-
- KASSERT(wrq->tail->m_nextpkt == NULL,
- ("%s: wrq->tail grew a tail of its own", __func__));
-
+ if (wr != NULL) {
eq->flags |= EQ_STALLED;
if (callout_pending(&eq->tx_callout) == 0)
callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
}
-
- return (0);
}
/* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */
@@ -1792,6 +1772,7 @@ alloc_mgmtq(struct adapter *sc)
static int
free_mgmtq(struct adapter *sc)
{
+
return free_wrq(sc, &sc->sge.mgmtq);
}
@@ -1885,7 +1866,7 @@ free_rxq(struct port_info *pi, struct sge_rxq *rxq)
return (rc);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static int
alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq,
int intr_idx, int idx, struct sysctl_oid *oid)
@@ -2031,7 +2012,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
return (rc);
}
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
static int
ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
{
@@ -2103,7 +2084,7 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq)
rc = eth_eq_alloc(sc, pi, eq);
break;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
case EQ_OFLD:
rc = ofld_eq_alloc(sc, pi, eq);
break;
@@ -2141,7 +2122,7 @@ free_eq(struct adapter *sc, struct sge_eq *eq)
eq->cntxt_id);
break;
-#ifndef TCP_OFFLOAD_DISABLE
+#ifdef TCP_OFFLOAD
case EQ_OFLD:
rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
eq->cntxt_id);
@@ -2183,6 +2164,7 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq,
return (rc);
wrq->adapter = sc;
+ STAILQ_INIT(&wrq->wr_list);
SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
&wrq->eq.cntxt_id, 0, "SGE context id of the queue");
@@ -3179,7 +3161,7 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to)
static inline void
copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
{
- if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) {
+ if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) {
bcopy(from, *to, len);
(*to) += len;
} else {
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
new file mode 100644
index 0000000..bc59171
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -0,0 +1,377 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* atid services */
+static int alloc_atid(struct adapter *, void *);
+static void *lookup_atid(struct adapter *, int);
+static void free_atid(struct adapter *, int);
+
+static int
+alloc_atid(struct adapter *sc, void *ctx)
+{
+ struct tid_info *t = &sc->tids;
+ int atid = -1;
+
+ mtx_lock(&t->atid_lock);
+ if (t->afree) {
+ union aopen_entry *p = t->afree;
+
+ atid = p - t->atid_tab;
+ t->afree = p->next;
+ p->data = ctx;
+ t->atids_in_use++;
+ }
+ mtx_unlock(&t->atid_lock);
+ return (atid);
+}
+
+static void *
+lookup_atid(struct adapter *sc, int atid)
+{
+ struct tid_info *t = &sc->tids;
+
+ return (t->atid_tab[atid].data);
+}
+
+static void
+free_atid(struct adapter *sc, int atid)
+{
+ struct tid_info *t = &sc->tids;
+ union aopen_entry *p = &t->atid_tab[atid];
+
+ mtx_lock(&t->atid_lock);
+ p->next = t->afree;
+ t->afree = p;
+ t->atids_in_use--;
+ mtx_unlock(&t->atid_lock);
+}
+
+/*
+ * Active open failed.
+ */
+static int
+do_act_establish(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_act_establish *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ unsigned int atid = G_TID_TID(ntohl(cpl->tos_atid));
+ struct toepcb *toep = lookup_atid(sc, atid);
+ struct inpcb *inp = toep->inp;
+
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
+
+ CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid);
+ free_atid(sc, atid);
+
+ INP_WLOCK(inp);
+ toep->tid = tid;
+ insert_tid(sc, tid, toep);
+ if (inp->inp_flags & INP_DROPPED) {
+
+ /* socket closed by the kernel before hw told us it connected */
+
+ send_flowc_wr(toep, NULL);
+ send_reset(sc, toep, be32toh(cpl->snd_isn));
+ goto done;
+ }
+
+ make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+done:
+ INP_WUNLOCK(inp);
+ return (0);
+}
+
+static inline int
+act_open_has_tid(unsigned int status)
+{
+
+ return (status != CPL_ERR_TCAM_FULL &&
+ status != CPL_ERR_TCAM_PARITY &&
+ status != CPL_ERR_CONN_EXIST &&
+ status != CPL_ERR_ARP_MISS);
+}
+
+/*
+ * Convert an ACT_OPEN_RPL status to an errno.
+ */
+static inline int
+act_open_rpl_status_to_errno(int status)
+{
+
+ switch (status) {
+ case CPL_ERR_CONN_RESET:
+ return (ECONNREFUSED);
+ case CPL_ERR_ARP_MISS:
+ return (EHOSTUNREACH);
+ case CPL_ERR_CONN_TIMEDOUT:
+ return (ETIMEDOUT);
+ case CPL_ERR_TCAM_FULL:
+ return (ENOMEM);
+ case CPL_ERR_CONN_EXIST:
+ log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+ return (EADDRINUSE);
+ default:
+ return (EIO);
+ }
+}
+
+static int
+do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
+ unsigned int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
+ unsigned int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
+ struct toepcb *toep = lookup_atid(sc, atid);
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp = intotcpcb(inp);
+ struct toedev *tod = &toep->td->tod;
+
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__));
+
+ CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status);
+
+ /* Ignore negative advice */
+ if (status == CPL_ERR_RTX_NEG_ADVICE)
+ return (0);
+
+ free_atid(sc, atid);
+ toep->tid = -1;
+
+ if (status && act_open_has_tid(status))
+ release_tid(sc, GET_TID(cpl), toep->ctrlq);
+
+ if (status == CPL_ERR_TCAM_FULL) {
+ INP_WLOCK(inp);
+ toe_connect_failed(tod, tp, EAGAIN);
+ final_cpl_received(toep); /* unlocks inp */
+ } else {
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(status));
+ final_cpl_received(toep); /* unlocks inp */
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ }
+
+ return (0);
+}
+
+/*
+ * Options2 for active open.
+ */
+static uint32_t
+calc_opt2a(struct socket *so)
+{
+ struct tcpcb *tp = so_sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct port_info *pi = toep->port;
+ struct adapter *sc = pi->adapter;
+ uint32_t opt2 = 0;
+
+ if (tp->t_flags & TF_SACK_PERMIT)
+ opt2 |= F_SACK_EN;
+
+ if (tp->t_flags & TF_REQ_TSTMP)
+ opt2 |= F_TSTAMPS_EN;
+
+ if (tp->t_flags & TF_REQ_SCALE)
+ opt2 |= F_WND_SCALE_EN;
+
+ if (V_tcp_do_ecn)
+ opt2 |= F_CCTRL_ECN;
+
+ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
+ opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
+ opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
+
+ return (htobe32(opt2));
+}
+
+
+void
+t4_init_connect_cpl_handlers(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish);
+ t4_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl);
+}
+
+/*
+ * active open (soconnect).
+ *
+ * State of affairs on entry:
+ * soisconnecting (so_state |= SS_ISCONNECTING)
+ * tcbinfo not locked (This has changed - used to be WLOCKed)
+ * inp WLOCKed
+ * tp->t_state = TCPS_SYN_SENT
+ * rtalloc1, RT_UNLOCK on rt.
+ */
+int
+t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
+ struct sockaddr *nam)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct toepcb *toep = NULL;
+ struct wrqe *wr = NULL;
+ struct cpl_act_open_req *cpl;
+ struct l2t_entry *e = NULL;
+ struct ifnet *rt_ifp = rt->rt_ifp;
+ struct port_info *pi;
+ int atid = -1, mtu_idx, rscale, qid_atid, rc = ENOMEM;
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp = intotcpcb(inp);
+
+ INP_WLOCK_ASSERT(inp);
+
+ if (nam->sa_family != AF_INET)
+ CXGBE_UNIMPLEMENTED("IPv6 connect");
+
+ if (rt_ifp->if_type == IFT_ETHER)
+ pi = rt_ifp->if_softc;
+ else if (rt_ifp->if_type == IFT_L2VLAN) {
+ struct ifnet *ifp = VLAN_COOKIE(rt_ifp);
+
+ pi = ifp->if_softc;
+ } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG)
+ return (ENOSYS); /* XXX: implement lagg support */
+ else
+ return (ENOTSUP);
+
+ toep = alloc_toepcb(pi, -1, -1, M_NOWAIT);
+ if (toep == NULL)
+ goto failed;
+
+ atid = alloc_atid(sc, toep);
+ if (atid < 0)
+ goto failed;
+
+ e = t4_l2t_get(pi, rt_ifp,
+ rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam);
+ if (e == NULL)
+ goto failed;
+
+ wr = alloc_wrqe(sizeof(*cpl), toep->ctrlq);
+ if (wr == NULL)
+ goto failed;
+ cpl = wrtod(wr);
+
+ toep->tid = atid;
+ toep->l2te = e;
+ toep->ulp_mode = ULP_MODE_NONE;
+ SOCKBUF_LOCK(&so->so_rcv);
+ /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ offload_socket(so, toep);
+
+ /*
+ * The kernel sets request_r_scale based on sb_max whereas we need to
+ * take hardware's MAX_RCV_WND into account too. This is normally a
+ * no-op as MAX_RCV_WND is much larger than the default sb_max.
+ */
+ if (tp->t_flags & TF_REQ_SCALE)
+ rscale = tp->request_r_scale = select_rcv_wscale();
+ else
+ rscale = 0;
+ mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
+ qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | atid;
+
+ INIT_TP_WR(cpl, 0);
+ OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid));
+ inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip,
+ &cpl->peer_port);
+ cpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, toep->rx_credits,
+ toep->ulp_mode);
+ cpl->params = select_ntuple(pi, e, sc->filter_mode);
+ cpl->opt2 = calc_opt2a(so);
+
+ CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__,
+ toep->tid, tcpstates[tp->t_state], toep, inp);
+
+ rc = t4_l2t_send(sc, wr, e);
+ if (rc == 0) {
+ toepcb_set_flag(toep, TPF_CPL_PENDING);
+ return (0);
+ }
+
+ undo_offload_socket(so);
+failed:
+ CTR5(KTR_CXGBE, "%s: FAILED, atid %d, toep %p, l2te %p, wr %p",
+ __func__, atid, toep, e, wr);
+
+ if (e)
+ t4_l2t_release(e);
+ if (wr)
+ free_wrqe(wr);
+ if (atid >= 0)
+ free_atid(sc, atid);
+ if (toep)
+ free_toepcb(toep);
+
+ return (rc);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
new file mode 100644
index 0000000..161fc12
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -0,0 +1,1276 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sglist.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+VNET_DECLARE(int, tcp_do_autosndbuf);
+#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
+VNET_DECLARE(int, tcp_autosndbuf_inc);
+#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
+VNET_DECLARE(int, tcp_autosndbuf_max);
+#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
+
+void
+send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
+{
+ struct wrqe *wr;
+ struct fw_flowc_wr *flowc;
+ unsigned int nparams = ftxp ? 8 : 4, flowclen;
+ struct port_info *pi = toep->port;
+ struct adapter *sc = pi->adapter;
+ unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
+ struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+ KASSERT(!toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+ ("%s: flowc for tid %u sent already", __func__, toep->tid));
+
+ CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
+
+ flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+ wr = alloc_wrqe(roundup(flowclen, 16), toep->ofld_txq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ flowc = wrtod(wr);
+ memset(flowc, 0, wr->wr_len);
+
+ flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+ V_FW_FLOWC_WR_NPARAMS(nparams));
+ flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+ V_FW_WR_FLOWID(toep->tid));
+
+ flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+ flowc->mnemval[0].val = htobe32(pfvf);
+ flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+ flowc->mnemval[1].val = htobe32(pi->tx_chan);
+ flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+ flowc->mnemval[2].val = htobe32(pi->tx_chan);
+ flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+ flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id);
+ if (ftxp) {
+ uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf);
+
+ flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+ flowc->mnemval[4].val = htobe32(ftxp->snd_nxt);
+ flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+ flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt);
+ flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+ flowc->mnemval[6].val = htobe32(sndbuf);
+ flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+ flowc->mnemval[7].val = htobe32(ftxp->mss);
+ }
+
+ txsd->tx_credits = howmany(flowclen, 16);
+ txsd->plen = 0;
+ KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
+ ("%s: not enough credits (%d)", __func__, toep->tx_credits));
+ toep->tx_credits -= txsd->tx_credits;
+ if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
+ toep->txsd_pidx = 0;
+ toep->txsd_avail--;
+
+ toepcb_set_flag(toep, TPF_FLOWC_WR_SENT);
+ t4_wrq_tx(sc, wr);
+}
+
+void
+send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
+{
+ struct wrqe *wr;
+ struct cpl_abort_req *req;
+ int tid = toep->tid;
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */
+
+ INP_WLOCK_ASSERT(inp);
+
+ CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
+ __func__, toep->tid,
+ inp->inp_flags & INP_DROPPED ? "inp dropped" :
+ tcpstates[tp->t_state],
+ toep->flags, inp->inp_flags,
+ toepcb_flag(toep, TPF_ABORT_SHUTDOWN) ?
+ " (abort already in progress)" : "");
+
+ if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+ return; /* abort already in progress */
+
+ toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+
+ KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+ ("%s: flowc_wr not sent for tid %d.", __func__, tid));
+
+ wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ req = wrtod(wr);
+
+ INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
+ if (inp->inp_flags & INP_DROPPED)
+ req->rsvd0 = htobe32(snd_nxt);
+ else
+ req->rsvd0 = htobe32(tp->snd_nxt);
+ req->rsvd1 = !toepcb_flag(toep, TPF_TX_DATA_SENT);
+ req->cmd = CPL_ABORT_SEND_RST;
+
+ /*
+ * XXX: What's the correct way to tell that the inp hasn't been detached
+ * from its socket? Should I even be flushing the snd buffer here?
+ */
+ if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+ struct socket *so = inp->inp_socket;
+
+ if (so != NULL) /* because I'm not sure. See comment above */
+ sbflush(&so->so_snd);
+ }
+
+ t4_l2t_send(sc, wr, toep->l2te);
+}
+
+/*
+ * Called when a connection is established to translate the TCP options
+ * reported by HW to FreeBSD's native format.
+ */
+static void
+assign_rxopt(struct tcpcb *tp, unsigned int opt)
+{
+ struct toepcb *toep = tp->t_toe;
+ struct adapter *sc = td_adapter(toep->td);
+
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40;
+
+ if (G_TCPOPT_TSTAMP(opt)) {
+ tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */
+ tp->ts_recent = 0; /* hmmm */
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
+ }
+
+ if (G_TCPOPT_SACK(opt))
+ tp->t_flags |= TF_SACK_PERMIT; /* should already be set */
+ else
+ tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */
+
+ if (G_TCPOPT_WSCALE_OK(opt))
+ tp->t_flags |= TF_RCVD_SCALE;
+
+ /* Doing window scaling? */
+ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
+ (TF_RCVD_SCALE | TF_REQ_SCALE)) {
+ tp->rcv_scale = tp->request_r_scale;
+ tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
+ }
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCPS_ESTABLISHED.
+ *
+ * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1.
+ */
+void
+make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn,
+ uint16_t opt)
+{
+ struct inpcb *inp = toep->inp;
+ struct socket *so = inp->inp_socket;
+ struct tcpcb *tp = intotcpcb(inp);
+ long bufsize;
+ uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */
+ uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */
+ uint16_t tcpopt = be16toh(opt);
+ struct flowc_tx_params ftxp;
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(tp->t_state == TCPS_SYN_SENT ||
+ tp->t_state == TCPS_SYN_RECEIVED,
+ ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
+
+ CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p",
+ __func__, toep->tid, toep, inp);
+
+ tp->t_state = TCPS_ESTABLISHED;
+ tp->t_starttime = ticks;
+ TCPSTAT_INC(tcps_connects);
+
+ tp->irs = irs;
+ tcp_rcvseqinit(tp);
+ tp->rcv_wnd = toep->rx_credits << 10;
+ tp->rcv_adv += tp->rcv_wnd;
+ tp->last_ack_sent = tp->rcv_nxt;
+
+ /*
+ * If we were unable to send all rx credits via opt0, save the remainder
+ * in rx_credits so that they can be handed over with the next credit
+ * update.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ bufsize = select_rcv_wnd(so);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ toep->rx_credits = bufsize - tp->rcv_wnd;
+
+ tp->iss = iss;
+ tcp_sendseqinit(tp);
+ tp->snd_una = iss + 1;
+ tp->snd_nxt = iss + 1;
+ tp->snd_max = iss + 1;
+
+ assign_rxopt(tp, tcpopt);
+
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf)
+ bufsize = V_tcp_autosndbuf_max;
+ else
+ bufsize = sbspace(&so->so_snd);
+ SOCKBUF_UNLOCK(&so->so_snd);
+
+ ftxp.snd_nxt = tp->snd_nxt;
+ ftxp.rcv_nxt = tp->rcv_nxt;
+ ftxp.snd_space = bufsize;
+ ftxp.mss = tp->t_maxseg;
+ send_flowc_wr(toep, &ftxp);
+
+ soisconnected(so);
+}
+
+static int
+send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits)
+{
+ struct wrqe *wr;
+ struct cpl_rx_data_ack *req;
+ uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+
+ wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
+ if (wr == NULL)
+ return (0);
+ req = wrtod(wr);
+
+ INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
+ req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
+
+ t4_wrq_tx(sc, wr);
+ return (credits);
+}
+
+void
+t4_rcvd(struct toedev *tod, struct tcpcb *tp)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+ struct sockbuf *so_rcv = &so->so_rcv;
+ struct toepcb *toep = tp->t_toe;
+ int must_send;
+
+ INP_WLOCK_ASSERT(inp);
+
+ SOCKBUF_LOCK(so_rcv);
+ KASSERT(toep->enqueued >= so_rcv->sb_cc,
+ ("%s: so_rcv->sb_cc > enqueued", __func__));
+ toep->rx_credits += toep->enqueued - so_rcv->sb_cc;
+ toep->enqueued = so_rcv->sb_cc;
+ SOCKBUF_UNLOCK(so_rcv);
+
+ must_send = toep->rx_credits + 16384 >= tp->rcv_wnd;
+ if (must_send || toep->rx_credits >= 15 * 1024) {
+ int credits;
+
+ credits = send_rx_credits(sc, toep, toep->rx_credits);
+ toep->rx_credits -= credits;
+ tp->rcv_wnd += credits;
+ tp->rcv_adv += credits;
+ }
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message.
+ */
+static int
+close_conn(struct adapter *sc, struct toepcb *toep)
+{
+ struct wrqe *wr;
+ struct cpl_close_con_req *req;
+ unsigned int tid = toep->tid;
+
+ CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
+ toepcb_flag(toep, TPF_FIN_SENT) ? ", IGNORED" : "");
+
+ if (toepcb_flag(toep, TPF_FIN_SENT))
+ return (0);
+
+ KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+ ("%s: flowc_wr not sent for tid %u.", __func__, tid));
+
+ wr = alloc_wrqe(sizeof(*req), toep->ofld_txq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ req = wrtod(wr);
+
+ req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
+ V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
+ req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
+ V_FW_WR_FLOWID(tid));
+ req->wr.wr_lo = cpu_to_be64(0);
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+ req->rsvd = 0;
+
+ toepcb_set_flag(toep, TPF_FIN_SENT);
+ toepcb_clr_flag(toep, TPF_SEND_FIN);
+ t4_l2t_send(sc, wr, toep->l2te);
+
+ return (0);
+}
+
+#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
+#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
+
+/* Maximum amount of immediate data we could stuff in a WR */
+static inline int
+max_imm_payload(int tx_credits)
+{
+ const int n = 2; /* Use only up to 2 desc for imm. data WR */
+
+ KASSERT(tx_credits >= 0 &&
+ tx_credits <= MAX_OFLD_TX_CREDITS,
+ ("%s: %d credits", __func__, tx_credits));
+
+ if (tx_credits < MIN_OFLD_TX_CREDITS)
+ return (0);
+
+ if (tx_credits >= (n * EQ_ESIZE) / 16)
+ return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
+ else
+ return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
+}
+
+/* Maximum number of SGL entries we could stuff in a WR */
+static inline int
+max_dsgl_nsegs(int tx_credits)
+{
+ int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
+ int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
+
+ KASSERT(tx_credits >= 0 &&
+ tx_credits <= MAX_OFLD_TX_CREDITS,
+ ("%s: %d credits", __func__, tx_credits));
+
+ if (tx_credits < MIN_OFLD_TX_CREDITS)
+ return (0);
+
+ nseg += 2 * (sge_pair_credits * 16 / 24);
+ if ((sge_pair_credits * 16) % 24 == 16)
+ nseg++;
+
+ return (nseg);
+}
+
+static inline void
+write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
+ unsigned int plen, uint8_t credits, int more_to_come)
+{
+ struct fw_ofld_tx_data_wr *txwr = dst;
+ int shove = !more_to_come;
+ int compl = 1;
+
+ /*
+ * We always request completion notifications from the firmware. The
+ * only exception is when we know we'll get more data to send shortly
+ * and that we'll have some tx credits remaining to transmit that data.
+ */
+ if (more_to_come && toep->tx_credits - credits >= MIN_OFLD_TX_CREDITS)
+ compl = 0;
+
+ txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
+ V_FW_WR_COMPL(compl) | V_FW_WR_IMMDLEN(immdlen));
+ txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
+ V_FW_WR_LEN16(credits));
+ txwr->tunnel_to_proxy =
+ htobe32(V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode) |
+ V_FW_OFLD_TX_DATA_WR_URGENT(0) | /* XXX */
+ V_FW_OFLD_TX_DATA_WR_SHOVE(shove));
+ txwr->plen = htobe32(plen);
+}
+
+/*
+ * Generate a DSGL from a starting mbuf. The total number of segments and the
+ * maximum segments in any one mbuf are provided.
+ */
+static void
+write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
+{
+ struct mbuf *m;
+ struct ulptx_sgl *usgl = dst;
+ int i, j, rc;
+ struct sglist sg;
+ struct sglist_seg segs[n];
+
+ KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
+
+ sglist_init(&sg, n, segs);
+ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
+ V_ULPTX_NSGE(nsegs));
+
+ i = -1;
+ for (m = start; m != stop; m = m->m_next) {
+ rc = sglist_append(&sg, mtod(m, void *), m->m_len);
+ if (__predict_false(rc != 0))
+ panic("%s: sglist_append %d", __func__, rc);
+
+ for (j = 0; j < sg.sg_nseg; i++, j++) {
+ if (i < 0) {
+ usgl->len0 = htobe32(segs[j].ss_len);
+ usgl->addr0 = htobe64(segs[j].ss_paddr);
+ } else {
+ usgl->sge[i / 2].len[i & 1] =
+ htobe32(segs[j].ss_len);
+ usgl->sge[i / 2].addr[i & 1] =
+ htobe64(segs[j].ss_paddr);
+ }
+#ifdef INVARIANTS
+ nsegs--;
+#endif
+ }
+ sglist_reset(&sg);
+ }
+ if (i & 1)
+ usgl->sge[i / 2].len[1] = htobe32(0);
+ KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
+ __func__, nsegs, start, stop));
+}
+
+/*
+ * Max number of SGL entries an offload tx work request can have. This is 41
+ * (1 + 40) for a full 512B work request.
+ * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
+ */
+#define OFLD_SGL_LEN (41)
+
+/*
+ * Send data and/or a FIN to the peer.
+ *
+ * The socket's so_snd buffer consists of a stream of data starting with sb_mb
+ * and linked together with m_next. sb_sndptr, if set, is the last mbuf that
+ * was transmitted.
+ */
+static void
+t4_push_frames(struct adapter *sc, struct toepcb *toep)
+{
+ struct mbuf *sndptr, *m, *sb_sndptr;
+ struct fw_ofld_tx_data_wr *txwr;
+ struct wrqe *wr;
+ unsigned int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp = intotcpcb(inp);
+ struct socket *so = inp->inp_socket;
+ struct sockbuf *sb = &so->so_snd;
+ int tx_credits;
+ struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+ ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
+
+ if (toep->ulp_mode != ULP_MODE_NONE)
+ CXGBE_UNIMPLEMENTED("ulp_mode");
+
+ /*
+ * This function doesn't resume by itself. Someone else must clear the
+ * flag and call this function.
+ */
+ if (__predict_false(toepcb_flag(toep, TPF_TX_SUSPENDED)))
+ return;
+
+ do {
+ tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
+ max_imm = max_imm_payload(tx_credits);
+ max_nsegs = max_dsgl_nsegs(tx_credits);
+
+ SOCKBUF_LOCK(sb);
+ sb_sndptr = sb->sb_sndptr;
+ sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
+ plen = 0;
+ nsegs = 0;
+ max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
+ for (m = sndptr; m != NULL; m = m->m_next) {
+ int n = sglist_count(mtod(m, void *), m->m_len);
+
+ nsegs += n;
+ plen += m->m_len;
+
+ /* This mbuf sent us _over_ the nsegs limit, back out */
+ if (plen > max_imm && nsegs > max_nsegs) {
+ nsegs -= n;
+ plen -= m->m_len;
+ if (plen == 0) {
+ /* Too few credits */
+ toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+ SOCKBUF_UNLOCK(sb);
+ return;
+ }
+ break;
+ }
+
+ if (max_nsegs_1mbuf < n)
+ max_nsegs_1mbuf = n;
+ sb_sndptr = m; /* new sb->sb_sndptr if all goes well */
+
+ /* This mbuf put us right at the max_nsegs limit */
+ if (plen > max_imm && nsegs == max_nsegs) {
+ m = m->m_next;
+ break;
+ }
+ }
+
+ if (sb->sb_flags & SB_AUTOSIZE &&
+ V_tcp_do_autosndbuf &&
+ sb->sb_hiwat < V_tcp_autosndbuf_max &&
+ sbspace(sb) < sb->sb_hiwat / 8 * 7) {
+ int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
+ V_tcp_autosndbuf_max);
+
+ if (!sbreserve_locked(sb, newsize, so, NULL))
+ sb->sb_flags &= ~SB_AUTOSIZE;
+ else {
+ sowwakeup_locked(so); /* room available */
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ goto unlocked;
+ }
+ }
+ SOCKBUF_UNLOCK(sb);
+unlocked:
+
+ /* nothing to send */
+ if (plen == 0) {
+ KASSERT(m == NULL,
+ ("%s: nothing to send, but m != NULL", __func__));
+ break;
+ }
+
+ if (__predict_false(toepcb_flag(toep, TPF_FIN_SENT)))
+ panic("%s: excess tx.", __func__);
+
+ if (plen <= max_imm) {
+
+ /* Immediate data tx */
+
+ wr = alloc_wrqe(roundup(sizeof(*txwr) + plen, 16),
+ toep->ofld_txq);
+ if (wr == NULL) {
+ /* XXX: how will we recover from this? */
+ toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+ return;
+ }
+ txwr = wrtod(wr);
+ credits = howmany(wr->wr_len, 16);
+ write_tx_wr(txwr, toep, plen, plen, credits,
+ tp->t_flags & TF_MORETOCOME);
+ m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
+ } else {
+ int wr_len;
+
+ /* DSGL tx */
+
+ wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
+ ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
+ wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq);
+ if (wr == NULL) {
+ /* XXX: how will we recover from this? */
+ toepcb_set_flag(toep, TPF_TX_SUSPENDED);
+ return;
+ }
+ txwr = wrtod(wr);
+ credits = howmany(wr_len, 16);
+ write_tx_wr(txwr, toep, 0, plen, credits,
+ tp->t_flags & TF_MORETOCOME);
+ write_tx_sgl(txwr + 1, sndptr, m, nsegs,
+ max_nsegs_1mbuf);
+ if (wr_len & 0xf) {
+ uint64_t *pad = (uint64_t *)
+ ((uintptr_t)txwr + wr_len);
+ *pad = 0;
+ }
+ }
+
+ KASSERT(toep->tx_credits >= credits,
+ ("%s: not enough credits", __func__));
+
+ toep->tx_credits -= credits;
+
+ tp->snd_nxt += plen;
+ tp->snd_max += plen;
+
+ SOCKBUF_LOCK(sb);
+ KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
+ sb->sb_sndptr = sb_sndptr;
+ SOCKBUF_UNLOCK(sb);
+
+ toepcb_set_flag(toep, TPF_TX_DATA_SENT);
+
+ KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
+ txsd->plen = plen;
+ txsd->tx_credits = credits;
+ txsd++;
+ if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
+ toep->txsd_pidx = 0;
+ txsd = &toep->txsd[0];
+ }
+ toep->txsd_avail--;
+
+ t4_l2t_send(sc, wr, toep->l2te);
+ } while (m != NULL);
+
+ /* Send a FIN if requested, but only if there's no more data to send */
+ if (m == NULL && toepcb_flag(toep, TPF_SEND_FIN))
+ close_conn(sc, toep);
+}
+
+int
+t4_tod_output(struct toedev *tod, struct tcpcb *tp)
+{
+ struct adapter *sc = tod->tod_softc;
+#ifdef INVARIANTS
+ struct inpcb *inp = tp->t_inpcb;
+#endif
+ struct toepcb *toep = tp->t_toe;
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+ ("%s: inp %p dropped.", __func__, inp));
+ KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+ t4_push_frames(sc, toep);
+
+ return (0);
+}
+
+int
+t4_send_fin(struct toedev *tod, struct tcpcb *tp)
+{
+ struct adapter *sc = tod->tod_softc;
+#ifdef INVARIANTS
+ struct inpcb *inp = tp->t_inpcb;
+#endif
+ struct toepcb *toep = tp->t_toe;
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+ ("%s: inp %p dropped.", __func__, inp));
+ KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+ toepcb_set_flag(toep, TPF_SEND_FIN);
+ t4_push_frames(sc, toep);
+
+ return (0);
+}
+
+int
+t4_send_rst(struct toedev *tod, struct tcpcb *tp)
+{
+ struct adapter *sc = tod->tod_softc;
+#if defined(INVARIANTS)
+ struct inpcb *inp = tp->t_inpcb;
+#endif
+ struct toepcb *toep = tp->t_toe;
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT((inp->inp_flags & INP_DROPPED) == 0,
+ ("%s: inp %p dropped.", __func__, inp));
+ KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+
+ /* hmmmm */
+ KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
+ ("%s: flowc for tid %u [%s] not sent already",
+ __func__, toep->tid, tcpstates[tp->t_state]));
+
+ send_reset(sc, toep, 0);
+ return (0);
+}
+
+/*
+ * Peer has sent us a FIN.
+ */
+static int
+do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_peer_close *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp = NULL;
+ struct socket *so = NULL;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_PEER_CLOSE,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ tp = intotcpcb(inp);
+
+ CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__,
+ tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp);
+
+ if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+ goto done;
+
+ so = inp->inp_socket;
+
+ socantrcvmore(so);
+ tp->rcv_nxt++; /* FIN */
+ KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
+ ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
+ be32toh(cpl->rcv_nxt)));
+
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ tp->t_starttime = ticks;
+ /* FALLTHROUGH */
+
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_CLOSE_WAIT;
+ break;
+
+ case TCPS_FIN_WAIT_1:
+ tp->t_state = TCPS_CLOSING;
+ break;
+
+ case TCPS_FIN_WAIT_2:
+ tcp_twstart(tp);
+ INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ INP_WLOCK(inp);
+ final_cpl_received(toep);
+ return (0);
+
+ default:
+ log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
+ __func__, tid, tp->t_state);
+ }
+done:
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (0);
+}
+
+/*
+ * Peer has ACK'd our FIN.
+ */
+static int
+do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp = NULL;
+ struct socket *so = NULL;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_CLOSE_CON_RPL,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ tp = intotcpcb(inp);
+
+ CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
+ __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
+
+ if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
+ goto done;
+
+ so = inp->inp_socket;
+ tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */
+
+ switch (tp->t_state) {
+ case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */
+ tcp_twstart(tp);
+release:
+ INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ INP_WLOCK(inp);
+ final_cpl_received(toep); /* no more CPLs expected */
+
+ return (0);
+ case TCPS_LAST_ACK:
+ if (tcp_close(tp))
+ INP_WUNLOCK(inp);
+ goto release;
+
+ case TCPS_FIN_WAIT_1:
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ soisdisconnected(so);
+ tp->t_state = TCPS_FIN_WAIT_2;
+ break;
+
+ default:
+ log(LOG_ERR,
+ "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
+ __func__, tid, tcpstates[tp->t_state]);
+ }
+done:
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (0);
+}
+
+void
+send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid,
+ int rst_status)
+{
+ struct wrqe *wr;
+ struct cpl_abort_rpl *cpl;
+
+ wr = alloc_wrqe(sizeof(*cpl), ofld_txq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ cpl = wrtod(wr);
+
+ INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
+ cpl->cmd = rst_status;
+
+ t4_wrq_tx(sc, wr);
+}
+
+static int
+abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
+{
+ switch (abort_reason) {
+ case CPL_ERR_BAD_SYN:
+ case CPL_ERR_CONN_RESET:
+ return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
+ case CPL_ERR_XMIT_TIMEDOUT:
+ case CPL_ERR_PERSIST_TIMEDOUT:
+ case CPL_ERR_FINWAIT2_TIMEDOUT:
+ case CPL_ERR_KEEPALIVE_TIMEDOUT:
+ return (ETIMEDOUT);
+ default:
+ return (EIO);
+ }
+}
+
+/*
+ * TCP RST from the peer, timeout, or some other such critical error.
+ */
+static int
+do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct sge_wrq *ofld_txq = toep->ofld_txq;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct socket *so;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_ABORT_REQ_RSS,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+ if (toepcb_flag(toep, TPF_SYNQE))
+ return (do_abort_req_synqe(iq, rss, m));
+
+ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
+ cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) {
+ CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
+ __func__, cpl->status, tid, toep->flags);
+ return (0); /* Ignore negative advice */
+ }
+
+ inp = toep->inp;
+ INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */
+ INP_WLOCK(inp);
+
+ tp = intotcpcb(inp);
+ so = inp->inp_socket;
+
+ CTR6(KTR_CXGBE,
+ "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
+ __func__, tid, tcpstates[tp->t_state], toep->flags, inp->inp_flags,
+ cpl->status);
+
+ /*
+ * If we'd initiated an abort earlier the reply to it is responsible for
+ * cleaning up resources. Otherwise we tear everything down right here
+ * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
+ */
+ if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) {
+ INP_WUNLOCK(inp);
+ goto done;
+ }
+ toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN);
+
+ so_error_set(so, abort_status_to_errno(tp, cpl->status));
+ tp = tcp_close(tp);
+ if (tp == NULL)
+ INP_WLOCK(inp); /* re-acquire */
+
+ final_cpl_received(toep);
+done:
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
+ return (0);
+}
+
+/*
+ * Reply to the CPL_ABORT_REQ (send_reset)
+ */
+static int
+do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct inpcb *inp = toep->inp;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_ABORT_RPL_RSS,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+
+ if (toepcb_flag(toep, TPF_SYNQE))
+ return (do_abort_rpl_synqe(iq, rss, m));
+
+ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
+ __func__, tid, toep, inp, cpl->status);
+
+ KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+ ("%s: wasn't expecting abort reply", __func__));
+
+ INP_WLOCK(inp);
+ final_cpl_received(toep);
+
+ return (0);
+}
+
+static int
+do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_rx_data *cpl = mtod(m, const void *);
+ unsigned int tid = GET_TID(cpl);
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp;
+ struct socket *so;
+ struct sockbuf *so_rcv;
+
+ if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+ /*
+ * do_pass_establish failed and must be attempting to abort the
+ * synqe's tid. Meanwhile, the T4 has sent us data for such a
+ * connection.
+ */
+ KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+ ("%s: synqe and tid isn't being aborted.", __func__));
+ m_freem(m);
+ return (0);
+ }
+
+ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ /* strip off CPL header */
+ m_adj(m, sizeof(*cpl));
+
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+ CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
+ __func__, tid, m->m_pkthdr.len, inp->inp_flags);
+ INP_WUNLOCK(inp);
+ m_freem(m);
+ return (0);
+ }
+
+ tp = intotcpcb(inp);
+
+#ifdef INVARIANTS
+ if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) {
+ log(LOG_ERR,
+ "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n",
+ __func__, be32toh(cpl->seq), toep->tid, tp->rcv_nxt);
+ }
+#endif
+
+ tp->rcv_nxt += m->m_pkthdr.len;
+ KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
+ ("%s: negative window size", __func__));
+ tp->rcv_wnd -= m->m_pkthdr.len;
+ tp->t_rcvtime = ticks;
+
+ so = inp_inpcbtosocket(inp);
+ so_rcv = &so->so_rcv;
+ SOCKBUF_LOCK(so_rcv);
+
+ if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
+ CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
+ __func__, tid, m->m_pkthdr.len);
+ m_freem(m);
+ SOCKBUF_UNLOCK(so_rcv);
+ INP_WUNLOCK(inp);
+
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ tp = tcp_drop(tp, ECONNRESET);
+ if (tp)
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+
+ return (0);
+ }
+
+ /* receive buffer autosize */
+ if (so_rcv->sb_flags & SB_AUTOSIZE &&
+ V_tcp_do_autorcvbuf &&
+ so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
+ m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) {
+ unsigned int hiwat = so_rcv->sb_hiwat;
+ unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
+ V_tcp_autorcvbuf_max);
+
+ if (!sbreserve_locked(so_rcv, newsize, so, NULL))
+ so_rcv->sb_flags &= ~SB_AUTOSIZE;
+ else
+ toep->rx_credits += newsize - hiwat;
+ }
+ toep->enqueued += m->m_pkthdr.len;
+ sbappendstream_locked(so_rcv, m);
+ sorwakeup_locked(so);
+ SOCKBUF_UNLOCK_ASSERT(so_rcv);
+
+ INP_WUNLOCK(inp);
+ return (0);
+}
+
+#define S_CPL_FW4_ACK_OPCODE 24
+#define M_CPL_FW4_ACK_OPCODE 0xff
+#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE)
+#define G_CPL_FW4_ACK_OPCODE(x) \
+ (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE)
+
+#define S_CPL_FW4_ACK_FLOWID 0
+#define M_CPL_FW4_ACK_FLOWID 0xffffff
+#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID)
+#define G_CPL_FW4_ACK_FLOWID(x) \
+ (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID)
+
+#define S_CPL_FW4_ACK_CR 24
+#define M_CPL_FW4_ACK_CR 0xff
+#define V_CPL_FW4_ACK_CR(x) ((x) << S_CPL_FW4_ACK_CR)
+#define G_CPL_FW4_ACK_CR(x) (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR)
+
+#define S_CPL_FW4_ACK_SEQVAL 0
+#define M_CPL_FW4_ACK_SEQVAL 0x1
+#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL)
+#define G_CPL_FW4_ACK_SEQVAL(x) \
+ (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL)
+#define F_CPL_FW4_ACK_SEQVAL V_CPL_FW4_ACK_SEQVAL(1U)
+
+static int
+do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
+ unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
+ struct toepcb *toep = lookup_tid(sc, tid);
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct socket *so;
+ uint8_t credits = cpl->credits;
+ struct ofld_tx_sdesc *txsd;
+ int plen;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ /*
+ * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
+ * now this comes back carrying the credits for the flowc.
+ */
+ if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
+ KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN),
+ ("%s: credits for a synq entry %p", __func__, toep));
+ return (0);
+ }
+
+ inp = toep->inp;
+
+ KASSERT(opcode == CPL_FW4_ACK,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ INP_WLOCK(inp);
+
+ if (__predict_false(toepcb_flag(toep, TPF_ABORT_SHUTDOWN))) {
+ INP_WUNLOCK(inp);
+ return (0);
+ }
+
+ KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+ ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
+
+ tp = intotcpcb(inp);
+
+ if (cpl->seq_vld) {
+ tcp_seq snd_una = be32toh(cpl->snd_una);
+
+#ifdef INVARIANTS
+ if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
+ log(LOG_ERR,
+ "%s: unexpected seq# %x for TID %u, snd_una %x\n",
+ __func__, snd_una, toep->tid, tp->snd_una);
+ }
+#endif
+
+ if (tp->snd_una != snd_una) {
+ tp->snd_una = snd_una;
+ tp->ts_recent_age = tcp_ts_getticks();
+ }
+ }
+
+ so = inp->inp_socket;
+ txsd = &toep->txsd[toep->txsd_cidx];
+ plen = 0;
+ while (credits) {
+ KASSERT(credits >= txsd->tx_credits,
+ ("%s: too many (or partial) credits", __func__));
+ credits -= txsd->tx_credits;
+ toep->tx_credits += txsd->tx_credits;
+ plen += txsd->plen;
+ txsd++;
+ toep->txsd_avail++;
+ KASSERT(toep->txsd_avail <= toep->txsd_total,
+ ("%s: txsd avail > total", __func__));
+ if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
+ txsd = &toep->txsd[0];
+ toep->txsd_cidx = 0;
+ }
+ }
+
+ if (plen > 0) {
+ struct sockbuf *sb = &so->so_snd;
+
+ SOCKBUF_LOCK(sb);
+ sbdrop_locked(sb, plen);
+ sowwakeup_locked(so);
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ }
+
+ /* XXX */
+ if ((toepcb_flag(toep, TPF_TX_SUSPENDED) &&
+ toep->tx_credits >= MIN_OFLD_TX_CREDITS) ||
+ toep->tx_credits == toep->txsd_total *
+ howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) {
+ toepcb_clr_flag(toep, TPF_TX_SUSPENDED);
+ t4_push_frames(sc, toep);
+ }
+ INP_WUNLOCK(inp);
+
+ return (0);
+}
+
+void
+t4_init_cpl_io_handlers(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close);
+ t4_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl);
+ t4_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req);
+ t4_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl);
+ t4_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data);
+ t4_register_cpl_handler(sc, CPL_FW4_ACK, do_fw4_ack);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
new file mode 100644
index 0000000..895e57a
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -0,0 +1,1362 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/refcount.h>
+#include <sys/domain.h>
+#include <sys/fnv_hash.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* stid services */
+static int alloc_stid(struct adapter *, void *);
+static void *lookup_stid(struct adapter *, int);
+static void free_stid(struct adapter *, int);
+
+/* lctx services */
+static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
+ struct port_info *);
+static int free_lctx(struct adapter *, struct listen_ctx *);
+static void hold_lctx(struct listen_ctx *);
+static void listen_hash_add(struct adapter *, struct listen_ctx *);
+static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
+static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
+static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
+
+static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
+static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
+static void send_reset_synqe(struct toedev *, struct synq_entry *);
+
+/* XXX: won't work for IPv6 */
+static int
+alloc_stid(struct adapter *sc, void *ctx)
+{
+ struct tid_info *t = &sc->tids;
+ int stid = -1;
+
+ mtx_lock(&t->stid_lock);
+ if (t->sfree) {
+ union serv_entry *p = t->sfree;
+
+ stid = p - t->stid_tab;
+ stid += t->stid_base;
+ t->sfree = p->next;
+ p->data = ctx;
+ t->stids_in_use++;
+ }
+ mtx_unlock(&t->stid_lock);
+ return (stid);
+}
+
+static void *
+lookup_stid(struct adapter *sc, int stid)
+{
+ struct tid_info *t = &sc->tids;
+
+ return (t->stid_tab[stid - t->stid_base].data);
+}
+
+static void
+free_stid(struct adapter *sc, int stid)
+{
+ struct tid_info *t = &sc->tids;
+ union serv_entry *p = &t->stid_tab[stid - t->stid_base];
+
+ mtx_lock(&t->stid_lock);
+ p->next = t->sfree;
+ t->sfree = p;
+ t->stids_in_use--;
+ mtx_unlock(&t->stid_lock);
+}
+
+static struct listen_ctx *
+alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
+{
+ struct listen_ctx *lctx;
+
+ INP_WLOCK_ASSERT(inp);
+
+ lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
+ if (lctx == NULL)
+ return (NULL);
+
+ lctx->stid = alloc_stid(sc, lctx);
+ if (lctx->stid < 0) {
+ free(lctx, M_CXGBE);
+ return (NULL);
+ }
+
+ lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
+ lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
+ refcount_init(&lctx->refcount, 1);
+ TAILQ_INIT(&lctx->synq);
+
+ lctx->inp = inp;
+ in_pcbref(inp);
+
+ return (lctx);
+}
+
+/* Don't call this directly, use release_lctx instead */
+static int
+free_lctx(struct adapter *sc, struct listen_ctx *lctx)
+{
+ struct inpcb *inp = lctx->inp;
+
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(lctx->refcount == 0,
+ ("%s: refcount %d", __func__, lctx->refcount));
+ KASSERT(TAILQ_EMPTY(&lctx->synq),
+ ("%s: synq not empty.", __func__));
+ KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
+
+ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
+ __func__, lctx->stid, lctx, lctx->inp);
+
+ free_stid(sc, lctx->stid);
+ free(lctx, M_CXGBE);
+
+ return (in_pcbrele_wlocked(inp));
+}
+
+static void
+hold_lctx(struct listen_ctx *lctx)
+{
+
+ refcount_acquire(&lctx->refcount);
+}
+
+static inline uint32_t
+listen_hashfn(void *key, u_long mask)
+{
+
+ return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
+}
+
+/*
+ * Add a listen_ctx entry to the listen hash table.
+ */
+static void
+listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
+{
+ struct tom_data *td = sc->tom_softc;
+ int bucket = listen_hashfn(lctx->inp, td->listen_mask);
+
+ mtx_lock(&td->lctx_hash_lock);
+ LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
+ td->lctx_count++;
+ mtx_unlock(&td->lctx_hash_lock);
+}
+
+/*
+ * Look for the listening socket's context entry in the hash and return it.
+ */
+static struct listen_ctx *
+listen_hash_find(struct adapter *sc, struct inpcb *inp)
+{
+ struct tom_data *td = sc->tom_softc;
+ int bucket = listen_hashfn(inp, td->listen_mask);
+ struct listen_ctx *lctx;
+
+ mtx_lock(&td->lctx_hash_lock);
+ LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
+ if (lctx->inp == inp)
+ break;
+ }
+ mtx_unlock(&td->lctx_hash_lock);
+
+ return (lctx);
+}
+
+/*
+ * Removes the listen_ctx structure for inp from the hash and returns it.
+ */
+static struct listen_ctx *
+listen_hash_del(struct adapter *sc, struct inpcb *inp)
+{
+ struct tom_data *td = sc->tom_softc;
+ int bucket = listen_hashfn(inp, td->listen_mask);
+ struct listen_ctx *lctx, *l;
+
+ mtx_lock(&td->lctx_hash_lock);
+ LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
+ if (lctx->inp == inp) {
+ LIST_REMOVE(lctx, link);
+ td->lctx_count--;
+ break;
+ }
+ }
+ mtx_unlock(&td->lctx_hash_lock);
+
+ return (lctx);
+}
+
+/*
+ * Releases a hold on the lctx. Must be called with the listening socket's inp
+ * locked. The inp may be freed by this function and it returns NULL to
+ * indicate this.
+ */
+static struct inpcb *
+release_lctx(struct adapter *sc, struct listen_ctx *lctx)
+{
+ struct inpcb *inp = lctx->inp;
+ int inp_freed = 0;
+
+ INP_WLOCK_ASSERT(inp);
+ if (refcount_release(&lctx->refcount))
+ inp_freed = free_lctx(sc, lctx);
+
+ return (inp_freed ? NULL : inp);
+}
+
+static void
+send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct mbuf *m = synqe->syn;
+ struct ifnet *ifp = m->m_pkthdr.rcvif;
+ struct port_info *pi = ifp->if_softc;
+ struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
+ struct wrqe *wr;
+ struct fw_flowc_wr *flowc;
+ struct cpl_abort_req *req;
+ int txqid, rxqid, flowclen;
+ struct sge_wrq *ofld_txq;
+ struct sge_ofld_rxq *ofld_rxq;
+ const int nparams = 4;
+ unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
+
+ INP_WLOCK_ASSERT(synqe->lctx->inp);
+
+ CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s",
+ __func__, synqe, synqe->tid,
+ synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ?
+ " (abort already in progress)" : "");
+ if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN))
+ return; /* abort already in progress */
+ synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN);
+
+ get_qids_from_mbuf(m, &txqid, &rxqid);
+ ofld_txq = &sc->sge.ofld_txq[txqid];
+ ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+
+ /* The wrqe will have two WRs - a flowc followed by an abort_req */
+ flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
+
+ wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ flowc = wrtod(wr);
+ req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE));
+
+ /* First the flowc ... */
+ memset(flowc, 0, wr->wr_len);
+ flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+ V_FW_FLOWC_WR_NPARAMS(nparams));
+ flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
+ V_FW_WR_FLOWID(synqe->tid));
+ flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+ flowc->mnemval[0].val = htobe32(pfvf);
+ flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+ flowc->mnemval[1].val = htobe32(pi->tx_chan);
+ flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+ flowc->mnemval[2].val = htobe32(pi->tx_chan);
+ flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+ flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
+ synqe_set_flag(synqe, TPF_FLOWC_WR_SENT);
+
+ /* ... then ABORT request */
+ INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
+ req->rsvd0 = 0; /* don't have a snd_nxt */
+ req->rsvd1 = 1; /* no data sent yet */
+ req->cmd = CPL_ABORT_SEND_RST;
+
+ t4_l2t_send(sc, wr, e);
+}
+
+static int
+create_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+ struct wrqe *wr;
+ struct cpl_pass_open_req *req;
+ struct in_conninfo *inc = &lctx->inp->inp_inc;
+
+ wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
+ if (wr == NULL) {
+ log(LOG_ERR, "%s: allocation failure", __func__);
+ return (ENOMEM);
+ }
+ req = wrtod(wr);
+
+ INIT_TP_WR(req, 0);
+ OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
+ req->local_port = inc->inc_lport;
+ req->peer_port = 0;
+ req->local_ip = inc->inc_laddr.s_addr;
+ req->peer_ip = 0;
+ req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
+ req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
+ F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
+
+ t4_wrq_tx(sc, wr);
+ return (0);
+}
+
+static int
+destroy_server(struct adapter *sc, struct listen_ctx *lctx)
+{
+ struct wrqe *wr;
+ struct cpl_close_listsvr_req *req;
+
+ wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
+ if (wr == NULL) {
+ /* XXX */
+ panic("%s: allocation failure.", __func__);
+ }
+ req = wrtod(wr);
+
+ INIT_TP_WR(req, 0);
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
+ lctx->stid));
+ req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
+ req->rsvd = htobe16(0);
+
+ t4_wrq_tx(sc, wr);
+ return (0);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ *
+ * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
+ * sc->offload_map, if_capenable are all race prone.
+ */
+int
+t4_listen_start(struct toedev *tod, struct tcpcb *tp)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct port_info *pi;
+ struct inpcb *inp = tp->t_inpcb;
+ struct listen_ctx *lctx;
+ int i;
+
+ INP_WLOCK_ASSERT(inp);
+
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ return (0);
+
+#if 0
+ ADAPTER_LOCK(sc);
+ if (IS_BUSY(sc)) {
+ log(LOG_ERR, "%s: listen request ignored, %s is busy",
+ __func__, device_get_nameunit(sc->dev));
+ goto done;
+ }
+
+ KASSERT(sc->flags & TOM_INIT_DONE,
+ ("%s: TOM not initialized", __func__));
+#endif
+
+ if ((sc->open_device_map & sc->offload_map) == 0)
+ goto done; /* no port that's UP with IFCAP_TOE enabled */
+
+ /*
+ * Find a running port with IFCAP_TOE4. We'll use the first such port's
+ * queues to send the passive open and receive the reply to it.
+ *
+ * XXX: need a way to mark a port in use by offload. if_cxgbe should
+ * then reject any attempt to bring down such a port (and maybe reject
+ * attempts to disable IFCAP_TOE on that port too?).
+ */
+ for_each_port(sc, i) {
+ if (isset(&sc->open_device_map, i) &&
+ sc->port[i]->ifp->if_capenable & IFCAP_TOE4)
+ break;
+ }
+ KASSERT(i < sc->params.nports,
+ ("%s: no running port with TOE capability enabled.", __func__));
+ pi = sc->port[i];
+
+ if (listen_hash_find(sc, inp) != NULL)
+ goto done; /* already setup */
+
+ lctx = alloc_lctx(sc, inp, pi);
+ if (lctx == NULL) {
+ log(LOG_ERR,
+ "%s: listen request ignored, %s couldn't allocate lctx\n",
+ __func__, device_get_nameunit(sc->dev));
+ goto done;
+ }
+ listen_hash_add(sc, lctx);
+
+ CTR5(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p", __func__,
+ lctx->stid, tcpstates[tp->t_state], lctx, inp);
+
+ if (create_server(sc, lctx) != 0) {
+ log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
+ device_get_nameunit(sc->dev));
+ (void) listen_hash_del(sc, inp);
+ inp = release_lctx(sc, lctx);
+ /* can't be freed, host stack has a reference */
+ KASSERT(inp != NULL, ("%s: inp freed", __func__));
+ goto done;
+ }
+ lctx->flags |= LCTX_RPL_PENDING;
+done:
+#if 0
+ ADAPTER_UNLOCK(sc);
+#endif
+ return (0);
+}
+
+int
+t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
+{
+ struct listen_ctx *lctx;
+ struct adapter *sc = tod->tod_softc;
+ struct inpcb *inp = tp->t_inpcb;
+ struct synq_entry *synqe;
+
+ INP_WLOCK_ASSERT(inp);
+
+ lctx = listen_hash_del(sc, inp);
+ if (lctx == NULL)
+ return (ENOENT); /* no hardware listener for this inp */
+
+ CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
+ lctx, lctx->flags);
+
+ /*
+ * If the reply to the PASS_OPEN is still pending we'll wait for it to
+ * arrive and clean up when it does.
+ */
+ if (lctx->flags & LCTX_RPL_PENDING) {
+ KASSERT(TAILQ_EMPTY(&lctx->synq),
+ ("%s: synq not empty.", __func__));
+ return (EINPROGRESS);
+ }
+
+ /*
+ * The host stack will abort all the connections on the listening
+ * socket's so_comp. It doesn't know about the connections on the synq
+ * so we need to take care of those.
+ */
+ TAILQ_FOREACH(synqe, &lctx->synq, link)
+ send_reset_synqe(tod, synqe);
+
+ destroy_server(sc, lctx);
+ return (0);
+}
+
+static inline void
+hold_synqe(struct synq_entry *synqe)
+{
+
+ refcount_acquire(&synqe->refcnt);
+}
+
+static inline void
+release_synqe(struct synq_entry *synqe)
+{
+
+ if (refcount_release(&synqe->refcnt)) {
+ int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE);
+
+ m_freem(synqe->syn);
+ if (needfree)
+ free(synqe, M_CXGBE);
+ }
+}
+
+void
+t4_syncache_added(struct toedev *tod __unused, void *arg)
+{
+ struct synq_entry *synqe = arg;
+
+ hold_synqe(synqe);
+}
+
+void
+t4_syncache_removed(struct toedev *tod __unused, void *arg)
+{
+ struct synq_entry *synqe = arg;
+
+ release_synqe(synqe);
+}
+
+/* XXX */
+extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+
+int
+t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct synq_entry *synqe = arg;
+ struct wrqe *wr;
+ struct l2t_entry *e;
+ struct tcpopt to;
+ struct ip *ip = mtod(m, struct ip *);
+ struct tcphdr *th = (void *)(ip + 1);
+
+ wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
+ if (wr == NULL)
+ return (EALREADY);
+
+ bzero(&to, sizeof(to));
+ tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
+ TO_SYN);
+
+ /* save these for later */
+ synqe->iss = be32toh(th->th_seq);
+ synqe->ts = to.to_tsval;
+
+ e = &sc->l2t->l2tab[synqe->l2e_idx];
+ t4_l2t_send(sc, wr, e);
+
+ m_freem(m); /* don't need this any more */
+ return (0);
+}
+
+static int
+do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
+ int stid = GET_TID(cpl);
+ unsigned int status = cpl->status;
+ struct listen_ctx *lctx = lookup_stid(sc, stid);
+ struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_PASS_OPEN_RPL,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+ INP_WLOCK(inp);
+
+ CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
+ __func__, stid, status, lctx->flags);
+
+ lctx->flags &= ~LCTX_RPL_PENDING;
+
+ if (status != CPL_ERR_NONE)
+ log(LOG_ERR, "listener with stid %u failed: %d", stid, status);
+
+#ifdef INVARIANTS
+ /*
+ * If the inp has been dropped (listening socket closed) then
+ * listen_stop must have run and taken the inp out of the hash.
+ */
+ if (inp->inp_flags & INP_DROPPED) {
+ KASSERT(listen_hash_del(sc, inp) == NULL,
+ ("%s: inp %p still in listen hash", __func__, inp));
+ }
+#endif
+
+ if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
+ if (release_lctx(sc, lctx) != NULL)
+ INP_WUNLOCK(inp);
+ return (status);
+ }
+
+ /*
+ * Listening socket stopped listening earlier and now the chip tells us
+ * it has started the hardware listener. Stop it; the lctx will be
+ * released in do_close_server_rpl.
+ */
+ if (inp->inp_flags & INP_DROPPED) {
+ destroy_server(sc, lctx);
+ INP_WUNLOCK(inp);
+ return (status);
+ }
+
+ /*
+ * Failed to start hardware listener. Take inp out of the hash and
+ * release our reference on it. An error message has been logged
+ * already.
+ */
+ if (status != CPL_ERR_NONE) {
+ listen_hash_del(sc, inp);
+ if (release_lctx(sc, lctx) != NULL)
+ INP_WUNLOCK(inp);
+ return (status);
+ }
+
+ /* hardware listener open for business */
+
+ INP_WUNLOCK(inp);
+ return (status);
+}
+
+static int
+do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
+ int stid = GET_TID(cpl);
+ unsigned int status = cpl->status;
+ struct listen_ctx *lctx = lookup_stid(sc, stid);
+ struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+ CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
+
+ if (status != CPL_ERR_NONE) {
+ log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
+ __func__, status, stid);
+ return (status);
+ }
+
+ INP_WLOCK(inp);
+ inp = release_lctx(sc, lctx);
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+
+ return (status);
+}
+
+static void
+done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
+{
+ struct listen_ctx *lctx = synqe->lctx;
+ struct inpcb *inp = lctx->inp;
+ struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
+ struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
+
+ INP_WLOCK_ASSERT(inp);
+
+ TAILQ_REMOVE(&lctx->synq, synqe, link);
+ inp = release_lctx(sc, lctx);
+ if (inp)
+ INP_WUNLOCK(inp);
+ remove_tid(sc, synqe->tid);
+ release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
+ t4_l2t_release(e);
+ release_synqe(synqe); /* removed from synq list */
+}
+
+int
+do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ struct synq_entry *synqe = lookup_tid(sc, tid);
+ struct listen_ctx *lctx = synqe->lctx;
+ struct inpcb *inp = lctx->inp;
+ int txqid;
+ struct sge_wrq *ofld_txq;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_ABORT_REQ_RSS,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
+ __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
+
+ if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
+ cpl->status == CPL_ERR_PERSIST_NEG_ADVICE)
+ return (0); /* Ignore negative advice */
+
+ INP_WLOCK(inp);
+
+ get_qids_from_mbuf(synqe->syn, &txqid, NULL);
+ ofld_txq = &sc->sge.ofld_txq[txqid];
+
+ /*
+ * If we'd initiated an abort earlier the reply to it is responsible for
+ * cleaning up resources. Otherwise we tear everything down right here
+ * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
+ */
+ if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) {
+ INP_WUNLOCK(inp);
+ goto done;
+ }
+
+ done_with_synqe(sc, synqe);
+ /* inp lock released by done_with_synqe */
+done:
+ send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
+ return (0);
+}
+
+int
+do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(cpl);
+ struct synq_entry *synqe = lookup_tid(sc, tid);
+ struct listen_ctx *lctx = synqe->lctx;
+ struct inpcb *inp = lctx->inp;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_ABORT_RPL_RSS,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
+
+ CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
+ __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
+
+ INP_WLOCK(inp);
+ KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+ ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
+ __func__, synqe, synqe->flags));
+
+ done_with_synqe(sc, synqe);
+ /* inp lock released by done_with_synqe */
+
+ return (0);
+}
+
+void
+t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct synq_entry *synqe = arg;
+#ifdef INVARIANTS
+ struct inpcb *inp = sotoinpcb(so);
+#endif
+ struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
+ struct toepcb *toep = *(struct toepcb **)(cpl + 1);
+
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(synqe_flag(synqe, TPF_SYNQE),
+ ("%s: %p not a synq_entry?", __func__, arg));
+
+ offload_socket(so, toep);
+ make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
+ toepcb_set_flag(toep, TPF_CPL_PENDING);
+ update_tid(sc, synqe->tid, toep);
+}
+
+static inline void
+save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
+{
+ uint32_t txqid, rxqid;
+
+ txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
+ rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
+
+ m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
+}
+
+static inline void
+get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
+{
+
+ if (txqid)
+ *txqid = m->m_pkthdr.flowid >> 16;
+ if (rxqid)
+ *rxqid = m->m_pkthdr.flowid & 0xffff;
+}
+
+/*
+ * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
+ * store some state temporarily.
+ */
+static struct synq_entry *
+mbuf_to_synqe(struct mbuf *m)
+{
+ int len = roundup(sizeof (struct synq_entry), 8);
+ int tspace = M_TRAILINGSPACE(m);
+ struct synq_entry *synqe = NULL;
+
+ if (tspace < len) {
+ synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
+ if (synqe == NULL)
+ return (NULL);
+ } else
+ synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe));
+
+ synqe->flags = 0;
+ synqe_set_flag(synqe, TPF_SYNQE);
+ if (tspace < len)
+ synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE);
+
+ return (synqe);
+}
+
+static void
+t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
+{
+ bzero(to, sizeof(*to));
+
+ if (t4opt->mss) {
+ to->to_flags |= TOF_MSS;
+ to->to_mss = be16toh(t4opt->mss);
+ }
+
+ if (t4opt->wsf) {
+ to->to_flags |= TOF_SCALE;
+ to->to_wscale = t4opt->wsf;
+ }
+
+ if (t4opt->tstamp)
+ to->to_flags |= TOF_TS;
+
+ if (t4opt->sack)
+ to->to_flags |= TOF_SACKPERM;
+}
+
+/*
+ * Options2 for passive open.
+ */
+static uint32_t
+calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
+ const struct tcp_options *tcpopt, struct tcphdr *th)
+{
+ uint32_t opt2 = 0;
+ struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+
+ if (V_tcp_do_rfc1323) {
+ if (tcpopt->tstamp)
+ opt2 |= F_TSTAMPS_EN;
+ if (tcpopt->sack)
+ opt2 |= F_SACK_EN;
+ if (tcpopt->wsf > 0)
+ opt2 |= F_WND_SCALE_EN;
+ }
+
+ if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
+ opt2 |= F_CCTRL_ECN;
+
+ opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
+ opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
+ opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
+
+ return htobe32(opt2);
+}
+
+/* XXX: duplication. */
+static inline void
+tcp_fields_to_host(struct tcphdr *th)
+{
+
+ th->th_seq = ntohl(th->th_seq);
+ th->th_ack = ntohl(th->th_ack);
+ th->th_win = ntohs(th->th_win);
+ th->th_urp = ntohs(th->th_urp);
+}
+
+static void
+pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
+ struct tcphdr *th)
+{
+ const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
+ const struct ether_header *eh;
+ unsigned int hlen = be32toh(cpl->hdr_len);
+ const struct ip *ip;
+ const struct tcphdr *tcp;
+
+ eh = (const void *)(cpl + 1);
+ ip = (const void *)((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
+ tcp = (const void *)((uintptr_t)ip + G_IP_HDR_LEN(hlen));
+
+ if (inc) {
+ bzero(inc, sizeof(*inc));
+ inc->inc_faddr = ip->ip_src;
+ inc->inc_laddr = ip->ip_dst;
+ inc->inc_fport = tcp->th_sport;
+ inc->inc_lport = tcp->th_dport;
+ if (ip->ip_v == 6)
+ inc->inc_flags |= INC_ISIPV6;
+ }
+
+ if (th) {
+ bcopy(tcp, th, sizeof(*th));
+ tcp_fields_to_host(th); /* just like tcp_input */
+ }
+}
+
+#define REJECT_PASS_ACCEPT() do { \
+ reject_reason = __LINE__; \
+ goto reject; \
+} while (0)
+
+/*
+ * The context associated with a tid entry via insert_tid could be a synq_entry
+ * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
+ */
+CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
+
+/*
+ * Incoming SYN on a listening socket.
+ *
+ * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
+ * etc.
+ */
+static int
+do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ struct toedev *tod;
+ const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
+ struct cpl_pass_accept_rpl *rpl;
+ struct wrqe *wr;
+ unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
+ unsigned int tid = GET_TID(cpl);
+ struct listen_ctx *lctx = lookup_stid(sc, stid);
+ struct inpcb *inp;
+ struct socket *so;
+ struct in_conninfo inc;
+ struct tcphdr th;
+ struct tcpopt to;
+ struct port_info *pi;
+ struct ifnet *ifp, *ifp_vlan = NULL;
+ struct l2t_entry *e = NULL;
+ struct rtentry *rt;
+ struct sockaddr_in nam;
+ int rscale, mtu_idx, rx_credits, rxqid;
+ struct synq_entry *synqe = NULL;
+ int reject_reason;
+ uint16_t vid;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+
+ CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
+ lctx);
+
+ pass_accept_req_to_protohdrs(m, &inc, &th);
+ t4opt_to_tcpopt(&cpl->tcpopt, &to);
+
+ pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
+ ifp = pi->ifp;
+ m->m_pkthdr.rcvif = ifp;
+ tod = TOEDEV(ifp);
+
+ /*
+ * Don't offload if the interface that received the SYN doesn't have
+ * IFCAP_TOE enabled.
+ */
+ if ((ifp->if_capenable & IFCAP_TOE4) == 0)
+ REJECT_PASS_ACCEPT();
+
+ /* Don't offload IPv6 connections. XXX: add IPv6 support */
+ if (inc.inc_flags & INC_ISIPV6)
+ REJECT_PASS_ACCEPT();
+
+ /*
+ * Don't offload if the SYN had a VLAN tag and the vid doesn't match
+ * anything on this interface.
+ */
+ vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
+ if (vid != 0xfff) {
+ ifp_vlan = VLAN_DEVAT(ifp, vid);
+ if (ifp_vlan == NULL)
+ REJECT_PASS_ACCEPT();
+ }
+
+ /*
+ * Don't offload if the peer requested a TCP option that's not known to
+ * the silicon.
+ */
+ if (cpl->tcpopt.unknown)
+ REJECT_PASS_ACCEPT();
+
+ /*
+ * Don't offload if the outgoing interface for the route back to the
+ * peer is not the same as the interface that received the SYN.
+ * XXX: too restrictive.
+ */
+ nam.sin_len = sizeof(nam);
+ nam.sin_family = AF_INET;
+ nam.sin_addr = inc.inc_faddr;
+ rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
+ if (rt == NULL)
+ REJECT_PASS_ACCEPT();
+ else {
+ struct sockaddr *nexthop;
+
+ RT_UNLOCK(rt);
+ nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
+ (struct sockaddr *)&nam;
+ if (rt->rt_ifp == ifp ||
+ (ifp_vlan != NULL && rt->rt_ifp == ifp_vlan))
+ e = t4_l2t_get(pi, rt->rt_ifp, nexthop);
+ RTFREE(rt);
+ if (e == NULL)
+ REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */
+ }
+
+ synqe = mbuf_to_synqe(m);
+ if (synqe == NULL)
+ REJECT_PASS_ACCEPT();
+
+ wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]);
+ if (wr == NULL)
+ REJECT_PASS_ACCEPT();
+ rpl = wrtod(wr);
+
+ INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check, syncache_add */
+
+ /* Don't offload if the 4-tuple is already in use */
+ if (toe_4tuple_check(&inc, &th, ifp) != 0) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ free(wr, M_CXGBE);
+ REJECT_PASS_ACCEPT();
+ }
+
+ inp = lctx->inp; /* listening socket, not owned by TOE */
+ INP_WLOCK(inp);
+
+ /* Don't offload if the listening socket has closed */
+ if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+ /*
+ * The listening socket has closed. The reply from the TOE to
+ * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
+ * resources tied to this listen context.
+ */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ free(wr, M_CXGBE);
+ REJECT_PASS_ACCEPT();
+ }
+ so = inp->inp_socket;
+
+ mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
+ rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
+ SOCKBUF_LOCK(&so->so_rcv);
+ /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+ rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ save_qids_in_mbuf(m, pi);
+ get_qids_from_mbuf(m, NULL, &rxqid);
+
+ INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
+ rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits,
+ ULP_MODE_NONE);
+ rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th);
+
+ synqe->tid = tid;
+ synqe->lctx = lctx;
+ synqe->syn = m;
+ m = NULL;
+ refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the
+ duration of this function */
+ synqe->l2e_idx = e->idx;
+ synqe->rcv_bufsize = rx_credits;
+ atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
+
+ insert_tid(sc, tid, synqe);
+ TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
+ hold_synqe(synqe); /* hold for the duration it's in the synq */
+ hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */
+
+ /*
+ * If all goes well t4_syncache_respond will get called during
+ * syncache_add. Also note that syncache_add releases both pcbinfo and
+ * pcb locks.
+ */
+ toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
+ INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+
+ /*
+ * If we replied during syncache_add (synqe->wr has been consumed),
+ * good. Otherwise, set it to 0 so that further syncache_respond
+ * attempts by the kernel will be ignored.
+ *
+ * The extra hold on the synqe makes sure that it is still around, even
+ * if the listener has been dropped and the synqe was aborted and the
+ * reply to the abort has removed and released the synqe from the synq
+ * list.
+ */
+ if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
+
+ INP_WLOCK(inp);
+ if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+ /* listener closed. synqe must have been aborted. */
+ KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+ ("%s: listener %p closed but synqe %p not aborted",
+ __func__, inp, synqe));
+
+ CTR5(KTR_CXGBE,
+ "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
+ __func__, stid, tid, lctx, synqe);
+ INP_WUNLOCK(inp);
+ free(wr, M_CXGBE);
+ release_synqe(synqe); /* about to exit function */
+ return (__LINE__);
+ }
+
+ /*
+ * synqe aborted before TOM replied to PASS_ACCEPT_REQ. But
+ * that can only happen if the listener was closed and we just
+ * checked for that.
+ */
+ KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+ ("%s: synqe %p aborted, but listener %p not dropped.",
+ __func__, synqe, inp));
+
+ /* Yank the synqe out of the lctx synq. */
+ TAILQ_REMOVE(&lctx->synq, synqe, link);
+ release_synqe(synqe); /* removed from synq list */
+ inp = release_lctx(sc, lctx);
+ if (inp)
+ INP_WUNLOCK(inp);
+
+ /*
+ * syncache may or may not have a hold on the synqe, which may
+ * or may not be stashed in the original SYN mbuf passed to us.
+ * Just copy it over instead of dealing with all possibilities.
+ */
+ m = m_dup(synqe->syn, M_DONTWAIT);
+ if (m)
+ m->m_pkthdr.rcvif = ifp;
+
+ release_synqe(synqe); /* about to exit function */
+ free(wr, M_CXGBE);
+ REJECT_PASS_ACCEPT();
+ }
+ release_synqe(synqe); /* about to exit function */
+ CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
+ __func__, stid, tid, lctx, synqe);
+ return (0);
+reject:
+ CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
+ reject_reason);
+
+ if (e)
+ t4_l2t_release(e);
+ release_tid(sc, tid, lctx->ctrlq);
+
+ if (__predict_true(m != NULL)) {
+ m_adj(m, sizeof(*cpl));
+ m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m->m_pkthdr.csum_data = 0xffff;
+ ifp->if_input(ifp, m);
+ }
+
+ return (reject_reason);
+}
+
+static void
+synqe_to_protohdrs(struct synq_entry *synqe,
+ const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
+ struct tcphdr *th, struct tcpopt *to)
+{
+ uint16_t tcp_opt = be16toh(cpl->tcp_opt);
+
+ /* start off with the original SYN */
+ pass_accept_req_to_protohdrs(synqe->syn, inc, th);
+
+ /* modify parts to make it look like the ACK to our SYN|ACK */
+ th->th_flags = TH_ACK;
+ th->th_ack = synqe->iss + 1;
+ th->th_seq = be32toh(cpl->rcv_isn);
+ bzero(to, sizeof(*to));
+ if (G_TCPOPT_TSTAMP(tcp_opt)) {
+ to->to_flags |= TOF_TS;
+ to->to_tsecr = synqe->ts;
+ }
+}
+
+static int
+do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ struct port_info *pi;
+ struct ifnet *ifp;
+ const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
+#if defined(KTR) || defined(INVARIANTS)
+ unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
+#endif
+ unsigned int tid = GET_TID(cpl);
+ struct synq_entry *synqe = lookup_tid(sc, tid);
+ struct listen_ctx *lctx = synqe->lctx;
+ struct inpcb *inp = lctx->inp;
+ struct socket *so;
+ struct tcphdr th;
+ struct tcpopt to;
+ struct in_conninfo inc;
+ struct toepcb *toep;
+ u_int txqid, rxqid;
+#ifdef INVARIANTS
+ unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
+#endif
+
+ KASSERT(opcode == CPL_PASS_ESTABLISH,
+ ("%s: unexpected opcode 0x%x", __func__, opcode));
+ KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+ KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
+ KASSERT(synqe_flag(synqe, TPF_SYNQE),
+ ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
+
+ INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
+ INP_WLOCK(inp);
+
+ CTR6(KTR_CXGBE,
+ "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
+ __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
+
+ if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+ /*
+ * The listening socket has closed. The TOM must have aborted
+ * all the embryonic connections (including this one) that were
+ * on the lctx's synq. do_abort_rpl for the tid is responsible
+ * for cleaning up.
+ */
+ KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
+ ("%s: listen socket dropped but tid %u not aborted.",
+ __func__, tid));
+
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (0);
+ }
+
+ ifp = synqe->syn->m_pkthdr.rcvif;
+ pi = ifp->if_softc;
+ KASSERT(pi->adapter == sc,
+ ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
+
+ get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
+ KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
+ ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid,
+ (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
+
+ toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
+ if (toep == NULL) {
+reset:
+ /* The reply to this abort will perform final cleanup */
+ send_reset_synqe(TOEDEV(ifp), synqe);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (0);
+ }
+ toep->tid = tid;
+ toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
+ toep->ulp_mode = ULP_MODE_NONE;
+ /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
+ toep->rx_credits = synqe->rcv_bufsize;
+
+ so = inp->inp_socket;
+ KASSERT(so != NULL, ("%s: socket is NULL", __func__));
+
+ /* Come up with something that syncache_expand should be ok with. */
+ synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
+
+ /*
+ * No more need for anything in the mbuf that carried the
+ * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer
+ * there. XXX: bad form but I don't want to increase the size of synqe.
+ */
+ m = synqe->syn;
+ KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
+ ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
+ bcopy(cpl, mtod(m, void *), sizeof(*cpl));
+ *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
+
+ if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
+ free_toepcb(toep);
+ goto reset;
+ }
+
+ /* Done with the synqe */
+ TAILQ_REMOVE(&lctx->synq, synqe, link);
+ inp = release_lctx(sc, lctx);
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ release_synqe(synqe);
+
+ return (0);
+}
+
+void
+t4_init_listen_cpl_handlers(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+ t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+ t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+ t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
new file mode 100644
index 0000000..c6e9a1f
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#ifdef TCP_OFFLOAD
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+/* Module ops */
+static int t4_tom_mod_load(void);
+static int t4_tom_mod_unload(void);
+static int t4_tom_modevent(module_t, int, void *);
+
+/* ULD ops and helpers */
+static int t4_tom_activate(struct adapter *);
+static int t4_tom_deactivate(struct adapter *);
+
+static struct uld_info tom_uld_info = {
+ .uld_id = ULD_TOM,
+ .activate = t4_tom_activate,
+ .deactivate = t4_tom_deactivate,
+};
+
+static void queue_tid_release(struct adapter *, int);
+static void release_offload_resources(struct toepcb *);
+static int alloc_tid_tabs(struct tid_info *);
+static void free_tid_tabs(struct tid_info *);
+static void free_tom_data(struct adapter *, struct tom_data *);
+
+struct toepcb *
+alloc_toepcb(struct port_info *pi, int txqid, int rxqid, int flags)
+{
+ struct adapter *sc = pi->adapter;
+ struct toepcb *toep;
+ int tx_credits, txsd_total, len;
+
+ /*
+ * The firmware counts tx work request credits in units of 16 bytes
+ * each. Reserve room for an ABORT_REQ so the driver never has to worry
+ * about tx credits if it wants to abort a connection.
+ */
+ tx_credits = sc->params.ofldq_wr_cred;
+ tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
+
+ /*
+ * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
+ * immediate payload, and firmware counts tx work request credits in
+ * units of 16 byte. Calculate the maximum work requests possible.
+ */
+ txsd_total = tx_credits /
+ howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16);
+
+ if (txqid < 0)
+ txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
+ KASSERT(txqid >= pi->first_ofld_txq &&
+ txqid < pi->first_ofld_txq + pi->nofldtxq,
+ ("%s: txqid %d for port %p (first %d, n %d)", __func__, txqid, pi,
+ pi->first_ofld_txq, pi->nofldtxq));
+
+ if (rxqid < 0)
+ rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
+ KASSERT(rxqid >= pi->first_ofld_rxq &&
+ rxqid < pi->first_ofld_rxq + pi->nofldrxq,
+ ("%s: rxqid %d for port %p (first %d, n %d)", __func__, rxqid, pi,
+ pi->first_ofld_rxq, pi->nofldrxq));
+
+ len = offsetof(struct toepcb, txsd) +
+ txsd_total * sizeof(struct ofld_tx_sdesc);
+
+ toep = malloc(len, M_CXGBE, M_ZERO | flags);
+ if (toep == NULL)
+ return (NULL);
+
+ toep->td = sc->tom_softc;
+ toep->port = pi;
+ toep->tx_credits = tx_credits;
+ toep->ofld_txq = &sc->sge.ofld_txq[txqid];
+ toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid];
+ toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
+ toep->txsd_total = txsd_total;
+ toep->txsd_avail = txsd_total;
+ toep->txsd_pidx = 0;
+ toep->txsd_cidx = 0;
+
+ return (toep);
+}
+
+void
+free_toepcb(struct toepcb *toep)
+{
+
+ KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+ ("%s: attached to an inpcb", __func__));
+ KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+ ("%s: CPL pending", __func__));
+
+ free(toep, M_CXGBE);
+}
+
+/*
+ * Set up the socket for TCP offload.
+ */
+void
+offload_socket(struct socket *so, struct toepcb *toep)
+{
+ struct tom_data *td = toep->td;
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp = intotcpcb(inp);
+ struct sockbuf *sb;
+
+ INP_WLOCK_ASSERT(inp);
+
+ /* Update socket */
+ sb = &so->so_snd;
+ SOCKBUF_LOCK(sb);
+ sb->sb_flags |= SB_NOCOALESCE;
+ SOCKBUF_UNLOCK(sb);
+ sb = &so->so_rcv;
+ SOCKBUF_LOCK(sb);
+ sb->sb_flags |= SB_NOCOALESCE;
+ SOCKBUF_UNLOCK(sb);
+
+ /* Update TCP PCB */
+ tp->tod = &td->tod;
+ tp->t_toe = toep;
+ tp->t_flags |= TF_TOE;
+
+ /* Install an extra hold on inp */
+ toep->inp = inp;
+ toepcb_set_flag(toep, TPF_ATTACHED);
+ in_pcbref(inp);
+
+ /* Add the TOE PCB to the active list */
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
+ mtx_unlock(&td->toep_list_lock);
+}
+
+/* This is _not_ the normal way to "unoffload" a socket. */
+void
+undo_offload_socket(struct socket *so)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp = intotcpcb(inp);
+ struct toepcb *toep = tp->t_toe;
+ struct tom_data *td = toep->td;
+ struct sockbuf *sb;
+
+ INP_WLOCK_ASSERT(inp);
+
+ sb = &so->so_snd;
+ SOCKBUF_LOCK(sb);
+ sb->sb_flags &= ~SB_NOCOALESCE;
+ SOCKBUF_UNLOCK(sb);
+ sb = &so->so_rcv;
+ SOCKBUF_LOCK(sb);
+ sb->sb_flags &= ~SB_NOCOALESCE;
+ SOCKBUF_UNLOCK(sb);
+
+ tp->tod = NULL;
+ tp->t_toe = NULL;
+ tp->t_flags &= ~TF_TOE;
+
+ toep->inp = NULL;
+ toepcb_clr_flag(toep, TPF_ATTACHED);
+ if (in_pcbrele_wlocked(inp))
+ panic("%s: inp freed.", __func__);
+
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_REMOVE(&td->toep_list, toep, link);
+ mtx_unlock(&td->toep_list_lock);
+}
+
+static void
+release_offload_resources(struct toepcb *toep)
+{
+ struct tom_data *td = toep->td;
+ struct adapter *sc = td_adapter(td);
+ int tid = toep->tid;
+
+ KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0,
+ ("%s: %p has CPL pending.", __func__, toep));
+ KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0,
+ ("%s: %p is still attached.", __func__, toep));
+
+ CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)",
+ __func__, toep, tid, toep->l2te);
+
+ if (toep->l2te)
+ t4_l2t_release(toep->l2te);
+
+ if (tid >= 0) {
+ remove_tid(sc, tid);
+ release_tid(sc, tid, toep->ctrlq);
+ }
+
+ mtx_lock(&td->toep_list_lock);
+ TAILQ_REMOVE(&td->toep_list, toep, link);
+ mtx_unlock(&td->toep_list_lock);
+
+ free_toepcb(toep);
+}
+
+/*
+ * The kernel is done with the TCP PCB and this is our opportunity to unhook the
+ * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no
+ * pending CPL) then it is time to release all resources tied to the toepcb.
+ *
+ * Also gets called when an offloaded active open fails and the TOM wants the
+ * kernel to take the TCP PCB back.
+ */
+static void
+t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
+{
+#if defined(KTR) || defined(INVARIANTS)
+ struct inpcb *inp = tp->t_inpcb;
+#endif
+ struct toepcb *toep = tp->t_toe;
+
+ INP_WLOCK_ASSERT(inp);
+
+ KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
+ KASSERT(toepcb_flag(toep, TPF_ATTACHED),
+ ("%s: not attached", __func__));
+
+#ifdef KTR
+ if (tp->t_state == TCPS_SYN_SENT) {
+ CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
+ __func__, toep->tid, toep, toep->flags, inp,
+ inp->inp_flags);
+ } else {
+ CTR6(KTR_CXGBE,
+ "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
+ toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
+ inp->inp_flags);
+ }
+#endif
+
+ tp->t_toe = NULL;
+ tp->t_flags &= ~TF_TOE;
+ toepcb_clr_flag(toep, TPF_ATTACHED);
+
+ if (toepcb_flag(toep, TPF_CPL_PENDING) == 0)
+ release_offload_resources(toep);
+}
+
+/*
+ * The TOE driver will not receive any more CPLs for the tid associated with the
+ * toepcb; release the hold on the inpcb.
+ */
+void
+final_cpl_received(struct toepcb *toep)
+{
+ struct inpcb *inp = toep->inp;
+
+ KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
+ INP_WLOCK_ASSERT(inp);
+ KASSERT(toepcb_flag(toep, TPF_CPL_PENDING),
+ ("%s: CPL not pending already?", __func__));
+
+ CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
+ __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
+
+ toep->inp = NULL;
+ toepcb_clr_flag(toep, TPF_CPL_PENDING);
+
+ if (toepcb_flag(toep, TPF_ATTACHED) == 0)
+ release_offload_resources(toep);
+
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
+}
+
+void
+insert_tid(struct adapter *sc, int tid, void *ctx)
+{
+ struct tid_info *t = &sc->tids;
+
+ t->tid_tab[tid] = ctx;
+ atomic_add_int(&t->tids_in_use, 1);
+}
+
+void *
+lookup_tid(struct adapter *sc, int tid)
+{
+ struct tid_info *t = &sc->tids;
+
+ return (t->tid_tab[tid]);
+}
+
+void
+update_tid(struct adapter *sc, int tid, void *ctx)
+{
+ struct tid_info *t = &sc->tids;
+
+ t->tid_tab[tid] = ctx;
+}
+
+void
+remove_tid(struct adapter *sc, int tid)
+{
+ struct tid_info *t = &sc->tids;
+
+ t->tid_tab[tid] = NULL;
+ atomic_subtract_int(&t->tids_in_use, 1);
+}
+
+void
+release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq)
+{
+ struct wrqe *wr;
+ struct cpl_tid_release *req;
+
+ wr = alloc_wrqe(sizeof(*req), ctrlq);
+ if (wr == NULL) {
+ queue_tid_release(sc, tid); /* defer */
+ return;
+ }
+ req = wrtod(wr);
+
+ INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid);
+
+ t4_wrq_tx(sc, wr);
+}
+
+static void
+queue_tid_release(struct adapter *sc, int tid)
+{
+
+ CXGBE_UNIMPLEMENTED("deferred tid release");
+}
+
+/*
+ * What mtu_idx to use, given a 4-tuple and/or an MSS cap
+ */
+int
+find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
+{
+ unsigned short *mtus = &sc->params.mtus[0];
+ int i = 0, mss;
+
+ KASSERT(inc != NULL || pmss > 0,
+ ("%s: at least one of inc/pmss must be specified", __func__));
+
+ mss = inc ? tcp_mssopt(inc) : pmss;
+ if (pmss > 0 && mss > pmss)
+ mss = pmss;
+
+ while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40)
+ ++i;
+
+ return (i);
+}
+
+/*
+ * Determine the receive window size for a socket.
+ */
+u_long
+select_rcv_wnd(struct socket *so)
+{
+ unsigned long wnd;
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ wnd = sbspace(&so->so_rcv);
+ if (wnd < MIN_RCV_WND)
+ wnd = MIN_RCV_WND;
+
+ return min(wnd, MAX_RCV_WND);
+}
+
+int
+select_rcv_wscale(void)
+{
+ int wscale = 0;
+ unsigned long space = sb_max;
+
+ if (space > MAX_RCV_WND)
+ space = MAX_RCV_WND;
+
+ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
+ wscale++;
+
+ return (wscale);
+}
+
+extern int always_keepalive;
+#define VIID_SMACIDX(v) (((unsigned int)(v) & 0x7f) << 1)
+
+/*
+ * socket so could be a listening socket too.
+ */
+uint64_t
+calc_opt0(struct socket *so, struct port_info *pi, struct l2t_entry *e,
+ int mtu_idx, int rscale, int rx_credits, int ulp_mode)
+{
+ uint64_t opt0;
+
+ KASSERT(rx_credits <= M_RCV_BUFSIZ,
+ ("%s: rcv_bufsiz too high", __func__));
+
+ opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
+ V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
+
+ if (so != NULL) {
+ struct inpcb *inp = sotoinpcb(so);
+ struct tcpcb *tp = intotcpcb(inp);
+ int keepalive = always_keepalive ||
+ so_options_get(so) & SO_KEEPALIVE;
+
+ opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
+ opt0 |= V_KEEP_ALIVE(keepalive != 0);
+ }
+
+ if (e != NULL)
+ opt0 |= V_L2T_IDX(e->idx);
+
+ if (pi != NULL) {
+ opt0 |= V_SMAC_SEL(VIID_SMACIDX(pi->viid));
+ opt0 |= V_TX_CHAN(pi->tx_chan);
+ }
+
+ return htobe64(opt0);
+}
+
+#define FILTER_SEL_WIDTH_P_FC (3 + 1)
+#define FILTER_SEL_WIDTH_VIN_P_FC (6 + 7 + FILTER_SEL_WIDTH_P_FC)
+#define FILTER_SEL_WIDTH_TAG_P_FC (3 + FILTER_SEL_WIDTH_VIN_P_FC)
+#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC)
+#define VLAN_NONE 0xfff
+#define FILTER_SEL_VLAN_NONE 0xffff
+
+uint32_t
+select_ntuple(struct port_info *pi, struct l2t_entry *e, uint32_t filter_mode)
+{
+ uint16_t viid = pi->viid;
+ uint32_t ntuple = 0;
+
+ if (filter_mode == HW_TPL_FR_MT_PR_IV_P_FC) {
+ if (e->vlan == VLAN_NONE)
+ ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC;
+ else {
+ ntuple |= e->vlan << FILTER_SEL_WIDTH_P_FC;
+ ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+ }
+ ntuple |= e->lport << S_PORT;
+ ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+ } else if (filter_mode == HW_TPL_FR_MT_PR_OV_P_FC) {
+ ntuple |= G_FW_VIID_VIN(viid) << FILTER_SEL_WIDTH_P_FC;
+ ntuple |= G_FW_VIID_PFN(viid) << FILTER_SEL_WIDTH_VIN_P_FC;
+ ntuple |= G_FW_VIID_VIVLD(viid) << FILTER_SEL_WIDTH_TAG_P_FC;
+ ntuple |= e->lport << S_PORT;
+ ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC;
+ }
+
+ return (htobe32(ntuple));
+}
+
+static int
+alloc_tid_tabs(struct tid_info *t)
+{
+ size_t size;
+ unsigned int i;
+
+ size = t->ntids * sizeof(*t->tid_tab) +
+ t->natids * sizeof(*t->atid_tab) +
+ t->nstids * sizeof(*t->stid_tab);
+
+ t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT);
+ if (t->tid_tab == NULL)
+ return (ENOMEM);
+
+ mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF);
+ t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
+ t->afree = t->atid_tab;
+ t->atids_in_use = 0;
+ for (i = 1; i < t->natids; i++)
+ t->atid_tab[i - 1].next = &t->atid_tab[i];
+ t->atid_tab[t->natids - 1].next = NULL;
+
+ mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
+ t->stid_tab = (union serv_entry *)&t->atid_tab[t->natids];
+ t->sfree = t->stid_tab;
+ t->stids_in_use = 0;
+ for (i = 1; i < t->nstids; i++)
+ t->stid_tab[i - 1].next = &t->stid_tab[i];
+ t->stid_tab[t->nstids - 1].next = NULL;
+
+ atomic_store_rel_int(&t->tids_in_use, 0);
+
+ return (0);
+}
+
+static void
+free_tid_tabs(struct tid_info *t)
+{
+ KASSERT(t->tids_in_use == 0,
+ ("%s: %d tids still in use.", __func__, t->tids_in_use));
+ KASSERT(t->atids_in_use == 0,
+ ("%s: %d atids still in use.", __func__, t->atids_in_use));
+ KASSERT(t->stids_in_use == 0,
+ ("%s: %d tids still in use.", __func__, t->stids_in_use));
+
+ free(t->tid_tab, M_CXGBE);
+ t->tid_tab = NULL;
+
+ if (mtx_initialized(&t->atid_lock))
+ mtx_destroy(&t->atid_lock);
+ if (mtx_initialized(&t->stid_lock))
+ mtx_destroy(&t->stid_lock);
+}
+
+static void
+free_tom_data(struct adapter *sc, struct tom_data *td)
+{
+ KASSERT(TAILQ_EMPTY(&td->toep_list),
+ ("%s: TOE PCB list is not empty.", __func__));
+ KASSERT(td->lctx_count == 0,
+ ("%s: lctx hash table is not empty.", __func__));
+
+ t4_uninit_l2t_cpl_handlers(sc);
+
+ if (td->listen_mask != 0)
+ hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
+
+ if (mtx_initialized(&td->lctx_hash_lock))
+ mtx_destroy(&td->lctx_hash_lock);
+ if (mtx_initialized(&td->toep_list_lock))
+ mtx_destroy(&td->toep_list_lock);
+
+ free_tid_tabs(&sc->tids);
+ free(td, M_CXGBE);
+}
+
+/*
+ * Ground control to Major TOM
+ * Commencing countdown, engines on
+ */
+static int
+t4_tom_activate(struct adapter *sc)
+{
+ struct tom_data *td;
+ struct toedev *tod;
+ int i, rc;
+
+ ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */
+
+ /* per-adapter softc for TOM */
+ td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
+ if (td == NULL)
+ return (ENOMEM);
+
+ /* List of TOE PCBs and associated lock */
+ mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
+ TAILQ_INIT(&td->toep_list);
+
+ /* Listen context */
+ mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
+ td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
+ &td->listen_mask, HASH_NOWAIT);
+
+ /* TID tables */
+ rc = alloc_tid_tabs(&sc->tids);
+ if (rc != 0)
+ goto done;
+
+ /* CPL handlers */
+ t4_init_connect_cpl_handlers(sc);
+ t4_init_l2t_cpl_handlers(sc);
+ t4_init_listen_cpl_handlers(sc);
+ t4_init_cpl_io_handlers(sc);
+
+ /* toedev ops */
+ tod = &td->tod;
+ init_toedev(tod);
+ tod->tod_softc = sc;
+ tod->tod_connect = t4_connect;
+ tod->tod_listen_start = t4_listen_start;
+ tod->tod_listen_stop = t4_listen_stop;
+ tod->tod_rcvd = t4_rcvd;
+ tod->tod_output = t4_tod_output;
+ tod->tod_send_rst = t4_send_rst;
+ tod->tod_send_fin = t4_send_fin;
+ tod->tod_pcb_detach = t4_pcb_detach;
+ tod->tod_l2_update = t4_l2_update;
+ tod->tod_syncache_added = t4_syncache_added;
+ tod->tod_syncache_removed = t4_syncache_removed;
+ tod->tod_syncache_respond = t4_syncache_respond;
+ tod->tod_offload_socket = t4_offload_socket;
+
+ for_each_port(sc, i)
+ TOEDEV(sc->port[i]->ifp) = &td->tod;
+
+ sc->tom_softc = td;
+ sc->flags |= TOM_INIT_DONE;
+ register_toedev(sc->tom_softc);
+
+done:
+ if (rc != 0)
+ free_tom_data(sc, td);
+ return (rc);
+}
+
+static int
+t4_tom_deactivate(struct adapter *sc)
+{
+ int rc = 0;
+ struct tom_data *td = sc->tom_softc;
+
+ ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */
+
+ if (td == NULL)
+ return (0); /* XXX. KASSERT? */
+
+ if (sc->offload_map != 0)
+ return (EBUSY); /* at least one port has IFCAP_TOE enabled */
+
+ mtx_lock(&td->toep_list_lock);
+ if (!TAILQ_EMPTY(&td->toep_list))
+ rc = EBUSY;
+ mtx_unlock(&td->toep_list_lock);
+
+ mtx_lock(&td->lctx_hash_lock);
+ if (td->lctx_count > 0)
+ rc = EBUSY;
+ mtx_unlock(&td->lctx_hash_lock);
+
+ if (rc == 0) {
+ unregister_toedev(sc->tom_softc);
+ free_tom_data(sc, td);
+ sc->tom_softc = NULL;
+ sc->flags &= ~TOM_INIT_DONE;
+ }
+
+ return (rc);
+}
+
+static int
+t4_tom_mod_load(void)
+{
+ int rc;
+
+ rc = t4_register_uld(&tom_uld_info);
+ if (rc != 0)
+ t4_tom_mod_unload();
+
+ return (rc);
+}
+
+static void
+tom_uninit(struct adapter *sc, void *arg __unused)
+{
+ /* Try to free resources (works only if no port has IFCAP_TOE) */
+ ADAPTER_LOCK(sc);
+ if (sc->flags & TOM_INIT_DONE)
+ t4_deactivate_uld(sc, ULD_TOM);
+ ADAPTER_UNLOCK(sc);
+}
+
+static int
+t4_tom_mod_unload(void)
+{
+ t4_iterate(tom_uninit, NULL);
+
+ if (t4_unregister_uld(&tom_uld_info) == EBUSY)
+ return (EBUSY);
+
+ return (0);
+}
+#endif /* TCP_OFFLOAD */
+
+static int
+t4_tom_modevent(module_t mod, int cmd, void *arg)
+{
+ int rc = 0;
+
+#ifdef TCP_OFFLOAD
+ switch (cmd) {
+ case MOD_LOAD:
+ rc = t4_tom_mod_load();
+ break;
+
+ case MOD_UNLOAD:
+ rc = t4_tom_mod_unload();
+ break;
+
+ default:
+ rc = EINVAL;
+ }
+#else
+ printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
+ rc = EOPNOTSUPP;
+#endif
+ return (rc);
+}
+
+static moduledata_t t4_tom_moddata= {
+ "t4_tom",
+ t4_tom_modevent,
+ 0
+};
+
+MODULE_VERSION(t4_tom, 1);
+MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
+MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
+DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
new file mode 100644
index 0000000..4e171e7
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -0,0 +1,248 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __T4_TOM_H__
+#define __T4_TOM_H__
+
+#define KTR_CXGBE KTR_SPARE3
+#define LISTEN_HASH_SIZE 32
+
+/*
+ * Min receive window. We want it to be large enough to accommodate receive
+ * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
+ */
+#define MIN_RCV_WND (24 * 1024U)
+
+/*
+ * Max receive window supported by HW in bytes. Only a small part of it can
+ * be set through option0, the rest needs to be set through RX_DATA_ACK.
+ */
+#define MAX_RCV_WND ((1U << 27) - 1)
+
+/* TOE PCB flags */
+enum {
+ TPF_ATTACHED, /* a tcpcb refers to this toepcb */
+ TPF_FLOWC_WR_SENT, /* firmware flow context WR sent */
+ TPF_TX_DATA_SENT, /* some data sent */
+ TPF_TX_SUSPENDED, /* tx suspended for lack of resources */
+ TPF_SEND_FIN, /* send FIN after sending all pending data */
+ TPF_FIN_SENT, /* FIN has been sent */
+ TPF_ABORT_SHUTDOWN, /* connection abort is in progress */
+ TPF_CPL_PENDING, /* haven't received the last CPL */
+ TPF_SYNQE, /* synq_entry, not really a toepcb */
+ TPF_SYNQE_NEEDFREE, /* synq_entry was allocated externally */
+};
+
+struct ofld_tx_sdesc {
+ uint32_t plen; /* payload length */
+ uint8_t tx_credits; /* firmware tx credits (unit is 16B) */
+};
+
+struct toepcb {
+ TAILQ_ENTRY(toepcb) link; /* toep_list */
+ unsigned int flags; /* miscellaneous flags */
+ struct tom_data *td;
+ struct inpcb *inp; /* backpointer to host stack's PCB */
+ struct port_info *port; /* physical port */
+ struct sge_wrq *ofld_txq;
+ struct sge_ofld_rxq *ofld_rxq;
+ struct sge_wrq *ctrlq;
+ struct l2t_entry *l2te; /* L2 table entry used by this connection */
+ int tid; /* Connection identifier */
+ unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */
+ unsigned int enqueued; /* # of bytes added to so_rcv (not yet read) */
+ int rx_credits; /* rx credits (in bytes) to be returned to hw */
+
+ unsigned int ulp_mode; /* ULP mode */
+
+ /* Tx software descriptor */
+ uint8_t txsd_total;
+ uint8_t txsd_pidx;
+ uint8_t txsd_cidx;
+ uint8_t txsd_avail;
+ struct ofld_tx_sdesc txsd[];
+};
+
+struct flowc_tx_params {
+ uint32_t snd_nxt;
+ uint32_t rcv_nxt;
+ unsigned int snd_space;
+ unsigned int mss;
+};
+
+static inline int
+toepcb_flag(struct toepcb *toep, int flag)
+{
+
+ return isset(&toep->flags, flag);
+}
+
+static inline void
+toepcb_set_flag(struct toepcb *toep, int flag)
+{
+
+ setbit(&toep->flags, flag);
+}
+
+static inline void
+toepcb_clr_flag(struct toepcb *toep, int flag)
+{
+
+ clrbit(&toep->flags, flag);
+}
+
+/*
+ * Compressed state for embryonic connections for a listener. Barely fits in
+ * 64B, try not to grow it further.
+ */
+struct synq_entry {
+ TAILQ_ENTRY(synq_entry) link; /* listen_ctx's synq link */
+ int flags; /* same as toepcb's tp_flags */
+ int tid;
+ struct listen_ctx *lctx; /* backpointer to listen ctx */
+ struct mbuf *syn;
+ uint32_t iss;
+ uint32_t ts;
+ volatile uintptr_t wr;
+ volatile u_int refcnt;
+ uint16_t l2e_idx;
+ uint16_t rcv_bufsize;
+};
+
+static inline int
+synqe_flag(struct synq_entry *synqe, int flag)
+{
+
+ return isset(&synqe->flags, flag);
+}
+
+static inline void
+synqe_set_flag(struct synq_entry *synqe, int flag)
+{
+
+ setbit(&synqe->flags, flag);
+}
+
+static inline void
+synqe_clr_flag(struct synq_entry *synqe, int flag)
+{
+
+ clrbit(&synqe->flags, flag);
+}
+
+/* listen_ctx flags */
+#define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */
+
+struct listen_ctx {
+ LIST_ENTRY(listen_ctx) link; /* listen hash linkage */
+ volatile int refcount;
+ int stid;
+ int flags;
+ struct inpcb *inp; /* listening socket's inp */
+ struct sge_wrq *ctrlq;
+ struct sge_ofld_rxq *ofld_rxq;
+ TAILQ_HEAD(, synq_entry) synq;
+};
+
+struct tom_data {
+ struct toedev tod;
+
+ /* toepcb's associated with this TOE device */
+ struct mtx toep_list_lock;
+ TAILQ_HEAD(, toepcb) toep_list;
+
+ LIST_HEAD(, listen_ctx) *listen_hash;
+ u_long listen_mask;
+ int lctx_count; /* # of lctx in the hash table */
+ struct mtx lctx_hash_lock;
+};
+
+static inline struct tom_data *
+tod_td(struct toedev *tod)
+{
+
+ return (member2struct(tom_data, tod, tod));
+}
+
+static inline struct adapter *
+td_adapter(struct tom_data *td)
+{
+
+ return (td->tod.tod_softc);
+}
+
+/* t4_tom.c */
+struct toepcb *alloc_toepcb(struct port_info *, int, int, int);
+void free_toepcb(struct toepcb *);
+void offload_socket(struct socket *, struct toepcb *);
+void undo_offload_socket(struct socket *);
+void final_cpl_received(struct toepcb *);
+void insert_tid(struct adapter *, int, void *);
+void *lookup_tid(struct adapter *, int);
+void update_tid(struct adapter *, int, void *);
+void remove_tid(struct adapter *, int);
+void release_tid(struct adapter *, int, struct sge_wrq *);
+int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
+u_long select_rcv_wnd(struct socket *);
+int select_rcv_wscale(void);
+uint64_t calc_opt0(struct socket *, struct port_info *, struct l2t_entry *,
+ int, int, int, int);
+uint32_t select_ntuple(struct port_info *, struct l2t_entry *, uint32_t);
+
+/* t4_connect.c */
+void t4_init_connect_cpl_handlers(struct adapter *);
+int t4_connect(struct toedev *, struct socket *, struct rtentry *,
+ struct sockaddr *);
+
+/* t4_listen.c */
+void t4_init_listen_cpl_handlers(struct adapter *);
+int t4_listen_start(struct toedev *, struct tcpcb *);
+int t4_listen_stop(struct toedev *, struct tcpcb *);
+void t4_syncache_added(struct toedev *, void *);
+void t4_syncache_removed(struct toedev *, void *);
+int t4_syncache_respond(struct toedev *, void *, struct mbuf *);
+int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
+ struct mbuf *);
+int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
+ struct mbuf *);
+void t4_offload_socket(struct toedev *, void *, struct socket *);
+
+/* t4_cpl_io.c */
+void t4_init_cpl_io_handlers(struct adapter *);
+void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int);
+void send_flowc_wr(struct toepcb *, struct flowc_tx_params *);
+void send_reset(struct adapter *, struct toepcb *, uint32_t);
+void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t);
+void t4_rcvd(struct toedev *, struct tcpcb *);
+int t4_tod_output(struct toedev *, struct tcpcb *);
+int t4_send_fin(struct toedev *, struct tcpcb *);
+int t4_send_rst(struct toedev *, struct tcpcb *);
+
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.c b/sys/dev/cxgbe/tom/t4_tom_l2t.c
new file mode 100644
index 0000000..ffe64c5
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c
@@ -0,0 +1,405 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#ifdef TCP_OFFLOAD
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sbuf.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/toecore.h>
+
+#include "common/common.h"
+#include "common/jhash.h"
+#include "common/t4_msg.h"
+#include "tom/t4_tom_l2t.h"
+#include "tom/t4_tom.h"
+
+#define VLAN_NONE 0xfff
+
+#define SA(x) ((struct sockaddr *)(x))
+#define SIN(x) ((struct sockaddr_in *)(x))
+#define SINADDR(x) (SIN(x)->sin_addr.s_addr)
+
+static inline void
+l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+ if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */
+ atomic_subtract_int(&d->nfree, 1);
+}
+
+static inline unsigned int
+arp_hash(const uint32_t key, int ifindex)
+{
+ return jhash_2words(key, ifindex, 0) & (L2T_SIZE - 1);
+}
+
+/*
+ * Add a WR to an L2T entry's queue of work requests awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void
+arpq_enqueue(struct l2t_entry *e, struct wrqe *wr)
+{
+ mtx_assert(&e->lock, MA_OWNED);
+
+ STAILQ_INSERT_TAIL(&e->wr_list, wr, link);
+}
+
+static inline void
+send_pending(struct adapter *sc, struct l2t_entry *e)
+{
+ struct wrqe *wr;
+
+ mtx_assert(&e->lock, MA_OWNED);
+
+ while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) {
+ STAILQ_REMOVE_HEAD(&e->wr_list, link);
+ t4_wrq_tx(sc, wr);
+ }
+}
+
+static void
+resolution_failed_for_wr(struct wrqe *wr)
+{
+ log(LOG_ERR, "%s: leaked work request %p, wr_len %d", __func__, wr,
+ wr->wr_len);
+
+ /* free(wr, M_CXGBE); */
+}
+
+static void
+resolution_failed(struct l2t_entry *e)
+{
+ struct wrqe *wr;
+
+ mtx_assert(&e->lock, MA_OWNED);
+
+ while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) {
+ STAILQ_REMOVE_HEAD(&e->wr_list, link);
+ resolution_failed_for_wr(wr);
+ }
+}
+
+static void
+update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr,
+ uint16_t vtag)
+{
+
+ mtx_assert(&e->lock, MA_OWNED);
+
+ /*
+ * The entry may be in active use (e->refcount > 0) or not. We update
+ * it even when it's not as this simplifies the case where we decide to
+ * reuse the entry later.
+ */
+
+ if (lladdr == NULL &&
+ (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) {
+ /*
+ * Never got a valid L2 address for this one. Just mark it as
+ * failed instead of removing it from the hash (for which we'd
+ * need to wlock the table).
+ */
+ e->state = L2T_STATE_FAILED;
+ resolution_failed(e);
+ return;
+
+ } else if (lladdr == NULL) {
+
+ /* Valid or already-stale entry was deleted (or expired) */
+
+ KASSERT(e->state == L2T_STATE_VALID ||
+ e->state == L2T_STATE_STALE,
+ ("%s: lladdr NULL, state %d", __func__, e->state));
+
+ e->state = L2T_STATE_STALE;
+
+ } else {
+
+ if (e->state == L2T_STATE_RESOLVING ||
+ e->state == L2T_STATE_FAILED ||
+ memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) {
+
+ /* unresolved -> resolved; or dmac changed */
+
+ memcpy(e->dmac, lladdr, ETHER_ADDR_LEN);
+ e->vlan = vtag;
+ t4_write_l2e(sc, e, 1);
+ }
+ e->state = L2T_STATE_VALID;
+ }
+}
+
+static int
+resolve_entry(struct adapter *sc, struct l2t_entry *e)
+{
+ struct tom_data *td = sc->tom_softc;
+ struct toedev *tod = &td->tod;
+ struct sockaddr_in sin = {0};
+ uint8_t dmac[ETHER_ADDR_LEN];
+ uint16_t vtag = VLAN_NONE;
+ int rc;
+
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(struct sockaddr_in);
+ SINADDR(&sin) = e->addr;
+
+ rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag);
+ if (rc == EWOULDBLOCK)
+ return (rc);
+
+ mtx_lock(&e->lock);
+ update_entry(sc, e, rc == 0 ? dmac : NULL, vtag);
+ mtx_unlock(&e->lock);
+
+ return (rc);
+}
+
+int
+t4_l2t_send_slow(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e)
+{
+
+again:
+ switch (e->state) {
+ case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
+
+ if (resolve_entry(sc, e) != EWOULDBLOCK)
+ goto again; /* entry updated, re-examine state */
+
+ /* Fall through */
+
+ case L2T_STATE_VALID: /* fast-path, send the packet on */
+
+ t4_wrq_tx(sc, wr);
+ return (0);
+
+ case L2T_STATE_RESOLVING:
+ case L2T_STATE_SYNC_WRITE:
+
+ mtx_lock(&e->lock);
+ if (e->state != L2T_STATE_SYNC_WRITE &&
+ e->state != L2T_STATE_RESOLVING) {
+ /* state changed by the time we got here */
+ mtx_unlock(&e->lock);
+ goto again;
+ }
+ arpq_enqueue(e, wr);
+ mtx_unlock(&e->lock);
+
+ if (resolve_entry(sc, e) == EWOULDBLOCK)
+ break;
+
+ mtx_lock(&e->lock);
+ if (e->state == L2T_STATE_VALID && !STAILQ_EMPTY(&e->wr_list))
+ send_pending(sc, e);
+ if (e->state == L2T_STATE_FAILED)
+ resolution_failed(e);
+ mtx_unlock(&e->lock);
+ break;
+
+ case L2T_STATE_FAILED:
+ resolution_failed_for_wr(wr);
+ return (EHOSTUNREACH);
+ }
+
+ return (0);
+}
+
+/*
+ * Called when an L2T entry has no more users. The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor. We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+
+static int
+do_l2t_write_rpl2(struct sge_iq *iq, const struct rss_header *rss,
+ struct mbuf *m)
+{
+ struct adapter *sc = iq->adapter;
+ const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
+ unsigned int tid = GET_TID(rpl);
+ unsigned int idx = tid & (L2T_SIZE - 1);
+ int rc;
+
+ rc = do_l2t_write_rpl(iq, rss, m);
+ if (rc != 0)
+ return (rc);
+
+ if (tid & F_SYNC_WR) {
+ struct l2t_entry *e = &sc->l2t->l2tab[idx];
+
+ mtx_lock(&e->lock);
+ if (e->state != L2T_STATE_SWITCHING) {
+ send_pending(sc, e);
+ e->state = L2T_STATE_VALID;
+ }
+ mtx_unlock(&e->lock);
+ }
+
+ return (0);
+}
+
+void
+t4_init_l2t_cpl_handlers(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl2);
+}
+
+void
+t4_uninit_l2t_cpl_handlers(struct adapter *sc)
+{
+
+ t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
+}
+
+/*
+ * The TOE wants an L2 table entry that it can use to reach the next hop over
+ * the specified port. Produce such an entry - create one if needed.
+ *
+ * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
+ * top of the real cxgbe interface.
+ */
+struct l2t_entry *
+t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
+{
+ struct l2t_entry *e;
+ struct l2t_data *d = pi->adapter->l2t;
+ uint32_t addr = SINADDR(sa);
+ int hash = arp_hash(addr, ifp->if_index);
+ unsigned int smt_idx = pi->port_id;
+
+ if (sa->sa_family != AF_INET)
+ return (NULL); /* XXX: no IPv6 support right now */
+
+#ifndef VLAN_TAG
+ if (ifp->if_type == IFT_L2VLAN)
+ return (NULL);
+#endif
+
+ rw_wlock(&d->lock);
+ for (e = d->l2tab[hash].first; e; e = e->next) {
+ if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) {
+ l2t_hold(d, e);
+ goto done;
+ }
+ }
+
+ /* Need to allocate a new entry */
+ e = t4_alloc_l2e(d);
+ if (e) {
+ mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
+ e->next = d->l2tab[hash].first;
+ d->l2tab[hash].first = e;
+
+ e->state = L2T_STATE_RESOLVING;
+ e->addr = addr;
+ e->ifp = ifp;
+ e->smt_idx = smt_idx;
+ e->hash = hash;
+ e->lport = pi->lport;
+ atomic_store_rel_int(&e->refcnt, 1);
+#ifdef VLAN_TAG
+ if (ifp->if_type == IFT_L2VLAN)
+ VLAN_TAG(ifp, &e->vlan);
+ else
+ e->vlan = VLAN_NONE;
+#endif
+ mtx_unlock(&e->lock);
+ }
+done:
+ rw_wunlock(&d->lock);
+ return e;
+}
+
+/*
+ * Called when the host's ARP layer makes a change to some entry that is loaded
+ * into the HW L2 table.
+ */
+void
+t4_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+ uint8_t *lladdr, uint16_t vtag)
+{
+ struct adapter *sc = tod->tod_softc;
+ struct l2t_entry *e;
+ struct l2t_data *d = sc->l2t;
+ uint32_t addr = SINADDR(sa);
+ int hash = arp_hash(addr, ifp->if_index);
+
+ KASSERT(d != NULL, ("%s: no L2 table", __func__));
+
+ rw_rlock(&d->lock);
+ for (e = d->l2tab[hash].first; e; e = e->next) {
+ if (e->addr == addr && e->ifp == ifp) {
+ mtx_lock(&e->lock);
+ if (atomic_load_acq_int(&e->refcnt))
+ goto found;
+ e->state = L2T_STATE_STALE;
+ mtx_unlock(&e->lock);
+ break;
+ }
+ }
+ rw_runlock(&d->lock);
+
+ /*
+ * This is of no interest to us. We've never had an offloaded
+ * connection to this destination, and we aren't attempting one right
+ * now.
+ */
+ return;
+
+found:
+ rw_runlock(&d->lock);
+
+ KASSERT(e->state != L2T_STATE_UNUSED,
+ ("%s: unused entry in the hash.", __func__));
+
+ update_entry(sc, e, lladdr, vtag);
+ mtx_unlock(&e->lock);
+}
+#endif
diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.h b/sys/dev/cxgbe/tom/t4_tom_l2t.h
new file mode 100644
index 0000000..3d76735
--- /dev/null
+++ b/sys/dev/cxgbe/tom/t4_tom_l2t.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef __T4_TOM_L2T_H
+#define __T4_TOM_L2T_H
+
+#include "t4_l2t.h"
+
+int t4_l2t_send_slow(struct adapter *, struct wrqe *, struct l2t_entry *);
+struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *,
+ struct sockaddr *);
+void t4_l2_update(struct toedev *, struct ifnet *, struct sockaddr *,
+ uint8_t *, uint16_t);
+void t4_init_l2t_cpl_handlers(struct adapter *);
+void t4_uninit_l2t_cpl_handlers(struct adapter *);
+
+static inline int
+t4_l2t_send(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e)
+{
+ if (__predict_true(e->state == L2T_STATE_VALID)) {
+ t4_wrq_tx(sc, wr);
+ return (0);
+ } else
+ return (t4_l2t_send_slow(sc, wr, e));
+}
+
+#endif /* __T4_TOM_L2T_H */
OpenPOWER on IntegriCloud