diff options
author | np <np@FreeBSD.org> | 2012-06-19 07:34:13 +0000 |
---|---|---|
committer | np <np@FreeBSD.org> | 2012-06-19 07:34:13 +0000 |
commit | 67d5f1a727273d8e141e96c429114dff9fb06ec3 (patch) | |
tree | 9255a545bbd49a0458ed8850371b4fe6ed2cd01f /sys/dev/cxgbe | |
parent | 27063437e23a5e5e7debf9144ee974d21b6a6774 (diff) | |
download | FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.zip FreeBSD-src-67d5f1a727273d8e141e96c429114dff9fb06ec3.tar.gz |
- Updated TOE support in the kernel.
- Stateful TCP offload drivers for Terminator 3 and 4 (T3 and T4) ASICs.
These are available as t3_tom and t4_tom modules that augment cxgb(4)
and cxgbe(4) respectively. The cxgb/cxgbe drivers continue to work as
usual with or without these extra features.
- iWARP driver for Terminator 3 ASIC (kernel verbs). T4 iWARP in the
works and will follow soon.
Build-tested with make universe.
30s overview
============
What interfaces support TCP offload? Look for TOE4 and/or TOE6 in the
capabilities of an interface:
# ifconfig -m | grep TOE
Enable/disable TCP offload on an interface (just like any other ifnet
capability):
# ifconfig cxgbe0 toe
# ifconfig cxgbe0 -toe
Which connections are offloaded? Look for toe4 and/or toe6 in the
output of netstat and sockstat:
# netstat -np tcp | grep toe
# sockstat -46c | grep toe
Reviewed by: bz, gnn
Sponsored by: Chelsio communications.
MFC after: ~3 months (after 9.1, and after ensuring MFC is feasible)
Diffstat (limited to 'sys/dev/cxgbe')
-rw-r--r-- | sys/dev/cxgbe/adapter.h | 103 | ||||
-rw-r--r-- | sys/dev/cxgbe/common/t4_hw.c | 2 | ||||
-rw-r--r-- | sys/dev/cxgbe/offload.h | 19 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_l2t.c | 563 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_l2t.h | 55 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_main.c | 213 | ||||
-rw-r--r-- | sys/dev/cxgbe/t4_sge.c | 128 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_connect.c | 377 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_cpl_io.c | 1276 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_listen.c | 1362 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_tom.c | 755 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_tom.h | 248 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_tom_l2t.c | 405 | ||||
-rw-r--r-- | sys/dev/cxgbe/tom/t4_tom_l2t.h | 53 |
14 files changed, 4840 insertions, 719 deletions
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 6be75bc..ba5335a 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -157,6 +157,7 @@ enum { INTR_DIRECT = (1 << 2), /* direct interrupts for everything */ MASTER_PF = (1 << 3), ADAP_SYSCTL_CTX = (1 << 4), + TOM_INIT_DONE = (1 << 5), CXGBE_BUSY = (1 << 9), @@ -199,7 +200,7 @@ struct port_info { int first_txq; /* index of first tx queue */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int nofldtxq; /* # of offload tx queues */ int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ @@ -213,6 +214,8 @@ struct port_info { struct link_config link_cfg; struct port_stats stats; + eventhandler_tag vlan_c; + struct callout tick; struct sysctl_ctx_list ctx; /* from ifconfig up to driver detach */ @@ -296,7 +299,7 @@ struct sge_iq { enum { EQ_CTRL = 1, EQ_ETH = 2, -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD EQ_OFLD = 3, #endif @@ -422,14 +425,36 @@ struct sge_rxq { } __aligned(CACHE_LINE_SIZE); -#ifndef TCP_OFFLOAD_DISABLE +static inline struct sge_rxq * +iq_to_rxq(struct sge_iq *iq) +{ + + return (member2struct(sge_rxq, iq, iq)); +} + + +#ifdef TCP_OFFLOAD /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_ofld_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ } __aligned(CACHE_LINE_SIZE); + +static inline struct sge_ofld_rxq * +iq_to_ofld_rxq(struct sge_iq *iq) +{ + + return (member2struct(sge_ofld_rxq, iq, iq)); +} #endif +struct wrqe { + STAILQ_ENTRY(wrqe) link; + struct sge_wrq *wrq; + int wr_len; + uint64_t wr[] __aligned(16); +}; + /* * wrq: SGE egress queue that is given prebuilt work requests. Both the control * and offload tx queues are of this type. @@ -438,8 +463,9 @@ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; - struct mbuf *head; /* held up due to lack of descriptors */ - struct mbuf *tail; /* valid only if head is valid */ + + /* List of WRs held up due to lack of tx descriptors */ + STAILQ_HEAD(, wrqe) wr_list; /* stats for common events first */ @@ -457,7 +483,7 @@ struct sge { int nrxq; /* total # of Ethernet rx queues */ int ntxq; /* total # of Ethernet tx tx queues */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ #endif @@ -469,7 +495,7 @@ struct sge { struct sge_wrq *ctrlq; /* Control queues */ struct sge_txq *txq; /* NIC tx queues */ struct sge_rxq *rxq; /* NIC rx queues */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_wrq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ #endif @@ -483,6 +509,7 @@ struct sge { struct rss_header; typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *, struct mbuf *); +typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *); struct adapter { SLIST_ENTRY(adapter) link; @@ -519,15 +546,15 @@ struct adapter { uint8_t chan_map[NCHAN]; uint32_t filter_mode; -#ifndef TCP_OFFLOAD_DISABLE - struct uld_softc tom; +#ifdef TCP_OFFLOAD + void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; #endif struct l2t_data *l2t; /* L2 table */ struct tid_info tids; int open_device_map; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int offload_map; #endif int flags; @@ -554,7 +581,8 @@ struct adapter { TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; - cpl_handler_t cpl_handler[256] __aligned(CACHE_LINE_SIZE); + an_handler_t an_handler __aligned(CACHE_LINE_SIZE); + cpl_handler_t cpl_handler[256]; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) @@ -609,82 +637,96 @@ struct adapter { static inline uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { + return bus_space_read_4(sc->bt, sc->bh, reg); } static inline void t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val) { + bus_space_write_4(sc->bt, sc->bh, reg, val); } static inline uint64_t t4_read_reg64(struct adapter *sc, uint32_t reg) { + return t4_bus_space_read_8(sc->bt, sc->bh, reg); } static inline void t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val) { + t4_bus_space_write_8(sc->bt, sc->bh, reg, val); } static inline void t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val) { + *val = pci_read_config(sc->dev, reg, 1); } static inline void t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val) { + pci_write_config(sc->dev, reg, val, 1); } static inline void t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val) { + *val = pci_read_config(sc->dev, reg, 2); } static inline void t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val) { + pci_write_config(sc->dev, reg, val, 2); } static inline void t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val) { + *val = pci_read_config(sc->dev, reg, 4); } static inline void t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val) { + pci_write_config(sc->dev, reg, val, 4); } static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) { + return (sc->port[idx]); } static inline void t4_os_set_hw_addr(struct adapter *sc, int idx, uint8_t hw_addr[]) { + bcopy(hw_addr, sc->port[idx]->hw_addr, ETHER_ADDR_LEN); } static inline bool is_10G_port(const struct port_info *pi) { + return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0); } static inline int tx_resume_threshold(struct sge_eq *eq) { + return (eq->qsize / 4); } @@ -698,6 +740,7 @@ void t4_os_portmod_changed(const struct adapter *, int); void t4_os_link_changed(struct adapter *, int, int); void t4_iterate(void (*)(struct adapter *, void *), void *); int t4_register_cpl_handler(struct adapter *, int, cpl_handler_t); +int t4_register_an_handler(struct adapter *, an_handler_t); /* t4_sge.c */ void t4_sge_modload(void); @@ -714,21 +757,45 @@ void t4_intr_all(void *); void t4_intr(void *); void t4_intr_err(void *); void t4_intr_evt(void *); -int t4_mgmt_tx(struct adapter *, struct mbuf *); -int t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct mbuf *); +void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *); void t4_update_fl_bufsize(struct ifnet *); int can_resume_tx(struct sge_eq *); -static inline int t4_wrq_tx(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m) +static inline struct wrqe * +alloc_wrqe(int wr_len, struct sge_wrq *wrq) { - int rc; + int len = offsetof(struct wrqe, wr) + wr_len; + struct wrqe *wr; + + wr = malloc(len, M_CXGBE, M_NOWAIT); + if (__predict_false(wr == NULL)) + return (NULL); + wr->wr_len = wr_len; + wr->wrq = wrq; + return (wr); +} + +static inline void * +wrtod(struct wrqe *wr) +{ + return (&wr->wr[0]); +} + +static inline void +free_wrqe(struct wrqe *wr) +{ + free(wr, M_CXGBE); +} + +static inline void +t4_wrq_tx(struct adapter *sc, struct wrqe *wr) +{ + struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); - rc = t4_wrq_tx_locked(sc, wrq, m); + t4_wrq_tx_locked(sc, wrq, wr); TXQ_UNLOCK(wrq); - return (rc); } - #endif diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c index 6f4dd8d..f629cbe 100644 --- a/sys/dev/cxgbe/common/t4_hw.c +++ b/sys/dev/cxgbe/common/t4_hw.c @@ -27,6 +27,8 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include "common.h" #include "t4_regs.h" #include "t4_regs_values.h" diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index f6ada9d..1ae9f1f 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -31,12 +31,6 @@ #ifndef __T4_OFFLOAD_H__ #define __T4_OFFLOAD_H__ -/* XXX: flagrant misuse of mbuf fields (during tx by TOM) */ -#define MBUF_EQ(m) (*((void **)(&(m)->m_pkthdr.rcvif))) -/* These have to work for !M_PKTHDR so we use a field from m_hdr. */ -#define MBUF_TX_CREDITS(m) ((m)->m_hdr.pad[0]) -#define MBUF_DMA_MAPPED(m) ((m)->m_hdr.pad[1]) - #define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \ (w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \ (w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \ @@ -119,7 +113,7 @@ struct t4_virt_res { /* virtualized HW resources */ struct t4_range ocq; }; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD enum { ULD_TOM = 1, }; @@ -130,13 +124,8 @@ struct uld_info { SLIST_ENTRY(uld_info) link; int refcount; int uld_id; - int (*attach)(struct adapter *, void **); - int (*detach)(void *); -}; - -struct uld_softc { - struct uld_info *uld; - void *softc; + int (*activate)(struct adapter *); + int (*deactivate)(struct adapter *); }; struct tom_tunables { @@ -148,6 +137,8 @@ struct tom_tunables { int t4_register_uld(struct uld_info *); int t4_unregister_uld(struct uld_info *); +int t4_activate_uld(struct adapter *, int); +int t4_deactivate_uld(struct adapter *, int); #endif #endif diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c index 55491cd..8373c32 100644 --- a/sys/dev/cxgbe/t4_l2t.c +++ b/sys/dev/cxgbe/t4_l2t.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2011 Chelsio Communications, Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,16 +38,7 @@ __FBSDID("$FreeBSD$"); #include <sys/rwlock.h> #include <sys/socket.h> #include <sys/sbuf.h> -#include <net/if.h> -#include <net/if_types.h> -#include <net/ethernet.h> -#include <net/if_vlan_var.h> -#include <net/if_dl.h> -#include <net/if_llatbl.h> -#include <net/route.h> #include <netinet/in.h> -#include <netinet/in_var.h> -#include <netinet/if_ether.h> #include "common/common.h" #include "common/jhash.h" @@ -72,42 +63,11 @@ __FBSDID("$FreeBSD$"); * lifetime of an L2T entry is fully contained in the lifetime of the TOE. */ -/* identifies sync vs async L2T_WRITE_REQs */ -#define S_SYNC_WR 12 -#define V_SYNC_WR(x) ((x) << S_SYNC_WR) -#define F_SYNC_WR V_SYNC_WR(1) - -enum { - L2T_STATE_VALID, /* entry is up to date */ - L2T_STATE_STALE, /* entry may be used but needs revalidation */ - L2T_STATE_RESOLVING, /* entry needs address resolution */ - L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */ - - /* when state is one of the below the entry is not hashed */ - L2T_STATE_SWITCHING, /* entry is being used by a switching filter */ - L2T_STATE_UNUSED /* entry not in use */ -}; - -struct l2t_data { - struct rwlock lock; - volatile int nfree; /* number of free entries */ - struct l2t_entry *rover;/* starting point for next allocation */ - struct l2t_entry l2tab[L2T_SIZE]; -}; - -static int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, - struct mbuf *); - -#define VLAN_NONE 0xfff -#define SA(x) ((struct sockaddr *)(x)) -#define SIN(x) ((struct sockaddr_in *)(x)) -#define SINADDR(x) (SIN(x)->sin_addr.s_addr) - /* * Allocate a free L2T entry. Must be called with l2t_data.lock held. */ -static struct l2t_entry * -alloc_l2e(struct l2t_data *d) +struct l2t_entry * +t4_alloc_l2e(struct l2t_data *d) { struct l2t_entry *end, *e, **p; @@ -121,7 +81,8 @@ alloc_l2e(struct l2t_data *d) if (atomic_load_acq_int(&e->refcnt) == 0) goto found; - for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) ; + for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) + continue; found: d->rover = e + 1; atomic_subtract_int(&d->nfree, 1); @@ -148,19 +109,18 @@ found: * Write an L2T entry. Must be called with the entry locked. * The write may be synchronous or asynchronous. */ -static int -write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) +int +t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) { - struct mbuf *m; + struct wrqe *wr; struct cpl_l2t_write_req *req; mtx_assert(&e->lock, MA_OWNED); - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) + wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq); + if (wr == NULL) return (ENOMEM); - - req = mtod(m, struct cpl_l2t_write_req *); - m->m_pkthdr.len = m->m_len = sizeof(*req); + req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx | @@ -170,7 +130,7 @@ write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) req->vlan = htons(e->vlan); memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); - t4_mgmt_tx(sc, m); + t4_wrq_tx(sc, wr); if (sync && e->state != L2T_STATE_SWITCHING) e->state = L2T_STATE_SYNC_WRITE; @@ -189,7 +149,7 @@ t4_l2t_alloc_switching(struct l2t_data *d) struct l2t_entry *e; rw_rlock(&d->lock); - e = alloc_l2e(d); + e = t4_alloc_l2e(d); if (e) { mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ e->state = L2T_STATE_SWITCHING; @@ -214,7 +174,7 @@ t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan, e->lport = port; memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); mtx_lock(&e->lock); - rc = write_l2e(sc, e, 0); + rc = t4_write_l2e(sc, e, 0); mtx_unlock(&e->lock); return (rc); } @@ -234,10 +194,13 @@ t4_init_l2t(struct adapter *sc, int flags) rw_init(&d->lock, "L2T"); for (i = 0; i < L2T_SIZE; i++) { - d->l2tab[i].idx = i; - d->l2tab[i].state = L2T_STATE_UNUSED; - mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF); - atomic_store_rel_int(&d->l2tab[i].refcnt, 0); + struct l2t_entry *e = &d->l2tab[i]; + + e->idx = i; + e->state = L2T_STATE_UNUSED; + mtx_init(&e->lock, "L2T_E", NULL, MTX_DEF); + STAILQ_INIT(&e->wr_list); + atomic_store_rel_int(&e->refcnt, 0); } sc->l2t = d; @@ -259,6 +222,24 @@ t4_free_l2t(struct l2t_data *d) return (0); } +int +do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(rpl); + unsigned int idx = tid & (L2T_SIZE - 1); + + if (__predict_false(rpl->status != CPL_ERR_NONE)) { + log(LOG_ERR, + "Unexpected L2T_WRITE_RPL status %u for entry %u\n", + rpl->status, idx); + return (EINVAL); + } + + return (0); +} + #ifdef SBUF_DRAIN static inline unsigned int vlan_prio(const struct l2t_entry *e) @@ -273,7 +254,7 @@ l2e_state(const struct l2t_entry *e) case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */ case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */ case L2T_STATE_SYNC_WRITE: return 'W'; - case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R'; + case L2T_STATE_RESOLVING: return STAILQ_EMPTY(&e->wr_list) ? 'R' : 'A'; case L2T_STATE_SWITCHING: return 'X'; default: return 'U'; } @@ -311,20 +292,20 @@ sysctl_l2t(SYSCTL_HANDLER_ARGS) "Ethernet address VLAN/P LP State Users Port"); header = 1; } - if (e->state == L2T_STATE_SWITCHING || e->v6) + if (e->state == L2T_STATE_SWITCHING) ip[0] = 0; else snprintf(ip, sizeof(ip), "%s", - inet_ntoa(*(struct in_addr *)&e->addr[0])); + inet_ntoa(*(struct in_addr *)&e->addr)); - /* XXX: accessing lle probably not safe? */ + /* XXX: e->ifp may not be around */ sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d" " %u %2u %c %5u %s", e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5], e->vlan & 0xfff, vlan_prio(e), e->lport, l2e_state(e), atomic_load_acq_int(&e->refcnt), - e->lle ? e->lle->lle_tbl->llt_ifp->if_xname : ""); + e->ifp->if_xname); skip: mtx_unlock(&e->lock); } @@ -335,459 +316,3 @@ skip: return (rc); } #endif - -#ifndef TCP_OFFLOAD_DISABLE -static inline void -l2t_hold(struct l2t_data *d, struct l2t_entry *e) -{ - if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */ - atomic_subtract_int(&d->nfree, 1); -} - -/* - * To avoid having to check address families we do not allow v4 and v6 - * neighbors to be on the same hash chain. We keep v4 entries in the first - * half of available hash buckets and v6 in the second. - */ -enum { - L2T_SZ_HALF = L2T_SIZE / 2, - L2T_HASH_MASK = L2T_SZ_HALF - 1 -}; - -static inline unsigned int -arp_hash(const uint32_t *key, int ifindex) -{ - return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK; -} - -static inline unsigned int -ipv6_hash(const uint32_t *key, int ifindex) -{ - uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3]; - - return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK); -} - -static inline unsigned int -addr_hash(const uint32_t *addr, int addr_len, int ifindex) -{ - return addr_len == 4 ? arp_hash(addr, ifindex) : - ipv6_hash(addr, ifindex); -} - -/* - * Checks if an L2T entry is for the given IP/IPv6 address. It does not check - * whether the L2T entry and the address are of the same address family. - * Callers ensure an address is only checked against L2T entries of the same - * family, something made trivial by the separation of IP and IPv6 hash chains - * mentioned above. Returns 0 if there's a match, - */ -static inline int -addreq(const struct l2t_entry *e, const uint32_t *addr) -{ - if (e->v6) - return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) | - (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]); - return e->addr[0] ^ addr[0]; -} - -/* - * Add a packet to an L2T entry's queue of packets awaiting resolution. - * Must be called with the entry's lock held. - */ -static inline void -arpq_enqueue(struct l2t_entry *e, struct mbuf *m) -{ - mtx_assert(&e->lock, MA_OWNED); - - KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt not NULL", __func__)); - if (e->arpq_head) - e->arpq_tail->m_nextpkt = m; - else - e->arpq_head = m; - e->arpq_tail = m; -} - -static inline void -send_pending(struct adapter *sc, struct l2t_entry *e) -{ - struct mbuf *m, *next; - - mtx_assert(&e->lock, MA_OWNED); - - for (m = e->arpq_head; m; m = next) { - next = m->m_nextpkt; - m->m_nextpkt = NULL; - t4_wrq_tx(sc, MBUF_EQ(m), m); - } - e->arpq_head = e->arpq_tail = NULL; -} - -#ifdef INET -/* - * Looks up and fills up an l2t_entry's lle. We grab all the locks that we need - * ourself, and update e->state at the end if e->lle was successfully filled. - * - * The lle passed in comes from arpresolve and is ignored as it does not appear - * to be of much use. - */ -static int -l2t_fill_lle(struct adapter *sc, struct l2t_entry *e, struct llentry *unused) -{ - int rc = 0; - struct sockaddr_in sin; - struct ifnet *ifp = e->ifp; - struct llentry *lle; - - bzero(&sin, sizeof(struct sockaddr_in)); - if (e->v6) - panic("%s: IPv6 L2 resolution not supported yet.", __func__); - - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in)); - - mtx_assert(&e->lock, MA_NOTOWNED); - KASSERT(e->addr && ifp, ("%s: bad prep before call", __func__)); - - IF_AFDATA_LOCK(ifp); - lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, SA(&sin)); - IF_AFDATA_UNLOCK(ifp); - if (!LLE_IS_VALID(lle)) - return (ENOMEM); - if (!(lle->la_flags & LLE_VALID)) { - rc = EINVAL; - goto done; - } - - LLE_ADDREF(lle); - - mtx_lock(&e->lock); - if (e->state == L2T_STATE_RESOLVING) { - KASSERT(e->lle == NULL, ("%s: lle already valid", __func__)); - e->lle = lle; - memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN); - write_l2e(sc, e, 1); - } else { - KASSERT(e->lle == lle, ("%s: lle changed", __func__)); - LLE_REMREF(lle); - } - mtx_unlock(&e->lock); -done: - LLE_WUNLOCK(lle); - return (rc); -} -#endif - -int -t4_l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e) -{ -#ifndef INET - return (EINVAL); -#else - struct llentry *lle = NULL; - struct sockaddr_in sin; - struct ifnet *ifp = e->ifp; - - if (e->v6) - panic("%s: IPv6 L2 resolution not supported yet.", __func__); - - bzero(&sin, sizeof(struct sockaddr_in)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in)); - -again: - switch (e->state) { - case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ - if (arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0) - l2t_fill_lle(sc, e, lle); - - /* Fall through */ - - case L2T_STATE_VALID: /* fast-path, send the packet on */ - return t4_wrq_tx(sc, MBUF_EQ(m), m); - - case L2T_STATE_RESOLVING: - case L2T_STATE_SYNC_WRITE: - mtx_lock(&e->lock); - if (e->state != L2T_STATE_SYNC_WRITE && - e->state != L2T_STATE_RESOLVING) { - /* state changed by the time we got here */ - mtx_unlock(&e->lock); - goto again; - } - arpq_enqueue(e, m); - mtx_unlock(&e->lock); - - if (e->state == L2T_STATE_RESOLVING && - arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0) - l2t_fill_lle(sc, e, lle); - } - - return (0); -#endif -} - -/* - * Called when an L2T entry has no more users. The entry is left in the hash - * table since it is likely to be reused but we also bump nfree to indicate - * that the entry can be reallocated for a different neighbor. We also drop - * the existing neighbor reference in case the neighbor is going away and is - * waiting on our reference. - * - * Because entries can be reallocated to other neighbors once their ref count - * drops to 0 we need to take the entry's lock to avoid races with a new - * incarnation. - */ -static void -t4_l2e_free(struct l2t_entry *e) -{ - struct llentry *lle = NULL; - struct l2t_data *d; - - mtx_lock(&e->lock); - if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */ - lle = e->lle; - e->lle = NULL; - /* - * Don't need to worry about the arpq, an L2T entry can't be - * released if any packets are waiting for resolution as we - * need to be able to communicate with the device to close a - * connection. - */ - } - mtx_unlock(&e->lock); - - d = container_of(e, struct l2t_data, l2tab[e->idx]); - atomic_add_int(&d->nfree, 1); - - if (lle) - LLE_FREE(lle); -} - -void -t4_l2t_release(struct l2t_entry *e) -{ - if (atomic_fetchadd_int(&e->refcnt, -1) == 1) - t4_l2e_free(e); -} - -static int -do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, - struct mbuf *m) -{ - struct adapter *sc = iq->adapter; - const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); - unsigned int tid = GET_TID(rpl); - unsigned int idx = tid & (L2T_SIZE - 1); - - if (__predict_false(rpl->status != CPL_ERR_NONE)) { - log(LOG_ERR, - "Unexpected L2T_WRITE_RPL status %u for entry %u\n", - rpl->status, idx); - return (EINVAL); - } - - if (tid & F_SYNC_WR) { - struct l2t_entry *e = &sc->l2t->l2tab[idx]; - - mtx_lock(&e->lock); - if (e->state != L2T_STATE_SWITCHING) { - send_pending(sc, e); - e->state = L2T_STATE_VALID; - } - mtx_unlock(&e->lock); - } - - return (0); -} - -/* - * Reuse an L2T entry that was previously used for the same next hop. - */ -static void -reuse_entry(struct l2t_entry *e) -{ - struct llentry *lle; - - mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ - lle = e->lle; - if (lle) { - KASSERT(lle->la_flags & LLE_VALID, - ("%s: invalid lle stored in l2t_entry", __func__)); - - if (lle->la_expire >= time_uptime) - e->state = L2T_STATE_STALE; - else - e->state = L2T_STATE_VALID; - } else - e->state = L2T_STATE_RESOLVING; - mtx_unlock(&e->lock); -} - -/* - * The TOE wants an L2 table entry that it can use to reach the next hop over - * the specified port. Produce such an entry - create one if needed. - * - * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on - * top of the real cxgbe interface. - */ -struct l2t_entry * -t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa) -{ - struct l2t_entry *e; - struct l2t_data *d = pi->adapter->l2t; - int addr_len; - uint32_t *addr; - int hash; - struct sockaddr_in6 *sin6; - unsigned int smt_idx = pi->port_id; - - if (sa->sa_family == AF_INET) { - addr = (uint32_t *)&SINADDR(sa); - addr_len = sizeof(SINADDR(sa)); - } else if (sa->sa_family == AF_INET6) { - sin6 = (struct sockaddr_in6 *)sa; - addr = (uint32_t *)&sin6->sin6_addr.s6_addr; - addr_len = sizeof(sin6->sin6_addr.s6_addr); - } else - return (NULL); - -#ifndef VLAN_TAG - if (ifp->if_type == IFT_L2VLAN) - return (NULL); -#endif - - hash = addr_hash(addr, addr_len, ifp->if_index); - - rw_wlock(&d->lock); - for (e = d->l2tab[hash].first; e; e = e->next) { - if (!addreq(e, addr) && e->ifp == ifp && e->smt_idx == smt_idx){ - l2t_hold(d, e); - if (atomic_load_acq_int(&e->refcnt) == 1) - reuse_entry(e); - goto done; - } - } - - /* Need to allocate a new entry */ - e = alloc_l2e(d); - if (e) { - mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ - e->state = L2T_STATE_RESOLVING; - memcpy(e->addr, addr, addr_len); - e->ifindex = ifp->if_index; - e->smt_idx = smt_idx; - e->ifp = ifp; - e->hash = hash; - e->lport = pi->lport; - e->v6 = (addr_len == 16); - e->lle = NULL; - atomic_store_rel_int(&e->refcnt, 1); -#ifdef VLAN_TAG - if (ifp->if_type == IFT_L2VLAN) - VLAN_TAG(ifp, &e->vlan); - else - e->vlan = VLAN_NONE; -#endif - e->next = d->l2tab[hash].first; - d->l2tab[hash].first = e; - mtx_unlock(&e->lock); - } -done: - rw_wunlock(&d->lock); - return e; -} - -/* - * Called when the host's neighbor layer makes a change to some entry that is - * loaded into the HW L2 table. - */ -void -t4_l2t_update(struct adapter *sc, struct llentry *lle) -{ - struct l2t_entry *e; - struct l2t_data *d = sc->l2t; - struct sockaddr *sa = L3_ADDR(lle); - struct llentry *old_lle = NULL; - uint32_t *addr = (uint32_t *)&SINADDR(sa); - struct ifnet *ifp = lle->lle_tbl->llt_ifp; - int hash = addr_hash(addr, sizeof(*addr), ifp->if_index); - - KASSERT(d != NULL, ("%s: no L2 table", __func__)); - LLE_WLOCK_ASSERT(lle); - KASSERT(lle->la_flags & LLE_VALID || lle->la_flags & LLE_DELETED, - ("%s: entry neither valid nor deleted.", __func__)); - - rw_rlock(&d->lock); - for (e = d->l2tab[hash].first; e; e = e->next) { - if (!addreq(e, addr) && e->ifp == ifp) { - mtx_lock(&e->lock); - if (atomic_load_acq_int(&e->refcnt)) - goto found; - e->state = L2T_STATE_STALE; - mtx_unlock(&e->lock); - break; - } - } - rw_runlock(&d->lock); - - /* The TOE has no interest in this LLE */ - return; - - found: - rw_runlock(&d->lock); - - if (atomic_load_acq_int(&e->refcnt)) { - - /* Entry is referenced by at least 1 offloaded connection. */ - - /* Handle deletes first */ - if (lle->la_flags & LLE_DELETED) { - if (lle == e->lle) { - e->lle = NULL; - e->state = L2T_STATE_RESOLVING; - LLE_REMREF(lle); - } - goto done; - } - - if (lle != e->lle) { - old_lle = e->lle; - LLE_ADDREF(lle); - e->lle = lle; - } - - if (e->state == L2T_STATE_RESOLVING || - memcmp(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN)) { - - /* unresolved -> resolved; or dmac changed */ - - memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN); - write_l2e(sc, e, 1); - } else { - - /* +ve reinforcement of a valid or stale entry */ - - } - - e->state = L2T_STATE_VALID; - - } else { - /* - * Entry was used previously but is unreferenced right now. - * e->lle has been released and NULL'd out by t4_l2t_free, or - * l2t_release is about to call t4_l2t_free and do that. - * - * Either way this is of no interest to us. - */ - } - -done: - mtx_unlock(&e->lock); - if (old_lle) - LLE_FREE(old_lle); -} - -#endif diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h index 5dfce83..0303885 100644 --- a/sys/dev/cxgbe/t4_l2t.h +++ b/sys/dev/cxgbe/t4_l2t.h @@ -30,8 +30,25 @@ #ifndef __T4_L2T_H #define __T4_L2T_H +/* identifies sync vs async L2T_WRITE_REQs */ +#define S_SYNC_WR 12 +#define V_SYNC_WR(x) ((x) << S_SYNC_WR) +#define F_SYNC_WR V_SYNC_WR(1) + enum { L2T_SIZE = 4096 }; /* # of L2T entries */ +enum { + L2T_STATE_VALID, /* entry is up to date */ + L2T_STATE_STALE, /* entry may be used but needs revalidation */ + L2T_STATE_RESOLVING, /* entry needs address resolution */ + L2T_STATE_FAILED, /* failed to resolve */ + L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */ + + /* when state is one of the below the entry is not hashed */ + L2T_STATE_SWITCHING, /* entry is being used by a switching filter */ + L2T_STATE_UNUSED /* entry not in use */ +}; + /* * Each L2T entry plays multiple roles. First of all, it keeps state for the * corresponding entry of the HW L2 table and maintains a queue of offload @@ -43,39 +60,49 @@ enum { L2T_SIZE = 4096 }; /* # of L2T entries */ struct l2t_entry { uint16_t state; /* entry state */ uint16_t idx; /* entry index */ - uint32_t addr[4]; /* next hop IP or IPv6 address */ + uint32_t addr; /* next hop IP address */ struct ifnet *ifp; /* outgoing interface */ uint16_t smt_idx; /* SMT index */ uint16_t vlan; /* VLAN TCI (id: 0-11, prio: 13-15) */ - int ifindex; /* interface index */ - struct llentry *lle; /* llentry for next hop */ struct l2t_entry *first; /* start of hash chain */ struct l2t_entry *next; /* next l2t_entry on chain */ - struct mbuf *arpq_head; /* list of mbufs awaiting resolution */ - struct mbuf *arpq_tail; + STAILQ_HEAD(, wrqe) wr_list; /* list of WRs awaiting resolution */ struct mtx lock; volatile int refcnt; /* entry reference count */ uint16_t hash; /* hash bucket the entry is on */ - uint8_t v6; /* whether entry is for IPv6 */ uint8_t lport; /* associated offload logical port */ uint8_t dmac[ETHER_ADDR_LEN]; /* next hop's MAC address */ }; +struct l2t_data { + struct rwlock lock; + volatile int nfree; /* number of free entries */ + struct l2t_entry *rover;/* starting point for next allocation */ + struct l2t_entry l2tab[L2T_SIZE]; +}; + + int t4_init_l2t(struct adapter *, int); int t4_free_l2t(struct l2t_data *); +struct l2t_entry *t4_alloc_l2e(struct l2t_data *); struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *); int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t, uint8_t, uint8_t *); -void t4_l2t_release(struct l2t_entry *); +int t4_write_l2e(struct adapter *, struct l2t_entry *, int); +int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); + +static inline void +t4_l2t_release(struct l2t_entry *e) +{ + struct l2t_data *d = container_of(e, struct l2t_data, l2tab[e->idx]); + + if (atomic_fetchadd_int(&e->refcnt, -1) == 1) + atomic_add_int(&d->nfree, 1); +} + + #ifdef SBUF_DRAIN int sysctl_l2t(SYSCTL_HANDLER_ARGS); #endif -#ifndef TCP_OFFLOAD_DISABLE -struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *, - struct sockaddr *); -int t4_l2t_send(struct adapter *, struct mbuf *, struct l2t_entry *); -void t4_l2t_update(struct adapter *, struct llentry *); -#endif - #endif /* __T4_L2T_H */ diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 874a6ad..a91363b 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -119,9 +119,13 @@ static void cxgbe_media_status(struct ifnet *, struct ifmediareq *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4 Ethernet driver and services"); +/* + * Correct lock order when you need to acquire multiple locks is t4_list_lock, + * then ADAPTER_LOCK, then t4_uld_list_lock. + */ static struct mtx t4_list_lock; static SLIST_HEAD(, adapter) t4_list; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static struct mtx t4_uld_list_lock; static SLIST_HEAD(, uld_info) t4_uld_list; #endif @@ -149,7 +153,7 @@ TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); static int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g); @@ -237,7 +241,7 @@ struct intrs_and_queues { int nrxq10g; /* # of NIC rxq's for each 10G port */ int ntxq1g; /* # of NIC txq's for each 1G port */ int nrxq1g; /* # of NIC rxq's for each 1G port */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int nofldtxq10g; /* # of TOE txq's for each 10G port */ int nofldrxq10g; /* # of TOE rxq's for each 10G port */ int nofldtxq1g; /* # of TOE txq's for each 1G port */ @@ -297,8 +301,10 @@ static void reg_block_dump(struct adapter *, uint8_t *, unsigned int, unsigned int); static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void cxgbe_tick(void *); +static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t); static int cpl_not_handled(struct sge_iq *, const struct rss_header *, struct mbuf *); +static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *); static int t4_sysctls(struct adapter *); static int cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); @@ -342,10 +348,8 @@ static int filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); static int get_sge_context(struct adapter *, struct t4_sge_context *); static int read_card_mem(struct adapter *, struct t4_mem_range *); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int toe_capability(struct port_info *, int); -static int activate_uld(struct adapter *, int, struct uld_softc *); -static int deactivate_uld(struct uld_softc *); #endif static int t4_mod_event(module_t, int, void *); @@ -368,8 +372,12 @@ struct t4_pciids { {0x440a, 4, "Chelsio T404-BT"}, }; -#ifndef TCP_OFFLOAD_DISABLE -/* This is used in service_iq() to get to the fl associated with an iq. */ +#ifdef TCP_OFFLOAD +/* + * service_iq() has an iq and needs the fl. Offset of fl from the iq should be + * exactly the same for both rxq and ofld_rxq. + */ +CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif @@ -401,7 +409,7 @@ t4_attach(device_t dev) int rc = 0, i, n10g, n1g, rqidx, tqidx; struct intrs_and_queues iaq; struct sge *s; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif @@ -436,6 +444,7 @@ t4_attach(device_t dev) goto done; /* error message displayed already */ memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); + sc->an_handler = an_not_handled; for (i = 0; i < ARRAY_SIZE(sc->cpl_handler); i++) sc->cpl_handler[i] = cpl_not_handled; t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, filter_rpl); @@ -595,7 +604,7 @@ t4_attach(device_t dev) s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; @@ -631,7 +640,7 @@ t4_attach(device_t dev) * tx queues that each port should get. */ rqidx = tqidx = 0; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; #endif for_each_port(sc, i) { @@ -653,7 +662,7 @@ t4_attach(device_t dev) rqidx += pi->nrxq; tqidx += pi->ntxq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { pi->first_ofld_rxq = ofld_rqidx; pi->first_ofld_txq = ofld_tqidx; @@ -761,7 +770,7 @@ t4_detach(device_t dev) if (sc->l2t) t4_free_l2t(sc->l2t); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); #endif @@ -832,7 +841,7 @@ cxgbe_attach(device_t dev) ifp->if_qflush = cxgbe_qflush; ifp->if_capabilities = T4_CAP; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) ifp->if_capabilities |= IFCAP_TOE4; #endif @@ -844,9 +853,12 @@ cxgbe_attach(device_t dev) cxgbe_media_status); build_medialist(pi); + pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, + EVENTHANDLER_PRI_ANY); + ether_ifattach(ifp, pi->hw_addr); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { device_printf(dev, "%d txq, %d rxq (NIC); %d txq, %d rxq (TOE)\n", @@ -876,6 +888,9 @@ cxgbe_detach(device_t dev) SET_BUSY(sc); ADAPTER_UNLOCK(sc); + if (pi->vlan_c) + EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c); + PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&pi->tick); @@ -1042,7 +1057,7 @@ fail: } #endif } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; @@ -1292,7 +1307,7 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, iaq->ntxq1g = t4_ntxq1g; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldtxq10g = t4_nofldtxq10g; iaq->nofldtxq1g = t4_nofldtxq1g; iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g; @@ -1364,7 +1379,7 @@ restart: n++; } iaq->nrxq10g = min(n, nrxq10g); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldrxq10g = min(n, nofldrxq10g); #endif } @@ -1379,7 +1394,7 @@ restart: n++; } iaq->nrxq1g = min(n, nrxq1g); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldrxq1g = min(n, nofldrxq1g); #endif } @@ -1392,7 +1407,7 @@ restart: * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif @@ -2305,7 +2320,7 @@ adapter_full_init(struct adapter *sc) struct irq *irq; struct port_info *pi; struct sge_rxq *rxq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif @@ -2369,7 +2384,7 @@ adapter_full_init(struct adapter *sc) for_each_port(sc, p) { pi = sc->port[p]; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD /* * Skip over the NIC queues if they aren't taking direct * interrupts. @@ -2386,7 +2401,7 @@ adapter_full_init(struct adapter *sc) rid++; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD /* * Skip over the offload queues if they aren't taking * direct interrupts. @@ -2494,7 +2509,7 @@ port_full_uninit(struct port_info *pi) int i; struct sge_rxq *rxq; struct sge_txq *txq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif @@ -2507,7 +2522,7 @@ port_full_uninit(struct port_info *pi) quiesce_eq(sc, &txq->eq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { quiesce_eq(sc, &ofld_txq->eq); } @@ -2518,7 +2533,7 @@ port_full_uninit(struct port_info *pi) quiesce_fl(sc, &rxq->fl); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); @@ -2892,14 +2907,27 @@ cxgbe_tick(void *arg) PORT_UNLOCK(pi); } +static void +cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid) +{ + struct ifnet *vlan; + + if (arg != ifp) + return; + + vlan = VLAN_DEVAT(ifp, vid); + VLAN_SETCOOKIE(vlan, ifp); +} + static int cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { + #ifdef INVARIANTS - panic("%s: opcode %02x on iq %p with payload %p", + panic("%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); #else - log(LOG_ERR, "%s: opcode %02x on iq %p with payload %p", + log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); m_freem(m); #endif @@ -2922,6 +2950,31 @@ t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h) } static int +an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl) +{ + +#ifdef INVARIANTS + panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl); +#else + log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)", + __func__, iq, ctrl); +#endif + return (EDOOFUS); +} + +int +t4_register_an_handler(struct adapter *sc, an_handler_t h) +{ + uintptr_t *loc, new; + + new = h ? (uintptr_t)h : (uintptr_t)an_not_handled; + loc = (uintptr_t *) &sc->an_handler; + atomic_store_rel_ptr(loc, new); + + return (0); +} + +static int t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; @@ -3072,7 +3125,7 @@ t4_sysctls(struct adapter *sc) sysctl_tx_rate, "A", "Tx rate"); #endif -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { /* * dev.t4nex.X.toe. @@ -3125,7 +3178,7 @@ cxgbe_sysctls(struct port_info *pi) SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &pi->first_txq, 0, "index of first tx queue"); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &pi->nofldrxq, 0, @@ -4543,7 +4596,7 @@ set_filter_mode(struct adapter *sc, uint32_t mode) goto done; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (sc->offload_map) { rc = EBUSY; goto done; @@ -4734,7 +4787,7 @@ static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; - struct mbuf *m; + struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; @@ -4755,12 +4808,11 @@ set_filter_wr(struct adapter *sc, int fidx) ftid = sc->tids.ftid_base + fidx; - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) + wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); + if (wr == NULL) return (ENOMEM); - fwr = mtod(m, struct fw_filter_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*fwr); + fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); @@ -4830,7 +4882,7 @@ set_filter_wr(struct adapter *sc, int fidx) f->pending = 1; sc->tids.ftids_in_use++; - t4_mgmt_tx(sc, m); + t4_wrq_tx(sc, wr); return (0); } @@ -4838,7 +4890,7 @@ static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; - struct mbuf *m; + struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; @@ -4846,18 +4898,16 @@ del_filter_wr(struct adapter *sc, int fidx) ftid = sc->tids.ftid_base + fidx; - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) + wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); + if (wr == NULL) return (ENOMEM); - - fwr = mtod(m, struct fw_filter_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*fwr); + fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; - t4_mgmt_tx(sc, m); + t4_wrq_tx(sc, wr); return (0); } @@ -5215,7 +5265,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, return (rc); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int toe_capability(struct port_info *pi, int enable) { @@ -5228,13 +5278,28 @@ toe_capability(struct port_info *pi, int enable) return (ENODEV); if (enable) { + if (!(sc->flags & FULL_INIT_DONE)) { + log(LOG_WARNING, + "You must enable a cxgbe interface first\n"); + return (EAGAIN); + } + if (isset(&sc->offload_map, pi->port_id)) return (0); - if (sc->offload_map == 0) { - rc = activate_uld(sc, ULD_TOM, &sc->tom); + if (!(sc->flags & TOM_INIT_DONE)) { + rc = t4_activate_uld(sc, ULD_TOM); + if (rc == EAGAIN) { + log(LOG_WARNING, + "You must kldload t4_tom.ko before trying " + "to enable TOE on a cxgbe interface.\n"); + } if (rc != 0) return (rc); + KASSERT(sc->tom_softc != NULL, + ("%s: TOM activated but softc NULL", __func__)); + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM activated but flag not set", __func__)); } setbit(&sc->offload_map, pi->port_id); @@ -5242,15 +5307,9 @@ toe_capability(struct port_info *pi, int enable) if (!isset(&sc->offload_map, pi->port_id)) return (0); + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); - - if (sc->offload_map == 0) { - rc = deactivate_uld(&sc->tom); - if (rc != 0) { - setbit(&sc->offload_map, pi->port_id); - return (rc); - } - } } return (0); @@ -5305,8 +5364,8 @@ done: return (rc); } -static int -activate_uld(struct adapter *sc, int id, struct uld_softc *usc) +int +t4_activate_uld(struct adapter *sc, int id) { int rc = EAGAIN; struct uld_info *ui; @@ -5315,13 +5374,9 @@ activate_uld(struct adapter *sc, int id, struct uld_softc *usc) SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { - rc = ui->attach(sc, &usc->softc); - if (rc == 0) { - KASSERT(usc->softc != NULL, - ("%s: ULD %d has no state", __func__, id)); + rc = ui->activate(sc); + if (rc == 0) ui->refcount++; - usc->uld = ui; - } goto done; } } @@ -5331,25 +5386,21 @@ done: return (rc); } -static int -deactivate_uld(struct uld_softc *usc) +int +t4_deactivate_uld(struct adapter *sc, int id) { - int rc; + int rc = EINVAL; + struct uld_info *ui; mtx_lock(&t4_uld_list_lock); - if (usc->uld == NULL || usc->softc == NULL) { - rc = EINVAL; - goto done; - } - - rc = usc->uld->detach(usc->softc); - if (rc == 0) { - KASSERT(usc->uld->refcount > 0, - ("%s: ULD has bad refcount", __func__)); - usc->uld->refcount--; - usc->uld = NULL; - usc->softc = NULL; + SLIST_FOREACH(ui, &t4_uld_list, link) { + if (ui->uld_id == id) { + rc = ui->deactivate(sc); + if (rc == 0) + ui->refcount--; + goto done; + } } done: mtx_unlock(&t4_uld_list_lock); @@ -5379,7 +5430,7 @@ tweak_tunables(void) if (t4_nrxq1g < 1) t4_nrxq1g = min(nc, NRXQ_1G); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (t4_nofldtxq10g < 1) t4_nofldtxq10g = min(nc, NOFLDTXQ_10G); @@ -5426,7 +5477,7 @@ t4_mod_event(module_t mod, int cmd, void *arg) t4_sge_modload(); mtx_init(&t4_list_lock, "T4 adapters", 0, MTX_DEF); SLIST_INIT(&t4_list); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD mtx_init(&t4_uld_list_lock, "T4 ULDs", 0, MTX_DEF); SLIST_INIT(&t4_uld_list); #endif @@ -5434,7 +5485,7 @@ t4_mod_event(module_t mod, int cmd, void *arg) break; case MOD_UNLOAD: -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD mtx_lock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 8f39f10..92c9212 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/kernel.h> +#include <sys/kdb.h> #include <sys/malloc.h> #include <sys/queue.h> #include <sys/taskqueue.h> @@ -51,7 +52,6 @@ __FBSDID("$FreeBSD$"); #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" -#include "t4_l2t.h" struct fl_buf_info { int size; @@ -115,14 +115,14 @@ static int free_mgmtq(struct adapter *); static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct port_info *, struct sge_rxq *); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *); @@ -397,7 +397,7 @@ first_vector(struct port_info *pi) if (i == pi->port_id) break; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (sc->flags & INTR_DIRECT) rc += pi->nrxq + pi->nofldrxq; else @@ -434,7 +434,7 @@ port_intr_iq(struct port_info *pi, int idx) if (sc->intr_count == 1) return (&sc->sge.fwq); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (sc->flags & INTR_DIRECT) { idx %= pi->nrxq + pi->nofldrxq; @@ -475,19 +475,20 @@ t4_setup_port_queues(struct port_info *pi) struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *ctrlq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; + struct sysctl_oid *oid2 = NULL; #endif char name[16]; struct adapter *sc = pi->adapter; - struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev), *oid2 = NULL; + struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, @@ -515,7 +516,7 @@ t4_setup_port_queues(struct port_info *pi) init_fl(&rxq->fl, pi->qsize_rxq / 8, pi->ifp->if_mtu, name); if (sc->flags & INTR_DIRECT -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD || (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq) #endif ) { @@ -527,7 +528,7 @@ t4_setup_port_queues(struct port_info *pi) } } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { snprintf(name, sizeof(name), "%s ofld_rxq%d-iq", @@ -567,7 +568,7 @@ t4_setup_port_queues(struct port_info *pi) j++; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) continue; @@ -603,7 +604,7 @@ t4_setup_port_queues(struct port_info *pi) j++; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(pi, i, ofld_txq) { @@ -655,7 +656,7 @@ t4_teardown_port_queues(struct port_info *pi) struct adapter *sc = pi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif @@ -677,7 +678,7 @@ t4_teardown_port_queues(struct port_info *pi) free_txq(pi, txq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { free_wrq(sc, ofld_txq); } @@ -693,7 +694,7 @@ t4_teardown_port_queues(struct port_info *pi) free_rxq(pi, rxq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(pi, ofld_rxq); @@ -709,7 +710,7 @@ t4_teardown_port_queues(struct port_info *pi) free_rxq(pi, rxq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(pi, ofld_rxq); @@ -775,7 +776,7 @@ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; - struct sge_rxq *rxq = (void *)iq; /* Use iff iq is part of rxq */ + struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ struct sge_fl *fl = &rxq->fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; struct rsp_ctrl *ctrl; @@ -862,7 +863,8 @@ service_iq(struct sge_iq *iq, int budget) break; default: - panic("%s: rsp_type %u", __func__, rsp_type); + sc->an_handler(iq, ctrl); + break; } iq_next(iq); @@ -1076,42 +1078,33 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) return (0); } -int -t4_mgmt_tx(struct adapter *sc, struct mbuf *m) -{ - return t4_wrq_tx(sc, &sc->sge.mgmtq, m); -} - /* * Doesn't fail. Holds on to work requests it can't send right away. */ -int -t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) +void +t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { struct sge_eq *eq = &wrq->eq; int can_reclaim; caddr_t dst; - struct mbuf *wr, *next; TXQ_LOCK_ASSERT_OWNED(wrq); +#ifdef TCP_OFFLOAD KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD || (eq->flags & EQ_TYPEMASK) == EQ_CTRL, ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); +#else + KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL, + ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); +#endif - if (__predict_true(m0 != NULL)) { - if (wrq->head) - wrq->tail->m_nextpkt = m0; - else - wrq->head = m0; - while (m0->m_nextpkt) - m0 = m0->m_nextpkt; - wrq->tail = m0; - } + if (__predict_true(wr != NULL)) + STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); can_reclaim = reclaimable(eq); if (__predict_false(eq->flags & EQ_STALLED)) { if (can_reclaim < tx_resume_threshold(eq)) - return (0); + return; eq->flags &= ~EQ_STALLED; eq->unstalled++; } @@ -1120,39 +1113,34 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) if (__predict_false(eq->cidx >= eq->cap)) eq->cidx -= eq->cap; - for (wr = wrq->head; wr; wr = next) { + while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) { int ndesc; - struct mbuf *m; - next = wr->m_nextpkt; - wr->m_nextpkt = NULL; + if (__predict_false(wr->wr_len < 0 || + wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) { - M_ASSERTPKTHDR(wr); - KASSERT(wr->m_pkthdr.len > 0 && (wr->m_pkthdr.len & 0x7) == 0, - ("%s: work request len %d.", __func__, wr->m_pkthdr.len)); - - if (wr->m_pkthdr.len > SGE_MAX_WR_LEN) { #ifdef INVARIANTS - panic("%s: oversized work request", __func__); -#else - log(LOG_ERR, "%s: %s work request too long (%d)", - device_get_nameunit(sc->dev), __func__, - wr->m_pkthdr.len); - m_freem(wr); - continue; + panic("%s: work request with length %d", __func__, + wr->wr_len); #endif +#ifdef KDB + kdb_backtrace(); +#endif + log(LOG_ERR, "%s: %s work request with length %d", + device_get_nameunit(sc->dev), __func__, wr->wr_len); + STAILQ_REMOVE_HEAD(&wrq->wr_list, link); + free_wrqe(wr); + continue; } - ndesc = howmany(wr->m_pkthdr.len, EQ_ESIZE); + ndesc = howmany(wr->wr_len, EQ_ESIZE); if (eq->avail < ndesc) { - wr->m_nextpkt = next; wrq->no_desc++; break; } dst = (void *)&eq->desc[eq->pidx]; - for (m = wr; m; m = m->m_next) - copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); + copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len); eq->pidx += ndesc; eq->avail -= ndesc; @@ -1164,7 +1152,8 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) ring_eq_db(sc, eq); wrq->tx_wrs++; - m_freem(wr); + STAILQ_REMOVE_HEAD(&wrq->wr_list, link); + free_wrqe(wr); if (eq->avail < 8) { can_reclaim = reclaimable(eq); @@ -1178,20 +1167,11 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) if (eq->pending) ring_eq_db(sc, eq); - if (wr == NULL) - wrq->head = wrq->tail = NULL; - else { - wrq->head = wr; - - KASSERT(wrq->tail->m_nextpkt == NULL, - ("%s: wrq->tail grew a tail of its own", __func__)); - + if (wr != NULL) { eq->flags |= EQ_STALLED; if (callout_pending(&eq->tx_callout) == 0) callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); } - - return (0); } /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */ @@ -1792,6 +1772,7 @@ alloc_mgmtq(struct adapter *sc) static int free_mgmtq(struct adapter *sc) { + return free_wrq(sc, &sc->sge.mgmtq); } @@ -1885,7 +1866,7 @@ free_rxq(struct port_info *pi, struct sge_rxq *rxq) return (rc); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) @@ -2031,7 +2012,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) return (rc); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { @@ -2103,7 +2084,7 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) rc = eth_eq_alloc(sc, pi, eq); break; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD case EQ_OFLD: rc = ofld_eq_alloc(sc, pi, eq); break; @@ -2141,7 +2122,7 @@ free_eq(struct adapter *sc, struct sge_eq *eq) eq->cntxt_id); break; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); @@ -2183,6 +2164,7 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq, return (rc); wrq->adapter = sc; + STAILQ_INIT(&wrq->wr_list); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); @@ -3179,7 +3161,7 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to) static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { - if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) { + if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) { bcopy(from, *to, len); (*to) += len; } else { diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c new file mode 100644 index 0000000..bc59171 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -0,0 +1,377 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/module.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_vlan_var.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/tcp_var.h> +#define TCPSTATES +#include <netinet/tcp_fsm.h> +#include <netinet/toecore.h> + +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +/* atid services */ +static int alloc_atid(struct adapter *, void *); +static void *lookup_atid(struct adapter *, int); +static void free_atid(struct adapter *, int); + +static int +alloc_atid(struct adapter *sc, void *ctx) +{ + struct tid_info *t = &sc->tids; + int atid = -1; + + mtx_lock(&t->atid_lock); + if (t->afree) { + union aopen_entry *p = t->afree; + + atid = p - t->atid_tab; + t->afree = p->next; + p->data = ctx; + t->atids_in_use++; + } + mtx_unlock(&t->atid_lock); + return (atid); +} + +static void * +lookup_atid(struct adapter *sc, int atid) +{ + struct tid_info *t = &sc->tids; + + return (t->atid_tab[atid].data); +} + +static void +free_atid(struct adapter *sc, int atid) +{ + struct tid_info *t = &sc->tids; + union aopen_entry *p = &t->atid_tab[atid]; + + mtx_lock(&t->atid_lock); + p->next = t->afree; + t->afree = p; + t->atids_in_use--; + mtx_unlock(&t->atid_lock); +} + +/* + * Active open failed. + */ +static int +do_act_establish(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_act_establish *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + unsigned int atid = G_TID_TID(ntohl(cpl->tos_atid)); + struct toepcb *toep = lookup_atid(sc, atid); + struct inpcb *inp = toep->inp; + + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); + + CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); + free_atid(sc, atid); + + INP_WLOCK(inp); + toep->tid = tid; + insert_tid(sc, tid, toep); + if (inp->inp_flags & INP_DROPPED) { + + /* socket closed by the kernel before hw told us it connected */ + + send_flowc_wr(toep, NULL); + send_reset(sc, toep, be32toh(cpl->snd_isn)); + goto done; + } + + make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); +done: + INP_WUNLOCK(inp); + return (0); +} + +static inline int +act_open_has_tid(unsigned int status) +{ + + return (status != CPL_ERR_TCAM_FULL && + status != CPL_ERR_TCAM_PARITY && + status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS); +} + +/* + * Convert an ACT_OPEN_RPL status to an errno. + */ +static inline int +act_open_rpl_status_to_errno(int status) +{ + + switch (status) { + case CPL_ERR_CONN_RESET: + return (ECONNREFUSED); + case CPL_ERR_ARP_MISS: + return (EHOSTUNREACH); + case CPL_ERR_CONN_TIMEDOUT: + return (ETIMEDOUT); + case CPL_ERR_TCAM_FULL: + return (ENOMEM); + case CPL_ERR_CONN_EXIST: + log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); + return (EADDRINUSE); + default: + return (EIO); + } +} + +static int +do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); + unsigned int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); + unsigned int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); + struct toepcb *toep = lookup_atid(sc, atid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp = intotcpcb(inp); + struct toedev *tod = &toep->td->tod; + + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); + + CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); + + /* Ignore negative advice */ + if (status == CPL_ERR_RTX_NEG_ADVICE) + return (0); + + free_atid(sc, atid); + toep->tid = -1; + + if (status && act_open_has_tid(status)) + release_tid(sc, GET_TID(cpl), toep->ctrlq); + + if (status == CPL_ERR_TCAM_FULL) { + INP_WLOCK(inp); + toe_connect_failed(tod, tp, EAGAIN); + final_cpl_received(toep); /* unlocks inp */ + } else { + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(status)); + final_cpl_received(toep); /* unlocks inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + } + + return (0); +} + +/* + * Options2 for active open. + */ +static uint32_t +calc_opt2a(struct socket *so) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct port_info *pi = toep->port; + struct adapter *sc = pi->adapter; + uint32_t opt2 = 0; + + if (tp->t_flags & TF_SACK_PERMIT) + opt2 |= F_SACK_EN; + + if (tp->t_flags & TF_REQ_TSTMP) + opt2 |= F_TSTAMPS_EN; + + if (tp->t_flags & TF_REQ_SCALE) + opt2 |= F_WND_SCALE_EN; + + if (V_tcp_do_ecn) + opt2 |= F_CCTRL_ECN; + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE); + opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); + + return (htobe32(opt2)); +} + + +void +t4_init_connect_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); + t4_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); +} + +/* + * active open (soconnect). + * + * State of affairs on entry: + * soisconnecting (so_state |= SS_ISCONNECTING) + * tcbinfo not locked (This has changed - used to be WLOCKed) + * inp WLOCKed + * tp->t_state = TCPS_SYN_SENT + * rtalloc1, RT_UNLOCK on rt. + */ +int +t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, + struct sockaddr *nam) +{ + struct adapter *sc = tod->tod_softc; + struct toepcb *toep = NULL; + struct wrqe *wr = NULL; + struct cpl_act_open_req *cpl; + struct l2t_entry *e = NULL; + struct ifnet *rt_ifp = rt->rt_ifp; + struct port_info *pi; + int atid = -1, mtu_idx, rscale, qid_atid, rc = ENOMEM; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + + INP_WLOCK_ASSERT(inp); + + if (nam->sa_family != AF_INET) + CXGBE_UNIMPLEMENTED("IPv6 connect"); + + if (rt_ifp->if_type == IFT_ETHER) + pi = rt_ifp->if_softc; + else if (rt_ifp->if_type == IFT_L2VLAN) { + struct ifnet *ifp = VLAN_COOKIE(rt_ifp); + + pi = ifp->if_softc; + } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) + return (ENOSYS); /* XXX: implement lagg support */ + else + return (ENOTSUP); + + toep = alloc_toepcb(pi, -1, -1, M_NOWAIT); + if (toep == NULL) + goto failed; + + atid = alloc_atid(sc, toep); + if (atid < 0) + goto failed; + + e = t4_l2t_get(pi, rt_ifp, + rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam); + if (e == NULL) + goto failed; + + wr = alloc_wrqe(sizeof(*cpl), toep->ctrlq); + if (wr == NULL) + goto failed; + cpl = wrtod(wr); + + toep->tid = atid; + toep->l2te = e; + toep->ulp_mode = ULP_MODE_NONE; + SOCKBUF_LOCK(&so->so_rcv); + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + SOCKBUF_UNLOCK(&so->so_rcv); + + offload_socket(so, toep); + + /* + * The kernel sets request_r_scale based on sb_max whereas we need to + * take hardware's MAX_RCV_WND into account too. This is normally a + * no-op as MAX_RCV_WND is much larger than the default sb_max. + */ + if (tp->t_flags & TF_REQ_SCALE) + rscale = tp->request_r_scale = select_rcv_wscale(); + else + rscale = 0; + mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); + qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | atid; + + INIT_TP_WR(cpl, 0); + OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); + inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, + &cpl->peer_port); + cpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, toep->rx_credits, + toep->ulp_mode); + cpl->params = select_ntuple(pi, e, sc->filter_mode); + cpl->opt2 = calc_opt2a(so); + + CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, + toep->tid, tcpstates[tp->t_state], toep, inp); + + rc = t4_l2t_send(sc, wr, e); + if (rc == 0) { + toepcb_set_flag(toep, TPF_CPL_PENDING); + return (0); + } + + undo_offload_socket(so); +failed: + CTR5(KTR_CXGBE, "%s: FAILED, atid %d, toep %p, l2te %p, wr %p", + __func__, atid, toep, e, wr); + + if (e) + t4_l2t_release(e); + if (wr) + free_wrqe(wr); + if (atid >= 0) + free_atid(sc, atid); + if (toep) + free_toepcb(toep); + + return (rc); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c new file mode 100644 index 0000000..161fc12 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -0,0 +1,1276 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/module.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sglist.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/tcp_var.h> +#define TCPSTATES +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/toecore.h> + +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +VNET_DECLARE(int, tcp_do_autosndbuf); +#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) +VNET_DECLARE(int, tcp_autosndbuf_inc); +#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) +VNET_DECLARE(int, tcp_autosndbuf_max); +#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) +VNET_DECLARE(int, tcp_do_autorcvbuf); +#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) +VNET_DECLARE(int, tcp_autorcvbuf_inc); +#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) +VNET_DECLARE(int, tcp_autorcvbuf_max); +#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) + +void +send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) +{ + struct wrqe *wr; + struct fw_flowc_wr *flowc; + unsigned int nparams = ftxp ? 8 : 4, flowclen; + struct port_info *pi = toep->port; + struct adapter *sc = pi->adapter; + unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN; + struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; + + KASSERT(!toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc for tid %u sent already", __func__, toep->tid)); + + CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); + + flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); + + wr = alloc_wrqe(roundup(flowclen, 16), toep->ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + flowc = wrtod(wr); + memset(flowc, 0, wr->wr_len); + + flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | + V_FW_FLOWC_WR_NPARAMS(nparams)); + flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | + V_FW_WR_FLOWID(toep->tid)); + + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = htobe32(pfvf); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = htobe32(pi->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = htobe32(pi->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id); + if (ftxp) { + uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); + + flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; + flowc->mnemval[4].val = htobe32(ftxp->snd_nxt); + flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; + flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt); + flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; + flowc->mnemval[6].val = htobe32(sndbuf); + flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; + flowc->mnemval[7].val = htobe32(ftxp->mss); + } + + txsd->tx_credits = howmany(flowclen, 16); + txsd->plen = 0; + KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, + ("%s: not enough credits (%d)", __func__, toep->tx_credits)); + toep->tx_credits -= txsd->tx_credits; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) + toep->txsd_pidx = 0; + toep->txsd_avail--; + + toepcb_set_flag(toep, TPF_FLOWC_WR_SENT); + t4_wrq_tx(sc, wr); +} + +void +send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) +{ + struct wrqe *wr; + struct cpl_abort_req *req; + int tid = toep->tid; + struct inpcb *inp = toep->inp; + struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ + + INP_WLOCK_ASSERT(inp); + + CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", + __func__, toep->tid, + inp->inp_flags & INP_DROPPED ? "inp dropped" : + tcpstates[tp->t_state], + toep->flags, inp->inp_flags, + toepcb_flag(toep, TPF_ABORT_SHUTDOWN) ? + " (abort already in progress)" : ""); + + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) + return; /* abort already in progress */ + + toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN); + + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc_wr not sent for tid %d.", __func__, tid)); + + wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + req = wrtod(wr); + + INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); + if (inp->inp_flags & INP_DROPPED) + req->rsvd0 = htobe32(snd_nxt); + else + req->rsvd0 = htobe32(tp->snd_nxt); + req->rsvd1 = !toepcb_flag(toep, TPF_TX_DATA_SENT); + req->cmd = CPL_ABORT_SEND_RST; + + /* + * XXX: What's the correct way to tell that the inp hasn't been detached + * from its socket? Should I even be flushing the snd buffer here? + */ + if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { + struct socket *so = inp->inp_socket; + + if (so != NULL) /* because I'm not sure. See comment above */ + sbflush(&so->so_snd); + } + + t4_l2t_send(sc, wr, toep->l2te); +} + +/* + * Called when a connection is established to translate the TCP options + * reported by HW to FreeBSD's native format. + */ +static void +assign_rxopt(struct tcpcb *tp, unsigned int opt) +{ + struct toepcb *toep = tp->t_toe; + struct adapter *sc = td_adapter(toep->td); + + INP_LOCK_ASSERT(tp->t_inpcb); + + tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40; + + if (G_TCPOPT_TSTAMP(opt)) { + tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ + tp->ts_recent = 0; /* hmmm */ + tp->ts_recent_age = tcp_ts_getticks(); + tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; + } + + if (G_TCPOPT_SACK(opt)) + tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ + else + tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ + + if (G_TCPOPT_WSCALE_OK(opt)) + tp->t_flags |= TF_RCVD_SCALE; + + /* Doing window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); + } +} + +/* + * Completes some final bits of initialization for just established connections + * and changes their state to TCPS_ESTABLISHED. + * + * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1. + */ +void +make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, + uint16_t opt) +{ + struct inpcb *inp = toep->inp; + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); + long bufsize; + uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */ + uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */ + uint16_t tcpopt = be16toh(opt); + struct flowc_tx_params ftxp; + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_state == TCPS_SYN_SENT || + tp->t_state == TCPS_SYN_RECEIVED, + ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); + + CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p", + __func__, toep->tid, toep, inp); + + tp->t_state = TCPS_ESTABLISHED; + tp->t_starttime = ticks; + TCPSTAT_INC(tcps_connects); + + tp->irs = irs; + tcp_rcvseqinit(tp); + tp->rcv_wnd = toep->rx_credits << 10; + tp->rcv_adv += tp->rcv_wnd; + tp->last_ack_sent = tp->rcv_nxt; + + /* + * If we were unable to send all rx credits via opt0, save the remainder + * in rx_credits so that they can be handed over with the next credit + * update. + */ + SOCKBUF_LOCK(&so->so_rcv); + bufsize = select_rcv_wnd(so); + SOCKBUF_UNLOCK(&so->so_rcv); + toep->rx_credits = bufsize - tp->rcv_wnd; + + tp->iss = iss; + tcp_sendseqinit(tp); + tp->snd_una = iss + 1; + tp->snd_nxt = iss + 1; + tp->snd_max = iss + 1; + + assign_rxopt(tp, tcpopt); + + SOCKBUF_LOCK(&so->so_snd); + if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) + bufsize = V_tcp_autosndbuf_max; + else + bufsize = sbspace(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); + + ftxp.snd_nxt = tp->snd_nxt; + ftxp.rcv_nxt = tp->rcv_nxt; + ftxp.snd_space = bufsize; + ftxp.mss = tp->t_maxseg; + send_flowc_wr(toep, &ftxp); + + soisconnected(so); +} + +static int +send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits) +{ + struct wrqe *wr; + struct cpl_rx_data_ack *req; + uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + + wr = alloc_wrqe(sizeof(*req), toep->ctrlq); + if (wr == NULL) + return (0); + req = wrtod(wr); + + INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); + req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); + + t4_wrq_tx(sc, wr); + return (credits); +} + +void +t4_rcvd(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct sockbuf *so_rcv = &so->so_rcv; + struct toepcb *toep = tp->t_toe; + int must_send; + + INP_WLOCK_ASSERT(inp); + + SOCKBUF_LOCK(so_rcv); + KASSERT(toep->enqueued >= so_rcv->sb_cc, + ("%s: so_rcv->sb_cc > enqueued", __func__)); + toep->rx_credits += toep->enqueued - so_rcv->sb_cc; + toep->enqueued = so_rcv->sb_cc; + SOCKBUF_UNLOCK(so_rcv); + + must_send = toep->rx_credits + 16384 >= tp->rcv_wnd; + if (must_send || toep->rx_credits >= 15 * 1024) { + int credits; + + credits = send_rx_credits(sc, toep, toep->rx_credits); + toep->rx_credits -= credits; + tp->rcv_wnd += credits; + tp->rcv_adv += credits; + } +} + +/* + * Close a connection by sending a CPL_CLOSE_CON_REQ message. + */ +static int +close_conn(struct adapter *sc, struct toepcb *toep) +{ + struct wrqe *wr; + struct cpl_close_con_req *req; + unsigned int tid = toep->tid; + + CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, + toepcb_flag(toep, TPF_FIN_SENT) ? ", IGNORED" : ""); + + if (toepcb_flag(toep, TPF_FIN_SENT)) + return (0); + + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc_wr not sent for tid %u.", __func__, tid)); + + wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + req = wrtod(wr); + + req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | + V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); + req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | + V_FW_WR_FLOWID(tid)); + req->wr.wr_lo = cpu_to_be64(0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = 0; + + toepcb_set_flag(toep, TPF_FIN_SENT); + toepcb_clr_flag(toep, TPF_SEND_FIN); + t4_l2t_send(sc, wr, toep->l2te); + + return (0); +} + +#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) +#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) + +/* Maximum amount of immediate data we could stuff in a WR */ +static inline int +max_imm_payload(int tx_credits) +{ + const int n = 2; /* Use only up to 2 desc for imm. data WR */ + + KASSERT(tx_credits >= 0 && + tx_credits <= MAX_OFLD_TX_CREDITS, + ("%s: %d credits", __func__, tx_credits)); + + if (tx_credits < MIN_OFLD_TX_CREDITS) + return (0); + + if (tx_credits >= (n * EQ_ESIZE) / 16) + return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); + else + return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); +} + +/* Maximum number of SGL entries we could stuff in a WR */ +static inline int +max_dsgl_nsegs(int tx_credits) +{ + int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ + int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; + + KASSERT(tx_credits >= 0 && + tx_credits <= MAX_OFLD_TX_CREDITS, + ("%s: %d credits", __func__, tx_credits)); + + if (tx_credits < MIN_OFLD_TX_CREDITS) + return (0); + + nseg += 2 * (sge_pair_credits * 16 / 24); + if ((sge_pair_credits * 16) % 24 == 16) + nseg++; + + return (nseg); +} + +static inline void +write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, + unsigned int plen, uint8_t credits, int more_to_come) +{ + struct fw_ofld_tx_data_wr *txwr = dst; + int shove = !more_to_come; + int compl = 1; + + /* + * We always request completion notifications from the firmware. The + * only exception is when we know we'll get more data to send shortly + * and that we'll have some tx credits remaining to transmit that data. + */ + if (more_to_come && toep->tx_credits - credits >= MIN_OFLD_TX_CREDITS) + compl = 0; + + txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | + V_FW_WR_COMPL(compl) | V_FW_WR_IMMDLEN(immdlen)); + txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | + V_FW_WR_LEN16(credits)); + txwr->tunnel_to_proxy = + htobe32(V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode) | + V_FW_OFLD_TX_DATA_WR_URGENT(0) | /* XXX */ + V_FW_OFLD_TX_DATA_WR_SHOVE(shove)); + txwr->plen = htobe32(plen); +} + +/* + * Generate a DSGL from a starting mbuf. The total number of segments and the + * maximum segments in any one mbuf are provided. + */ +static void +write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) +{ + struct mbuf *m; + struct ulptx_sgl *usgl = dst; + int i, j, rc; + struct sglist sg; + struct sglist_seg segs[n]; + + KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); + + sglist_init(&sg, n, segs); + usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | + V_ULPTX_NSGE(nsegs)); + + i = -1; + for (m = start; m != stop; m = m->m_next) { + rc = sglist_append(&sg, mtod(m, void *), m->m_len); + if (__predict_false(rc != 0)) + panic("%s: sglist_append %d", __func__, rc); + + for (j = 0; j < sg.sg_nseg; i++, j++) { + if (i < 0) { + usgl->len0 = htobe32(segs[j].ss_len); + usgl->addr0 = htobe64(segs[j].ss_paddr); + } else { + usgl->sge[i / 2].len[i & 1] = + htobe32(segs[j].ss_len); + usgl->sge[i / 2].addr[i & 1] = + htobe64(segs[j].ss_paddr); + } +#ifdef INVARIANTS + nsegs--; +#endif + } + sglist_reset(&sg); + } + if (i & 1) + usgl->sge[i / 2].len[1] = htobe32(0); + KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", + __func__, nsegs, start, stop)); +} + +/* + * Max number of SGL entries an offload tx work request can have. This is 41 + * (1 + 40) for a full 512B work request. + * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) + */ +#define OFLD_SGL_LEN (41) + +/* + * Send data and/or a FIN to the peer. + * + * The socket's so_snd buffer consists of a stream of data starting with sb_mb + * and linked together with m_next. sb_sndptr, if set, is the last mbuf that + * was transmitted. + */ +static void +t4_push_frames(struct adapter *sc, struct toepcb *toep) +{ + struct mbuf *sndptr, *m, *sb_sndptr; + struct fw_ofld_tx_data_wr *txwr; + struct wrqe *wr; + unsigned int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; + struct inpcb *inp = toep->inp; + struct tcpcb *tp = intotcpcb(inp); + struct socket *so = inp->inp_socket; + struct sockbuf *sb = &so->so_snd; + int tx_credits; + struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; + + INP_WLOCK_ASSERT(inp); + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); + + if (toep->ulp_mode != ULP_MODE_NONE) + CXGBE_UNIMPLEMENTED("ulp_mode"); + + /* + * This function doesn't resume by itself. Someone else must clear the + * flag and call this function. + */ + if (__predict_false(toepcb_flag(toep, TPF_TX_SUSPENDED))) + return; + + do { + tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); + max_imm = max_imm_payload(tx_credits); + max_nsegs = max_dsgl_nsegs(tx_credits); + + SOCKBUF_LOCK(sb); + sb_sndptr = sb->sb_sndptr; + sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; + plen = 0; + nsegs = 0; + max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ + for (m = sndptr; m != NULL; m = m->m_next) { + int n = sglist_count(mtod(m, void *), m->m_len); + + nsegs += n; + plen += m->m_len; + + /* This mbuf sent us _over_ the nsegs limit, back out */ + if (plen > max_imm && nsegs > max_nsegs) { + nsegs -= n; + plen -= m->m_len; + if (plen == 0) { + /* Too few credits */ + toepcb_set_flag(toep, TPF_TX_SUSPENDED); + SOCKBUF_UNLOCK(sb); + return; + } + break; + } + + if (max_nsegs_1mbuf < n) + max_nsegs_1mbuf = n; + sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ + + /* This mbuf put us right at the max_nsegs limit */ + if (plen > max_imm && nsegs == max_nsegs) { + m = m->m_next; + break; + } + } + + if (sb->sb_flags & SB_AUTOSIZE && + V_tcp_do_autosndbuf && + sb->sb_hiwat < V_tcp_autosndbuf_max && + sbspace(sb) < sb->sb_hiwat / 8 * 7) { + int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, + V_tcp_autosndbuf_max); + + if (!sbreserve_locked(sb, newsize, so, NULL)) + sb->sb_flags &= ~SB_AUTOSIZE; + else { + sowwakeup_locked(so); /* room available */ + SOCKBUF_UNLOCK_ASSERT(sb); + goto unlocked; + } + } + SOCKBUF_UNLOCK(sb); +unlocked: + + /* nothing to send */ + if (plen == 0) { + KASSERT(m == NULL, + ("%s: nothing to send, but m != NULL", __func__)); + break; + } + + if (__predict_false(toepcb_flag(toep, TPF_FIN_SENT))) + panic("%s: excess tx.", __func__); + + if (plen <= max_imm) { + + /* Immediate data tx */ + + wr = alloc_wrqe(roundup(sizeof(*txwr) + plen, 16), + toep->ofld_txq); + if (wr == NULL) { + /* XXX: how will we recover from this? */ + toepcb_set_flag(toep, TPF_TX_SUSPENDED); + return; + } + txwr = wrtod(wr); + credits = howmany(wr->wr_len, 16); + write_tx_wr(txwr, toep, plen, plen, credits, + tp->t_flags & TF_MORETOCOME); + m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); + } else { + int wr_len; + + /* DSGL tx */ + + wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; + wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq); + if (wr == NULL) { + /* XXX: how will we recover from this? */ + toepcb_set_flag(toep, TPF_TX_SUSPENDED); + return; + } + txwr = wrtod(wr); + credits = howmany(wr_len, 16); + write_tx_wr(txwr, toep, 0, plen, credits, + tp->t_flags & TF_MORETOCOME); + write_tx_sgl(txwr + 1, sndptr, m, nsegs, + max_nsegs_1mbuf); + if (wr_len & 0xf) { + uint64_t *pad = (uint64_t *) + ((uintptr_t)txwr + wr_len); + *pad = 0; + } + } + + KASSERT(toep->tx_credits >= credits, + ("%s: not enough credits", __func__)); + + toep->tx_credits -= credits; + + tp->snd_nxt += plen; + tp->snd_max += plen; + + SOCKBUF_LOCK(sb); + KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); + sb->sb_sndptr = sb_sndptr; + SOCKBUF_UNLOCK(sb); + + toepcb_set_flag(toep, TPF_TX_DATA_SENT); + + KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); + txsd->plen = plen; + txsd->tx_credits = credits; + txsd++; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { + toep->txsd_pidx = 0; + txsd = &toep->txsd[0]; + } + toep->txsd_avail--; + + t4_l2t_send(sc, wr, toep->l2te); + } while (m != NULL); + + /* Send a FIN if requested, but only if there's no more data to send */ + if (m == NULL && toepcb_flag(toep, TPF_SEND_FIN)) + close_conn(sc, toep); +} + +int +t4_tod_output(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; +#ifdef INVARIANTS + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("%s: inp %p dropped.", __func__, inp)); + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + + t4_push_frames(sc, toep); + + return (0); +} + +int +t4_send_fin(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; +#ifdef INVARIANTS + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("%s: inp %p dropped.", __func__, inp)); + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + + toepcb_set_flag(toep, TPF_SEND_FIN); + t4_push_frames(sc, toep); + + return (0); +} + +int +t4_send_rst(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; +#if defined(INVARIANTS) + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("%s: inp %p dropped.", __func__, inp)); + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + + /* hmmmm */ + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc for tid %u [%s] not sent already", + __func__, toep->tid, tcpstates[tp->t_state])); + + send_reset(sc, toep, 0); + return (0); +} + +/* + * Peer has sent us a FIN. + */ +static int +do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_peer_close *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp = NULL; + struct socket *so = NULL; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PEER_CLOSE, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = intotcpcb(inp); + + CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, + tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); + + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) + goto done; + + so = inp->inp_socket; + + socantrcvmore(so); + tp->rcv_nxt++; /* FIN */ + KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), + ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, + be32toh(cpl->rcv_nxt))); + + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + case TCPS_FIN_WAIT_2: + tcp_twstart(tp); + INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + + INP_WLOCK(inp); + final_cpl_received(toep); + return (0); + + default: + log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", + __func__, tid, tp->t_state); + } +done: + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); +} + +/* + * Peer has ACK'd our FIN. + */ +static int +do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp = NULL; + struct socket *so = NULL; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_CLOSE_CON_RPL, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = intotcpcb(inp); + + CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", + __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); + + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) + goto done; + + so = inp->inp_socket; + tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ + + switch (tp->t_state) { + case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ + tcp_twstart(tp); +release: + INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + + INP_WLOCK(inp); + final_cpl_received(toep); /* no more CPLs expected */ + + return (0); + case TCPS_LAST_ACK: + if (tcp_close(tp)) + INP_WUNLOCK(inp); + goto release; + + case TCPS_FIN_WAIT_1: + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + soisdisconnected(so); + tp->t_state = TCPS_FIN_WAIT_2; + break; + + default: + log(LOG_ERR, + "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", + __func__, tid, tcpstates[tp->t_state]); + } +done: + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); +} + +void +send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, + int rst_status) +{ + struct wrqe *wr; + struct cpl_abort_rpl *cpl; + + wr = alloc_wrqe(sizeof(*cpl), ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + cpl = wrtod(wr); + + INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); + cpl->cmd = rst_status; + + t4_wrq_tx(sc, wr); +} + +static int +abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) +{ + switch (abort_reason) { + case CPL_ERR_BAD_SYN: + case CPL_ERR_CONN_RESET: + return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: + return (ETIMEDOUT); + default: + return (EIO); + } +} + +/* + * TCP RST from the peer, timeout, or some other such critical error. + */ +static int +do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct sge_wrq *ofld_txq = toep->ofld_txq; + struct inpcb *inp; + struct tcpcb *tp; + struct socket *so; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_REQ_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + + if (toepcb_flag(toep, TPF_SYNQE)) + return (do_abort_req_synqe(iq, rss, m)); + + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + if (cpl->status == CPL_ERR_RTX_NEG_ADVICE || + cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) { + CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", + __func__, cpl->status, tid, toep->flags); + return (0); /* Ignore negative advice */ + } + + inp = toep->inp; + INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_WLOCK(inp); + + tp = intotcpcb(inp); + so = inp->inp_socket; + + CTR6(KTR_CXGBE, + "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", + __func__, tid, tcpstates[tp->t_state], toep->flags, inp->inp_flags, + cpl->status); + + /* + * If we'd initiated an abort earlier the reply to it is responsible for + * cleaning up resources. Otherwise we tear everything down right here + * right now. We owe the T4 a CPL_ABORT_RPL no matter what. + */ + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) { + INP_WUNLOCK(inp); + goto done; + } + toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN); + + so_error_set(so, abort_status_to_errno(tp, cpl->status)); + tp = tcp_close(tp); + if (tp == NULL) + INP_WLOCK(inp); /* re-acquire */ + + final_cpl_received(toep); +done: + INP_INFO_WUNLOCK(&V_tcbinfo); + send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); + return (0); +} + +/* + * Reply to the CPL_ABORT_REQ (send_reset) + */ +static int +do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_RPL_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + + if (toepcb_flag(toep, TPF_SYNQE)) + return (do_abort_rpl_synqe(iq, rss, m)); + + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", + __func__, tid, toep, inp, cpl->status); + + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: wasn't expecting abort reply", __func__)); + + INP_WLOCK(inp); + final_cpl_received(toep); + + return (0); +} + +static int +do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_rx_data *cpl = mtod(m, const void *); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp; + struct socket *so; + struct sockbuf *so_rcv; + + if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) { + /* + * do_pass_establish failed and must be attempting to abort the + * synqe's tid. Meanwhile, the T4 has sent us data for such a + * connection. + */ + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: synqe and tid isn't being aborted.", __func__)); + m_freem(m); + return (0); + } + + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + /* strip off CPL header */ + m_adj(m, sizeof(*cpl)); + + INP_WLOCK(inp); + if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { + CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", + __func__, tid, m->m_pkthdr.len, inp->inp_flags); + INP_WUNLOCK(inp); + m_freem(m); + return (0); + } + + tp = intotcpcb(inp); + +#ifdef INVARIANTS + if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) { + log(LOG_ERR, + "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", + __func__, be32toh(cpl->seq), toep->tid, tp->rcv_nxt); + } +#endif + + tp->rcv_nxt += m->m_pkthdr.len; + KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, + ("%s: negative window size", __func__)); + tp->rcv_wnd -= m->m_pkthdr.len; + tp->t_rcvtime = ticks; + + so = inp_inpcbtosocket(inp); + so_rcv = &so->so_rcv; + SOCKBUF_LOCK(so_rcv); + + if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { + CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", + __func__, tid, m->m_pkthdr.len); + m_freem(m); + SOCKBUF_UNLOCK(so_rcv); + INP_WUNLOCK(inp); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = tcp_drop(tp, ECONNRESET); + if (tp) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + + return (0); + } + + /* receive buffer autosize */ + if (so_rcv->sb_flags & SB_AUTOSIZE && + V_tcp_do_autorcvbuf && + so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && + m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) { + unsigned int hiwat = so_rcv->sb_hiwat; + unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); + + if (!sbreserve_locked(so_rcv, newsize, so, NULL)) + so_rcv->sb_flags &= ~SB_AUTOSIZE; + else + toep->rx_credits += newsize - hiwat; + } + toep->enqueued += m->m_pkthdr.len; + sbappendstream_locked(so_rcv, m); + sorwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(so_rcv); + + INP_WUNLOCK(inp); + return (0); +} + +#define S_CPL_FW4_ACK_OPCODE 24 +#define M_CPL_FW4_ACK_OPCODE 0xff +#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE) +#define G_CPL_FW4_ACK_OPCODE(x) \ + (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE) + +#define S_CPL_FW4_ACK_FLOWID 0 +#define M_CPL_FW4_ACK_FLOWID 0xffffff +#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID) +#define G_CPL_FW4_ACK_FLOWID(x) \ + (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID) + +#define S_CPL_FW4_ACK_CR 24 +#define M_CPL_FW4_ACK_CR 0xff +#define V_CPL_FW4_ACK_CR(x) ((x) << S_CPL_FW4_ACK_CR) +#define G_CPL_FW4_ACK_CR(x) (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR) + +#define S_CPL_FW4_ACK_SEQVAL 0 +#define M_CPL_FW4_ACK_SEQVAL 0x1 +#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL) +#define G_CPL_FW4_ACK_SEQVAL(x) \ + (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL) +#define F_CPL_FW4_ACK_SEQVAL V_CPL_FW4_ACK_SEQVAL(1U) + +static int +do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); + unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp; + struct tcpcb *tp; + struct socket *so; + uint8_t credits = cpl->credits; + struct ofld_tx_sdesc *txsd; + int plen; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + /* + * Very unusual case: we'd sent a flowc + abort_req for a synq entry and + * now this comes back carrying the credits for the flowc. + */ + if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) { + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: credits for a synq entry %p", __func__, toep)); + return (0); + } + + inp = toep->inp; + + KASSERT(opcode == CPL_FW4_ACK, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + INP_WLOCK(inp); + + if (__predict_false(toepcb_flag(toep, TPF_ABORT_SHUTDOWN))) { + INP_WUNLOCK(inp); + return (0); + } + + KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, + ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); + + tp = intotcpcb(inp); + + if (cpl->seq_vld) { + tcp_seq snd_una = be32toh(cpl->snd_una); + +#ifdef INVARIANTS + if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { + log(LOG_ERR, + "%s: unexpected seq# %x for TID %u, snd_una %x\n", + __func__, snd_una, toep->tid, tp->snd_una); + } +#endif + + if (tp->snd_una != snd_una) { + tp->snd_una = snd_una; + tp->ts_recent_age = tcp_ts_getticks(); + } + } + + so = inp->inp_socket; + txsd = &toep->txsd[toep->txsd_cidx]; + plen = 0; + while (credits) { + KASSERT(credits >= txsd->tx_credits, + ("%s: too many (or partial) credits", __func__)); + credits -= txsd->tx_credits; + toep->tx_credits += txsd->tx_credits; + plen += txsd->plen; + txsd++; + toep->txsd_avail++; + KASSERT(toep->txsd_avail <= toep->txsd_total, + ("%s: txsd avail > total", __func__)); + if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { + txsd = &toep->txsd[0]; + toep->txsd_cidx = 0; + } + } + + if (plen > 0) { + struct sockbuf *sb = &so->so_snd; + + SOCKBUF_LOCK(sb); + sbdrop_locked(sb, plen); + sowwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(sb); + } + + /* XXX */ + if ((toepcb_flag(toep, TPF_TX_SUSPENDED) && + toep->tx_credits >= MIN_OFLD_TX_CREDITS) || + toep->tx_credits == toep->txsd_total * + howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) { + toepcb_clr_flag(toep, TPF_TX_SUSPENDED); + t4_push_frames(sc, toep); + } + INP_WUNLOCK(inp); + + return (0); +} + +void +t4_init_cpl_io_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); + t4_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); + t4_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); + t4_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); + t4_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); + t4_register_cpl_handler(sc, CPL_FW4_ACK, do_fw4_ack); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c new file mode 100644 index 0000000..895e57a --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -0,0 +1,1362 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/module.h> +#include <sys/protosw.h> +#include <sys/refcount.h> +#include <sys/domain.h> +#include <sys/fnv_hash.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_vlan_var.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/tcp_var.h> +#define TCPSTATES +#include <netinet/tcp_fsm.h> +#include <netinet/toecore.h> + +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +/* stid services */ +static int alloc_stid(struct adapter *, void *); +static void *lookup_stid(struct adapter *, int); +static void free_stid(struct adapter *, int); + +/* lctx services */ +static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, + struct port_info *); +static int free_lctx(struct adapter *, struct listen_ctx *); +static void hold_lctx(struct listen_ctx *); +static void listen_hash_add(struct adapter *, struct listen_ctx *); +static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); +static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); +static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); + +static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *); +static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); +static void send_reset_synqe(struct toedev *, struct synq_entry *); + +/* XXX: won't work for IPv6 */ +static int +alloc_stid(struct adapter *sc, void *ctx) +{ + struct tid_info *t = &sc->tids; + int stid = -1; + + mtx_lock(&t->stid_lock); + if (t->sfree) { + union serv_entry *p = t->sfree; + + stid = p - t->stid_tab; + stid += t->stid_base; + t->sfree = p->next; + p->data = ctx; + t->stids_in_use++; + } + mtx_unlock(&t->stid_lock); + return (stid); +} + +static void * +lookup_stid(struct adapter *sc, int stid) +{ + struct tid_info *t = &sc->tids; + + return (t->stid_tab[stid - t->stid_base].data); +} + +static void +free_stid(struct adapter *sc, int stid) +{ + struct tid_info *t = &sc->tids; + union serv_entry *p = &t->stid_tab[stid - t->stid_base]; + + mtx_lock(&t->stid_lock); + p->next = t->sfree; + t->sfree = p; + t->stids_in_use--; + mtx_unlock(&t->stid_lock); +} + +static struct listen_ctx * +alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi) +{ + struct listen_ctx *lctx; + + INP_WLOCK_ASSERT(inp); + + lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); + if (lctx == NULL) + return (NULL); + + lctx->stid = alloc_stid(sc, lctx); + if (lctx->stid < 0) { + free(lctx, M_CXGBE); + return (NULL); + } + + lctx->ctrlq = &sc->sge.ctrlq[pi->port_id]; + lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq]; + refcount_init(&lctx->refcount, 1); + TAILQ_INIT(&lctx->synq); + + lctx->inp = inp; + in_pcbref(inp); + + return (lctx); +} + +/* Don't call this directly, use release_lctx instead */ +static int +free_lctx(struct adapter *sc, struct listen_ctx *lctx) +{ + struct inpcb *inp = lctx->inp; + + INP_WLOCK_ASSERT(inp); + KASSERT(lctx->refcount == 0, + ("%s: refcount %d", __func__, lctx->refcount)); + KASSERT(TAILQ_EMPTY(&lctx->synq), + ("%s: synq not empty.", __func__)); + KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); + + CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", + __func__, lctx->stid, lctx, lctx->inp); + + free_stid(sc, lctx->stid); + free(lctx, M_CXGBE); + + return (in_pcbrele_wlocked(inp)); +} + +static void +hold_lctx(struct listen_ctx *lctx) +{ + + refcount_acquire(&lctx->refcount); +} + +static inline uint32_t +listen_hashfn(void *key, u_long mask) +{ + + return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); +} + +/* + * Add a listen_ctx entry to the listen hash table. + */ +static void +listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) +{ + struct tom_data *td = sc->tom_softc; + int bucket = listen_hashfn(lctx->inp, td->listen_mask); + + mtx_lock(&td->lctx_hash_lock); + LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); + td->lctx_count++; + mtx_unlock(&td->lctx_hash_lock); +} + +/* + * Look for the listening socket's context entry in the hash and return it. + */ +static struct listen_ctx * +listen_hash_find(struct adapter *sc, struct inpcb *inp) +{ + struct tom_data *td = sc->tom_softc; + int bucket = listen_hashfn(inp, td->listen_mask); + struct listen_ctx *lctx; + + mtx_lock(&td->lctx_hash_lock); + LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { + if (lctx->inp == inp) + break; + } + mtx_unlock(&td->lctx_hash_lock); + + return (lctx); +} + +/* + * Removes the listen_ctx structure for inp from the hash and returns it. + */ +static struct listen_ctx * +listen_hash_del(struct adapter *sc, struct inpcb *inp) +{ + struct tom_data *td = sc->tom_softc; + int bucket = listen_hashfn(inp, td->listen_mask); + struct listen_ctx *lctx, *l; + + mtx_lock(&td->lctx_hash_lock); + LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { + if (lctx->inp == inp) { + LIST_REMOVE(lctx, link); + td->lctx_count--; + break; + } + } + mtx_unlock(&td->lctx_hash_lock); + + return (lctx); +} + +/* + * Releases a hold on the lctx. Must be called with the listening socket's inp + * locked. The inp may be freed by this function and it returns NULL to + * indicate this. + */ +static struct inpcb * +release_lctx(struct adapter *sc, struct listen_ctx *lctx) +{ + struct inpcb *inp = lctx->inp; + int inp_freed = 0; + + INP_WLOCK_ASSERT(inp); + if (refcount_release(&lctx->refcount)) + inp_freed = free_lctx(sc, lctx); + + return (inp_freed ? NULL : inp); +} + +static void +send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) +{ + struct adapter *sc = tod->tod_softc; + struct mbuf *m = synqe->syn; + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct port_info *pi = ifp->if_softc; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + struct wrqe *wr; + struct fw_flowc_wr *flowc; + struct cpl_abort_req *req; + int txqid, rxqid, flowclen; + struct sge_wrq *ofld_txq; + struct sge_ofld_rxq *ofld_rxq; + const int nparams = 4; + unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN; + + INP_WLOCK_ASSERT(synqe->lctx->inp); + + CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s", + __func__, synqe, synqe->tid, + synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ? + " (abort already in progress)" : ""); + if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) + return; /* abort already in progress */ + synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN); + + get_qids_from_mbuf(m, &txqid, &rxqid); + ofld_txq = &sc->sge.ofld_txq[txqid]; + ofld_rxq = &sc->sge.ofld_rxq[rxqid]; + + /* The wrqe will have two WRs - a flowc followed by an abort_req */ + flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); + + wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + flowc = wrtod(wr); + req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE)); + + /* First the flowc ... */ + memset(flowc, 0, wr->wr_len); + flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | + V_FW_FLOWC_WR_NPARAMS(nparams)); + flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | + V_FW_WR_FLOWID(synqe->tid)); + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = htobe32(pfvf); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = htobe32(pi->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = htobe32(pi->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); + synqe_set_flag(synqe, TPF_FLOWC_WR_SENT); + + /* ... then ABORT request */ + INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); + req->rsvd0 = 0; /* don't have a snd_nxt */ + req->rsvd1 = 1; /* no data sent yet */ + req->cmd = CPL_ABORT_SEND_RST; + + t4_l2t_send(sc, wr, e); +} + +static int +create_server(struct adapter *sc, struct listen_ctx *lctx) +{ + struct wrqe *wr; + struct cpl_pass_open_req *req; + struct in_conninfo *inc = &lctx->inp->inp_inc; + + wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); + if (wr == NULL) { + log(LOG_ERR, "%s: allocation failure", __func__); + return (ENOMEM); + } + req = wrtod(wr); + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); + req->local_port = inc->inc_lport; + req->peer_port = 0; + req->local_ip = inc->inc_laddr.s_addr; + req->peer_ip = 0; + req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); + req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | + F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); + + t4_wrq_tx(sc, wr); + return (0); +} + +static int +destroy_server(struct adapter *sc, struct listen_ctx *lctx) +{ + struct wrqe *wr; + struct cpl_close_listsvr_req *req; + + wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + req = wrtod(wr); + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, + lctx->stid)); + req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); + req->rsvd = htobe16(0); + + t4_wrq_tx(sc, wr); + return (0); +} + +/* + * Start a listening server by sending a passive open request to HW. + * + * Can't take adapter lock here and access to sc->flags, sc->open_device_map, + * sc->offload_map, if_capenable are all race prone. + */ +int +t4_listen_start(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; + struct port_info *pi; + struct inpcb *inp = tp->t_inpcb; + struct listen_ctx *lctx; + int i; + + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_vflag & INP_IPV4) == 0) + return (0); + +#if 0 + ADAPTER_LOCK(sc); + if (IS_BUSY(sc)) { + log(LOG_ERR, "%s: listen request ignored, %s is busy", + __func__, device_get_nameunit(sc->dev)); + goto done; + } + + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM not initialized", __func__)); +#endif + + if ((sc->open_device_map & sc->offload_map) == 0) + goto done; /* no port that's UP with IFCAP_TOE enabled */ + + /* + * Find a running port with IFCAP_TOE4. We'll use the first such port's + * queues to send the passive open and receive the reply to it. + * + * XXX: need a way to mark a port in use by offload. if_cxgbe should + * then reject any attempt to bring down such a port (and maybe reject + * attempts to disable IFCAP_TOE on that port too?). + */ + for_each_port(sc, i) { + if (isset(&sc->open_device_map, i) && + sc->port[i]->ifp->if_capenable & IFCAP_TOE4) + break; + } + KASSERT(i < sc->params.nports, + ("%s: no running port with TOE capability enabled.", __func__)); + pi = sc->port[i]; + + if (listen_hash_find(sc, inp) != NULL) + goto done; /* already setup */ + + lctx = alloc_lctx(sc, inp, pi); + if (lctx == NULL) { + log(LOG_ERR, + "%s: listen request ignored, %s couldn't allocate lctx\n", + __func__, device_get_nameunit(sc->dev)); + goto done; + } + listen_hash_add(sc, lctx); + + CTR5(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p", __func__, + lctx->stid, tcpstates[tp->t_state], lctx, inp); + + if (create_server(sc, lctx) != 0) { + log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__, + device_get_nameunit(sc->dev)); + (void) listen_hash_del(sc, inp); + inp = release_lctx(sc, lctx); + /* can't be freed, host stack has a reference */ + KASSERT(inp != NULL, ("%s: inp freed", __func__)); + goto done; + } + lctx->flags |= LCTX_RPL_PENDING; +done: +#if 0 + ADAPTER_UNLOCK(sc); +#endif + return (0); +} + +int +t4_listen_stop(struct toedev *tod, struct tcpcb *tp) +{ + struct listen_ctx *lctx; + struct adapter *sc = tod->tod_softc; + struct inpcb *inp = tp->t_inpcb; + struct synq_entry *synqe; + + INP_WLOCK_ASSERT(inp); + + lctx = listen_hash_del(sc, inp); + if (lctx == NULL) + return (ENOENT); /* no hardware listener for this inp */ + + CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, + lctx, lctx->flags); + + /* + * If the reply to the PASS_OPEN is still pending we'll wait for it to + * arrive and clean up when it does. + */ + if (lctx->flags & LCTX_RPL_PENDING) { + KASSERT(TAILQ_EMPTY(&lctx->synq), + ("%s: synq not empty.", __func__)); + return (EINPROGRESS); + } + + /* + * The host stack will abort all the connections on the listening + * socket's so_comp. It doesn't know about the connections on the synq + * so we need to take care of those. + */ + TAILQ_FOREACH(synqe, &lctx->synq, link) + send_reset_synqe(tod, synqe); + + destroy_server(sc, lctx); + return (0); +} + +static inline void +hold_synqe(struct synq_entry *synqe) +{ + + refcount_acquire(&synqe->refcnt); +} + +static inline void +release_synqe(struct synq_entry *synqe) +{ + + if (refcount_release(&synqe->refcnt)) { + int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE); + + m_freem(synqe->syn); + if (needfree) + free(synqe, M_CXGBE); + } +} + +void +t4_syncache_added(struct toedev *tod __unused, void *arg) +{ + struct synq_entry *synqe = arg; + + hold_synqe(synqe); +} + +void +t4_syncache_removed(struct toedev *tod __unused, void *arg) +{ + struct synq_entry *synqe = arg; + + release_synqe(synqe); +} + +/* XXX */ +extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); + +int +t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) +{ + struct adapter *sc = tod->tod_softc; + struct synq_entry *synqe = arg; + struct wrqe *wr; + struct l2t_entry *e; + struct tcpopt to; + struct ip *ip = mtod(m, struct ip *); + struct tcphdr *th = (void *)(ip + 1); + + wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr); + if (wr == NULL) + return (EALREADY); + + bzero(&to, sizeof(to)); + tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), + TO_SYN); + + /* save these for later */ + synqe->iss = be32toh(th->th_seq); + synqe->ts = to.to_tsval; + + e = &sc->l2t->l2tab[synqe->l2e_idx]; + t4_l2t_send(sc, wr, e); + + m_freem(m); /* don't need this any more */ + return (0); +} + +static int +do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); + int stid = GET_TID(cpl); + unsigned int status = cpl->status; + struct listen_ctx *lctx = lookup_stid(sc, stid); + struct inpcb *inp = lctx->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PASS_OPEN_RPL, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + + INP_WLOCK(inp); + + CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", + __func__, stid, status, lctx->flags); + + lctx->flags &= ~LCTX_RPL_PENDING; + + if (status != CPL_ERR_NONE) + log(LOG_ERR, "listener with stid %u failed: %d", stid, status); + +#ifdef INVARIANTS + /* + * If the inp has been dropped (listening socket closed) then + * listen_stop must have run and taken the inp out of the hash. + */ + if (inp->inp_flags & INP_DROPPED) { + KASSERT(listen_hash_del(sc, inp) == NULL, + ("%s: inp %p still in listen hash", __func__, inp)); + } +#endif + + if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { + if (release_lctx(sc, lctx) != NULL) + INP_WUNLOCK(inp); + return (status); + } + + /* + * Listening socket stopped listening earlier and now the chip tells us + * it has started the hardware listener. Stop it; the lctx will be + * released in do_close_server_rpl. + */ + if (inp->inp_flags & INP_DROPPED) { + destroy_server(sc, lctx); + INP_WUNLOCK(inp); + return (status); + } + + /* + * Failed to start hardware listener. Take inp out of the hash and + * release our reference on it. An error message has been logged + * already. + */ + if (status != CPL_ERR_NONE) { + listen_hash_del(sc, inp); + if (release_lctx(sc, lctx) != NULL) + INP_WUNLOCK(inp); + return (status); + } + + /* hardware listener open for business */ + + INP_WUNLOCK(inp); + return (status); +} + +static int +do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); + int stid = GET_TID(cpl); + unsigned int status = cpl->status; + struct listen_ctx *lctx = lookup_stid(sc, stid); + struct inpcb *inp = lctx->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + + CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); + + if (status != CPL_ERR_NONE) { + log(LOG_ERR, "%s: failed (%u) to close listener for stid %u", + __func__, status, stid); + return (status); + } + + INP_WLOCK(inp); + inp = release_lctx(sc, lctx); + if (inp != NULL) + INP_WUNLOCK(inp); + + return (status); +} + +static void +done_with_synqe(struct adapter *sc, struct synq_entry *synqe) +{ + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + + INP_WLOCK_ASSERT(inp); + + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(sc, lctx); + if (inp) + INP_WUNLOCK(inp); + remove_tid(sc, synqe->tid); + release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]); + t4_l2t_release(e); + release_synqe(synqe); /* removed from synq list */ +} + +int +do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(sc, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + int txqid; + struct sge_wrq *ofld_txq; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_REQ_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); + + CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", + __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); + + if (cpl->status == CPL_ERR_RTX_NEG_ADVICE || + cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) + return (0); /* Ignore negative advice */ + + INP_WLOCK(inp); + + get_qids_from_mbuf(synqe->syn, &txqid, NULL); + ofld_txq = &sc->sge.ofld_txq[txqid]; + + /* + * If we'd initiated an abort earlier the reply to it is responsible for + * cleaning up resources. Otherwise we tear everything down right here + * right now. We owe the T4 a CPL_ABORT_RPL no matter what. + */ + if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) { + INP_WUNLOCK(inp); + goto done; + } + + done_with_synqe(sc, synqe); + /* inp lock released by done_with_synqe */ +done: + send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); + return (0); +} + +int +do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(sc, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_RPL_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); + + CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", + __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); + + INP_WLOCK(inp); + KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: wasn't expecting abort reply for synqe %p (0x%x)", + __func__, synqe, synqe->flags)); + + done_with_synqe(sc, synqe); + /* inp lock released by done_with_synqe */ + + return (0); +} + +void +t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) +{ + struct adapter *sc = tod->tod_softc; + struct synq_entry *synqe = arg; +#ifdef INVARIANTS + struct inpcb *inp = sotoinpcb(so); +#endif + struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); + struct toepcb *toep = *(struct toepcb **)(cpl + 1); + + INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_WLOCK_ASSERT(inp); + KASSERT(synqe_flag(synqe, TPF_SYNQE), + ("%s: %p not a synq_entry?", __func__, arg)); + + offload_socket(so, toep); + make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); + toepcb_set_flag(toep, TPF_CPL_PENDING); + update_tid(sc, synqe->tid, toep); +} + +static inline void +save_qids_in_mbuf(struct mbuf *m, struct port_info *pi) +{ + uint32_t txqid, rxqid; + + txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq; + rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq; + + m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); +} + +static inline void +get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid) +{ + + if (txqid) + *txqid = m->m_pkthdr.flowid >> 16; + if (rxqid) + *rxqid = m->m_pkthdr.flowid & 0xffff; +} + +/* + * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to + * store some state temporarily. + */ +static struct synq_entry * +mbuf_to_synqe(struct mbuf *m) +{ + int len = roundup(sizeof (struct synq_entry), 8); + int tspace = M_TRAILINGSPACE(m); + struct synq_entry *synqe = NULL; + + if (tspace < len) { + synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT); + if (synqe == NULL) + return (NULL); + } else + synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe)); + + synqe->flags = 0; + synqe_set_flag(synqe, TPF_SYNQE); + if (tspace < len) + synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE); + + return (synqe); +} + +static void +t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) +{ + bzero(to, sizeof(*to)); + + if (t4opt->mss) { + to->to_flags |= TOF_MSS; + to->to_mss = be16toh(t4opt->mss); + } + + if (t4opt->wsf) { + to->to_flags |= TOF_SCALE; + to->to_wscale = t4opt->wsf; + } + + if (t4opt->tstamp) + to->to_flags |= TOF_TS; + + if (t4opt->sack) + to->to_flags |= TOF_SACKPERM; +} + +/* + * Options2 for passive open. + */ +static uint32_t +calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, + const struct tcp_options *tcpopt, struct tcphdr *th) +{ + uint32_t opt2 = 0; + struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; + + if (V_tcp_do_rfc1323) { + if (tcpopt->tstamp) + opt2 |= F_TSTAMPS_EN; + if (tcpopt->sack) + opt2 |= F_SACK_EN; + if (tcpopt->wsf > 0) + opt2 |= F_WND_SCALE_EN; + } + + if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR)) + opt2 |= F_CCTRL_ECN; + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE); + opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); + + return htobe32(opt2); +} + +/* XXX: duplication. */ +static inline void +tcp_fields_to_host(struct tcphdr *th) +{ + + th->th_seq = ntohl(th->th_seq); + th->th_ack = ntohl(th->th_ack); + th->th_win = ntohs(th->th_win); + th->th_urp = ntohs(th->th_urp); +} + +static void +pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc, + struct tcphdr *th) +{ + const struct cpl_pass_accept_req *cpl = mtod(m, const void *); + const struct ether_header *eh; + unsigned int hlen = be32toh(cpl->hdr_len); + const struct ip *ip; + const struct tcphdr *tcp; + + eh = (const void *)(cpl + 1); + ip = (const void *)((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); + tcp = (const void *)((uintptr_t)ip + G_IP_HDR_LEN(hlen)); + + if (inc) { + bzero(inc, sizeof(*inc)); + inc->inc_faddr = ip->ip_src; + inc->inc_laddr = ip->ip_dst; + inc->inc_fport = tcp->th_sport; + inc->inc_lport = tcp->th_dport; + if (ip->ip_v == 6) + inc->inc_flags |= INC_ISIPV6; + } + + if (th) { + bcopy(tcp, th, sizeof(*th)); + tcp_fields_to_host(th); /* just like tcp_input */ + } +} + +#define REJECT_PASS_ACCEPT() do { \ + reject_reason = __LINE__; \ + goto reject; \ +} while (0) + +/* + * The context associated with a tid entry via insert_tid could be a synq_entry + * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. + */ +CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); + +/* + * Incoming SYN on a listening socket. + * + * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, + * etc. + */ +static int +do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + struct toedev *tod; + const struct cpl_pass_accept_req *cpl = mtod(m, const void *); + struct cpl_pass_accept_rpl *rpl; + struct wrqe *wr; + unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); + unsigned int tid = GET_TID(cpl); + struct listen_ctx *lctx = lookup_stid(sc, stid); + struct inpcb *inp; + struct socket *so; + struct in_conninfo inc; + struct tcphdr th; + struct tcpopt to; + struct port_info *pi; + struct ifnet *ifp, *ifp_vlan = NULL; + struct l2t_entry *e = NULL; + struct rtentry *rt; + struct sockaddr_in nam; + int rscale, mtu_idx, rx_credits, rxqid; + struct synq_entry *synqe = NULL; + int reject_reason; + uint16_t vid; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PASS_ACCEPT_REQ, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + + CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, + lctx); + + pass_accept_req_to_protohdrs(m, &inc, &th); + t4opt_to_tcpopt(&cpl->tcpopt, &to); + + pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; + ifp = pi->ifp; + m->m_pkthdr.rcvif = ifp; + tod = TOEDEV(ifp); + + /* + * Don't offload if the interface that received the SYN doesn't have + * IFCAP_TOE enabled. + */ + if ((ifp->if_capenable & IFCAP_TOE4) == 0) + REJECT_PASS_ACCEPT(); + + /* Don't offload IPv6 connections. XXX: add IPv6 support */ + if (inc.inc_flags & INC_ISIPV6) + REJECT_PASS_ACCEPT(); + + /* + * Don't offload if the SYN had a VLAN tag and the vid doesn't match + * anything on this interface. + */ + vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); + if (vid != 0xfff) { + ifp_vlan = VLAN_DEVAT(ifp, vid); + if (ifp_vlan == NULL) + REJECT_PASS_ACCEPT(); + } + + /* + * Don't offload if the peer requested a TCP option that's not known to + * the silicon. + */ + if (cpl->tcpopt.unknown) + REJECT_PASS_ACCEPT(); + + /* + * Don't offload if the outgoing interface for the route back to the + * peer is not the same as the interface that received the SYN. + * XXX: too restrictive. + */ + nam.sin_len = sizeof(nam); + nam.sin_family = AF_INET; + nam.sin_addr = inc.inc_faddr; + rt = rtalloc1((struct sockaddr *)&nam, 0, 0); + if (rt == NULL) + REJECT_PASS_ACCEPT(); + else { + struct sockaddr *nexthop; + + RT_UNLOCK(rt); + nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : + (struct sockaddr *)&nam; + if (rt->rt_ifp == ifp || + (ifp_vlan != NULL && rt->rt_ifp == ifp_vlan)) + e = t4_l2t_get(pi, rt->rt_ifp, nexthop); + RTFREE(rt); + if (e == NULL) + REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ + } + + synqe = mbuf_to_synqe(m); + if (synqe == NULL) + REJECT_PASS_ACCEPT(); + + wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]); + if (wr == NULL) + REJECT_PASS_ACCEPT(); + rpl = wrtod(wr); + + INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check, syncache_add */ + + /* Don't offload if the 4-tuple is already in use */ + if (toe_4tuple_check(&inc, &th, ifp) != 0) { + INP_INFO_WUNLOCK(&V_tcbinfo); + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } + + inp = lctx->inp; /* listening socket, not owned by TOE */ + INP_WLOCK(inp); + + /* Don't offload if the listening socket has closed */ + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * The listening socket has closed. The reply from the TOE to + * our CPL_CLOSE_LISTSRV_REQ will ultimately release all + * resources tied to this listen context. + */ + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } + so = inp->inp_socket; + + mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); + rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; + SOCKBUF_LOCK(&so->so_rcv); + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + SOCKBUF_UNLOCK(&so->so_rcv); + + save_qids_in_mbuf(m, pi); + get_qids_from_mbuf(m, NULL, &rxqid); + + INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); + rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, + ULP_MODE_NONE); + rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th); + + synqe->tid = tid; + synqe->lctx = lctx; + synqe->syn = m; + m = NULL; + refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the + duration of this function */ + synqe->l2e_idx = e->idx; + synqe->rcv_bufsize = rx_credits; + atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); + + insert_tid(sc, tid, synqe); + TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); + hold_synqe(synqe); /* hold for the duration it's in the synq */ + hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ + + /* + * If all goes well t4_syncache_respond will get called during + * syncache_add. Also note that syncache_add releases both pcbinfo and + * pcb locks. + */ + toe_syncache_add(&inc, &to, &th, inp, tod, synqe); + INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + + /* + * If we replied during syncache_add (synqe->wr has been consumed), + * good. Otherwise, set it to 0 so that further syncache_respond + * attempts by the kernel will be ignored. + * + * The extra hold on the synqe makes sure that it is still around, even + * if the listener has been dropped and the synqe was aborted and the + * reply to the abort has removed and released the synqe from the synq + * list. + */ + if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { + + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* listener closed. synqe must have been aborted. */ + KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: listener %p closed but synqe %p not aborted", + __func__, inp, synqe)); + + CTR5(KTR_CXGBE, + "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", + __func__, stid, tid, lctx, synqe); + INP_WUNLOCK(inp); + free(wr, M_CXGBE); + release_synqe(synqe); /* about to exit function */ + return (__LINE__); + } + + /* + * synqe aborted before TOM replied to PASS_ACCEPT_REQ. But + * that can only happen if the listener was closed and we just + * checked for that. + */ + KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: synqe %p aborted, but listener %p not dropped.", + __func__, synqe, inp)); + + /* Yank the synqe out of the lctx synq. */ + TAILQ_REMOVE(&lctx->synq, synqe, link); + release_synqe(synqe); /* removed from synq list */ + inp = release_lctx(sc, lctx); + if (inp) + INP_WUNLOCK(inp); + + /* + * syncache may or may not have a hold on the synqe, which may + * or may not be stashed in the original SYN mbuf passed to us. + * Just copy it over instead of dealing with all possibilities. + */ + m = m_dup(synqe->syn, M_DONTWAIT); + if (m) + m->m_pkthdr.rcvif = ifp; + + release_synqe(synqe); /* about to exit function */ + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } + release_synqe(synqe); /* about to exit function */ + CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK", + __func__, stid, tid, lctx, synqe); + return (0); +reject: + CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, + reject_reason); + + if (e) + t4_l2t_release(e); + release_tid(sc, tid, lctx->ctrlq); + + if (__predict_true(m != NULL)) { + m_adj(m, sizeof(*cpl)); + m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + ifp->if_input(ifp, m); + } + + return (reject_reason); +} + +static void +synqe_to_protohdrs(struct synq_entry *synqe, + const struct cpl_pass_establish *cpl, struct in_conninfo *inc, + struct tcphdr *th, struct tcpopt *to) +{ + uint16_t tcp_opt = be16toh(cpl->tcp_opt); + + /* start off with the original SYN */ + pass_accept_req_to_protohdrs(synqe->syn, inc, th); + + /* modify parts to make it look like the ACK to our SYN|ACK */ + th->th_flags = TH_ACK; + th->th_ack = synqe->iss + 1; + th->th_seq = be32toh(cpl->rcv_isn); + bzero(to, sizeof(*to)); + if (G_TCPOPT_TSTAMP(tcp_opt)) { + to->to_flags |= TOF_TS; + to->to_tsecr = synqe->ts; + } +} + +static int +do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + struct port_info *pi; + struct ifnet *ifp; + const struct cpl_pass_establish *cpl = (const void *)(rss + 1); +#if defined(KTR) || defined(INVARIANTS) + unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); +#endif + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(sc, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + struct socket *so; + struct tcphdr th; + struct tcpopt to; + struct in_conninfo inc; + struct toepcb *toep; + u_int txqid, rxqid; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PASS_ESTABLISH, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + KASSERT(synqe_flag(synqe, TPF_SYNQE), + ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); + + INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_WLOCK(inp); + + CTR6(KTR_CXGBE, + "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", + __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); + + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * The listening socket has closed. The TOM must have aborted + * all the embryonic connections (including this one) that were + * on the lctx's synq. do_abort_rpl for the tid is responsible + * for cleaning up. + */ + KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: listen socket dropped but tid %u not aborted.", + __func__, tid)); + + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); + } + + ifp = synqe->syn->m_pkthdr.rcvif; + pi = ifp->if_softc; + KASSERT(pi->adapter == sc, + ("%s: pi %p, sc %p mismatch", __func__, pi, sc)); + + get_qids_from_mbuf(synqe->syn, &txqid, &rxqid); + KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], + ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid, + (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); + + toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT); + if (toep == NULL) { +reset: + /* The reply to this abort will perform final cleanup */ + send_reset_synqe(TOEDEV(ifp), synqe); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); + } + toep->tid = tid; + toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; + toep->ulp_mode = ULP_MODE_NONE; + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + toep->rx_credits = synqe->rcv_bufsize; + + so = inp->inp_socket; + KASSERT(so != NULL, ("%s: socket is NULL", __func__)); + + /* Come up with something that syncache_expand should be ok with. */ + synqe_to_protohdrs(synqe, cpl, &inc, &th, &to); + + /* + * No more need for anything in the mbuf that carried the + * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer + * there. XXX: bad form but I don't want to increase the size of synqe. + */ + m = synqe->syn; + KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len, + ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len)); + bcopy(cpl, mtod(m, void *), sizeof(*cpl)); + *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep; + + if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { + free_toepcb(toep); + goto reset; + } + + /* Done with the synqe */ + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(sc, lctx); + if (inp != NULL) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + release_synqe(synqe); + + return (0); +} + +void +t4_init_listen_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl); + t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); + t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req); + t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c new file mode 100644 index 0000000..c6e9a1f --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -0,0 +1,755 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/module.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/tcp_var.h> +#define TCPSTATES +#include <netinet/tcp_fsm.h> +#include <netinet/toecore.h> + +#ifdef TCP_OFFLOAD +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +/* Module ops */ +static int t4_tom_mod_load(void); +static int t4_tom_mod_unload(void); +static int t4_tom_modevent(module_t, int, void *); + +/* ULD ops and helpers */ +static int t4_tom_activate(struct adapter *); +static int t4_tom_deactivate(struct adapter *); + +static struct uld_info tom_uld_info = { + .uld_id = ULD_TOM, + .activate = t4_tom_activate, + .deactivate = t4_tom_deactivate, +}; + +static void queue_tid_release(struct adapter *, int); +static void release_offload_resources(struct toepcb *); +static int alloc_tid_tabs(struct tid_info *); +static void free_tid_tabs(struct tid_info *); +static void free_tom_data(struct adapter *, struct tom_data *); + +struct toepcb * +alloc_toepcb(struct port_info *pi, int txqid, int rxqid, int flags) +{ + struct adapter *sc = pi->adapter; + struct toepcb *toep; + int tx_credits, txsd_total, len; + + /* + * The firmware counts tx work request credits in units of 16 bytes + * each. Reserve room for an ABORT_REQ so the driver never has to worry + * about tx credits if it wants to abort a connection. + */ + tx_credits = sc->params.ofldq_wr_cred; + tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); + + /* + * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte + * immediate payload, and firmware counts tx work request credits in + * units of 16 byte. Calculate the maximum work requests possible. + */ + txsd_total = tx_credits / + howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16); + + if (txqid < 0) + txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq; + KASSERT(txqid >= pi->first_ofld_txq && + txqid < pi->first_ofld_txq + pi->nofldtxq, + ("%s: txqid %d for port %p (first %d, n %d)", __func__, txqid, pi, + pi->first_ofld_txq, pi->nofldtxq)); + + if (rxqid < 0) + rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq; + KASSERT(rxqid >= pi->first_ofld_rxq && + rxqid < pi->first_ofld_rxq + pi->nofldrxq, + ("%s: rxqid %d for port %p (first %d, n %d)", __func__, rxqid, pi, + pi->first_ofld_rxq, pi->nofldrxq)); + + len = offsetof(struct toepcb, txsd) + + txsd_total * sizeof(struct ofld_tx_sdesc); + + toep = malloc(len, M_CXGBE, M_ZERO | flags); + if (toep == NULL) + return (NULL); + + toep->td = sc->tom_softc; + toep->port = pi; + toep->tx_credits = tx_credits; + toep->ofld_txq = &sc->sge.ofld_txq[txqid]; + toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; + toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; + toep->txsd_total = txsd_total; + toep->txsd_avail = txsd_total; + toep->txsd_pidx = 0; + toep->txsd_cidx = 0; + + return (toep); +} + +void +free_toepcb(struct toepcb *toep) +{ + + KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0, + ("%s: attached to an inpcb", __func__)); + KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0, + ("%s: CPL pending", __func__)); + + free(toep, M_CXGBE); +} + +/* + * Set up the socket for TCP offload. + */ +void +offload_socket(struct socket *so, struct toepcb *toep) +{ + struct tom_data *td = toep->td; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct sockbuf *sb; + + INP_WLOCK_ASSERT(inp); + + /* Update socket */ + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + sb->sb_flags |= SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + sb->sb_flags |= SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + + /* Update TCP PCB */ + tp->tod = &td->tod; + tp->t_toe = toep; + tp->t_flags |= TF_TOE; + + /* Install an extra hold on inp */ + toep->inp = inp; + toepcb_set_flag(toep, TPF_ATTACHED); + in_pcbref(inp); + + /* Add the TOE PCB to the active list */ + mtx_lock(&td->toep_list_lock); + TAILQ_INSERT_HEAD(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); +} + +/* This is _not_ the normal way to "unoffload" a socket. */ +void +undo_offload_socket(struct socket *so) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep = tp->t_toe; + struct tom_data *td = toep->td; + struct sockbuf *sb; + + INP_WLOCK_ASSERT(inp); + + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + sb->sb_flags &= ~SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + sb->sb_flags &= ~SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + + tp->tod = NULL; + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + + toep->inp = NULL; + toepcb_clr_flag(toep, TPF_ATTACHED); + if (in_pcbrele_wlocked(inp)) + panic("%s: inp freed.", __func__); + + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); +} + +static void +release_offload_resources(struct toepcb *toep) +{ + struct tom_data *td = toep->td; + struct adapter *sc = td_adapter(td); + int tid = toep->tid; + + KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0, + ("%s: %p has CPL pending.", __func__, toep)); + KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0, + ("%s: %p is still attached.", __func__, toep)); + + CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)", + __func__, toep, tid, toep->l2te); + + if (toep->l2te) + t4_l2t_release(toep->l2te); + + if (tid >= 0) { + remove_tid(sc, tid); + release_tid(sc, tid, toep->ctrlq); + } + + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); + + free_toepcb(toep); +} + +/* + * The kernel is done with the TCP PCB and this is our opportunity to unhook the + * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no + * pending CPL) then it is time to release all resources tied to the toepcb. + * + * Also gets called when an offloaded active open fails and the TOM wants the + * kernel to take the TCP PCB back. + */ +static void +t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) +{ +#if defined(KTR) || defined(INVARIANTS) + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + KASSERT(toepcb_flag(toep, TPF_ATTACHED), + ("%s: not attached", __func__)); + +#ifdef KTR + if (tp->t_state == TCPS_SYN_SENT) { + CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", + __func__, toep->tid, toep, toep->flags, inp, + inp->inp_flags); + } else { + CTR6(KTR_CXGBE, + "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", + toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, + inp->inp_flags); + } +#endif + + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + toepcb_clr_flag(toep, TPF_ATTACHED); + + if (toepcb_flag(toep, TPF_CPL_PENDING) == 0) + release_offload_resources(toep); +} + +/* + * The TOE driver will not receive any more CPLs for the tid associated with the + * toepcb; release the hold on the inpcb. + */ +void +final_cpl_received(struct toepcb *toep) +{ + struct inpcb *inp = toep->inp; + + KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); + INP_WLOCK_ASSERT(inp); + KASSERT(toepcb_flag(toep, TPF_CPL_PENDING), + ("%s: CPL not pending already?", __func__)); + + CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", + __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); + + toep->inp = NULL; + toepcb_clr_flag(toep, TPF_CPL_PENDING); + + if (toepcb_flag(toep, TPF_ATTACHED) == 0) + release_offload_resources(toep); + + if (!in_pcbrele_wlocked(inp)) + INP_WUNLOCK(inp); +} + +void +insert_tid(struct adapter *sc, int tid, void *ctx) +{ + struct tid_info *t = &sc->tids; + + t->tid_tab[tid] = ctx; + atomic_add_int(&t->tids_in_use, 1); +} + +void * +lookup_tid(struct adapter *sc, int tid) +{ + struct tid_info *t = &sc->tids; + + return (t->tid_tab[tid]); +} + +void +update_tid(struct adapter *sc, int tid, void *ctx) +{ + struct tid_info *t = &sc->tids; + + t->tid_tab[tid] = ctx; +} + +void +remove_tid(struct adapter *sc, int tid) +{ + struct tid_info *t = &sc->tids; + + t->tid_tab[tid] = NULL; + atomic_subtract_int(&t->tids_in_use, 1); +} + +void +release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) +{ + struct wrqe *wr; + struct cpl_tid_release *req; + + wr = alloc_wrqe(sizeof(*req), ctrlq); + if (wr == NULL) { + queue_tid_release(sc, tid); /* defer */ + return; + } + req = wrtod(wr); + + INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); + + t4_wrq_tx(sc, wr); +} + +static void +queue_tid_release(struct adapter *sc, int tid) +{ + + CXGBE_UNIMPLEMENTED("deferred tid release"); +} + +/* + * What mtu_idx to use, given a 4-tuple and/or an MSS cap + */ +int +find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) +{ + unsigned short *mtus = &sc->params.mtus[0]; + int i = 0, mss; + + KASSERT(inc != NULL || pmss > 0, + ("%s: at least one of inc/pmss must be specified", __func__)); + + mss = inc ? tcp_mssopt(inc) : pmss; + if (pmss > 0 && mss > pmss) + mss = pmss; + + while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) + ++i; + + return (i); +} + +/* + * Determine the receive window size for a socket. + */ +u_long +select_rcv_wnd(struct socket *so) +{ + unsigned long wnd; + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + + wnd = sbspace(&so->so_rcv); + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; + + return min(wnd, MAX_RCV_WND); +} + +int +select_rcv_wscale(void) +{ + int wscale = 0; + unsigned long space = sb_max; + + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; + + while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) + wscale++; + + return (wscale); +} + +extern int always_keepalive; +#define VIID_SMACIDX(v) (((unsigned int)(v) & 0x7f) << 1) + +/* + * socket so could be a listening socket too. + */ +uint64_t +calc_opt0(struct socket *so, struct port_info *pi, struct l2t_entry *e, + int mtu_idx, int rscale, int rx_credits, int ulp_mode) +{ + uint64_t opt0; + + KASSERT(rx_credits <= M_RCV_BUFSIZ, + ("%s: rcv_bufsiz too high", __func__)); + + opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | + V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); + + if (so != NULL) { + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + int keepalive = always_keepalive || + so_options_get(so) & SO_KEEPALIVE; + + opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); + opt0 |= V_KEEP_ALIVE(keepalive != 0); + } + + if (e != NULL) + opt0 |= V_L2T_IDX(e->idx); + + if (pi != NULL) { + opt0 |= V_SMAC_SEL(VIID_SMACIDX(pi->viid)); + opt0 |= V_TX_CHAN(pi->tx_chan); + } + + return htobe64(opt0); +} + +#define FILTER_SEL_WIDTH_P_FC (3 + 1) +#define FILTER_SEL_WIDTH_VIN_P_FC (6 + 7 + FILTER_SEL_WIDTH_P_FC) +#define FILTER_SEL_WIDTH_TAG_P_FC (3 + FILTER_SEL_WIDTH_VIN_P_FC) +#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC) +#define VLAN_NONE 0xfff +#define FILTER_SEL_VLAN_NONE 0xffff + +uint32_t +select_ntuple(struct port_info *pi, struct l2t_entry *e, uint32_t filter_mode) +{ + uint16_t viid = pi->viid; + uint32_t ntuple = 0; + + if (filter_mode == HW_TPL_FR_MT_PR_IV_P_FC) { + if (e->vlan == VLAN_NONE) + ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC; + else { + ntuple |= e->vlan << FILTER_SEL_WIDTH_P_FC; + ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC; + } + ntuple |= e->lport << S_PORT; + ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC; + } else if (filter_mode == HW_TPL_FR_MT_PR_OV_P_FC) { + ntuple |= G_FW_VIID_VIN(viid) << FILTER_SEL_WIDTH_P_FC; + ntuple |= G_FW_VIID_PFN(viid) << FILTER_SEL_WIDTH_VIN_P_FC; + ntuple |= G_FW_VIID_VIVLD(viid) << FILTER_SEL_WIDTH_TAG_P_FC; + ntuple |= e->lport << S_PORT; + ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC; + } + + return (htobe32(ntuple)); +} + +static int +alloc_tid_tabs(struct tid_info *t) +{ + size_t size; + unsigned int i; + + size = t->ntids * sizeof(*t->tid_tab) + + t->natids * sizeof(*t->atid_tab) + + t->nstids * sizeof(*t->stid_tab); + + t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT); + if (t->tid_tab == NULL) + return (ENOMEM); + + mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); + t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; + t->afree = t->atid_tab; + t->atids_in_use = 0; + for (i = 1; i < t->natids; i++) + t->atid_tab[i - 1].next = &t->atid_tab[i]; + t->atid_tab[t->natids - 1].next = NULL; + + mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); + t->stid_tab = (union serv_entry *)&t->atid_tab[t->natids]; + t->sfree = t->stid_tab; + t->stids_in_use = 0; + for (i = 1; i < t->nstids; i++) + t->stid_tab[i - 1].next = &t->stid_tab[i]; + t->stid_tab[t->nstids - 1].next = NULL; + + atomic_store_rel_int(&t->tids_in_use, 0); + + return (0); +} + +static void +free_tid_tabs(struct tid_info *t) +{ + KASSERT(t->tids_in_use == 0, + ("%s: %d tids still in use.", __func__, t->tids_in_use)); + KASSERT(t->atids_in_use == 0, + ("%s: %d atids still in use.", __func__, t->atids_in_use)); + KASSERT(t->stids_in_use == 0, + ("%s: %d tids still in use.", __func__, t->stids_in_use)); + + free(t->tid_tab, M_CXGBE); + t->tid_tab = NULL; + + if (mtx_initialized(&t->atid_lock)) + mtx_destroy(&t->atid_lock); + if (mtx_initialized(&t->stid_lock)) + mtx_destroy(&t->stid_lock); +} + +static void +free_tom_data(struct adapter *sc, struct tom_data *td) +{ + KASSERT(TAILQ_EMPTY(&td->toep_list), + ("%s: TOE PCB list is not empty.", __func__)); + KASSERT(td->lctx_count == 0, + ("%s: lctx hash table is not empty.", __func__)); + + t4_uninit_l2t_cpl_handlers(sc); + + if (td->listen_mask != 0) + hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); + + if (mtx_initialized(&td->lctx_hash_lock)) + mtx_destroy(&td->lctx_hash_lock); + if (mtx_initialized(&td->toep_list_lock)) + mtx_destroy(&td->toep_list_lock); + + free_tid_tabs(&sc->tids); + free(td, M_CXGBE); +} + +/* + * Ground control to Major TOM + * Commencing countdown, engines on + */ +static int +t4_tom_activate(struct adapter *sc) +{ + struct tom_data *td; + struct toedev *tod; + int i, rc; + + ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */ + + /* per-adapter softc for TOM */ + td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); + if (td == NULL) + return (ENOMEM); + + /* List of TOE PCBs and associated lock */ + mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); + TAILQ_INIT(&td->toep_list); + + /* Listen context */ + mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); + td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, + &td->listen_mask, HASH_NOWAIT); + + /* TID tables */ + rc = alloc_tid_tabs(&sc->tids); + if (rc != 0) + goto done; + + /* CPL handlers */ + t4_init_connect_cpl_handlers(sc); + t4_init_l2t_cpl_handlers(sc); + t4_init_listen_cpl_handlers(sc); + t4_init_cpl_io_handlers(sc); + + /* toedev ops */ + tod = &td->tod; + init_toedev(tod); + tod->tod_softc = sc; + tod->tod_connect = t4_connect; + tod->tod_listen_start = t4_listen_start; + tod->tod_listen_stop = t4_listen_stop; + tod->tod_rcvd = t4_rcvd; + tod->tod_output = t4_tod_output; + tod->tod_send_rst = t4_send_rst; + tod->tod_send_fin = t4_send_fin; + tod->tod_pcb_detach = t4_pcb_detach; + tod->tod_l2_update = t4_l2_update; + tod->tod_syncache_added = t4_syncache_added; + tod->tod_syncache_removed = t4_syncache_removed; + tod->tod_syncache_respond = t4_syncache_respond; + tod->tod_offload_socket = t4_offload_socket; + + for_each_port(sc, i) + TOEDEV(sc->port[i]->ifp) = &td->tod; + + sc->tom_softc = td; + sc->flags |= TOM_INIT_DONE; + register_toedev(sc->tom_softc); + +done: + if (rc != 0) + free_tom_data(sc, td); + return (rc); +} + +static int +t4_tom_deactivate(struct adapter *sc) +{ + int rc = 0; + struct tom_data *td = sc->tom_softc; + + ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */ + + if (td == NULL) + return (0); /* XXX. KASSERT? */ + + if (sc->offload_map != 0) + return (EBUSY); /* at least one port has IFCAP_TOE enabled */ + + mtx_lock(&td->toep_list_lock); + if (!TAILQ_EMPTY(&td->toep_list)) + rc = EBUSY; + mtx_unlock(&td->toep_list_lock); + + mtx_lock(&td->lctx_hash_lock); + if (td->lctx_count > 0) + rc = EBUSY; + mtx_unlock(&td->lctx_hash_lock); + + if (rc == 0) { + unregister_toedev(sc->tom_softc); + free_tom_data(sc, td); + sc->tom_softc = NULL; + sc->flags &= ~TOM_INIT_DONE; + } + + return (rc); +} + +static int +t4_tom_mod_load(void) +{ + int rc; + + rc = t4_register_uld(&tom_uld_info); + if (rc != 0) + t4_tom_mod_unload(); + + return (rc); +} + +static void +tom_uninit(struct adapter *sc, void *arg __unused) +{ + /* Try to free resources (works only if no port has IFCAP_TOE) */ + ADAPTER_LOCK(sc); + if (sc->flags & TOM_INIT_DONE) + t4_deactivate_uld(sc, ULD_TOM); + ADAPTER_UNLOCK(sc); +} + +static int +t4_tom_mod_unload(void) +{ + t4_iterate(tom_uninit, NULL); + + if (t4_unregister_uld(&tom_uld_info) == EBUSY) + return (EBUSY); + + return (0); +} +#endif /* TCP_OFFLOAD */ + +static int +t4_tom_modevent(module_t mod, int cmd, void *arg) +{ + int rc = 0; + +#ifdef TCP_OFFLOAD + switch (cmd) { + case MOD_LOAD: + rc = t4_tom_mod_load(); + break; + + case MOD_UNLOAD: + rc = t4_tom_mod_unload(); + break; + + default: + rc = EINVAL; + } +#else + printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); + rc = EOPNOTSUPP; +#endif + return (rc); +} + +static moduledata_t t4_tom_moddata= { + "t4_tom", + t4_tom_modevent, + 0 +}; + +MODULE_VERSION(t4_tom, 1); +MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); +MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); +DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h new file mode 100644 index 0000000..4e171e7 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -0,0 +1,248 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar <np@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef __T4_TOM_H__ +#define __T4_TOM_H__ + +#define KTR_CXGBE KTR_SPARE3 +#define LISTEN_HASH_SIZE 32 + +/* + * Min receive window. We want it to be large enough to accommodate receive + * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. + */ +#define MIN_RCV_WND (24 * 1024U) + +/* + * Max receive window supported by HW in bytes. Only a small part of it can + * be set through option0, the rest needs to be set through RX_DATA_ACK. + */ +#define MAX_RCV_WND ((1U << 27) - 1) + +/* TOE PCB flags */ +enum { + TPF_ATTACHED, /* a tcpcb refers to this toepcb */ + TPF_FLOWC_WR_SENT, /* firmware flow context WR sent */ + TPF_TX_DATA_SENT, /* some data sent */ + TPF_TX_SUSPENDED, /* tx suspended for lack of resources */ + TPF_SEND_FIN, /* send FIN after sending all pending data */ + TPF_FIN_SENT, /* FIN has been sent */ + TPF_ABORT_SHUTDOWN, /* connection abort is in progress */ + TPF_CPL_PENDING, /* haven't received the last CPL */ + TPF_SYNQE, /* synq_entry, not really a toepcb */ + TPF_SYNQE_NEEDFREE, /* synq_entry was allocated externally */ +}; + +struct ofld_tx_sdesc { + uint32_t plen; /* payload length */ + uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ +}; + +struct toepcb { + TAILQ_ENTRY(toepcb) link; /* toep_list */ + unsigned int flags; /* miscellaneous flags */ + struct tom_data *td; + struct inpcb *inp; /* backpointer to host stack's PCB */ + struct port_info *port; /* physical port */ + struct sge_wrq *ofld_txq; + struct sge_ofld_rxq *ofld_rxq; + struct sge_wrq *ctrlq; + struct l2t_entry *l2te; /* L2 table entry used by this connection */ + int tid; /* Connection identifier */ + unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */ + unsigned int enqueued; /* # of bytes added to so_rcv (not yet read) */ + int rx_credits; /* rx credits (in bytes) to be returned to hw */ + + unsigned int ulp_mode; /* ULP mode */ + + /* Tx software descriptor */ + uint8_t txsd_total; + uint8_t txsd_pidx; + uint8_t txsd_cidx; + uint8_t txsd_avail; + struct ofld_tx_sdesc txsd[]; +}; + +struct flowc_tx_params { + uint32_t snd_nxt; + uint32_t rcv_nxt; + unsigned int snd_space; + unsigned int mss; +}; + +static inline int +toepcb_flag(struct toepcb *toep, int flag) +{ + + return isset(&toep->flags, flag); +} + +static inline void +toepcb_set_flag(struct toepcb *toep, int flag) +{ + + setbit(&toep->flags, flag); +} + +static inline void +toepcb_clr_flag(struct toepcb *toep, int flag) +{ + + clrbit(&toep->flags, flag); +} + +/* + * Compressed state for embryonic connections for a listener. Barely fits in + * 64B, try not to grow it further. + */ +struct synq_entry { + TAILQ_ENTRY(synq_entry) link; /* listen_ctx's synq link */ + int flags; /* same as toepcb's tp_flags */ + int tid; + struct listen_ctx *lctx; /* backpointer to listen ctx */ + struct mbuf *syn; + uint32_t iss; + uint32_t ts; + volatile uintptr_t wr; + volatile u_int refcnt; + uint16_t l2e_idx; + uint16_t rcv_bufsize; +}; + +static inline int +synqe_flag(struct synq_entry *synqe, int flag) +{ + + return isset(&synqe->flags, flag); +} + +static inline void +synqe_set_flag(struct synq_entry *synqe, int flag) +{ + + setbit(&synqe->flags, flag); +} + +static inline void +synqe_clr_flag(struct synq_entry *synqe, int flag) +{ + + clrbit(&synqe->flags, flag); +} + +/* listen_ctx flags */ +#define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */ + +struct listen_ctx { + LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ + volatile int refcount; + int stid; + int flags; + struct inpcb *inp; /* listening socket's inp */ + struct sge_wrq *ctrlq; + struct sge_ofld_rxq *ofld_rxq; + TAILQ_HEAD(, synq_entry) synq; +}; + +struct tom_data { + struct toedev tod; + + /* toepcb's associated with this TOE device */ + struct mtx toep_list_lock; + TAILQ_HEAD(, toepcb) toep_list; + + LIST_HEAD(, listen_ctx) *listen_hash; + u_long listen_mask; + int lctx_count; /* # of lctx in the hash table */ + struct mtx lctx_hash_lock; +}; + +static inline struct tom_data * +tod_td(struct toedev *tod) +{ + + return (member2struct(tom_data, tod, tod)); +} + +static inline struct adapter * +td_adapter(struct tom_data *td) +{ + + return (td->tod.tod_softc); +} + +/* t4_tom.c */ +struct toepcb *alloc_toepcb(struct port_info *, int, int, int); +void free_toepcb(struct toepcb *); +void offload_socket(struct socket *, struct toepcb *); +void undo_offload_socket(struct socket *); +void final_cpl_received(struct toepcb *); +void insert_tid(struct adapter *, int, void *); +void *lookup_tid(struct adapter *, int); +void update_tid(struct adapter *, int, void *); +void remove_tid(struct adapter *, int); +void release_tid(struct adapter *, int, struct sge_wrq *); +int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int); +u_long select_rcv_wnd(struct socket *); +int select_rcv_wscale(void); +uint64_t calc_opt0(struct socket *, struct port_info *, struct l2t_entry *, + int, int, int, int); +uint32_t select_ntuple(struct port_info *, struct l2t_entry *, uint32_t); + +/* t4_connect.c */ +void t4_init_connect_cpl_handlers(struct adapter *); +int t4_connect(struct toedev *, struct socket *, struct rtentry *, + struct sockaddr *); + +/* t4_listen.c */ +void t4_init_listen_cpl_handlers(struct adapter *); +int t4_listen_start(struct toedev *, struct tcpcb *); +int t4_listen_stop(struct toedev *, struct tcpcb *); +void t4_syncache_added(struct toedev *, void *); +void t4_syncache_removed(struct toedev *, void *); +int t4_syncache_respond(struct toedev *, void *, struct mbuf *); +int do_abort_req_synqe(struct sge_iq *, const struct rss_header *, + struct mbuf *); +int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *, + struct mbuf *); +void t4_offload_socket(struct toedev *, void *, struct socket *); + +/* t4_cpl_io.c */ +void t4_init_cpl_io_handlers(struct adapter *); +void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int); +void send_flowc_wr(struct toepcb *, struct flowc_tx_params *); +void send_reset(struct adapter *, struct toepcb *, uint32_t); +void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t); +void t4_rcvd(struct toedev *, struct tcpcb *); +int t4_tod_output(struct toedev *, struct tcpcb *); +int t4_send_fin(struct toedev *, struct tcpcb *); +int t4_send_rst(struct toedev *, struct tcpcb *); + +#endif diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.c b/sys/dev/cxgbe/tom/t4_tom_l2t.c new file mode 100644 index 0000000..ffe64c5 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c @@ -0,0 +1,405 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/sbuf.h> +#include <net/if.h> +#include <net/if_types.h> +#include <net/ethernet.h> +#include <net/if_vlan_var.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/toecore.h> + +#include "common/common.h" +#include "common/jhash.h" +#include "common/t4_msg.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +#define VLAN_NONE 0xfff + +#define SA(x) ((struct sockaddr *)(x)) +#define SIN(x) ((struct sockaddr_in *)(x)) +#define SINADDR(x) (SIN(x)->sin_addr.s_addr) + +static inline void +l2t_hold(struct l2t_data *d, struct l2t_entry *e) +{ + if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */ + atomic_subtract_int(&d->nfree, 1); +} + +static inline unsigned int +arp_hash(const uint32_t key, int ifindex) +{ + return jhash_2words(key, ifindex, 0) & (L2T_SIZE - 1); +} + +/* + * Add a WR to an L2T entry's queue of work requests awaiting resolution. + * Must be called with the entry's lock held. + */ +static inline void +arpq_enqueue(struct l2t_entry *e, struct wrqe *wr) +{ + mtx_assert(&e->lock, MA_OWNED); + + STAILQ_INSERT_TAIL(&e->wr_list, wr, link); +} + +static inline void +send_pending(struct adapter *sc, struct l2t_entry *e) +{ + struct wrqe *wr; + + mtx_assert(&e->lock, MA_OWNED); + + while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { + STAILQ_REMOVE_HEAD(&e->wr_list, link); + t4_wrq_tx(sc, wr); + } +} + +static void +resolution_failed_for_wr(struct wrqe *wr) +{ + log(LOG_ERR, "%s: leaked work request %p, wr_len %d", __func__, wr, + wr->wr_len); + + /* free(wr, M_CXGBE); */ +} + +static void +resolution_failed(struct l2t_entry *e) +{ + struct wrqe *wr; + + mtx_assert(&e->lock, MA_OWNED); + + while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { + STAILQ_REMOVE_HEAD(&e->wr_list, link); + resolution_failed_for_wr(wr); + } +} + +static void +update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr, + uint16_t vtag) +{ + + mtx_assert(&e->lock, MA_OWNED); + + /* + * The entry may be in active use (e->refcount > 0) or not. We update + * it even when it's not as this simplifies the case where we decide to + * reuse the entry later. + */ + + if (lladdr == NULL && + (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) { + /* + * Never got a valid L2 address for this one. Just mark it as + * failed instead of removing it from the hash (for which we'd + * need to wlock the table). + */ + e->state = L2T_STATE_FAILED; + resolution_failed(e); + return; + + } else if (lladdr == NULL) { + + /* Valid or already-stale entry was deleted (or expired) */ + + KASSERT(e->state == L2T_STATE_VALID || + e->state == L2T_STATE_STALE, + ("%s: lladdr NULL, state %d", __func__, e->state)); + + e->state = L2T_STATE_STALE; + + } else { + + if (e->state == L2T_STATE_RESOLVING || + e->state == L2T_STATE_FAILED || + memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) { + + /* unresolved -> resolved; or dmac changed */ + + memcpy(e->dmac, lladdr, ETHER_ADDR_LEN); + e->vlan = vtag; + t4_write_l2e(sc, e, 1); + } + e->state = L2T_STATE_VALID; + } +} + +static int +resolve_entry(struct adapter *sc, struct l2t_entry *e) +{ + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + struct sockaddr_in sin = {0}; + uint8_t dmac[ETHER_ADDR_LEN]; + uint16_t vtag = VLAN_NONE; + int rc; + + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + SINADDR(&sin) = e->addr; + + rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag); + if (rc == EWOULDBLOCK) + return (rc); + + mtx_lock(&e->lock); + update_entry(sc, e, rc == 0 ? dmac : NULL, vtag); + mtx_unlock(&e->lock); + + return (rc); +} + +int +t4_l2t_send_slow(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e) +{ + +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + + if (resolve_entry(sc, e) != EWOULDBLOCK) + goto again; /* entry updated, re-examine state */ + + /* Fall through */ + + case L2T_STATE_VALID: /* fast-path, send the packet on */ + + t4_wrq_tx(sc, wr); + return (0); + + case L2T_STATE_RESOLVING: + case L2T_STATE_SYNC_WRITE: + + mtx_lock(&e->lock); + if (e->state != L2T_STATE_SYNC_WRITE && + e->state != L2T_STATE_RESOLVING) { + /* state changed by the time we got here */ + mtx_unlock(&e->lock); + goto again; + } + arpq_enqueue(e, wr); + mtx_unlock(&e->lock); + + if (resolve_entry(sc, e) == EWOULDBLOCK) + break; + + mtx_lock(&e->lock); + if (e->state == L2T_STATE_VALID && !STAILQ_EMPTY(&e->wr_list)) + send_pending(sc, e); + if (e->state == L2T_STATE_FAILED) + resolution_failed(e); + mtx_unlock(&e->lock); + break; + + case L2T_STATE_FAILED: + resolution_failed_for_wr(wr); + return (EHOSTUNREACH); + } + + return (0); +} + +/* + * Called when an L2T entry has no more users. The entry is left in the hash + * table since it is likely to be reused but we also bump nfree to indicate + * that the entry can be reallocated for a different neighbor. We also drop + * the existing neighbor reference in case the neighbor is going away and is + * waiting on our reference. + * + * Because entries can be reallocated to other neighbors once their ref count + * drops to 0 we need to take the entry's lock to avoid races with a new + * incarnation. + */ + +static int +do_l2t_write_rpl2(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(rpl); + unsigned int idx = tid & (L2T_SIZE - 1); + int rc; + + rc = do_l2t_write_rpl(iq, rss, m); + if (rc != 0) + return (rc); + + if (tid & F_SYNC_WR) { + struct l2t_entry *e = &sc->l2t->l2tab[idx]; + + mtx_lock(&e->lock); + if (e->state != L2T_STATE_SWITCHING) { + send_pending(sc, e); + e->state = L2T_STATE_VALID; + } + mtx_unlock(&e->lock); + } + + return (0); +} + +void +t4_init_l2t_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl2); +} + +void +t4_uninit_l2t_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl); +} + +/* + * The TOE wants an L2 table entry that it can use to reach the next hop over + * the specified port. Produce such an entry - create one if needed. + * + * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on + * top of the real cxgbe interface. + */ +struct l2t_entry * +t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa) +{ + struct l2t_entry *e; + struct l2t_data *d = pi->adapter->l2t; + uint32_t addr = SINADDR(sa); + int hash = arp_hash(addr, ifp->if_index); + unsigned int smt_idx = pi->port_id; + + if (sa->sa_family != AF_INET) + return (NULL); /* XXX: no IPv6 support right now */ + +#ifndef VLAN_TAG + if (ifp->if_type == IFT_L2VLAN) + return (NULL); +#endif + + rw_wlock(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) { + if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) { + l2t_hold(d, e); + goto done; + } + } + + /* Need to allocate a new entry */ + e = t4_alloc_l2e(d); + if (e) { + mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ + e->next = d->l2tab[hash].first; + d->l2tab[hash].first = e; + + e->state = L2T_STATE_RESOLVING; + e->addr = addr; + e->ifp = ifp; + e->smt_idx = smt_idx; + e->hash = hash; + e->lport = pi->lport; + atomic_store_rel_int(&e->refcnt, 1); +#ifdef VLAN_TAG + if (ifp->if_type == IFT_L2VLAN) + VLAN_TAG(ifp, &e->vlan); + else + e->vlan = VLAN_NONE; +#endif + mtx_unlock(&e->lock); + } +done: + rw_wunlock(&d->lock); + return e; +} + +/* + * Called when the host's ARP layer makes a change to some entry that is loaded + * into the HW L2 table. + */ +void +t4_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, + uint8_t *lladdr, uint16_t vtag) +{ + struct adapter *sc = tod->tod_softc; + struct l2t_entry *e; + struct l2t_data *d = sc->l2t; + uint32_t addr = SINADDR(sa); + int hash = arp_hash(addr, ifp->if_index); + + KASSERT(d != NULL, ("%s: no L2 table", __func__)); + + rw_rlock(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) { + if (e->addr == addr && e->ifp == ifp) { + mtx_lock(&e->lock); + if (atomic_load_acq_int(&e->refcnt)) + goto found; + e->state = L2T_STATE_STALE; + mtx_unlock(&e->lock); + break; + } + } + rw_runlock(&d->lock); + + /* + * This is of no interest to us. We've never had an offloaded + * connection to this destination, and we aren't attempting one right + * now. + */ + return; + +found: + rw_runlock(&d->lock); + + KASSERT(e->state != L2T_STATE_UNUSED, + ("%s: unused entry in the hash.", __func__)); + + update_entry(sc, e, lladdr, vtag); + mtx_unlock(&e->lock); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.h b/sys/dev/cxgbe/tom/t4_tom_l2t.h new file mode 100644 index 0000000..3d76735 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom_l2t.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef __T4_TOM_L2T_H +#define __T4_TOM_L2T_H + +#include "t4_l2t.h" + +int t4_l2t_send_slow(struct adapter *, struct wrqe *, struct l2t_entry *); +struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *, + struct sockaddr *); +void t4_l2_update(struct toedev *, struct ifnet *, struct sockaddr *, + uint8_t *, uint16_t); +void t4_init_l2t_cpl_handlers(struct adapter *); +void t4_uninit_l2t_cpl_handlers(struct adapter *); + +static inline int +t4_l2t_send(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e) +{ + if (__predict_true(e->state == L2T_STATE_VALID)) { + t4_wrq_tx(sc, wr); + return (0); + } else + return (t4_l2t_send_slow(sc, wr, e)); +} + +#endif /* __T4_TOM_L2T_H */ |