From 67d5f1a727273d8e141e96c429114dff9fb06ec3 Mon Sep 17 00:00:00 2001 From: np Date: Tue, 19 Jun 2012 07:34:13 +0000 Subject: - Updated TOE support in the kernel. - Stateful TCP offload drivers for Terminator 3 and 4 (T3 and T4) ASICs. These are available as t3_tom and t4_tom modules that augment cxgb(4) and cxgbe(4) respectively. The cxgb/cxgbe drivers continue to work as usual with or without these extra features. - iWARP driver for Terminator 3 ASIC (kernel verbs). T4 iWARP in the works and will follow soon. Build-tested with make universe. 30s overview ============ What interfaces support TCP offload? Look for TOE4 and/or TOE6 in the capabilities of an interface: # ifconfig -m | grep TOE Enable/disable TCP offload on an interface (just like any other ifnet capability): # ifconfig cxgbe0 toe # ifconfig cxgbe0 -toe Which connections are offloaded? Look for toe4 and/or toe6 in the output of netstat and sockstat: # netstat -np tcp | grep toe # sockstat -46c | grep toe Reviewed by: bz, gnn Sponsored by: Chelsio communications. MFC after: ~3 months (after 9.1, and after ensuring MFC is feasible) --- sbin/ifconfig/ifconfig.c | 4 +- sys/amd64/conf/GENERIC | 1 + sys/conf/NOTES | 2 + sys/conf/files | 4 +- sys/conf/options | 2 +- sys/contrib/rdma/krping/krping.c | 48 +- sys/contrib/rdma/krping/krping.h | 4 +- sys/contrib/rdma/krping/krping_dev.c | 6 +- sys/contrib/rdma/rdma_addr.c | 5 +- sys/contrib/rdma/rdma_cache.c | 2 +- sys/dev/cxgb/common/cxgb_ctl_defs.h | 10 +- sys/dev/cxgb/cxgb_adapter.h | 38 +- sys/dev/cxgb/cxgb_main.c | 452 ++- sys/dev/cxgb/cxgb_offload.c | 465 --- sys/dev/cxgb/cxgb_offload.h | 249 +- sys/dev/cxgb/cxgb_osdep.h | 29 - sys/dev/cxgb/cxgb_sge.c | 418 +-- sys/dev/cxgb/sys/mvec.h | 21 - sys/dev/cxgb/t3cdev.h | 62 - sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c | 310 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h | 43 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c | 401 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h | 17 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c | 14 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c | 150 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c | 55 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c | 519 ++- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h | 96 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h | 22 + sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c | 94 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c | 409 +-- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h | 12 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c | 417 ++- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c | 13 +- sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h | 8 + sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h | 75 +- sys/dev/cxgb/ulp/toecore/cxgb_toedev.h | 49 - sys/dev/cxgb/ulp/toecore/toedev.c | 420 --- sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 5219 +++++++-------------------- sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c | 1034 ------ sys/dev/cxgb/ulp/tom/cxgb_ddp.c | 738 ---- sys/dev/cxgb/ulp/tom/cxgb_defs.h | 91 - sys/dev/cxgb/ulp/tom/cxgb_l2t.c | 590 ++- sys/dev/cxgb/ulp/tom/cxgb_l2t.h | 121 +- sys/dev/cxgb/ulp/tom/cxgb_listen.c | 1293 +++++-- sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h | 181 - sys/dev/cxgb/ulp/tom/cxgb_tcp.h | 47 - sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c | 97 - sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h | 14 - sys/dev/cxgb/ulp/tom/cxgb_toepcb.h | 69 +- sys/dev/cxgb/ulp/tom/cxgb_tom.c | 1649 ++------- sys/dev/cxgb/ulp/tom/cxgb_tom.h | 305 +- sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c | 140 - sys/dev/cxgbe/adapter.h | 103 +- sys/dev/cxgbe/common/t4_hw.c | 2 + sys/dev/cxgbe/offload.h | 19 +- sys/dev/cxgbe/t4_l2t.c | 563 +-- sys/dev/cxgbe/t4_l2t.h | 55 +- sys/dev/cxgbe/t4_main.c | 213 +- sys/dev/cxgbe/t4_sge.c | 128 +- sys/dev/cxgbe/tom/t4_connect.c | 377 ++ sys/dev/cxgbe/tom/t4_cpl_io.c | 1276 +++++++ sys/dev/cxgbe/tom/t4_listen.c | 1362 +++++++ sys/dev/cxgbe/tom/t4_tom.c | 755 ++++ sys/dev/cxgbe/tom/t4_tom.h | 248 ++ sys/dev/cxgbe/tom/t4_tom_l2t.c | 405 +++ sys/dev/cxgbe/tom/t4_tom_l2t.h | 53 + sys/i386/conf/GENERIC | 1 + sys/i386/conf/XEN | 2 +- sys/modules/Makefile | 2 + sys/modules/cxgb/Makefile | 35 +- sys/modules/cxgb/cxgb/Makefile | 3 +- sys/modules/cxgb/iw_cxgb/Makefile | 15 +- sys/modules/cxgb/toecore/Makefile | 11 - sys/modules/cxgb/tom/Makefile | 16 +- sys/modules/cxgbe/Makefile | 5 + sys/modules/cxgbe/tom/Makefile | 15 + sys/modules/rdma/krping/Makefile | 2 + sys/modules/toecore/Makefile | 9 + sys/net/if_var.h | 2 + sys/net/if_vlan.c | 20 +- sys/netinet/if_ether.c | 13 +- sys/netinet/if_ether.h | 10 +- sys/netinet/in.c | 2 +- sys/netinet/tcp_input.c | 13 +- sys/netinet/tcp_offload.c | 209 +- sys/netinet/tcp_offload.h | 364 +- sys/netinet/tcp_output.c | 8 + sys/netinet/tcp_subr.c | 19 +- sys/netinet/tcp_syncache.c | 135 +- sys/netinet/tcp_syncache.h | 19 +- sys/netinet/tcp_timer.c | 5 + sys/netinet/tcp_usrreq.c | 75 +- sys/netinet/tcp_var.h | 2 +- sys/netinet/toecore.c | 575 +++ sys/netinet/toecore.h | 130 + sys/netinet/toedev.h | 162 - sys/ofed/drivers/infiniband/core/cma.c | 21 +- sys/ofed/drivers/infiniband/core/iwcm.c | 3 + sys/ofed/include/linux/net.h | 8 +- sys/ofed/include/net/netevent.h | 7 +- sys/ofed/include/rdma/iw_cm.h | 4 +- usr.bin/netstat/inet.c | 5 +- usr.bin/sockstat/sockstat.c | 1 + 104 files changed, 11024 insertions(+), 12997 deletions(-) delete mode 100644 sys/dev/cxgb/cxgb_offload.c delete mode 100644 sys/dev/cxgb/t3cdev.h create mode 100644 sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h delete mode 100644 sys/dev/cxgb/ulp/toecore/cxgb_toedev.h delete mode 100644 sys/dev/cxgb/ulp/toecore/toedev.c delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_ddp.c delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_defs.h delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tcp.h delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h delete mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c create mode 100644 sys/dev/cxgbe/tom/t4_connect.c create mode 100644 sys/dev/cxgbe/tom/t4_cpl_io.c create mode 100644 sys/dev/cxgbe/tom/t4_listen.c create mode 100644 sys/dev/cxgbe/tom/t4_tom.c create mode 100644 sys/dev/cxgbe/tom/t4_tom.h create mode 100644 sys/dev/cxgbe/tom/t4_tom_l2t.c create mode 100644 sys/dev/cxgbe/tom/t4_tom_l2t.h delete mode 100644 sys/modules/cxgb/toecore/Makefile create mode 100644 sys/modules/cxgbe/tom/Makefile create mode 100644 sys/modules/toecore/Makefile create mode 100644 sys/netinet/toecore.c create mode 100644 sys/netinet/toecore.h delete mode 100644 sys/netinet/toedev.h diff --git a/sbin/ifconfig/ifconfig.c b/sbin/ifconfig/ifconfig.c index 082e15d..870acdd 100644 --- a/sbin/ifconfig/ifconfig.c +++ b/sbin/ifconfig/ifconfig.c @@ -916,7 +916,7 @@ unsetifdescr(const char *val, int value, int s, const struct afswtch *afp) #define IFCAPBITS \ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ -"\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ +"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ "\26RXCSUM_IPV6\27TXCSUM_IPV6" /* @@ -1212,6 +1212,8 @@ static struct cmd basic_cmds[] = { DEF_CMD("-tso4", -IFCAP_TSO4, setifcap), DEF_CMD("tso", IFCAP_TSO, setifcap), DEF_CMD("-tso", -IFCAP_TSO, setifcap), + DEF_CMD("toe", IFCAP_TOE, setifcap), + DEF_CMD("-toe", -IFCAP_TOE, setifcap), DEF_CMD("lro", IFCAP_LRO, setifcap), DEF_CMD("-lro", -IFCAP_LRO, setifcap), DEF_CMD("wol", IFCAP_WOL, setifcap), diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index a36a00e..74dd11e 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -28,6 +28,7 @@ options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking options INET6 # IPv6 communications protocols +options TCP_OFFLOAD # TCP offload options SCTP # Stream Control Transmission Protocol options FFS # Berkeley Fast Filesystem options SOFTUPDATES # Enable FFS soft updates support diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 1d4b29f..895a301 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -545,6 +545,8 @@ options INET6 #IPv6 communications protocols options ROUTETABLES=2 # max 16. 1 is back compatible. +options TCP_OFFLOAD # TCP offload support. + # In order to enable IPSEC you MUST also add device crypto to # your kernel configuration options IPSEC #IP security (requires device crypto) diff --git a/sys/conf/files b/sys/conf/files index ad460ba..c0b9286 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1038,8 +1038,6 @@ dev/cs/if_cs_isa.c optional cs isa dev/cs/if_cs_pccard.c optional cs pccard dev/cxgb/cxgb_main.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" -dev/cxgb/cxgb_offload.c optional cxgb pci \ - compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/cxgb_sge.c optional cxgb pci \ compile-with "${NORMAL_C} -I$S/dev/cxgb" dev/cxgb/common/cxgb_mc5.c optional cxgb pci \ @@ -3037,7 +3035,7 @@ netinet/tcp_hostcache.c optional inet | inet6 netinet/tcp_input.c optional inet | inet6 netinet/tcp_lro.c optional inet | inet6 netinet/tcp_output.c optional inet | inet6 -netinet/tcp_offload.c optional inet | inet6 +netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 netinet/tcp_reass.c optional inet | inet6 netinet/tcp_sack.c optional inet | inet6 netinet/tcp_subr.c optional inet | inet6 diff --git a/sys/conf/options b/sys/conf/options index ba678e1..18e6732 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -434,7 +434,7 @@ RADIX_MPATH opt_mpath.h ROUTETABLES opt_route.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG -TCP_OFFLOAD_DISABLE opt_inet.h #Disable code to dispatch tcp offloading +TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_SIGNATURE opt_inet.h VLAN_ARRAY opt_vlan.h XBONEHACK diff --git a/sys/contrib/rdma/krping/krping.c b/sys/contrib/rdma/krping/krping.c index d787965..99d1924 100644 --- a/sys/contrib/rdma/krping/krping.c +++ b/sys/contrib/rdma/krping/krping.c @@ -41,7 +41,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include @@ -53,11 +52,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include -#include +#include +#include #include "getopt.h" #include "krping.h" @@ -83,6 +84,7 @@ static const struct krping_option krping_opts[] = { {"bw", OPT_NOPARAM, 'B'}, {"tx-depth", OPT_INT, 't'}, {"poll", OPT_NOPARAM, 'P'}, + {"memlimit", OPT_INT, 'm'}, {NULL, 0, 0} }; @@ -254,10 +256,14 @@ static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { - if (wc.status != IB_WC_WR_FLUSH_ERR) - log(LOG_ERR, "cq completion failed status %d\n", + if (wc.status == IB_WC_WR_FLUSH_ERR) { + DEBUG_LOG("cq flushed\n"); + continue; + } else { + log(LOG_CRIT, "cq completion failed status %d\n", wc.status); - goto error; + goto error; + } } switch (wc.opcode) { @@ -432,8 +438,17 @@ static int krping_setup_buffers(struct krping_cb *cb) } } - cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, - PAGE_SIZE, 0); + /* RNIC adapters have a limit upto which it can register physical memory + * If DMA-MR memory mode is set then normally driver registers maximum + * supported memory. After that if contigmalloc allocates memory beyond the + * specified RNIC limit then Krping may not work. + */ + if (cb->use_dmamr && cb->memlimit) + cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit, + PAGE_SIZE, 0); + else + cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, + PAGE_SIZE, 0); if (!cb->rdma_buf) { log(LOG_ERR, "rdma_buf malloc failed\n"); @@ -458,8 +473,12 @@ static int krping_setup_buffers(struct krping_cb *cb) } if (!cb->server || cb->wlat || cb->rlat || cb->bw) { - cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, - 0, -1UL, PAGE_SIZE, 0); + if (cb->use_dmamr && cb->memlimit) + cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, + 0, cb->memlimit, PAGE_SIZE, 0); + else + cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, + 0, -1UL, PAGE_SIZE, 0); if (!cb->start_buf) { log(LOG_ERR, "start_buf malloc failed\n"); ret = ENOMEM; @@ -1636,6 +1655,8 @@ int krping_doit(char *cmd) cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; + cb->use_dmamr = 1; + cb->memlimit = 0; mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, @@ -1713,6 +1734,15 @@ int krping_doit(char *cmd) case 'd': debug++; break; + case 'm': + cb->memlimit = optint; + if (cb->memlimit < 1) { + log(LOG_ERR, "Invalid memory limit %ju\n", + cb->memlimit); + ret = EINVAL; + } else + DEBUG_LOG(PFX "memory limit %d\n", (int)optint); + break; default: log(LOG_ERR, "unknown opt %s\n", optarg); ret = EINVAL; diff --git a/sys/contrib/rdma/krping/krping.h b/sys/contrib/rdma/krping/krping.h index 8578e7e..d234825 100644 --- a/sys/contrib/rdma/krping/krping.h +++ b/sys/contrib/rdma/krping/krping.h @@ -1,7 +1,7 @@ /* * $FreeBSD$ */ -#include +#include #include /* @@ -92,6 +92,8 @@ struct krping_cb { int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ + uint64_t memlimit; /* limit of the physical memory that + can be registered with dma_mr mode */ /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ diff --git a/sys/contrib/rdma/krping/krping_dev.c b/sys/contrib/rdma/krping/krping_dev.c index 448f197..92d954d 100644 --- a/sys/contrib/rdma/krping/krping_dev.c +++ b/sys/contrib/rdma/krping/krping_dev.c @@ -14,7 +14,6 @@ __FBSDID("$FreeBSD$"); #include -#include #include /* uprintf */ #include #include /* defines used in kernel.h */ @@ -51,6 +50,9 @@ typedef struct s_krping { /* vars */ static struct cdev *krping_dev; +#undef MODULE_VERSION +#include + static int krping_loader(struct module *m, int what, void *arg) { @@ -175,6 +177,4 @@ krping_write(struct cdev *dev, struct uio *uio, int ioflag) return(err); } -MODULE_DEPEND(krping, rdma_core, 1, 1, 1); -MODULE_DEPEND(krping, rdma_cma, 1, 1, 1); DEV_MODULE(krping,krping_loader,NULL); diff --git a/sys/contrib/rdma/rdma_addr.c b/sys/contrib/rdma/rdma_addr.c index b3f7349..33ec5ac 100644 --- a/sys/contrib/rdma/rdma_addr.c +++ b/sys/contrib/rdma/rdma_addr.c @@ -117,7 +117,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, const unsigned char *dst_dev_addr) { dev_addr->dev_type = RDMA_NODE_RNIC; - memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), MAX_ADDR_LEN); + memset(dev_addr->src_dev_addr, 0, MAX_ADDR_LEN); + memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen); memcpy(dev_addr->broadcast, dev->if_broadcastaddr, MAX_ADDR_LEN); if (dst_dev_addr) memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); @@ -207,7 +208,7 @@ static int addr_resolve_remote(struct sockaddr_in *src_in, goto put; } ret = arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL, - rt_key(iproute.ro_rt), dmac, &lle); + (struct sockaddr *)dst_in, dmac, &lle); if (ret) { goto put; } diff --git a/sys/contrib/rdma/rdma_cache.c b/sys/contrib/rdma/rdma_cache.c index dced8eb..1e516a1 100644 --- a/sys/contrib/rdma/rdma_cache.c +++ b/sys/contrib/rdma/rdma_cache.c @@ -132,7 +132,7 @@ int ib_find_cached_gid(struct ib_device *device, for (p = 0; p <= end_port(device) - start_port(device); ++p) { cache = device->cache.gid_cache[p]; for (i = 0; i < cache->table_len; ++i) { - if (!memcmp(gid, &cache->table[i], 6)) { /* XXX */ + if (!memcmp(gid, &cache->table[i], sizeof *gid)) { *port_num = p + start_port(device); if (index) *index = i; diff --git a/sys/dev/cxgb/common/cxgb_ctl_defs.h b/sys/dev/cxgb/common/cxgb_ctl_defs.h index b228a25..fe8a4f7 100644 --- a/sys/dev/cxgb/common/cxgb_ctl_defs.h +++ b/sys/dev/cxgb/common/cxgb_ctl_defs.h @@ -60,14 +60,12 @@ struct mtutab { const unsigned short *mtus; /* the MTU table values */ }; -struct net_device; - /* - * Structure used to request the adapter net_device owning a given MAC address. + * Structure used to request the ifnet that owns a given MAC address. */ struct iff_mac { - struct net_device *dev; /* the net_device */ - const unsigned char *mac_addr; /* MAC address to lookup */ + struct ifnet *dev; + const unsigned char *mac_addr; u16 vlan_tag; }; @@ -85,7 +83,7 @@ struct ddp_params { struct adap_ports { unsigned int nports; /* number of ports on this adapter */ - struct net_device *lldevs[MAX_NPORTS]; + struct ifnet *lldevs[MAX_NPORTS]; }; /* diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h index 4354b1c..928f8fe 100644 --- a/sys/dev/cxgb/cxgb_adapter.h +++ b/sys/dev/cxgb/cxgb_adapter.h @@ -57,7 +57,6 @@ $FreeBSD$ #include #include -#include #include struct adapter; @@ -130,6 +129,7 @@ enum { CXGB_OFLD_INIT = (1 << 7), TP_PARITY_INIT = (1 << 8), CXGB_BUSY = (1 << 9), + TOM_INIT_DONE = (1 << 10), /* port flags */ DOOMED = (1 << 0), @@ -179,7 +179,6 @@ struct sge_rspq { uint32_t async_notif; uint32_t cntxt_id; uint32_t offload_pkts; - uint32_t offload_bundles; uint32_t pure_rsps; uint32_t unhandled_irqs; uint32_t starved; @@ -291,6 +290,7 @@ struct sge_qset { uint32_t txq_stopped; /* which Tx queues are stopped */ uint64_t port_stats[SGE_PSTAT_MAX]; struct port_info *port; + struct adapter *adap; int idx; /* qset # */ int qs_flags; int coalescing; @@ -307,10 +307,13 @@ struct sge { struct filter_info; +typedef int (*cpl_handler_t)(struct sge_qset *, struct rsp_desc *, + struct mbuf *); + struct adapter { + SLIST_ENTRY(adapter) link; device_t dev; int flags; - TAILQ_ENTRY(adapter) adapter_entry; /* PCI register resources */ int regs_rid; @@ -376,11 +379,16 @@ struct adapter { struct port_info port[MAX_NPORTS]; device_t portdev[MAX_NPORTS]; - struct t3cdev tdev; +#ifdef TCP_OFFLOAD + void *tom_softc; + void *iwarp_softc; +#endif char fw_version[64]; char port_types[MAX_NPORTS + 1]; uint32_t open_device_map; - uint32_t registered_device_map; +#ifdef TCP_OFFLOAD + int offload_map; +#endif struct mtx lock; driver_intr_t *cxgb_intr; int msi_count; @@ -392,6 +400,11 @@ struct adapter { char elmerlockbuf[ADAPTER_LOCK_NAME_LEN]; int timestamp; + +#ifdef TCP_OFFLOAD +#define NUM_CPL_HANDLERS 0xa7 + cpl_handler_t cpl_handler[NUM_CPL_HANDLERS] __aligned(CACHE_LINE_SIZE); +#endif }; struct t3_rx_mode { @@ -502,10 +515,12 @@ void t3_os_link_changed(adapter_t *adapter, int port_id, int link_status, int speed, int duplex, int fc, int mac_was_reset); void t3_os_phymod_changed(struct adapter *adap, int port_id); void t3_sge_err_intr_handler(adapter_t *adapter); -int t3_offload_tx(struct t3cdev *, struct mbuf *); +#ifdef TCP_OFFLOAD +int t3_offload_tx(struct adapter *, struct mbuf *); +#endif void t3_os_set_hw_addr(adapter_t *adapter, int port_idx, u8 hw_addr[]); int t3_mgmt_tx(adapter_t *adap, struct mbuf *m); - +int t3_register_cpl_handler(struct adapter *, int, cpl_handler_t); int t3_sge_alloc(struct adapter *); int t3_sge_free(struct adapter *); @@ -556,15 +571,9 @@ txq_to_qset(struct sge_txq *q, int qidx) return container_of(q, struct sge_qset, txq[qidx]); } -static __inline struct adapter * -tdev2adap(struct t3cdev *d) -{ - return container_of(d, struct adapter, tdev); -} - #undef container_of -#define OFFLOAD_DEVMAP_BIT 15 +#define OFFLOAD_DEVMAP_BIT (1 << MAX_NPORTS) static inline int offload_running(adapter_t *adapter) { return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT); @@ -573,4 +582,5 @@ static inline int offload_running(adapter_t *adapter) void cxgb_tx_watchdog(void *arg); int cxgb_transmit(struct ifnet *ifp, struct mbuf *m); void cxgb_qflush(struct ifnet *ifp); +void t3_iterate(void (*)(struct adapter *, void *), void *); #endif diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c index fb42004..aad7abf 100644 --- a/sys/dev/cxgb/cxgb_main.c +++ b/sys/dev/cxgb/cxgb_main.c @@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include #include #include @@ -107,6 +109,9 @@ static inline void mk_set_tcb_field(struct cpl_set_tcb_field *, unsigned int, unsigned int, u64, u64); static inline void set_tcb_field_ulp(struct cpl_set_tcb_field *, unsigned int, unsigned int, u64, u64); +#ifdef TCP_OFFLOAD +static int cpl_not_handled(struct sge_qset *, struct rsp_desc *, struct mbuf *); +#endif /* Attachment glue for the PCI controller end of the device. Each port of * the device is attached separately, as defined later. @@ -119,10 +124,11 @@ static __inline void reg_block_dump(struct adapter *ap, uint8_t *buf, unsigned i unsigned int end); static void cxgb_get_regs(adapter_t *sc, struct ch_ifconf_regs *regs, uint8_t *buf); static int cxgb_get_regs_len(void); -static int offload_open(struct port_info *pi); static void touch_bars(device_t dev); -static int offload_close(struct t3cdev *tdev); static void cxgb_update_mac_settings(struct port_info *p); +#ifdef TCP_OFFLOAD +static int toe_capability(struct port_info *, int); +#endif static device_method_t cxgb_controller_methods[] = { DEVMETHOD(device_probe, cxgb_controller_probe), @@ -138,8 +144,11 @@ static driver_t cxgb_controller_driver = { sizeof(struct adapter) }; +static int cxgbc_mod_event(module_t, int, void *); static devclass_t cxgb_controller_devclass; -DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgb_controller_devclass, 0, 0); +DRIVER_MODULE(cxgbc, pci, cxgb_controller_driver, cxgb_controller_devclass, + cxgbc_mod_event, 0); +MODULE_VERSION(cxgbc, 1); /* * Attachment glue for the ports. Attachment is done directly to the @@ -177,6 +186,14 @@ static struct cdevsw cxgb_cdevsw = { static devclass_t cxgb_port_devclass; DRIVER_MODULE(cxgb, cxgbc, cxgb_port_driver, cxgb_port_devclass, 0, 0); +MODULE_VERSION(cxgb, 1); + +static struct mtx t3_list_lock; +static SLIST_HEAD(, adapter) t3_list; +#ifdef TCP_OFFLOAD +static struct mtx t3_uld_list_lock; +static SLIST_HEAD(, uld_info) t3_uld_list; +#endif /* * The driver uses the best interrupt scheme available on a platform in the @@ -195,15 +212,6 @@ SYSCTL_INT(_hw_cxgb, OID_AUTO, msi_allowed, CTLFLAG_RDTUN, &msi_allowed, 0, "MSI-X, MSI, INTx selector"); /* - * The driver enables offload as a default. - * To disable it, use ofld_disable = 1. - */ -static int ofld_disable = 0; -TUNABLE_INT("hw.cxgb.ofld_disable", &ofld_disable); -SYSCTL_INT(_hw_cxgb, OID_AUTO, ofld_disable, CTLFLAG_RDTUN, &ofld_disable, 0, - "disable ULP offload"); - -/* * The driver uses an auto-queue algorithm by default. * To disable it and force a single queue-set per port, use multiq = 0 */ @@ -445,6 +453,25 @@ cxgb_controller_attach(device_t dev) sc->msi_count = 0; ai = cxgb_get_adapter_info(dev); + snprintf(sc->lockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb controller lock %d", + device_get_unit(dev)); + ADAPTER_LOCK_INIT(sc, sc->lockbuf); + + snprintf(sc->reglockbuf, ADAPTER_LOCK_NAME_LEN, "SGE reg lock %d", + device_get_unit(dev)); + snprintf(sc->mdiolockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb mdio lock %d", + device_get_unit(dev)); + snprintf(sc->elmerlockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb elmer lock %d", + device_get_unit(dev)); + + MTX_INIT(&sc->sge.reg_lock, sc->reglockbuf, NULL, MTX_SPIN); + MTX_INIT(&sc->mdio_lock, sc->mdiolockbuf, NULL, MTX_DEF); + MTX_INIT(&sc->elmer_lock, sc->elmerlockbuf, NULL, MTX_DEF); + + mtx_lock(&t3_list_lock); + SLIST_INSERT_HEAD(&t3_list, sc, link); + mtx_unlock(&t3_list_lock); + /* find the PCIe link width and set max read request to 4KB*/ if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) { uint16_t lnk; @@ -471,24 +498,10 @@ cxgb_controller_attach(device_t dev) if ((sc->regs_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE)) == NULL) { device_printf(dev, "Cannot allocate BAR region 0\n"); - return (ENXIO); + error = ENXIO; + goto out; } - snprintf(sc->lockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb controller lock %d", - device_get_unit(dev)); - ADAPTER_LOCK_INIT(sc, sc->lockbuf); - - snprintf(sc->reglockbuf, ADAPTER_LOCK_NAME_LEN, "SGE reg lock %d", - device_get_unit(dev)); - snprintf(sc->mdiolockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb mdio lock %d", - device_get_unit(dev)); - snprintf(sc->elmerlockbuf, ADAPTER_LOCK_NAME_LEN, "cxgb elmer lock %d", - device_get_unit(dev)); - - MTX_INIT(&sc->sge.reg_lock, sc->reglockbuf, NULL, MTX_SPIN); - MTX_INIT(&sc->mdio_lock, sc->mdiolockbuf, NULL, MTX_DEF); - MTX_INIT(&sc->elmer_lock, sc->elmerlockbuf, NULL, MTX_DEF); - sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); @@ -604,7 +617,7 @@ cxgb_controller_attach(device_t dev) } else { sc->flags |= TPS_UPTODATE; } - + /* * Create a child device for each MAC. The ethernet attachment * will be done in these children. @@ -636,12 +649,7 @@ cxgb_controller_attach(device_t dev) t3_sge_init_adapter(sc); t3_led_ready(sc); - - cxgb_offload_init(); - if (is_offload(sc)) { - setbit(&sc->registered_device_map, OFFLOAD_DEVMAP_BIT); - cxgb_adapter_ofld(sc); - } + error = t3_get_fw_version(sc, &vers); if (error) goto out; @@ -662,6 +670,11 @@ cxgb_controller_attach(device_t dev) device_printf(sc->dev, "Firmware Version %s\n", &sc->fw_version[0]); callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc); t3_add_attach_sysctls(sc); + +#ifdef TCP_OFFLOAD + for (i = 0; i < NUM_CPL_HANDLERS; i++) + sc->cpl_handler[i] = cpl_not_handled; +#endif out: if (error) cxgb_free(sc); @@ -775,20 +788,9 @@ cxgb_free(struct adapter *sc) sc->tq = NULL; } - if (is_offload(sc)) { - clrbit(&sc->registered_device_map, OFFLOAD_DEVMAP_BIT); - cxgb_adapter_unofld(sc); - } - -#ifdef notyet - if (sc->flags & CXGB_OFLD_INIT) - cxgb_offload_deactivate(sc); -#endif free(sc->filters, M_DEVBUF); t3_sge_free(sc); - cxgb_offload_exit(); - if (sc->udbs_res != NULL) bus_release_resource(sc->dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); @@ -800,6 +802,9 @@ cxgb_free(struct adapter *sc) MTX_DESTROY(&sc->mdio_lock); MTX_DESTROY(&sc->sge.reg_lock); MTX_DESTROY(&sc->elmer_lock); + mtx_lock(&t3_list_lock); + SLIST_REMOVE(&t3_list, sc, adapter, link); + mtx_unlock(&t3_list_lock); ADAPTER_LOCK_DEINIT(sc); } @@ -1017,6 +1022,10 @@ cxgb_port_attach(device_t dev) ifp->if_qflush = cxgb_qflush; ifp->if_capabilities = CXGB_CAP; +#ifdef TCP_OFFLOAD + if (is_offload(sc)) + ifp->if_capabilities |= IFCAP_TOE4; +#endif ifp->if_capenable = CXGB_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO; @@ -1420,65 +1429,6 @@ setup_rss(adapter_t *adap) cpus, rspq_map); } - -/* - * Sends an mbuf to an offload queue driver - * after dealing with any active network taps. - */ -static inline int -offload_tx(struct t3cdev *tdev, struct mbuf *m) -{ - int ret; - - ret = t3_offload_tx(tdev, m); - return (ret); -} - -static int -write_smt_entry(struct adapter *adapter, int idx) -{ - struct port_info *pi = &adapter->port[idx]; - struct cpl_smt_write_req *req; - struct mbuf *m; - - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return (ENOMEM); - - req = mtod(m, struct cpl_smt_write_req *); - m->m_pkthdr.len = m->m_len = sizeof(struct cpl_smt_write_req); - - req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx)); - req->mtu_idx = NMTUS - 1; /* should be 0 but there's a T3 bug */ - req->iff = idx; - memset(req->src_mac1, 0, sizeof(req->src_mac1)); - memcpy(req->src_mac0, pi->hw_addr, ETHER_ADDR_LEN); - - m_set_priority(m, 1); - - offload_tx(&adapter->tdev, m); - - return (0); -} - -static int -init_smt(struct adapter *adapter) -{ - int i; - - for_each_port(adapter, i) - write_smt_entry(adapter, i); - return 0; -} - -static void -init_port_mtus(adapter_t *adapter) -{ - unsigned int mtus = ETHERMTU | (ETHERMTU << 16); - - t3_write_reg(adapter, A_TP_MTU_PORT_TABLE, mtus); -} - static void send_pktsched_cmd(struct adapter *adap, int sched, int qidx, int lo, int hi, int port) @@ -1705,45 +1655,6 @@ cxgb_down(struct adapter *sc) t3_intr_disable(sc); } -static int -offload_open(struct port_info *pi) -{ - struct adapter *sc = pi->adapter; - struct t3cdev *tdev = &sc->tdev; - - setbit(&sc->open_device_map, OFFLOAD_DEVMAP_BIT); - - t3_tp_set_offload_mode(sc, 1); - tdev->lldev = pi->ifp; - init_port_mtus(sc); - t3_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd, - sc->params.rev == 0 ? sc->port[0].ifp->if_mtu : 0xffff); - init_smt(sc); - cxgb_add_clients(tdev); - - return (0); -} - -static int -offload_close(struct t3cdev *tdev) -{ - struct adapter *adapter = tdev2adap(tdev); - - if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) - return (0); - - /* Call back all registered clients */ - cxgb_remove_clients(tdev); - - tdev->lldev = NULL; - cxgb_set_dummy_ops(tdev); - t3_tp_set_offload_mode(adapter, 0); - - clrbit(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT); - - return (0); -} - /* * if_init for cxgb ports. */ @@ -1793,15 +1704,9 @@ cxgb_init_locked(struct port_info *p) ADAPTER_UNLOCK(sc); } - if (sc->open_device_map == 0) { - if ((rc = cxgb_up(sc)) != 0) + if (sc->open_device_map == 0 && ((rc = cxgb_up(sc)) != 0)) goto done; - if (is_offload(sc) && !ofld_disable && offload_open(p)) - log(LOG_WARNING, - "Could not initialize offload capabilities\n"); - } - PORT_LOCK(p); if (isset(&sc->open_device_map, p->port_id) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { @@ -1929,7 +1834,6 @@ cxgb_uninit_synchronized(struct port_info *pi) DELAY(100 * 1000); t3_mac_disable(&pi->mac, MAC_DIRECTION_RX); - pi->phy.ops->power_down(&pi->phy, 1); PORT_UNLOCK(pi); @@ -1937,9 +1841,6 @@ cxgb_uninit_synchronized(struct port_info *pi) pi->link_config.link_ok = 0; t3_os_link_changed(sc, pi->port_id, 0, 0, 0, 0, 0); - if ((sc->open_device_map & PORT_MASK) == 0) - offload_close(&sc->tdev); - if (sc->open_device_map == 0) cxgb_down(pi->adapter); @@ -2081,6 +1982,15 @@ fail: /* Safe to do this even if cxgb_up not called yet */ cxgb_set_lro(p, ifp->if_capenable & IFCAP_LRO); } +#ifdef TCP_OFFLOAD + if (mask & IFCAP_TOE4) { + int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE4; + + error = toe_capability(p, enable); + if (error == 0) + ifp->if_capenable ^= mask; + } +#endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { @@ -3362,3 +3272,235 @@ set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); mk_set_tcb_field(req, tid, word, mask, val); } + +void +t3_iterate(void (*func)(struct adapter *, void *), void *arg) +{ + struct adapter *sc; + + mtx_lock(&t3_list_lock); + SLIST_FOREACH(sc, &t3_list, link) { + /* + * func should not make any assumptions about what state sc is + * in - the only guarantee is that sc->sc_lock is a valid lock. + */ + func(sc, arg); + } + mtx_unlock(&t3_list_lock); +} + +#ifdef TCP_OFFLOAD +static int +toe_capability(struct port_info *pi, int enable) +{ + int rc; + struct adapter *sc = pi->adapter; + + ADAPTER_LOCK_ASSERT_OWNED(sc); + + if (!is_offload(sc)) + return (ENODEV); + + if (enable) { + if (!(sc->flags & FULL_INIT_DONE)) { + log(LOG_WARNING, + "You must enable a cxgb interface first\n"); + return (EAGAIN); + } + + if (isset(&sc->offload_map, pi->port_id)) + return (0); + + if (!(sc->flags & TOM_INIT_DONE)) { + rc = t3_activate_uld(sc, ULD_TOM); + if (rc == EAGAIN) { + log(LOG_WARNING, + "You must kldload t3_tom.ko before trying " + "to enable TOE on a cxgb interface.\n"); + } + if (rc != 0) + return (rc); + KASSERT(sc->tom_softc != NULL, + ("%s: TOM activated but softc NULL", __func__)); + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM activated but flag not set", __func__)); + } + + setbit(&sc->offload_map, pi->port_id); + + /* + * XXX: Temporary code to allow iWARP to be enabled when TOE is + * enabled on any port. Need to figure out how to enable, + * disable, load, and unload iWARP cleanly. + */ + if (!isset(&sc->offload_map, MAX_NPORTS) && + t3_activate_uld(sc, ULD_IWARP) == 0) + setbit(&sc->offload_map, MAX_NPORTS); + } else { + if (!isset(&sc->offload_map, pi->port_id)) + return (0); + + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM never initialized?", __func__)); + clrbit(&sc->offload_map, pi->port_id); + } + + return (0); +} + +/* + * Add an upper layer driver to the global list. + */ +int +t3_register_uld(struct uld_info *ui) +{ + int rc = 0; + struct uld_info *u; + + mtx_lock(&t3_uld_list_lock); + SLIST_FOREACH(u, &t3_uld_list, link) { + if (u->uld_id == ui->uld_id) { + rc = EEXIST; + goto done; + } + } + + SLIST_INSERT_HEAD(&t3_uld_list, ui, link); + ui->refcount = 0; +done: + mtx_unlock(&t3_uld_list_lock); + return (rc); +} + +int +t3_unregister_uld(struct uld_info *ui) +{ + int rc = EINVAL; + struct uld_info *u; + + mtx_lock(&t3_uld_list_lock); + + SLIST_FOREACH(u, &t3_uld_list, link) { + if (u == ui) { + if (ui->refcount > 0) { + rc = EBUSY; + goto done; + } + + SLIST_REMOVE(&t3_uld_list, ui, uld_info, link); + rc = 0; + goto done; + } + } +done: + mtx_unlock(&t3_uld_list_lock); + return (rc); +} + +int +t3_activate_uld(struct adapter *sc, int id) +{ + int rc = EAGAIN; + struct uld_info *ui; + + mtx_lock(&t3_uld_list_lock); + + SLIST_FOREACH(ui, &t3_uld_list, link) { + if (ui->uld_id == id) { + rc = ui->activate(sc); + if (rc == 0) + ui->refcount++; + goto done; + } + } +done: + mtx_unlock(&t3_uld_list_lock); + + return (rc); +} + +int +t3_deactivate_uld(struct adapter *sc, int id) +{ + int rc = EINVAL; + struct uld_info *ui; + + mtx_lock(&t3_uld_list_lock); + + SLIST_FOREACH(ui, &t3_uld_list, link) { + if (ui->uld_id == id) { + rc = ui->deactivate(sc); + if (rc == 0) + ui->refcount--; + goto done; + } + } +done: + mtx_unlock(&t3_uld_list_lock); + + return (rc); +} + +static int +cpl_not_handled(struct sge_qset *qs __unused, struct rsp_desc *r __unused, + struct mbuf *m) +{ + m_freem(m); + return (EDOOFUS); +} + +int +t3_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h) +{ + uintptr_t *loc, new; + + if (opcode >= NUM_CPL_HANDLERS) + return (EINVAL); + + new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled; + loc = (uintptr_t *) &sc->cpl_handler[opcode]; + atomic_store_rel_ptr(loc, new); + + return (0); +} +#endif + +static int +cxgbc_mod_event(module_t mod, int cmd, void *arg) +{ + int rc = 0; + + switch (cmd) { + case MOD_LOAD: + mtx_init(&t3_list_lock, "T3 adapters", 0, MTX_DEF); + SLIST_INIT(&t3_list); +#ifdef TCP_OFFLOAD + mtx_init(&t3_uld_list_lock, "T3 ULDs", 0, MTX_DEF); + SLIST_INIT(&t3_uld_list); +#endif + break; + + case MOD_UNLOAD: +#ifdef TCP_OFFLOAD + mtx_lock(&t3_uld_list_lock); + if (!SLIST_EMPTY(&t3_uld_list)) { + rc = EBUSY; + mtx_unlock(&t3_uld_list_lock); + break; + } + mtx_unlock(&t3_uld_list_lock); + mtx_destroy(&t3_uld_list_lock); +#endif + mtx_lock(&t3_list_lock); + if (!SLIST_EMPTY(&t3_list)) { + rc = EBUSY; + mtx_unlock(&t3_list_lock); + break; + } + mtx_unlock(&t3_list_lock); + mtx_destroy(&t3_list_lock); + break; + } + + return (rc); +} diff --git a/sys/dev/cxgb/cxgb_offload.c b/sys/dev/cxgb/cxgb_offload.c deleted file mode 100644 index 2ae83bd..0000000 --- a/sys/dev/cxgb/cxgb_offload.c +++ /dev/null @@ -1,465 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -***************************************************************************/ - - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#define VALIDATE_TID 0 -MALLOC_DEFINE(M_CXGB, "cxgb", "Chelsio 10 Gigabit Ethernet and services"); - -TAILQ_HEAD(, cxgb_client) client_list; -TAILQ_HEAD(, t3cdev) ofld_dev_list; - - -static struct mtx cxgb_db_lock; - - -static int inited = 0; - -static inline int -offload_activated(struct t3cdev *tdev) -{ - struct adapter *adapter = tdev2adap(tdev); - - return (isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)); -} - -static inline void -register_tdev(struct t3cdev *tdev) -{ - static int unit; - - mtx_lock(&cxgb_db_lock); - snprintf(tdev->name, sizeof(tdev->name), "ofld_dev%d", unit++); - TAILQ_INSERT_TAIL(&ofld_dev_list, tdev, entry); - mtx_unlock(&cxgb_db_lock); -} - -static inline void -unregister_tdev(struct t3cdev *tdev) -{ - if (!inited) - return; - - mtx_lock(&cxgb_db_lock); - TAILQ_REMOVE(&ofld_dev_list, tdev, entry); - mtx_unlock(&cxgb_db_lock); -} - -#ifndef TCP_OFFLOAD_DISABLE -/** - * cxgb_register_client - register an offload client - * @client: the client - * - * Add the client to the client list, - * and call backs the client for each activated offload device - */ -void -cxgb_register_client(struct cxgb_client *client) -{ - struct t3cdev *tdev; - - mtx_lock(&cxgb_db_lock); - TAILQ_INSERT_TAIL(&client_list, client, client_entry); - - if (client->add) { - TAILQ_FOREACH(tdev, &ofld_dev_list, entry) { - if (offload_activated(tdev)) { - client->add(tdev); - } else - CTR1(KTR_CXGB, - "cxgb_register_client: %p not activated", tdev); - - } - } - mtx_unlock(&cxgb_db_lock); -} - -/** - * cxgb_unregister_client - unregister an offload client - * @client: the client - * - * Remove the client to the client list, - * and call backs the client for each activated offload device. - */ -void -cxgb_unregister_client(struct cxgb_client *client) -{ - struct t3cdev *tdev; - - mtx_lock(&cxgb_db_lock); - TAILQ_REMOVE(&client_list, client, client_entry); - - if (client->remove) { - TAILQ_FOREACH(tdev, &ofld_dev_list, entry) { - if (offload_activated(tdev)) - client->remove(tdev); - } - } - mtx_unlock(&cxgb_db_lock); -} - -/** - * cxgb_add_clients - activate register clients for an offload device - * @tdev: the offload device - * - * Call backs all registered clients once a offload device is activated - */ -void -cxgb_add_clients(struct t3cdev *tdev) -{ - struct cxgb_client *client; - - mtx_lock(&cxgb_db_lock); - TAILQ_FOREACH(client, &client_list, client_entry) { - if (client->add) - client->add(tdev); - } - mtx_unlock(&cxgb_db_lock); -} - -/** - * cxgb_remove_clients - activate register clients for an offload device - * @tdev: the offload device - * - * Call backs all registered clients once a offload device is deactivated - */ -void -cxgb_remove_clients(struct t3cdev *tdev) -{ - struct cxgb_client *client; - - mtx_lock(&cxgb_db_lock); - TAILQ_FOREACH(client, &client_list, client_entry) { - if (client->remove) - client->remove(tdev); - } - mtx_unlock(&cxgb_db_lock); -} -#endif - -/** - * cxgb_ofld_recv - process n received offload packets - * @dev: the offload device - * @m: an array of offload packets - * @n: the number of offload packets - * - * Process an array of ingress offload packets. Each packet is forwarded - * to any active network taps and then passed to the offload device's receive - * method. We optimize passing packets to the receive method by passing - * it the whole array at once except when there are active taps. - */ -int -cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n) -{ - - return dev->recv(dev, m, n); -} - -/* - * Dummy handler for Rx offload packets in case we get an offload packet before - * proper processing is setup. This complains and drops the packet as it isn't - * normal to get offload packets at this stage. - */ -static int -rx_offload_blackhole(struct t3cdev *dev, struct mbuf **m, int n) -{ - while (n--) - m_freem(m[n]); - return 0; -} - -static void -dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, - struct sockaddr *sa) -{ -} - -void -cxgb_set_dummy_ops(struct t3cdev *dev) -{ - dev->recv = rx_offload_blackhole; - dev->arp_update = dummy_neigh_update; -} - -static int -do_smt_write_rpl(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_smt_write_rpl *rpl = cplhdr(m); - - if (rpl->status != CPL_ERR_NONE) - log(LOG_ERR, - "Unexpected SMT_WRITE_RPL status %u for entry %u\n", - rpl->status, GET_TID(rpl)); - - return CPL_RET_BUF_DONE; -} - -static int -do_l2t_write_rpl(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_l2t_write_rpl *rpl = cplhdr(m); - - if (rpl->status != CPL_ERR_NONE) - log(LOG_ERR, - "Unexpected L2T_WRITE_RPL status %u for entry %u\n", - rpl->status, GET_TID(rpl)); - - return CPL_RET_BUF_DONE; -} - -static int -do_rte_write_rpl(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_rte_write_rpl *rpl = cplhdr(m); - - if (rpl->status != CPL_ERR_NONE) - log(LOG_ERR, - "Unexpected L2T_WRITE_RPL status %u for entry %u\n", - rpl->status, GET_TID(rpl)); - - return CPL_RET_BUF_DONE; -} - -static int -do_set_tcb_rpl(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_set_tcb_rpl *rpl = cplhdr(m); - - if (rpl->status != CPL_ERR_NONE) - log(LOG_ERR, - "Unexpected SET_TCB_RPL status %u for tid %u\n", - rpl->status, GET_TID(rpl)); - return CPL_RET_BUF_DONE; -} - -static int -do_trace(struct t3cdev *dev, struct mbuf *m) -{ -#if 0 - struct cpl_trace_pkt *p = cplhdr(m); - - - skb->protocol = 0xffff; - skb->dev = dev->lldev; - skb_pull(skb, sizeof(*p)); - skb->mac.raw = mtod(m, (char *)); - netif_receive_skb(skb); -#endif - return 0; -} - -/* - * Process a received packet with an unknown/unexpected CPL opcode. - */ -static int -do_bad_cpl(struct t3cdev *dev, struct mbuf *m) -{ - log(LOG_ERR, "%s: received bad CPL command 0x%x\n", dev->name, - 0xFF & *mtod(m, uint32_t *)); - return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG); -} - -/* - * Handlers for each CPL opcode - */ -static cpl_handler_func cpl_handlers[256]; - -/* - * T3CDEV's receive method. - */ -int -process_rx(struct t3cdev *dev, struct mbuf **m, int n) -{ - while (n--) { - struct mbuf *m0 = *m++; - unsigned int opcode = G_OPCODE(ntohl(m0->m_pkthdr.csum_data)); - int ret; - - DPRINTF("processing op=0x%x m=%p data=%p\n", opcode, m0, m0->m_data); - - ret = cpl_handlers[opcode] (dev, m0); - -#if VALIDATE_TID - if (ret & CPL_RET_UNKNOWN_TID) { - union opcode_tid *p = cplhdr(m0); - - log(LOG_ERR, "%s: CPL message (opcode %u) had " - "unknown TID %u\n", dev->name, opcode, - G_TID(ntohl(p->opcode_tid))); - } -#endif - if (ret & CPL_RET_BUF_DONE) - m_freem(m0); - } - return 0; -} - -/* - * Add a new handler to the CPL dispatch table. A NULL handler may be supplied - * to unregister an existing handler. - */ -void -t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h) -{ - if (opcode < NUM_CPL_CMDS) - cpl_handlers[opcode] = h ? h : do_bad_cpl; - else - log(LOG_ERR, "T3C: handler registration for " - "opcode %x failed\n", opcode); -} - -/* - * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc. - * The allocated memory is cleared. - */ -void * -cxgb_alloc_mem(unsigned long size) -{ - - return malloc(size, M_CXGB, M_ZERO|M_NOWAIT); -} - -/* - * Free memory allocated through t3_alloc_mem(). - */ -void -cxgb_free_mem(void *addr) -{ - free(addr, M_CXGB); -} - -static __inline int -adap2type(struct adapter *adapter) -{ - int type = 0; - - switch (adapter->params.rev) { - case T3_REV_A: - type = T3A; - break; - case T3_REV_B: - case T3_REV_B2: - type = T3B; - break; - case T3_REV_C: - type = T3C; - break; - } - return type; -} - -void -cxgb_adapter_ofld(struct adapter *adapter) -{ - struct t3cdev *tdev = &adapter->tdev; - - cxgb_set_dummy_ops(tdev); - tdev->type = adap2type(adapter); - tdev->adapter = adapter; - register_tdev(tdev); - -} - -void -cxgb_adapter_unofld(struct adapter *adapter) -{ - struct t3cdev *tdev = &adapter->tdev; - - tdev->recv = NULL; - tdev->arp_update = NULL; - unregister_tdev(tdev); -} - -void -cxgb_offload_init(void) -{ - int i; - - if (inited++) - return; - - mtx_init(&cxgb_db_lock, "ofld db", NULL, MTX_DEF); - - TAILQ_INIT(&client_list); - TAILQ_INIT(&ofld_dev_list); - - for (i = 0; i < 0x100; ++i) - cpl_handlers[i] = do_bad_cpl; - - t3_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl); - t3_register_cpl_handler(CPL_RTE_WRITE_RPL, do_rte_write_rpl); - t3_register_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl); - - t3_register_cpl_handler(CPL_SET_TCB_RPL, do_set_tcb_rpl); - t3_register_cpl_handler(CPL_TRACE_PKT, do_trace); - -} - -void -cxgb_offload_exit(void) -{ - - if (--inited) - return; - - mtx_destroy(&cxgb_db_lock); -} - -MODULE_VERSION(if_cxgb, 1); diff --git a/sys/dev/cxgb/cxgb_offload.h b/sys/dev/cxgb/cxgb_offload.h index a8b858e..364ab43 100644 --- a/sys/dev/cxgb/cxgb_offload.h +++ b/sys/dev/cxgb/cxgb_offload.h @@ -1,4 +1,3 @@ - /************************************************************************** Copyright (c) 2007-2008, Chelsio Inc. @@ -33,221 +32,93 @@ $FreeBSD$ #ifndef _CXGB_OFFLOAD_H #define _CXGB_OFFLOAD_H -#include -#include - -MALLOC_DECLARE(M_CXGB); +#ifdef TCP_OFFLOAD +enum { + ULD_TOM = 1, + ULD_IWARP = 2, +}; struct adapter; -struct cxgb_client; - -void cxgb_offload_init(void); -void cxgb_offload_exit(void); - -void cxgb_adapter_ofld(struct adapter *adapter); -void cxgb_adapter_unofld(struct adapter *adapter); -int cxgb_offload_activate(struct adapter *adapter); -void cxgb_offload_deactivate(struct adapter *adapter); -int cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n); - -void cxgb_set_dummy_ops(struct t3cdev *dev); - - -/* - * Client registration. Users of T3 driver must register themselves. - * The T3 driver will call the add function of every client for each T3 - * adapter activated, passing up the t3cdev ptr. Each client fills out an - * array of callback functions to process CPL messages. - */ - -void cxgb_register_client(struct cxgb_client *client); -void cxgb_unregister_client(struct cxgb_client *client); -void cxgb_add_clients(struct t3cdev *tdev); -void cxgb_remove_clients(struct t3cdev *tdev); - -typedef int (*cxgb_cpl_handler_func)(struct t3cdev *dev, - struct mbuf *m, void *ctx); - -struct l2t_entry; -struct cxgb_client { - char *name; - void (*add) (struct t3cdev *); - void (*remove) (struct t3cdev *); - cxgb_cpl_handler_func *handlers; - int (*redirect)(void *ctx, struct rtentry *old, - struct rtentry *new, - struct l2t_entry *l2t); - TAILQ_ENTRY(cxgb_client) client_entry; +struct uld_info { + SLIST_ENTRY(uld_info) link; + int refcount; + int uld_id; + int (*activate)(struct adapter *); + int (*deactivate)(struct adapter *); }; -/* - * TID allocation services. - */ -int cxgb_alloc_atid(struct t3cdev *dev, struct cxgb_client *client, - void *ctx); -int cxgb_alloc_stid(struct t3cdev *dev, struct cxgb_client *client, - void *ctx); -void *cxgb_free_atid(struct t3cdev *dev, int atid); -void cxgb_free_stid(struct t3cdev *dev, int stid); -void *cxgb_get_lctx(struct t3cdev *tdev, int stid); -void cxgb_insert_tid(struct t3cdev *dev, struct cxgb_client *client, - void *ctx, - unsigned int tid); -void cxgb_queue_tid_release(struct t3cdev *dev, unsigned int tid); -void cxgb_remove_tid(struct t3cdev *dev, void *ctx, unsigned int tid); - -struct toe_tid_entry { - struct cxgb_client *client; - void *ctx; +struct tom_tunables { + int sndbuf; + int ddp; + int indsz; + int ddp_thres; }; /* CPL message priority levels */ enum { CPL_PRIORITY_DATA = 0, /* data messages */ - CPL_PRIORITY_SETUP = 1, /* connection setup messages */ - CPL_PRIORITY_TEARDOWN = 0, /* connection teardown messages */ - CPL_PRIORITY_LISTEN = 1, /* listen start/stop messages */ - CPL_PRIORITY_ACK = 1, /* RX ACK messages */ CPL_PRIORITY_CONTROL = 1 /* offload control messages */ }; -/* Flags for return value of CPL message handlers */ -enum { - CPL_RET_BUF_DONE = 1, // buffer processing done, buffer may be freed - CPL_RET_BAD_MSG = 2, // bad CPL message (e.g., unknown opcode) - CPL_RET_UNKNOWN_TID = 4 // unexpected unknown TID -}; +#define S_HDR_NDESC 0 +#define M_HDR_NDESC 0xf +#define V_HDR_NDESC(x) ((x) << S_HDR_NDESC) +#define G_HDR_NDESC(x) (((x) >> S_HDR_NDESC) & M_HDR_NDESC) -typedef int (*cpl_handler_func)(struct t3cdev *dev, struct mbuf *m); +#define S_HDR_QSET 4 +#define M_HDR_QSET 0xf +#define V_HDR_QSET(x) ((x) << S_HDR_QSET) +#define G_HDR_QSET(x) (((x) >> S_HDR_QSET) & M_HDR_QSET) -/* - * Returns a pointer to the first byte of the CPL header in an sk_buff that - * contains a CPL message. - */ -static inline void *cplhdr(struct mbuf *m) -{ - return mtod(m, uint8_t *); -} - -void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h); - -union listen_entry { - struct toe_tid_entry toe_tid; - union listen_entry *next; -}; +#define S_HDR_CTRL 8 +#define V_HDR_CTRL(x) ((x) << S_HDR_CTRL) +#define F_HDR_CTRL V_HDR_CTRL(1U) -union active_open_entry { - struct toe_tid_entry toe_tid; - union active_open_entry *next; -}; +#define S_HDR_DF 9 +#define V_HDR_DF(x) ((x) << S_HDR_DF) +#define F_HDR_DF V_HDR_DF(1U) -/* - * Holds the size, base address, free list start, etc of the TID, server TID, - * and active-open TID tables for a offload device. - * The tables themselves are allocated dynamically. - */ -struct tid_info { - struct toe_tid_entry *tid_tab; - unsigned int ntids; - volatile unsigned int tids_in_use; - - union listen_entry *stid_tab; - unsigned int nstids; - unsigned int stid_base; - - union active_open_entry *atid_tab; - unsigned int natids; - unsigned int atid_base; - - /* - * The following members are accessed R/W so we put them in their own - * cache lines. - * - * XXX We could combine the atid fields above with the lock here since - * atids are use once (unlike other tids). OTOH the above fields are - * usually in cache due to tid_tab. - */ - struct mtx atid_lock /* ____cacheline_aligned_in_smp */; - union active_open_entry *afree; - unsigned int atids_in_use; - - struct mtx stid_lock /*____cacheline_aligned */; - union listen_entry *sfree; - unsigned int stids_in_use; -}; +#define S_HDR_SGL 10 +#define V_HDR_SGL(x) ((x) << S_HDR_SGL) +#define F_HDR_SGL V_HDR_SGL(1U) -struct t3c_data { - struct t3cdev *dev; - unsigned int tx_max_chunk; /* max payload for TX_DATA */ - unsigned int max_wrs; /* max in-flight WRs per connection */ - unsigned int nmtus; - const unsigned short *mtus; - struct tid_info tid_maps; - - struct toe_tid_entry *tid_release_list; - struct mtx tid_release_lock; - struct task tid_release_task; +struct ofld_hdr +{ + void *sgl; /* SGL, if F_HDR_SGL set in flags */ + int plen; /* amount of payload (in bytes) */ + int flags; }; /* - * t3cdev -> toe_data accessor - */ -#define T3C_DATA(dev) (*(struct t3c_data **)&(dev)->l4opt) - -/* - * Map an ATID or STID to their entries in the corresponding TID tables. + * Convenience function for fixed size CPLs that fit in 1 desc. */ -static inline union active_open_entry *atid2entry(const struct tid_info *t, - unsigned int atid) +#define M_GETHDR_OFLD(qset, ctrl, cpl) \ + m_gethdr_ofld(qset, ctrl, sizeof(*cpl), (void **)&cpl) +static inline struct mbuf * +m_gethdr_ofld(int qset, int ctrl, int cpllen, void **cpl) { - return &t->atid_tab[atid - t->atid_base]; -} + struct mbuf *m; + struct ofld_hdr *oh; + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) + return (NULL); -static inline union listen_entry *stid2entry(const struct tid_info *t, - unsigned int stid) -{ - return &t->stid_tab[stid - t->stid_base]; -} + oh = mtod(m, struct ofld_hdr *); + oh->flags = V_HDR_NDESC(1) | V_HDR_QSET(qset) | V_HDR_CTRL(ctrl); + *cpl = (void *)(oh + 1); + m->m_pkthdr.len = m->m_len = sizeof(*oh) + cpllen; -/* - * Find the connection corresponding to a TID. - */ -static inline struct toe_tid_entry *lookup_tid(const struct tid_info *t, - unsigned int tid) -{ - return tid < t->ntids ? &(t->tid_tab[tid]) : NULL; + return (m); } -/* - * Find the connection corresponding to a server TID. - */ -static inline struct toe_tid_entry *lookup_stid(const struct tid_info *t, - unsigned int tid) -{ - if (tid < t->stid_base || tid >= t->stid_base + t->nstids) - return NULL; - return &(stid2entry(t, tid)->toe_tid); -} - -/* - * Find the connection corresponding to an active-open TID. - */ -static inline struct toe_tid_entry *lookup_atid(const struct tid_info *t, - unsigned int tid) -{ - if (tid < t->atid_base || tid >= t->atid_base + t->natids) - return NULL; - return &(atid2entry(t, tid)->toe_tid); -} +int t3_register_uld(struct uld_info *); +int t3_unregister_uld(struct uld_info *); +int t3_activate_uld(struct adapter *, int); +int t3_deactivate_uld(struct adapter *, int); +#endif /* TCP_OFFLOAD */ -void *cxgb_alloc_mem(unsigned long size); -void cxgb_free_mem(void *addr); -void cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa); -void cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa); -int process_rx(struct t3cdev *dev, struct mbuf **m, int n); -int attach_t3cdev(struct t3cdev *dev); -void detach_t3cdev(struct t3cdev *dev); +#define CXGB_UNIMPLEMENTED() \ + panic("IMPLEMENT: %s:%s:%d", __FUNCTION__, __FILE__, __LINE__) -#define CXGB_UNIMPLEMENTED() panic("IMPLEMENT: %s:%s:%d", __FUNCTION__, __FILE__, __LINE__) #endif diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h index 5dc256d..37171d9 100644 --- a/sys/dev/cxgb/cxgb_osdep.h +++ b/sys/dev/cxgb/cxgb_osdep.h @@ -67,27 +67,6 @@ struct t3_mbuf_hdr { } while (0) #endif -#define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif) -#define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri)) -#define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl)) -#define m_get_sgl(m) ((bus_dma_segment_t *)(m)->m_pkthdr.header) -#define m_set_sgllen(m, len) ((m)->m_pkthdr.ether_vtag = len) -#define m_get_sgllen(m) ((m)->m_pkthdr.ether_vtag) - -/* - * XXX FIXME - */ -#define m_set_toep(m, a) ((m)->m_pkthdr.header = (a)) -#define m_get_toep(m) ((m)->m_pkthdr.header) -#define m_set_handler(m, handler) ((m)->m_pkthdr.header = (handler)) - -#define m_set_socket(m, a) ((m)->m_pkthdr.header = (a)) -#define m_get_socket(m) ((m)->m_pkthdr.header) - -#define KTR_CXGB KTR_SPARE2 - -#define MT_DONTFREE 128 - #if __FreeBSD_version < 800054 #if defined (__GNUC__) #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__ @@ -123,13 +102,6 @@ struct t3_mbuf_hdr { #define CXGB_TX_CLEANUP_THRESHOLD 32 - -#ifdef DEBUG_PRINT -#define DPRINTF printf -#else -#define DPRINTF(...) -#endif - #define TX_MAX_SIZE (1 << 16) /* 64KB */ #define TX_MAX_SEGS 36 /* maximum supported by card */ @@ -199,7 +171,6 @@ static const int debug_flags = DBG_RX; #define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | (1< #include #include +#include #include #include @@ -78,6 +79,10 @@ __FBSDID("$FreeBSD$"); int txq_fills = 0; int multiq_tx_enable = 1; +#ifdef TCP_OFFLOAD +CTASSERT(NUM_CPL_HANDLERS >= NUM_CPL_CMDS); +#endif + extern struct sysctl_oid_list sysctl__hw_cxgb_children; int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE; TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size); @@ -471,10 +476,17 @@ static int get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m) { - m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE; + if (resp->rss_hdr.opcode == CPL_RX_DATA) { + const struct cpl_rx_data *cpl = (const void *)&resp->imm_data[0]; + m->m_len = sizeof(*cpl) + ntohs(cpl->len); + } else if (resp->rss_hdr.opcode == CPL_RX_PKT) { + const struct cpl_rx_pkt *cpl = (const void *)&resp->imm_data[0]; + m->m_len = sizeof(*cpl) + ntohs(cpl->len); + } else + m->m_len = IMMED_PKT_SIZE; m->m_ext.ext_buf = NULL; m->m_ext.ext_type = 0; - memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE); + memcpy(mtod(m, uint8_t *), resp->imm_data, m->m_len); return (0); } @@ -703,7 +715,8 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n) cb_arg.error = 0; while (n--) { /* - * We only allocate a cluster, mbuf allocation happens after rx + * We allocate an uninitialized mbuf + cluster, mbuf is + * initialized after rx. */ if (q->zone == zone_pack) { if ((m = m_getcl(M_NOWAIT, MT_NOINIT, M_PKTHDR)) == NULL) @@ -1170,57 +1183,6 @@ calc_tx_descs(const struct mbuf *m, int nsegs) return flits_to_desc(flits); } -static unsigned int -busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq, - struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs) -{ - struct mbuf *m0; - int err, pktlen, pass = 0; - bus_dma_tag_t tag = txq->entry_tag; - -retry: - err = 0; - m0 = *m; - pktlen = m0->m_pkthdr.len; -#if defined(__i386__) || defined(__amd64__) - if (busdma_map_sg_collapse(tag, txsd->map, m, segs, nsegs) == 0) { - goto done; - } else -#endif - err = bus_dmamap_load_mbuf_sg(tag, txsd->map, m0, segs, nsegs, 0); - - if (err == 0) { - goto done; - } - if (err == EFBIG && pass == 0) { - pass = 1; - /* Too many segments, try to defrag */ - m0 = m_defrag(m0, M_DONTWAIT); - if (m0 == NULL) { - m_freem(*m); - *m = NULL; - return (ENOBUFS); - } - *m = m0; - goto retry; - } else if (err == ENOMEM) { - return (err); - } if (err) { - if (cxgb_debug) - printf("map failure err=%d pktlen=%d\n", err, pktlen); - m_freem(m0); - *m = NULL; - return (err); - } -done: -#if !defined(__i386__) && !defined(__amd64__) - bus_dmamap_sync(tag, txsd->map, BUS_DMASYNC_PREWRITE); -#endif - txsd->flags |= TX_SW_DESC_MAPPED; - - return (0); -} - /** * make_sgl - populate a scatter/gather list for a packet * @sgp: the SGL to populate @@ -1328,10 +1290,10 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs if (__predict_true(ndesc == 1)) { set_wr_hdr(wrp, htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | - V_WR_SGLSFLT(flits)) | wr_hi, - htonl(V_WR_LEN(flits + sgl_flits) | - V_WR_GEN(txqs->gen)) | wr_lo); - /* XXX gen? */ + V_WR_SGLSFLT(flits)) | wr_hi, + htonl(V_WR_LEN(flits + sgl_flits) | V_WR_GEN(txqs->gen)) | + wr_lo); + wr_gen2(txd, txqs->gen); } else { @@ -1813,34 +1775,23 @@ cxgb_qflush(struct ifnet *ifp) * its entirety. */ static __inline void -write_imm(struct tx_desc *d, struct mbuf *m, +write_imm(struct tx_desc *d, caddr_t src, unsigned int len, unsigned int gen) { - struct work_request_hdr *from = mtod(m, struct work_request_hdr *); + struct work_request_hdr *from = (struct work_request_hdr *)src; struct work_request_hdr *to = (struct work_request_hdr *)d; uint32_t wr_hi, wr_lo; - if (len > WR_LEN) - panic("len too big %d\n", len); - if (len < sizeof(*from)) - panic("len too small %d", len); + KASSERT(len <= WR_LEN && len >= sizeof(*from), + ("%s: invalid len %d", __func__, len)); memcpy(&to[1], &from[1], len - sizeof(*from)); wr_hi = from->wrh_hi | htonl(F_WR_SOP | F_WR_EOP | - V_WR_BCNTLFLT(len & 7)); - wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | - V_WR_LEN((len + 7) / 8)); + V_WR_BCNTLFLT(len & 7)); + wr_lo = from->wrh_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8)); set_wr_hdr(to, wr_hi, wr_lo); wmb(); wr_gen2(d, gen); - - /* - * This check is a hack we should really fix the logic so - * that this can't happen - */ - if (m->m_type != MT_DONTFREE) - m_freem(m); - } /** @@ -1908,12 +1859,6 @@ reclaim_completed_tx_imm(struct sge_txq *q) q->cleaned += reclaim; } -static __inline int -immediate(const struct mbuf *m) -{ - return m->m_len <= WR_LEN && m->m_pkthdr.len <= WR_LEN ; -} - /** * ctrl_xmit - send a packet through an SGE control Tx queue * @adap: the adapter @@ -1931,11 +1876,8 @@ ctrl_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) struct work_request_hdr *wrp = mtod(m, struct work_request_hdr *); struct sge_txq *q = &qs->txq[TXQ_CTRL]; - if (__predict_false(!immediate(m))) { - m_freem(m); - return 0; - } - + KASSERT(m->m_len <= WR_LEN, ("%s: bad tx data", __func__)); + wrp->wrh_hi |= htonl(F_WR_SOP | F_WR_EOP); wrp->wrh_lo = htonl(V_WR_TID(q->token)); @@ -1950,7 +1892,7 @@ again: reclaim_completed_tx_imm(q); } goto again; } - write_imm(&q->desc[q->pidx], m, m->m_len, q->gen); + write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); q->in_use++; if (++q->pidx >= q->size) { @@ -1960,7 +1902,9 @@ again: reclaim_completed_tx_imm(q); TXQ_UNLOCK(qs); wmb(); t3_write_reg(adap, A_SG_KDOORBELL, - F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); + F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); + + m_free(m); return (0); } @@ -1985,7 +1929,8 @@ again: reclaim_completed_tx_imm(q); while (q->in_use < q->size && (m = mbufq_dequeue(&q->sendq)) != NULL) { - write_imm(&q->desc[q->pidx], m, m->m_len, q->gen); + write_imm(&q->desc[q->pidx], m->m_data, m->m_len, q->gen); + m_free(m); if (++q->pidx >= q->size) { q->pidx = 0; @@ -2239,6 +2184,7 @@ is_new_response(const struct rsp_desc *r, /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */ #define NOMEM_INTR_DELAY 2500 +#ifdef TCP_OFFLOAD /** * write_ofld_wr - write an offload work request * @adap: the adapter @@ -2252,71 +2198,66 @@ is_new_response(const struct rsp_desc *r, * data already carry the work request with most fields populated. */ static void -write_ofld_wr(adapter_t *adap, struct mbuf *m, - struct sge_txq *q, unsigned int pidx, - unsigned int gen, unsigned int ndesc, - bus_dma_segment_t *segs, unsigned int nsegs) +write_ofld_wr(adapter_t *adap, struct mbuf *m, struct sge_txq *q, + unsigned int pidx, unsigned int gen, unsigned int ndesc) { unsigned int sgl_flits, flits; + int i, idx, nsegs, wrlen; struct work_request_hdr *from; - struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1]; + struct sg_ent *sgp, t3sgl[TX_MAX_SEGS / 2 + 1]; struct tx_desc *d = &q->desc[pidx]; struct txq_state txqs; - - if (immediate(m) && nsegs == 0) { - write_imm(d, m, m->m_len, gen); + struct sglist_seg *segs; + struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); + struct sglist *sgl; + + from = (void *)(oh + 1); /* Start of WR within mbuf */ + wrlen = m->m_len - sizeof(*oh); + + if (!(oh->flags & F_HDR_SGL)) { + write_imm(d, (caddr_t)from, wrlen, gen); + + /* + * mbuf with "real" immediate tx data will be enqueue_wr'd by + * t3_push_frames and freed in wr_ack. Others, like those sent + * down by close_conn, t3_send_reset, etc. should be freed here. + */ + if (!(oh->flags & F_HDR_DF)) + m_free(m); return; } - /* Only TX_DATA builds SGLs */ - from = mtod(m, struct work_request_hdr *); - memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from)); + memcpy(&d->flit[1], &from[1], wrlen - sizeof(*from)); - flits = m->m_len / 8; - sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl; + sgl = oh->sgl; + flits = wrlen / 8; + sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : t3sgl; - make_sgl(sgp, segs, nsegs); - sgl_flits = sgl_len(nsegs); + nsegs = sgl->sg_nseg; + segs = sgl->sg_segs; + for (idx = 0, i = 0; i < nsegs; i++) { + KASSERT(segs[i].ss_len, ("%s: 0 len in sgl", __func__)); + if (i && idx == 0) + ++sgp; + sgp->len[idx] = htobe32(segs[i].ss_len); + sgp->addr[idx] = htobe64(segs[i].ss_paddr); + idx ^= 1; + } + if (idx) { + sgp->len[idx] = 0; + sgp->addr[idx] = 0; + } + sgl_flits = sgl_len(nsegs); txqs.gen = gen; txqs.pidx = pidx; txqs.compl = 0; - write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits, + write_wr_hdr_sgl(ndesc, d, &txqs, q, t3sgl, flits, sgl_flits, from->wrh_hi, from->wrh_lo); } /** - * calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet - * @m: the packet - * - * Returns the number of Tx descriptors needed for the given offload - * packet. These packets are already fully constructed. - */ -static __inline unsigned int -calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs) -{ - unsigned int flits, cnt = 0; - int ndescs; - - if (m->m_len <= WR_LEN && nsegs == 0) - return (1); /* packet fits as immediate data */ - - /* - * This needs to be re-visited for TOE - */ - - cnt = nsegs; - - /* headers */ - flits = m->m_len / 8; - - ndescs = flits_to_desc(flits + sgl_len(cnt)); - - return (ndescs); -} - -/** * ofld_xmit - send a packet through an offload queue * @adap: the adapter * @q: the Tx offload queue @@ -2327,28 +2268,19 @@ calc_tx_descs_ofld(struct mbuf *m, unsigned int nsegs) static int ofld_xmit(adapter_t *adap, struct sge_qset *qs, struct mbuf *m) { - int ret, nsegs; + int ret; unsigned int ndesc; unsigned int pidx, gen; struct sge_txq *q = &qs->txq[TXQ_OFLD]; - bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs; - struct tx_sw_desc *stx; + struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); - nsegs = m_get_sgllen(m); - vsegs = m_get_sgl(m); - ndesc = calc_tx_descs_ofld(m, nsegs); - busdma_map_sgl(vsegs, segs, nsegs); + ndesc = G_HDR_NDESC(oh->flags); - stx = &q->sdesc[q->pidx]; - TXQ_LOCK(qs); again: reclaim_completed_tx(qs, 16, TXQ_OFLD); ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD); if (__predict_false(ret)) { if (ret == 1) { - printf("no ofld desc avail\n"); - - m_set_priority(m, ndesc); /* save for restart */ TXQ_UNLOCK(qs); return (EINTR); } @@ -2363,16 +2295,11 @@ again: reclaim_completed_tx(qs, 16, TXQ_OFLD); q->pidx -= q->size; q->gen ^= 1; } -#ifdef T3_TRACE - T3_TRACE5(adap->tb[q->cntxt_id & 7], - "ofld_xmit: ndesc %u, pidx %u, len %u, main %u, frags %u", - ndesc, pidx, skb->len, skb->len - skb->data_len, - skb_shinfo(skb)->nr_frags); -#endif - TXQ_UNLOCK(qs); - write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs); + write_ofld_wr(adap, m, q, pidx, gen, ndesc); check_ring_tx_db(adap, q, 1); + TXQ_UNLOCK(qs); + return (0); } @@ -2389,16 +2316,15 @@ restart_offloadq(void *data, int npending) struct sge_qset *qs = data; struct sge_txq *q = &qs->txq[TXQ_OFLD]; adapter_t *adap = qs->port->adapter; - bus_dma_segment_t segs[TX_MAX_SEGS]; - struct tx_sw_desc *stx = &q->sdesc[q->pidx]; - int nsegs, cleaned; + int cleaned; TXQ_LOCK(qs); again: cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD); while ((m = mbufq_peek(&q->sendq)) != NULL) { unsigned int gen, pidx; - unsigned int ndesc = m_get_priority(m); + struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); + unsigned int ndesc = G_HDR_NDESC(oh->flags); if (__predict_false(q->size - q->in_use < ndesc)) { setbit(&qs->txq_stopped, TXQ_OFLD); @@ -2419,9 +2345,8 @@ again: cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD); } (void)mbufq_dequeue(&q->sendq); - busdma_map_mbufs(&m, q, stx, segs, &nsegs); TXQ_UNLOCK(qs); - write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs); + write_ofld_wr(adap, m, q, pidx, gen, ndesc); TXQ_LOCK(qs); } #if USE_GTS @@ -2435,34 +2360,7 @@ again: cleaned = reclaim_completed_tx(qs, 16, TXQ_OFLD); } /** - * queue_set - return the queue set a packet should use - * @m: the packet - * - * Maps a packet to the SGE queue set it should use. The desired queue - * set is carried in bits 1-3 in the packet's priority. - */ -static __inline int -queue_set(const struct mbuf *m) -{ - return m_get_priority(m) >> 1; -} - -/** - * is_ctrl_pkt - return whether an offload packet is a control packet - * @m: the packet - * - * Determines whether an offload packet should use an OFLD or a CTRL - * Tx queue. This is indicated by bit 0 in the packet's priority. - */ -static __inline int -is_ctrl_pkt(const struct mbuf *m) -{ - return m_get_priority(m) & 1; -} - -/** * t3_offload_tx - send an offload packet - * @tdev: the offload device to send to * @m: the packet * * Sends an offload packet. We use the packet priority to select the @@ -2470,77 +2368,35 @@ is_ctrl_pkt(const struct mbuf *m) * should be sent as regular or control, bits 1-3 select the queue set. */ int -t3_offload_tx(struct t3cdev *tdev, struct mbuf *m) -{ - adapter_t *adap = tdev2adap(tdev); - struct sge_qset *qs = &adap->sge.qs[queue_set(m)]; - - if (__predict_false(is_ctrl_pkt(m))) - return ctrl_xmit(adap, qs, m); - - return ofld_xmit(adap, qs, m); -} - -/** - * deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts - * @tdev: the offload device that will be receiving the packets - * @q: the SGE response queue that assembled the bundle - * @m: the partial bundle - * @n: the number of packets in the bundle - * - * Delivers a (partial) bundle of Rx offload packets to an offload device. - */ -static __inline void -deliver_partial_bundle(struct t3cdev *tdev, - struct sge_rspq *q, - struct mbuf *mbufs[], int n) +t3_offload_tx(struct adapter *sc, struct mbuf *m) { - if (n) { - q->offload_bundles++; - cxgb_ofld_recv(tdev, mbufs, n); - } -} + struct ofld_hdr *oh = mtod(m, struct ofld_hdr *); + struct sge_qset *qs = &sc->sge.qs[G_HDR_QSET(oh->flags)]; -static __inline int -rx_offload(struct t3cdev *tdev, struct sge_rspq *rq, - struct mbuf *m, struct mbuf *rx_gather[], - unsigned int gather_idx) -{ - - rq->offload_pkts++; - m->m_pkthdr.header = mtod(m, void *); - rx_gather[gather_idx++] = m; - if (gather_idx == RX_BUNDLE_SIZE) { - cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE); - gather_idx = 0; - rq->offload_bundles++; - } - return (gather_idx); + if (oh->flags & F_HDR_CTRL) { + m_adj(m, sizeof (*oh)); /* trim ofld_hdr off */ + return (ctrl_xmit(sc, qs, m)); + } else + return (ofld_xmit(sc, qs, m)); } +#endif static void restart_tx(struct sge_qset *qs) { struct adapter *sc = qs->port->adapter; - - + if (isset(&qs->txq_stopped, TXQ_OFLD) && should_restart_tx(&qs->txq[TXQ_OFLD]) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) { qs->txq[TXQ_OFLD].restarts++; - DPRINTF("restarting TXQ_OFLD\n"); taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); } - DPRINTF("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n", - qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]), - qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned, - qs->txq[TXQ_CTRL].in_use); - + if (isset(&qs->txq_stopped, TXQ_CTRL) && should_restart_tx(&qs->txq[TXQ_CTRL]) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) { qs->txq[TXQ_CTRL].restarts++; - DPRINTF("restarting TXQ_CTRL\n"); taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } } @@ -2569,6 +2425,7 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, MTX_INIT(&q->lock, q->namebuf, NULL, MTX_DEF); q->port = pi; + q->adap = sc; if ((q->txq[TXQ_ETH].txq_mr = buf_ring_alloc(cxgb_txq_buf_ring_size, M_DEVBUF, M_WAITOK, &q->lock)) == NULL) { @@ -2630,8 +2487,10 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, q->txq[i].gen = 1; q->txq[i].size = p->txq_size[i]; } - + +#ifdef TCP_OFFLOAD TASK_INIT(&q->txq[TXQ_OFLD].qresume_task, 0, restart_offloadq, q); +#endif TASK_INIT(&q->txq[TXQ_CTRL].qresume_task, 0, restart_ctrlq, q); TASK_INIT(&q->txq[TXQ_ETH].qreclaim_task, 0, sge_txq_reclaim_handler, q); TASK_INIT(&q->txq[TXQ_OFLD].qreclaim_task, 0, sge_txq_reclaim_handler, q); @@ -2736,8 +2595,7 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, mtx_unlock_spin(&sc->sge.reg_lock); t3_update_qset_coalesce(q, p); - q->port = pi; - + refill_fl(sc, &q->fl[0], q->fl[0].size); refill_fl(sc, &q->fl[1], q->fl[1].size); refill_rspq(sc, &q->rspq, q->rspq.size - 1); @@ -2768,8 +2626,6 @@ t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad) struct port_info *pi = &adap->port[adap->rxpkt_map[cpl->iff]]; struct ifnet *ifp = pi->ifp; - DPRINTF("rx_eth m=%p m->m_data=%p p->iff=%d\n", m, mtod(m, uint8_t *), cpl->iff); - if ((ifp->if_capenable & IFCAP_RXCSUM) && !cpl->fragment && cpl->csum_valid && cpl->csum == 0xffff) { m->m_pkthdr.csum_flags = (CSUM_IP_CHECKED|CSUM_IP_VALID); @@ -2967,8 +2823,6 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) int skip_lro; struct lro_ctrl *lro_ctrl = &qs->lro.ctrl; #endif - struct mbuf *offload_mbufs[RX_BUNDLE_SIZE]; - int ngathered = 0; struct t3_mbuf_hdr *mh = &rspq->rspq_mh; #ifdef DEBUG static int last_holdoff = 0; @@ -2982,10 +2836,10 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) while (__predict_true(budget_left && is_new_response(r, rspq))) { int eth, eop = 0, ethpad = 0; uint32_t flags = ntohl(r->flags); - uint32_t rss_csum = *(const uint32_t *)r; uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); + uint8_t opcode = r->rss_hdr.opcode; - eth = (r->rss_hdr.opcode == CPL_RX_PKT); + eth = (opcode == CPL_RX_PKT); if (__predict_false(flags & F_RSPD_ASYNC_NOTIF)) { struct mbuf *m; @@ -3005,27 +2859,27 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) memcpy(mtod(m, char *), r, AN_PKT_SIZE); m->m_len = m->m_pkthdr.len = AN_PKT_SIZE; *mtod(m, char *) = CPL_ASYNC_NOTIF; - rss_csum = htonl(CPL_ASYNC_NOTIF << 24); + opcode = CPL_ASYNC_NOTIF; eop = 1; rspq->async_notif++; goto skip; } else if (flags & F_RSPD_IMM_DATA_VALID) { - struct mbuf *m = NULL; + struct mbuf *m = m_gethdr(M_DONTWAIT, MT_DATA); - DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", - r->rss_hdr.opcode, rspq->cidx); - if (mh->mh_head == NULL) - mh->mh_head = m_gethdr(M_DONTWAIT, MT_DATA); - else - m = m_gethdr(M_DONTWAIT, MT_DATA); - - if (mh->mh_head == NULL && m == NULL) { + if (m == NULL) { no_mem: rspq->next_holdoff = NOMEM_INTR_DELAY; budget_left--; break; } - get_imm_packet(adap, r, mh->mh_head); + if (mh->mh_head == NULL) + mh->mh_head = m; + else + mh->mh_tail->m_next = m; + mh->mh_tail = m; + + get_imm_packet(adap, r, m); + mh->mh_head->m_pkthdr.len += m->m_len; eop = 1; rspq->imm_data++; } else if (r->len_cq) { @@ -3048,30 +2902,14 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) handle_rsp_cntrl_info(qs, flags); } - r++; - if (__predict_false(++rspq->cidx == rspq->size)) { - rspq->cidx = 0; - rspq->gen ^= 1; - r = rspq->desc; - } - - if (++rspq->credits >= 64) { - refill_rspq(adap, rspq, rspq->credits); - rspq->credits = 0; - } if (!eth && eop) { - mh->mh_head->m_pkthdr.csum_data = rss_csum; - /* - * XXX size mismatch - */ - m_set_priority(mh->mh_head, rss_hash); - - - ngathered = rx_offload(&adap->tdev, rspq, - mh->mh_head, offload_mbufs, ngathered); + rspq->offload_pkts++; +#ifdef TCP_OFFLOAD + adap->cpl_handler[opcode](qs, r, mh->mh_head); +#else + m_freem(mh->mh_head); +#endif mh->mh_head = NULL; - DPRINTF("received offload packet\n"); - } else if (eth && eop) { struct mbuf *m = mh->mh_head; @@ -3106,13 +2944,23 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) mh->mh_head = NULL; } + + r++; + if (__predict_false(++rspq->cidx == rspq->size)) { + rspq->cidx = 0; + rspq->gen ^= 1; + r = rspq->desc; + } + + if (++rspq->credits >= 64) { + refill_rspq(adap, rspq, rspq->credits); + rspq->credits = 0; + } __refill_fl_lt(adap, &qs->fl[0], 32); __refill_fl_lt(adap, &qs->fl[1], 32); --budget_left; } - deliver_partial_bundle(&adap->tdev, rspq, offload_mbufs, ngathered); - #if defined(INET6) || defined(INET) /* Flush LRO */ while (!SLIST_EMPTY(&lro_ctrl->lro_active)) { diff --git a/sys/dev/cxgb/sys/mvec.h b/sys/dev/cxgb/sys/mvec.h index 9db27dd..dff6f03 100644 --- a/sys/dev/cxgb/sys/mvec.h +++ b/sys/dev/cxgb/sys/mvec.h @@ -31,15 +31,6 @@ #define _MVEC_H_ #include -#define M_DDP 0x200000 /* direct data placement mbuf */ -#define EXT_PHYS 10 /* physical/bus address */ - -#define m_cur_offset m_ext.ext_size /* override to provide ddp offset */ -#define m_seq m_pkthdr.csum_data /* stored sequence */ -#define m_ddp_gl m_ext.ext_buf /* ddp list */ -#define m_ddp_flags m_pkthdr.csum_flags /* ddp flags */ -#define m_ulp_mode m_pkthdr.tso_segsz /* upper level protocol */ - static __inline void busdma_map_mbuf_fast(bus_dma_tag_t tag, bus_dmamap_t map, struct mbuf *m, bus_dma_segment_t *seg) @@ -58,17 +49,6 @@ int busdma_map_sg_collapse(bus_dma_tag_t tag, bus_dmamap_t map, struct mbuf **m, bus_dma_segment_t *segs, int *nsegs); void busdma_map_sg_vec(bus_dma_tag_t tag, bus_dmamap_t map, struct mbuf *m, bus_dma_segment_t *segs, int *nsegs); -static __inline int -busdma_map_sgl(bus_dma_segment_t *vsegs, bus_dma_segment_t *segs, int count) -{ - while (count--) { - segs->ds_addr = pmap_kextract((vm_offset_t)vsegs->ds_addr); - segs->ds_len = vsegs->ds_len; - segs++; - vsegs++; - } - return (0); -} static __inline void m_freem_list(struct mbuf *m) @@ -84,5 +64,4 @@ m_freem_list(struct mbuf *m) } } - #endif /* _MVEC_H_ */ diff --git a/sys/dev/cxgb/t3cdev.h b/sys/dev/cxgb/t3cdev.h deleted file mode 100644 index e0004b7..0000000 --- a/sys/dev/cxgb/t3cdev.h +++ /dev/null @@ -1,62 +0,0 @@ -/*- - * Copyright (c) 2007-2008, Chelsio Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ -#ifndef _T3CDEV_H_ -#define _T3CDEV_H_ - -#define T3CNAMSIZ 16 - -/* Get the t3cdev associated with an ifnet */ -#define T3CDEV(ifp) (&(((struct port_info *)(ifp)->if_softc))->adapter->tdev) - -struct cxgb3_client; - -enum t3ctype { - T3A = 0, - T3B, - T3C -}; - -struct t3cdev { - char name[T3CNAMSIZ]; /* T3C device name */ - enum t3ctype type; - TAILQ_ENTRY(t3cdev) entry; /* for list linking */ - struct ifnet *lldev; /* LL dev associated with T3C messages */ - struct adapter *adapter; - int (*send)(struct t3cdev *dev, struct mbuf *m); - int (*recv)(struct t3cdev *dev, struct mbuf **m, int n); - int (*ctl)(struct t3cdev *dev, unsigned int req, void *data); - void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, struct sockaddr *sa); - void *priv; /* driver private data */ - void *l2opt; /* optional layer 2 data */ - void *l3opt; /* optional layer 3 data */ - void *l4opt; /* optional layer 4 data */ - void *ulp; /* ulp stuff */ -}; - -#endif /* _T3CDEV_H_ */ diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c index a4f2ff6..c8652a0 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c @@ -29,11 +29,12 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include #include #include #include -#include #include #include #include @@ -54,20 +55,14 @@ __FBSDID("$FreeBSD$"); #include #include -#if __FreeBSD_version < 800044 -#define V_ifnet ifnet -#endif - -#include -#include -#if __FreeBSD_version >= 800056 -#include -#endif - #include +#include -#include +#include +#include +#include +#ifdef TCP_OFFLOAD #include #include #include @@ -75,26 +70,21 @@ __FBSDID("$FreeBSD$"); #include #include -/* - * XXX :-/ - * - */ - -#define idr_init(x) +static int iwch_mod_load(void); +static int iwch_mod_unload(void); +static int iwch_activate(struct adapter *); +static int iwch_deactivate(struct adapter *); -cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; - -static void open_rnic_dev(struct t3cdev *); -static void close_rnic_dev(struct t3cdev *); - -static TAILQ_HEAD( ,iwch_dev) dev_list; -static struct mtx dev_mutex; -static eventhandler_tag event_tag; +static struct uld_info iwch_uld_info = { + .uld_id = ULD_IWARP, + .activate = iwch_activate, + .deactivate = iwch_deactivate, +}; static void rnic_init(struct iwch_dev *rnicp) { - CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, rnicp); + idr_init(&rnicp->cqidr); idr_init(&rnicp->qpidr); idr_init(&rnicp->mmidr); @@ -103,15 +93,16 @@ rnic_init(struct iwch_dev *rnicp) rnicp->attr.vendor_id = 0x168; rnicp->attr.vendor_part_id = 7; rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; - rnicp->attr.max_wrs = (1UL << 24) - 1; + rnicp->attr.max_wrs = T3_MAX_QP_DEPTH; rnicp->attr.max_sge_per_wr = T3_MAX_SGE; rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; - rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1; + rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH; rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; - rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */ + rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK; + rnicp->attr.max_mr_size = T3_MAX_MR_SIZE; rnicp->attr.can_resize_wq = 0; rnicp->attr.max_rdma_reads_per_qp = 8; rnicp->attr.max_rdma_read_resources = @@ -127,170 +118,183 @@ rnic_init(struct iwch_dev *rnicp) rnicp->attr.zbva_support = 1; rnicp->attr.local_invalidate_fence = 1; rnicp->attr.cq_overflow_detection = 1; + return; } static void -open_rnic_dev(struct t3cdev *tdev) +rnic_uninit(struct iwch_dev *rnicp) +{ + idr_destroy(&rnicp->cqidr); + idr_destroy(&rnicp->qpidr); + idr_destroy(&rnicp->mmidr); + mtx_destroy(&rnicp->lock); +} + +static int +iwch_activate(struct adapter *sc) { struct iwch_dev *rnicp; - static int vers_printed; + int rc; + + KASSERT(!isset(&sc->offload_map, MAX_NPORTS), + ("%s: iWARP already activated on %s", __func__, + device_get_nameunit(sc->dev))); - CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__, tdev); - if (!vers_printed++) - printf("Chelsio T3 RDMA Driver - version x.xx\n"); rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); - if (!rnicp) { - printf("Cannot allocate ib device\n"); - return; - } - rnicp->rdev.ulp = rnicp; - rnicp->rdev.t3cdev_p = tdev; + if (rnicp == NULL) + return (ENOMEM); + + sc->iwarp_softc = rnicp; + rnicp->rdev.adap = sc; - mtx_lock(&dev_mutex); + cxio_hal_init(sc); + iwch_cm_init_cpl(sc); - if (cxio_rdev_open(&rnicp->rdev)) { - mtx_unlock(&dev_mutex); + rc = cxio_rdev_open(&rnicp->rdev); + if (rc != 0) { printf("Unable to open CXIO rdev\n"); - ib_dealloc_device(&rnicp->ibdev); - return; + goto err1; } rnic_init(rnicp); - TAILQ_INSERT_TAIL(&dev_list, rnicp, entry); - mtx_unlock(&dev_mutex); - - if (iwch_register_device(rnicp)) { + rc = iwch_register_device(rnicp); + if (rc != 0) { printf("Unable to register device\n"); - close_rnic_dev(tdev); + goto err2; } -#ifdef notyet - printf("Initialized device %s\n", - pci_name(rnicp->rdev.rnic_info.pdev)); -#endif - return; + + return (0); + +err2: + rnic_uninit(rnicp); + cxio_rdev_close(&rnicp->rdev); +err1: + cxio_hal_uninit(sc); + iwch_cm_term_cpl(sc); + sc->iwarp_softc = NULL; + + return (rc); } -static void -close_rnic_dev(struct t3cdev *tdev) +static int +iwch_deactivate(struct adapter *sc) { - struct iwch_dev *dev, *tmp; - CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__, tdev); - mtx_lock(&dev_mutex); - - TAILQ_FOREACH_SAFE(dev, &dev_list, entry, tmp) { - if (dev->rdev.t3cdev_p == tdev) { -#ifdef notyet - list_del(&dev->entry); - iwch_unregister_device(dev); - cxio_rdev_close(&dev->rdev); - idr_destroy(&dev->cqidr); - idr_destroy(&dev->qpidr); - idr_destroy(&dev->mmidr); - ib_dealloc_device(&dev->ibdev); -#endif - break; - } - } - mtx_unlock(&dev_mutex); + struct iwch_dev *rnicp; + + rnicp = sc->iwarp_softc; + + iwch_unregister_device(rnicp); + rnic_uninit(rnicp); + cxio_rdev_close(&rnicp->rdev); + cxio_hal_uninit(sc); + iwch_cm_term_cpl(sc); + ib_dealloc_device(&rnicp->ibdev); + + sc->iwarp_softc = NULL; + + return (0); } -static ifaddr_event_handler_t -ifaddr_event_handler(void *arg, struct ifnet *ifp) +static void +iwch_activate_all(struct adapter *sc, void *arg __unused) { - printf("%s if name %s \n", __FUNCTION__, ifp->if_xname); - if (ifp->if_capabilities & IFCAP_TOE4) { - KASSERT(T3CDEV(ifp) != NULL, ("null t3cdev ptr!")); - if (cxio_hal_find_rdev_by_t3cdev(T3CDEV(ifp)) == NULL) - open_rnic_dev(T3CDEV(ifp)); - } - return 0; + ADAPTER_LOCK(sc); + if ((sc->open_device_map & sc->offload_map) != 0 && + t3_activate_uld(sc, ULD_IWARP) == 0) + setbit(&sc->offload_map, MAX_NPORTS); + ADAPTER_UNLOCK(sc); } +static void +iwch_deactivate_all(struct adapter *sc, void *arg __unused) +{ + ADAPTER_LOCK(sc); + if (isset(&sc->offload_map, MAX_NPORTS) && + t3_deactivate_uld(sc, ULD_IWARP) == 0) + clrbit(&sc->offload_map, MAX_NPORTS); + ADAPTER_UNLOCK(sc); +} static int -iwch_init_module(void) +iwch_mod_load(void) { - VNET_ITERATOR_DECL(vnet_iter); - int err; - struct ifnet *ifp; - - printf("%s enter\n", __FUNCTION__); - TAILQ_INIT(&dev_list); - mtx_init(&dev_mutex, "iwch dev_list lock", NULL, MTX_DEF); - - err = cxio_hal_init(); - if (err) - return err; - err = iwch_cm_init(); - if (err) - return err; - cxio_register_ev_cb(iwch_ev_dispatch); - - /* Register for ifaddr events to dynamically add TOE devs */ - event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_event_handler, - NULL, EVENTHANDLER_PRI_ANY); - - /* Register existing TOE interfaces by walking the ifnet chain */ - IFNET_RLOCK(); - VNET_LIST_RLOCK(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); /* XXX CURVNET_SET_QUIET() ? */ - TAILQ_FOREACH(ifp, &V_ifnet, if_link) - (void)ifaddr_event_handler(NULL, ifp); - CURVNET_RESTORE(); + int rc; + + rc = iwch_cm_init(); + if (rc != 0) + return (rc); + + rc = t3_register_uld(&iwch_uld_info); + if (rc != 0) { + iwch_cm_term(); + return (rc); } - VNET_LIST_RUNLOCK(); - IFNET_RUNLOCK(); - return 0; + + t3_iterate(iwch_activate_all, NULL); + + return (rc); } -static void -iwch_exit_module(void) +static int +iwch_mod_unload(void) { - EVENTHANDLER_DEREGISTER(ifaddr_event, event_tag); - cxio_unregister_ev_cb(iwch_ev_dispatch); + t3_iterate(iwch_deactivate_all, NULL); + iwch_cm_term(); - cxio_hal_exit(); + + if (t3_unregister_uld(&iwch_uld_info) == EBUSY) + return (EBUSY); + + return (0); } +#endif /* TCP_OFFLOAD */ -static int -iwch_load(module_t mod, int cmd, void *arg) +#undef MODULE_VERSION +#include + +static int +iwch_modevent(module_t mod, int cmd, void *arg) { - int err = 0; - - switch (cmd) { - case MOD_LOAD: - printf("Loading iw_cxgb.\n"); - - iwch_init_module(); - break; - case MOD_QUIESCE: - break; - case MOD_UNLOAD: - printf("Unloading iw_cxgb.\n"); - iwch_exit_module(); - break; - case MOD_SHUTDOWN: - break; - default: - err = EOPNOTSUPP; - break; - } - - return (err); + int rc = 0; + +#ifdef TCP_OFFLOAD + switch (cmd) { + case MOD_LOAD: + rc = iwch_mod_load(); + if(rc) + printf("iw_cxgb: Chelsio T3 RDMA Driver failed to load\n"); + else + printf("iw_cxgb: Chelsio T3 RDMA Driver loaded\n"); + break; + + case MOD_UNLOAD: + rc = iwch_mod_unload(); + if(rc) + printf("iw_cxgb: Chelsio T3 RDMA Driver failed to unload\n"); + else + printf("iw_cxgb: Chelsio T3 RDMA Driver unloaded\n"); + break; + + default: + rc = EINVAL; + } +#else + printf("iw_cxgb: compiled without TCP_OFFLOAD support.\n"); + rc = EOPNOTSUPP; +#endif + return (rc); } -static moduledata_t mod_data = { +static moduledata_t iwch_mod_data = { "iw_cxgb", - iwch_load, + iwch_modevent, 0 }; MODULE_VERSION(iw_cxgb, 1); -DECLARE_MODULE(iw_cxgb, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); -MODULE_DEPEND(iw_cxgb, rdma_core, 1, 1, 1); -MODULE_DEPEND(iw_cxgb, if_cxgb, 1, 1, 1); +DECLARE_MODULE(iw_cxgb, iwch_mod_data, SI_SUB_EXEC, SI_ORDER_ANY); +MODULE_DEPEND(t3_tom, cxgbc, 1, 1, 1); +MODULE_DEPEND(iw_cxgb, toecore, 1, 1, 1); MODULE_DEPEND(iw_cxgb, t3_tom, 1, 1, 1); - diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h index f4b2856..9fd3e0c 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h @@ -37,6 +37,13 @@ struct iwch_cq; struct iwch_qp; struct iwch_mr; +enum t3ctype { + T3A = 0, + T3B, + T3C +}; + +#define PAGE_MASK_IWARP (~(PAGE_SIZE-1)) struct iwch_rnic_attributes { u32 vendor_id; @@ -57,6 +64,7 @@ struct iwch_rnic_attributes { * size (4k)^i. Phys block list mode unsupported. */ u32 mem_pgsizes_bitmask; + u64 max_mr_size; u8 can_resize_wq; /* @@ -97,9 +105,9 @@ struct iwch_dev { struct cxio_rdev rdev; u32 device_cap_flags; struct iwch_rnic_attributes attr; - struct kvl cqidr; - struct kvl qpidr; - struct kvl mmidr; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; struct mtx lock; TAILQ_ENTRY(iwch_dev) entry; }; @@ -113,40 +121,43 @@ static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) return container_of(ibdev, struct iwch_dev, ibdev); } -static inline int t3b_device(const struct iwch_dev *rhp) +static inline int t3b_device(const struct iwch_dev *rhp __unused) { - return rhp->rdev.t3cdev_p->type == T3B; + return (0); } -static inline int t3a_device(const struct iwch_dev *rhp) +static inline int t3a_device(const struct iwch_dev *rhp __unused) { - return rhp->rdev.t3cdev_p->type == T3A; + return (0); } static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) { - return kvl_lookup(&rhp->cqidr, cqid); + return idr_find(&rhp->cqidr, cqid); } static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) { - return kvl_lookup(&rhp->qpidr, qpid); + return idr_find(&rhp->qpidr, qpid); } static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) { - return kvl_lookup(&rhp->mmidr, mmid); + return idr_find(&rhp->mmidr, mmid); } -static inline int insert_handle(struct iwch_dev *rhp, struct kvl *kvlp, +static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, void *handle, u32 id) { int ret; u32 newid; do { + if (!idr_pre_get(idr, GFP_KERNEL)) { + return -ENOMEM; + } mtx_lock(&rhp->lock); - ret = kvl_alloc_above(kvlp, handle, id, &newid); + ret = idr_get_new_above(idr, handle, id, &newid); WARN_ON(ret != 0); WARN_ON(!ret && newid != id); mtx_unlock(&rhp->lock); @@ -155,14 +166,12 @@ static inline int insert_handle(struct iwch_dev *rhp, struct kvl *kvlp, return ret; } -static inline void remove_handle(struct iwch_dev *rhp, struct kvl *kvlp, u32 id) +static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) { mtx_lock(&rhp->lock); - kvl_delete(kvlp, id); + idr_remove(idr, id); mtx_unlock(&rhp->lock); } -extern struct cxgb_client t3c_client; -extern cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; -extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m); +void iwch_ev_dispatch(struct iwch_dev *, struct mbuf *); #endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c index d95e48d..9afad62 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c @@ -29,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -66,13 +68,17 @@ __FBSDID("$FreeBSD$"); #include #include -#include +#include +#include +#include #include #include -#include -#include #include +#include +#include +#include + #include #include #include @@ -97,46 +103,46 @@ static char *states[] = { }; #endif -SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters"); +SYSCTL_NODE(_hw, OID_AUTO, iw_cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters"); -static int ep_timeout_secs = 10; +static int ep_timeout_secs = 60; TUNABLE_INT("hw.iw_cxgb.ep_timeout_secs", &ep_timeout_secs); -SYSCTL_INT(_hw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RDTUN, &ep_timeout_secs, 0, - "CM Endpoint operation timeout in seconds (default=10)"); +SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RW, &ep_timeout_secs, 0, + "CM Endpoint operation timeout in seconds (default=60)"); static int mpa_rev = 1; TUNABLE_INT("hw.iw_cxgb.mpa_rev", &mpa_rev); -SYSCTL_INT(_hw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RDTUN, &mpa_rev, 0, +SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RW, &mpa_rev, 0, "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)"); static int markers_enabled = 0; TUNABLE_INT("hw.iw_cxgb.markers_enabled", &markers_enabled); -SYSCTL_INT(_hw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RDTUN, &markers_enabled, 0, +SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RW, &markers_enabled, 0, "Enable MPA MARKERS (default(0)=disabled)"); static int crc_enabled = 1; TUNABLE_INT("hw.iw_cxgb.crc_enabled", &crc_enabled); -SYSCTL_INT(_hw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RDTUN, &crc_enabled, 0, +SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RW, &crc_enabled, 0, "Enable MPA CRC (default(1)=enabled)"); static int rcv_win = 256 * 1024; TUNABLE_INT("hw.iw_cxgb.rcv_win", &rcv_win); -SYSCTL_INT(_hw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RDTUN, &rcv_win, 0, +SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RW, &rcv_win, 0, "TCP receive window in bytes (default=256KB)"); static int snd_win = 32 * 1024; TUNABLE_INT("hw.iw_cxgb.snd_win", &snd_win); -SYSCTL_INT(_hw_cxgb, OID_AUTO, snd_win, CTLFLAG_RDTUN, &snd_win, 0, +SYSCTL_INT(_hw_iw_cxgb, OID_AUTO, snd_win, CTLFLAG_RW, &snd_win, 0, "TCP send window in bytes (default=32KB)"); static unsigned int nocong = 0; TUNABLE_INT("hw.iw_cxgb.nocong", &nocong); -SYSCTL_UINT(_hw_cxgb, OID_AUTO, nocong, CTLFLAG_RDTUN, &nocong, 0, +SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, nocong, CTLFLAG_RW, &nocong, 0, "Turn off congestion control (default=0)"); static unsigned int cong_flavor = 1; TUNABLE_INT("hw.iw_cxgb.cong_flavor", &cong_flavor); -SYSCTL_UINT(_hw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RDTUN, &cong_flavor, 0, +SYSCTL_UINT(_hw_iw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RW, &cong_flavor, 0, "TCP Congestion control flavor (default=1)"); static void ep_timeout(void *arg); @@ -174,42 +180,44 @@ static void stop_ep_timer(struct iwch_ep *ep) { CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + if (!callout_pending(&ep->timer)) { + CTR3(KTR_IW_CXGB, "%s timer stopped when its not running! ep %p state %u\n", + __func__, ep, ep->com.state); + return; + } callout_drain(&ep->timer); put_ep(&ep->com); } -static int set_tcpinfo(struct iwch_ep *ep) +static int +set_tcpinfo(struct iwch_ep *ep) { - struct tcp_info ti; - struct sockopt sopt; - int err; + struct socket *so = ep->com.so; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct toepcb *toep; + int rc = 0; - sopt.sopt_dir = SOPT_GET; - sopt.sopt_level = IPPROTO_TCP; - sopt.sopt_name = TCP_INFO; - sopt.sopt_val = (caddr_t)&ti; - sopt.sopt_valsize = sizeof ti; - sopt.sopt_td = NULL; - - err = sogetopt(ep->com.so, &sopt); - if (err) { - printf("%s can't get tcpinfo\n", __FUNCTION__); - return -err; - } - if (!(ti.tcpi_options & TCPI_OPT_TOE)) { - printf("%s connection NOT OFFLOADED!\n", __FUNCTION__); - return -EINVAL; + INP_WLOCK(inp); + tp = intotcpcb(inp); + + if ((tp->t_flags & TF_TOE) == 0) { + rc = EINVAL; + printf("%s: connection NOT OFFLOADED!\n", __func__); + goto done; } + toep = tp->t_toe; - ep->snd_seq = ti.tcpi_snd_nxt; - ep->rcv_seq = ti.tcpi_rcv_nxt; - ep->emss = ti.tcpi_snd_mss - sizeof(struct tcpiphdr); - ep->hwtid = TOEPCB(ep->com.so)->tp_tid; /* XXX */ - if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS) - ep->emss -= 12; + ep->hwtid = toep->tp_tid; + ep->snd_seq = tp->snd_nxt; + ep->rcv_seq = tp->rcv_nxt; + ep->emss = tp->t_maxseg; if (ep->emss < 128) ep->emss = 128; - return 0; +done: + INP_WUNLOCK(inp); + return (rc); + } static enum iwch_ep_state @@ -264,56 +272,6 @@ void __free_ep(struct iwch_ep_common *epc) free(epc, M_DEVBUF); } -int -iwch_quiesce_tid(struct iwch_ep *ep) -{ -#ifdef notyet - struct cpl_set_tcb_field *req; - struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT); - - if (m == NULL) - return (-ENOMEM); - req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req)); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); - req->reply = 0; - req->cpu_idx = 0; - req->word = htons(W_TCB_RX_QUIESCE); - req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); - req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); - - m_set_priority(m, CPL_PRIORITY_DATA); - cxgb_ofld_send(ep->com.tdev, m); -#endif - return 0; -} - -int -iwch_resume_tid(struct iwch_ep *ep) -{ -#ifdef notyet - struct cpl_set_tcb_field *req; - struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT); - - if (m == NULL) - return (-ENOMEM); - req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req)); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); - req->reply = 0; - req->cpu_idx = 0; - req->word = htons(W_TCB_RX_QUIESCE); - req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); - req->val = 0; - - m_set_priority(m, CPL_PRIORITY_DATA); - cxgb_ofld_send(ep->com.tdev, m); -#endif - return 0; -} - static struct rtentry * find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, __be16 peer_port, u8 tos) @@ -331,13 +289,16 @@ find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, } static void -close_socket(struct iwch_ep_common *epc) +close_socket(struct iwch_ep_common *epc, int close) { CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); SOCK_LOCK(epc->so); soupcall_clear(epc->so, SO_RCV); SOCK_UNLOCK(epc->so); - soshutdown(epc->so, SHUT_WR|SHUT_RD); + if (close) + soclose(epc->so); + else + soshutdown(epc->so, SHUT_WR|SHUT_RD); epc->so = NULL; } @@ -500,7 +461,7 @@ abort_connection(struct iwch_ep *ep) CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); state_set(&ep->com, ABORTING); abort_socket(ep); - close_socket(&ep->com); + close_socket(&ep->com, 0); close_complete_upcall(ep); state_set(&ep->com, DEAD); put_ep(&ep->com); @@ -582,12 +543,13 @@ connect_request_upcall(struct iwch_ep *ep) event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); event.provider_data = ep; event.so = ep->com.so; - if (state_read(&ep->parent_ep->com) != DEAD) + if (state_read(&ep->parent_ep->com) != DEAD) { + get_ep(&ep->com); ep->parent_ep->com.cm_id->event_handler( ep->parent_ep->com.cm_id, &event); + } put_ep(&ep->parent_ep->com); - ep->parent_ep = NULL; } static void @@ -729,6 +691,7 @@ process_mpa_reply(struct iwch_ep *ep) */ CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__); state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.initiator = 1; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -885,6 +848,7 @@ process_mpa_request(struct iwch_ep *ep) * If we get here we have accumulated the entire mpa * start reply message including private data. */ + ep->mpa_attr.initiator = 0; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -934,7 +898,6 @@ process_peer_close(struct iwch_ep *ep) * rejects the CR. */ __state_set(&ep->com, CLOSING); - get_ep(&ep->com); break; case MPA_REP_SENT: __state_set(&ep->com, CLOSING); @@ -961,7 +924,7 @@ process_peer_close(struct iwch_ep *ep) iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } - close_socket(&ep->com); + close_socket(&ep->com, 0); close_complete_upcall(ep); __state_set(&ep->com, DEAD); release = 1; @@ -986,11 +949,10 @@ process_conn_error(struct iwch_ep *ep) { struct iwch_qp_attributes attrs; int ret; - int state; - state = state_read(&ep->com); - CTR5(KTR_IW_CXGB, "%s ep %p so %p so->so_error %u state %s", __FUNCTION__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]); - switch (state) { + mtx_lock(&ep->com.lock); + CTR3(KTR_IW_CXGB, "%s ep %p state %u", __func__, ep, ep->com.state); + switch (ep->com.state) { case MPA_REQ_WAIT: stop_ep_timer(ep); break; @@ -1009,7 +971,6 @@ process_conn_error(struct iwch_ep *ep) * the reference on it until the ULP accepts or * rejects the CR. */ - get_ep(&ep->com); break; case MORIBUND: case CLOSING: @@ -1031,6 +992,7 @@ process_conn_error(struct iwch_ep *ep) case ABORTING: break; case DEAD: + mtx_unlock(&ep->com.lock); CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__, ep->com.so->so_error); return; @@ -1039,11 +1001,12 @@ process_conn_error(struct iwch_ep *ep) break; } - if (state != ABORTING) { - close_socket(&ep->com); - state_set(&ep->com, DEAD); + if (ep->com.state != ABORTING) { + close_socket(&ep->com, 0); + __state_set(&ep->com, DEAD); put_ep(&ep->com); } + mtx_unlock(&ep->com.lock); return; } @@ -1071,7 +1034,10 @@ process_close_complete(struct iwch_ep *ep) IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } - close_socket(&ep->com); + if (ep->parent_ep) + close_socket(&ep->com, 1); + else + close_socket(&ep->com, 0); close_complete_upcall(ep); __state_set(&ep->com, DEAD); release = 1; @@ -1102,77 +1068,59 @@ process_close_complete(struct iwch_ep *ep) * terminate() handles case (1)... */ static int -terminate(struct t3cdev *tdev, struct mbuf *m, void *ctx) +terminate(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct toepcb *toep = (struct toepcb *)ctx; - struct socket *so = toeptoso(toep); + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + uint32_t hash = *((uint32_t *)r + 1); + unsigned int tid = ntohl(hash) >> 8 & 0xfffff; + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); + struct socket *so = toep->tp_inp->inp_socket; struct iwch_ep *ep = so->so_rcv.sb_upcallarg; - CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + if (state_read(&ep->com) != FPDU_MODE) + goto done; + m_adj(m, sizeof(struct cpl_rdma_terminate)); - CTR2(KTR_IW_CXGB, "%s saving %d bytes of term msg", __FUNCTION__, m->m_len); + + CTR4(KTR_IW_CXGB, "%s: tid %u, ep %p, saved %d bytes", + __func__, tid, ep, m->m_len); + m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer); ep->com.qp->attr.terminate_msg_len = m->m_len; ep->com.qp->attr.is_terminate_local = 0; - return CPL_RET_BUF_DONE; + +done: + m_freem(m); + return (0); } static int -ec_status(struct t3cdev *tdev, struct mbuf *m, void *ctx) +ec_status(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct toepcb *toep = (struct toepcb *)ctx; - struct socket *so = toeptoso(toep); - struct cpl_rdma_ec_status *rep = cplhdr(m); - struct iwch_ep *ep; - struct iwch_qp_attributes attrs; - int release = 0; + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_rdma_ec_status *rep = mtod(m, void *); + unsigned int tid = GET_TID(rep); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); + struct socket *so = toep->tp_inp->inp_socket; + struct iwch_ep *ep = so->so_rcv.sb_upcallarg; - ep = so->so_rcv.sb_upcallarg; - CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s ec_status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], rep->status); - if (!so || !ep) { - panic("bogosity ep %p state %d, so %p state %x\n", ep, ep ? ep->com.state : -1, so, so ? so->so_state : -1); - } - mtx_lock(&ep->com.lock); - switch (ep->com.state) { - case CLOSING: - if (!rep->status) - __state_set(&ep->com, MORIBUND); - else - __state_set(&ep->com, ABORTING); - break; - case MORIBUND: - stop_ep_timer(ep); - if (!rep->status) { - if ((ep->com.cm_id) && (ep->com.qp)) { - attrs.next_state = IWCH_QP_STATE_IDLE; - iwch_modify_qp(ep->com.qp->rhp, - ep->com.qp, - IWCH_QP_ATTR_NEXT_STATE, - &attrs, 1); - } - close_socket(&ep->com); - close_complete_upcall(ep); - __state_set(&ep->com, DEAD); - release = 1; - } - break; - case DEAD: - break; - default: - panic("unknown state: %d\n", ep->com.state); - } - mtx_unlock(&ep->com.lock); if (rep->status) { - log(LOG_ERR, "%s BAD CLOSE - Aborting tid %u\n", - __FUNCTION__, ep->hwtid); + struct iwch_qp_attributes attrs; + + CTR1(KTR_IW_CXGB, "%s BAD CLOSE - Aborting", __FUNCTION__); + stop_ep_timer(ep); attrs.next_state = IWCH_QP_STATE_ERROR; iwch_modify_qp(ep->com.qp->rhp, - ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, - &attrs, 1); + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + abort_connection(ep); } - if (release) - put_ep(&ep->com); - return CPL_RET_BUF_DONE; + + m_freem(m); + return (0); } static void @@ -1181,24 +1129,29 @@ ep_timeout(void *arg) struct iwch_ep *ep = (struct iwch_ep *)arg; struct iwch_qp_attributes attrs; int err = 0; + int abort = 1; mtx_lock(&ep->com.lock); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); switch (ep->com.state) { case MPA_REQ_SENT: + __state_set(&ep->com, ABORTING); connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: + __state_set(&ep->com, ABORTING); break; case CLOSING: case MORIBUND: if (ep->com.cm_id && ep->com.qp) err = 1; + __state_set(&ep->com, ABORTING); break; default: - panic("unknown state: %d\n", ep->com.state); + CTR3(KTR_IW_CXGB, "%s unexpected state ep %p state %u\n", + __func__, ep, ep->com.state); + abort = 0; } - __state_set(&ep->com, ABORTING); mtx_unlock(&ep->com.lock); if (err){ attrs.next_state = IWCH_QP_STATE_ERROR; @@ -1206,7 +1159,8 @@ ep_timeout(void *arg) ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } - abort_connection(ep); + if (abort) + abort_connection(ep); put_ep(&ep->com); } @@ -1228,6 +1182,7 @@ iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) err = send_mpa_reject(ep, pdata, pdata_len); err = soshutdown(ep->com.so, 3); } + put_ep(&ep->com); return 0; } @@ -1242,8 +1197,10 @@ iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct iwch_qp *qp = get_qhp(h, conn_param->qpn); CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); - if (state_read(&ep->com) == DEAD) - return (-ECONNRESET); + if (state_read(&ep->com) == DEAD) { + err = -ECONNRESET; + goto err; + } PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); PANIC_IF(!qp); @@ -1251,7 +1208,8 @@ iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { abort_connection(ep); - return (-EINVAL); + err = -EINVAL; + goto err; } cm_id->add_ref(cm_id); @@ -1263,11 +1221,10 @@ iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->ird = conn_param->ird; ep->ord = conn_param->ord; CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord); - get_ep(&ep->com); /* bind QP to EP and move to RTS */ attrs.mpa_attr = ep->mpa_attr; - attrs.max_ird = ep->ord; + attrs.max_ird = ep->ird; attrs.max_ord = ep->ord; attrs.llp_stream_handle = ep; attrs.next_state = IWCH_QP_STATE_RTS; @@ -1283,20 +1240,21 @@ iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->com.qp, mask, &attrs, 1); if (err) - goto err; + goto err1; err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) - goto err; + goto err1; state_set(&ep->com, FPDU_MODE); established_upcall(ep); put_ep(&ep->com); return 0; -err: +err1: ep->com.cm_id = NULL; ep->com.qp = NULL; cm_id->rem_ref(cm_id); +err: put_ep(&ep->com); return err; } @@ -1312,15 +1270,6 @@ static int init_sock(struct iwch_ep_common *epc) epc->so->so_state |= SS_NBIO; SOCK_UNLOCK(epc->so); sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_NO_DDP; - sopt.sopt_val = (caddr_t)&on; - sopt.sopt_valsize = sizeof on; - sopt.sopt_td = NULL; - err = sosetopt(epc->so, &sopt); - if (err) - printf("%s can't set SO_NO_DDP err %d\n", __FUNCTION__, err); - sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = (caddr_t)&on; @@ -1400,16 +1349,14 @@ iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) { printf("%s - interface not TOE capable.\n", __FUNCTION__); - goto fail3; + RTFREE(rt); + goto fail2; } tdev = TOEDEV(rt->rt_ifp); if (tdev == NULL) { printf("%s - No toedev for interface.\n", __FUNCTION__); - goto fail3; - } - if (!tdev->tod_can_offload(tdev, ep->com.so)) { - printf("%s - interface cannot offload!.\n", __FUNCTION__); - goto fail3; + RTFREE(rt); + goto fail2; } RTFREE(rt); @@ -1420,8 +1367,6 @@ iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->com.thread); if (!err) goto out; -fail3: - RTFREE(ep->dst); fail2: put_ep(&ep->com); out: @@ -1458,7 +1403,7 @@ iwch_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = ep; goto out; } - close_socket(&ep->com); + close_socket(&ep->com, 0); fail: cm_id->rem_ref(cm_id); put_ep(&ep->com); @@ -1474,7 +1419,7 @@ iwch_destroy_listen(struct iw_cm_id *cm_id) CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); state_set(&ep->com, DEAD); - close_socket(&ep->com); + close_socket(&ep->com, 0); cm_id->rem_ref(cm_id); put_ep(&ep->com); return 0; @@ -1493,47 +1438,48 @@ iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags) CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], abrupt); - if (ep->com.state == DEAD) { - CTR2(KTR_IW_CXGB, "%s already dead ep %p", __FUNCTION__, ep); - goto out; - } - - if (abrupt) { - if (ep->com.state != ABORTING) { - ep->com.state = ABORTING; - close = 1; - } - goto out; - } - switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: - start_ep_timer(ep); - ep->com.state = CLOSING; close = 1; + if (abrupt) + ep->com.state = ABORTING; + else { + ep->com.state = CLOSING; + start_ep_timer(ep); + } break; case CLOSING: - ep->com.state = MORIBUND; close = 1; + if (abrupt) { + stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; break; case MORIBUND: case ABORTING: + case DEAD: + CTR3(KTR_IW_CXGB, "%s ignoring disconnect ep %p state %u\n", + __func__, ep, ep->com.state); break; default: panic("unknown state: %d\n", ep->com.state); break; } -out: + mtx_unlock(&ep->com.lock); if (close) { if (abrupt) abort_connection(ep); - else + else { + if (!ep->parent_ep) + __state_set(&ep->com, MORIBUND); shutdown_socket(&ep->com); + } } return 0; } @@ -1587,7 +1533,7 @@ process_connected(struct iwch_ep *ep) send_mpa_req(ep); } else { connect_reply_upcall(ep, -ep->com.so->so_error); - close_socket(&ep->com); + close_socket(&ep->com, 0); state_set(&ep->com, DEAD); put_ep(&ep->com); } @@ -1643,10 +1589,20 @@ process_newconn(struct iwch_ep *parent_ep) } CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__, inet_ntoa(remote->sin_addr), ntohs(remote->sin_port)); + child_ep->com.tdev = parent_ep->com.tdev; + child_ep->com.local_addr.sin_family = parent_ep->com.local_addr.sin_family; + child_ep->com.local_addr.sin_port = parent_ep->com.local_addr.sin_port; + child_ep->com.local_addr.sin_addr.s_addr = parent_ep->com.local_addr.sin_addr.s_addr; + child_ep->com.local_addr.sin_len = parent_ep->com.local_addr.sin_len; + child_ep->com.remote_addr.sin_family = remote->sin_family; + child_ep->com.remote_addr.sin_port = remote->sin_port; + child_ep->com.remote_addr.sin_addr.s_addr = remote->sin_addr.s_addr; + child_ep->com.remote_addr.sin_len = remote->sin_len; child_ep->com.so = child_so; child_ep->com.cm_id = NULL; child_ep->com.thread = parent_ep->com.thread; child_ep->parent_ep = parent_ep; + free(remote, M_SONAME); get_ep(&parent_ep->com); child_ep->parent_ep = parent_ep; @@ -1747,17 +1703,30 @@ iwch_cm_init(void) } taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq"); TASK_INIT(&iw_cxgb_task, 0, process_req, NULL); - t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, terminate); - t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, ec_status); - return 0; + return (0); } void iwch_cm_term(void) { - t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, NULL); - t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, NULL); + taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task); taskqueue_free(iw_cxgb_taskq); } +void +iwch_cm_init_cpl(struct adapter *sc) +{ + + t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, terminate); + t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, ec_status); +} + +void +iwch_cm_term_cpl(struct adapter *sc) +{ + + t3_register_cpl_handler(sc, CPL_RDMA_TERMINATE, NULL); + t3_register_cpl_handler(sc, CPL_RDMA_EC_STATUS, NULL); +} +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h index 4250be3..ef76729 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h @@ -31,8 +31,8 @@ $FreeBSD$ #ifndef _IWCH_CM_H_ #define _IWCH_CM_H_ -#include -#include +#include +#include #include #include #include @@ -42,21 +42,21 @@ $FreeBSD$ #define MPA_KEY_REP "MPA ID Rep Frame" #define MPA_MAX_PRIVATE_DATA 256 -#define MPA_REV o0 /* XXX - amso1100 uses rev 0 ! */ +#define MPA_REV 0 /* XXX - amso1100 uses rev 0 ! */ #define MPA_REJECT 0x20 #define MPA_CRC 0x40 #define MPA_MARKERS 0x80 #define MPA_FLAGS_MASK 0xE0 #define put_ep(ep) { \ - CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__, \ + CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d", __FUNCTION__, __LINE__, \ ep, atomic_load_acq_int(&((ep)->refcount))); \ if (refcount_release(&((ep)->refcount))) \ __free_ep(ep); \ } #define get_ep(ep) { \ - CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \ + CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d", __FUNCTION__, __LINE__, \ ep, atomic_load_acq_int(&((ep)->refcount))); \ refcount_acquire(&((ep)->refcount)); \ } @@ -148,7 +148,7 @@ struct iwch_ep_common { TAILQ_ENTRY(iwch_ep_common) entry; struct iw_cm_id *cm_id; struct iwch_qp *qp; - struct t3cdev *tdev; + struct toedev *tdev; enum iwch_ep_state state; u_int refcount; struct cv waitq; @@ -176,7 +176,6 @@ struct iwch_ep { u32 snd_seq; u32 rcv_seq; struct l2t_entry *l2t; - struct rtentry *dst; struct mbuf *mpa_mbuf; struct iwch_mpa_attributes mpa_attr; unsigned int mpa_pkt_len; @@ -237,13 +236,13 @@ int iwch_destroy_listen(struct iw_cm_id *cm_id); int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags); -int iwch_quiesce_tid(struct iwch_ep *ep); -int iwch_resume_tid(struct iwch_ep *ep); void __free_ep(struct iwch_ep_common *ep); void iwch_rearp(struct iwch_ep *ep); int iwch_ep_redirect(void *ctx, struct rtentry *old, struct rtentry *new, struct l2t_entry *l2t); int iwch_cm_init(void); void iwch_cm_term(void); +void iwch_cm_init_cpl(struct adapter *); +void iwch_cm_term_cpl(struct adapter *); #endif /* _IWCH_CM_H_ */ diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c index bb564bb..9ad6ed9 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c @@ -30,11 +30,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -59,9 +61,11 @@ __FBSDID("$FreeBSD$"); #include #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include @@ -261,4 +265,4 @@ int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return npolled; } } - +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c index ac9c742..f5f59a3 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c @@ -30,11 +30,12 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include #include #include #include -#include #include #include #include @@ -60,11 +61,13 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include +#include +#include +#include +#include +#include -#ifdef DEBUG +#if defined(INVARIANTS) && defined(TCP_OFFLOAD) #include #include #include @@ -74,75 +77,100 @@ __FBSDID("$FreeBSD$"); #include #include +static int +cxio_rdma_get_mem(struct cxio_rdev *rdev, struct ch_mem_range *m) +{ + struct adapter *sc = rdev->adap; + struct mc7 *mem; + + if ((m->addr & 7) || (m->len & 7)) + return (EINVAL); + if (m->mem_id == MEM_CM) + mem = &sc->cm; + else if (m->mem_id == MEM_PMRX) + mem = &sc->pmrx; + else if (m->mem_id == MEM_PMTX) + mem = &sc->pmtx; + else + return (EINVAL); + + return (t3_mc7_bd_read(mem, m->addr/8, m->len/8, (u64 *)m->buf)); +} + void cxio_dump_tpt(struct cxio_rdev *rdev, uint32_t stag) { - struct ch_mem_range *m; + struct ch_mem_range m; u64 *data; + u32 addr; int rc; int size = 32; - m = kmalloc(sizeof(*m) + size, M_NOWAIT); - if (!m) { + m.buf = malloc(size, M_DEVBUF, M_NOWAIT); + if (m.buf == NULL) { CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); return; } - m->mem_id = MEM_PMRX; - m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; - m->len = size; - CTR3(KTR_IW_CXGB, "%s TPT addr 0x%x len %d", __FUNCTION__, m->addr, m->len); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + m.mem_id = MEM_PMRX; + m.addr = (stag >> 8) * 32 + rdev->rnic_info.tpt_base; + m.len = size; + CTR3(KTR_IW_CXGB, "%s TPT addr 0x%x len %d", __FUNCTION__, m.addr, m.len); + + rc = cxio_rdma_get_mem(rdev, &m); if (rc) { CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); return; } - data = (u64 *)m->buf; + data = (u64 *)m.buf; + addr = m.addr; while (size > 0) { - CTR2(KTR_IW_CXGB, "TPT %08x: %016llx", m->addr, (unsigned long long) *data); + CTR2(KTR_IW_CXGB, "TPT %08x: %016llx", addr, (unsigned long long) *data); size -= 8; data++; - m->addr += 8; + addr += 8; } - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); } void cxio_dump_pbl(struct cxio_rdev *rdev, uint32_t pbl_addr, uint32_t len, u8 shift) { - struct ch_mem_range *m; + struct ch_mem_range m; u64 *data; + u32 addr; int rc; int size, npages; shift += 12; npages = (len + (1ULL << shift) - 1) >> shift; size = npages * sizeof(u64); - - m = kmalloc(sizeof(*m) + size, M_NOWAIT); - if (!m) { + m.buf = malloc(size, M_DEVBUF, M_NOWAIT); + if (m.buf == NULL) { CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); return; } - m->mem_id = MEM_PMRX; - m->addr = pbl_addr; - m->len = size; + m.mem_id = MEM_PMRX; + m.addr = pbl_addr; + m.len = size; CTR4(KTR_IW_CXGB, "%s PBL addr 0x%x len %d depth %d", - __FUNCTION__, m->addr, m->len, npages); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + __FUNCTION__, m.addr, m.len, npages); + + rc = cxio_rdma_get_mem(rdev, &m); if (rc) { CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); return; } - data = (u64 *)m->buf; + data = (u64 *)m.buf; + addr = m.addr; while (size > 0) { - CTR2(KTR_IW_CXGB, "PBL %08x: %016llx", m->addr, (unsigned long long) *data); + CTR2(KTR_IW_CXGB, "PBL %08x: %016llx", addr, (unsigned long long) *data); size -= 8; data++; - m->addr += 8; + addr += 8; } - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); } void cxio_dump_wqe(union t3_wr *wqe) @@ -175,70 +203,76 @@ void cxio_dump_wce(struct t3_cqe *wce) void cxio_dump_rqt(struct cxio_rdev *rdev, uint32_t hwtid, int nents) { - struct ch_mem_range *m; + struct ch_mem_range m; int size = nents * 64; u64 *data; + u32 addr; int rc; - m = kmalloc(sizeof(*m) + size, M_NOWAIT); - if (!m) { + m.buf = malloc(size, M_DEVBUF, M_NOWAIT); + if (m.buf == NULL) { CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); return; } - m->mem_id = MEM_PMRX; - m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; - m->len = size; - CTR3(KTR_IW_CXGB, "%s RQT addr 0x%x len %d", __FUNCTION__, m->addr, m->len); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + m.mem_id = MEM_PMRX; + m.addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; + m.len = size; + CTR3(KTR_IW_CXGB, "%s RQT addr 0x%x len %d", __FUNCTION__, m.addr, m.len); + + rc = cxio_rdma_get_mem(rdev, &m); if (rc) { CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); return; } - data = (u64 *)m->buf; + data = (u64 *)m.buf; + addr = m.addr; while (size > 0) { - CTR2(KTR_IW_CXGB, "RQT %08x: %016llx", m->addr, (unsigned long long) *data); + CTR2(KTR_IW_CXGB, "RQT %08x: %016llx", addr, (unsigned long long) *data); size -= 8; data++; - m->addr += 8; + addr += 8; } - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); } void cxio_dump_tcb(struct cxio_rdev *rdev, uint32_t hwtid) { - struct ch_mem_range *m; + struct ch_mem_range m; int size = TCB_SIZE; uint32_t *data; + uint32_t addr; int rc; - m = kmalloc(sizeof(*m) + size, M_NOWAIT); - if (!m) { + m.buf = malloc(size, M_DEVBUF, M_NOWAIT); + if (m.buf == NULL) { CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); return; } - m->mem_id = MEM_CM; - m->addr = hwtid * size; - m->len = size; - CTR3(KTR_IW_CXGB, "%s TCB %d len %d", __FUNCTION__, m->addr, m->len); - rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + m.mem_id = MEM_CM; + m.addr = hwtid * size; + m.len = size; + CTR3(KTR_IW_CXGB, "%s TCB %d len %d", __FUNCTION__, m.addr, m.len); + + rc = cxio_rdma_get_mem(rdev, &m); if (rc) { CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); return; } - data = (uint32_t *)m->buf; + data = (uint32_t *)m.buf; + addr = m.addr; while (size > 0) { printf("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", - m->addr, + addr, *(data+2), *(data+3), *(data),*(data+1), *(data+6), *(data+7), *(data+4), *(data+5)); size -= 32; data += 8; - m->addr += 32; + addr += 32; } - free(m, M_DEVBUF); + free(m.buf, M_DEVBUF); } #endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c index cf0891d..65b7a2d 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c @@ -29,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -59,9 +61,11 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include @@ -81,11 +85,22 @@ post_qp_event(struct iwch_dev *rnicp, struct iwch_qp *qhp, struct iwch_cq *chp, struct ib_event event; struct iwch_qp_attributes attrs; + mtx_lock(&rnicp->lock); + + if (!qhp) { + CTR3(KTR_IW_CXGB, "%s unaffiliated error 0x%x qpid 0x%x\n", + __func__, CQE_STATUS(rsp_msg->cqe), + CQE_QPID(rsp_msg->cqe)); + mtx_unlock(&rnicp->lock); + return; + } + if ((qhp->attr.state == IWCH_QP_STATE_ERROR) || (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) { CTR4(KTR_IW_CXGB, "%s AE received after RTS - " "qp state %d qpid 0x%x status 0x%x", __FUNCTION__, qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe)); + mtx_unlock(&rnicp->lock); return; } @@ -95,6 +110,15 @@ post_qp_event(struct iwch_dev *rnicp, struct iwch_qp *qhp, struct iwch_cq *chp, CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + mtx_unlock(&rnicp->lock); + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } event.event = ib_event; event.device = chp->ibcq.device; @@ -106,25 +130,17 @@ post_qp_event(struct iwch_dev *rnicp, struct iwch_qp *qhp, struct iwch_cq *chp, if (qhp->ibqp.event_handler) (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); - if (qhp->attr.state == IWCH_QP_STATE_RTS) { - attrs.next_state = IWCH_QP_STATE_TERMINATE; - iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, - &attrs, 1); - if (send_term) - iwch_post_terminate(qhp, rsp_msg); - } + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); } void -iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m) +iwch_ev_dispatch(struct iwch_dev *rnicp, struct mbuf *m) { - struct iwch_dev *rnicp; struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data; struct iwch_cq *chp; struct iwch_qp *qhp; u32 cqid = RSPQ_CQID(rsp_msg); - rnicp = (struct iwch_dev *) rdev_p->ulp; mtx_lock(&rnicp->lock); chp = get_chp(rnicp, cqid); qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); @@ -136,7 +152,7 @@ iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m) CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); mtx_unlock(&rnicp->lock); - goto out; + return; } iwch_qp_add_ref(&qhp->ibqp); mtx_lock(&chp->lock); @@ -200,12 +216,6 @@ iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m) case TPT_ERR_BOUND: case TPT_ERR_INVALIDATE_SHARED_MR: case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: - log(LOG_ERR, "%s - CQE Err qpid 0x%x opcode %d status 0x%x " - "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, - CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), - CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), - CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); - (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); break; @@ -248,6 +258,5 @@ done: wakeup(chp); mtx_unlock(&chp->lock); iwch_qp_rem_ref(&qhp->ibqp); -out: - m_free(m); } +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c index cf42e38..2427263 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c @@ -1,4 +1,3 @@ - /************************************************************************** Copyright (c) 2007, Chelsio Inc. @@ -30,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -47,6 +48,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include @@ -59,12 +62,25 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include #include - -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include #include #include +#include +#include #include #include #include @@ -72,29 +88,21 @@ __FBSDID("$FreeBSD$"); #include #include -static TAILQ_HEAD( ,cxio_rdev) rdev_list; -static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL; - -static struct cxio_rdev * -cxio_hal_find_rdev_by_name(char *dev_name) +/* Response queue used for RDMA events. */ +#define ASYNC_NOTIF_RSPQ 0 +static inline int +cxio_rdma_cq_setup(struct cxio_rdev *rdev_p, unsigned id, uint64_t base_addr, + unsigned size, unsigned ovfl_mode, unsigned credits, unsigned credit_thres) { - struct cxio_rdev *rdev; + struct adapter *sc = rdev_p->adap; + int rc; - TAILQ_FOREACH(rdev, &rdev_list, entry) - if (!strcmp(rdev->dev_name, dev_name)) - return rdev; - return NULL; -} - -struct cxio_rdev * -cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev) -{ - struct cxio_rdev *rdev; + mtx_lock_spin(&sc->sge.reg_lock); + rc = -t3_sge_init_cqcntxt(sc, id, base_addr, size, ASYNC_NOTIF_RSPQ, + ovfl_mode, credits, credit_thres); + mtx_unlock_spin(&sc->sge.reg_lock); - TAILQ_FOREACH(rdev, &rdev_list, entry) - if (rdev->t3cdev_p == tdev) - return rdev; - return NULL; + return (rc); } int @@ -104,12 +112,14 @@ cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, int ret; struct t3_cqe *cqe; u32 rptr; + struct adapter *sc = rdev_p->adap; + + if (op != CQ_CREDIT_UPDATE) + credit = 0; - struct rdma_cq_op setup; - setup.id = cq->cqid; - setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0; - setup.op = op; - ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup); + mtx_lock_spin(&sc->sge.reg_lock); + ret = t3_sge_cqcntxt_op(sc, cq->cqid, op, credit); + mtx_unlock_spin(&sc->sge.reg_lock); if ((ret < 0) || (op == CQ_CREDIT_UPDATE)) return (ret); @@ -140,30 +150,26 @@ cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) { DELAY(1); if (i++ > 1000000) { - PANIC_IF(1); + struct adapter *sc = rdev_p->adap; + log(LOG_ERR, "%s: stalled rnic\n", - rdev_p->dev_name); + device_get_nameunit(sc->dev)); + PANIC_IF(1); return (-EIO); } } - return 1; + return (1); } - return 0; + return (0); } static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid) { - struct rdma_cq_setup setup; - setup.id = cqid; - setup.base_addr = 0; /* NULL address */ - setup.size = 0; /* disaable the CQ */ - setup.credits = 0; - setup.credit_thres = 0; - setup.ovfl_mode = 0; - return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); + + return (cxio_rdma_cq_setup(rdev_p, cqid, 0, 0, 0, 0, 0)); } static int @@ -171,43 +177,38 @@ cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) { u64 sge_cmd; struct t3_modify_qp_wr *wqe; - struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT); + struct mbuf *m; + + m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, wqe); if (m == NULL) { CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__); return (-ENOMEM); } wqe = mtod(m, struct t3_modify_qp_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*wqe); memset(wqe, 0, sizeof(*wqe)); build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, qpid, 7); wqe->flags = htobe32(MODQP_WRITE_EC); sge_cmd = qpid << 8 | 3; wqe->sge_cmd = htobe64(sge_cmd); - m_set_priority(m, CPL_PRIORITY_CONTROL); - m_set_sgl(m, NULL); - m_set_sgllen(m, 0); - return (cxgb_ofld_send(rdev_p->t3cdev_p, m)); + return t3_offload_tx(rdev_p->adap, m); } int -cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel) { - struct rdma_cq_setup setup; int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + size += 1; /* one extra page for storing cq-in-err state */ cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); if (!cq->cqid) return (-ENOMEM); - cq->sw_queue = malloc(size, M_DEVBUF, M_NOWAIT|M_ZERO); - if (!cq->sw_queue) - return (-ENOMEM); -#if 0 - cq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev, - (1UL << (cq->size_log2)) * - sizeof(struct t3_cqe), - &(cq->dma_addr), M_NOWAIT); -#else - cq->queue = contigmalloc((1UL << (cq->size_log2))*sizeof(struct t3_cqe), + if (kernel) { + cq->sw_queue = malloc(size, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!cq->sw_queue) + return (-ENOMEM); + } + + cq->queue = contigmalloc(size, M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); if (cq->queue) cq->dma_addr = vtophys(cq->queue); @@ -215,35 +216,10 @@ cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) free(cq->sw_queue, M_DEVBUF); return (-ENOMEM); } -#endif - -#ifdef notyet - pci_unmap_addr_set(cq, mapping, cq->dma_addr); -#endif memset(cq->queue, 0, size); - setup.id = cq->cqid; - setup.base_addr = (u64) (cq->dma_addr); - setup.size = 1UL << cq->size_log2; - setup.credits = 65535; - setup.credit_thres = 1; - if (rdev_p->t3cdev_p->type != T3A) - setup.ovfl_mode = 0; - else - setup.ovfl_mode = 1; - return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); -} -int -cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) -{ - struct rdma_cq_setup setup; - setup.id = cq->cqid; - setup.base_addr = (u64) (cq->dma_addr); - setup.size = 1UL << cq->size_log2; - setup.credits = setup.size; - setup.credit_thres = setup.size; /* TBD: overflow recovery */ - setup.ovfl_mode = 1; - return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); + return (cxio_rdma_cq_setup(rdev_p, cq->cqid, cq->dma_addr, + 1UL << cq->size_log2, 0, 65535, 1)); } static u32 @@ -325,7 +301,7 @@ cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, if (!wq->qpid) return (-ENOMEM); - wq->rq = malloc(depth * sizeof(u64), M_DEVBUF, M_NOWAIT|M_ZERO); + wq->rq = malloc(depth * sizeof(struct t3_swrq), M_DEVBUF, M_NOWAIT|M_ZERO); if (!wq->rq) goto err1; @@ -336,28 +312,19 @@ cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, wq->sq = malloc(depth * sizeof(struct t3_swsq), M_DEVBUF, M_NOWAIT|M_ZERO); if (!wq->sq) goto err3; -#if 0 - wq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev, - depth * sizeof(union t3_wr), - &(wq->dma_addr), M_NOWAIT); -#else wq->queue = contigmalloc(depth *sizeof(union t3_wr), M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); if (wq->queue) wq->dma_addr = vtophys(wq->queue); - -#endif - if (!wq->queue) + else goto err4; memset(wq->queue, 0, depth * sizeof(union t3_wr)); -#ifdef notyet - pci_unmap_addr_set(wq, mapping, wq->dma_addr); -#endif wq->doorbell = rdev_p->rnic_info.kdb_addr; if (!kernel_domain) wq->udb = (u64)rdev_p->rnic_info.udbell_physbase + (wq->qpid << rdev_p->qpshift); + wq->rdev = rdev_p; CTR4(KTR_IW_CXGB, "%s qpid 0x%x doorbell 0x%p udb 0x%llx", __FUNCTION__, wq->qpid, wq->doorbell, (unsigned long long) wq->udb); return 0; @@ -431,10 +398,11 @@ insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) cq->sw_wptr++; } -void +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) { u32 ptr; + int flushed = 0; CTR3(KTR_IW_CXGB, "%s wq %p cq %p", __FUNCTION__, wq, cq); @@ -442,8 +410,11 @@ cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) CTR4(KTR_IW_CXGB, "%s rq_rptr %u rq_wptr %u skip count %u", __FUNCTION__, wq->rq_rptr, wq->rq_wptr, count); ptr = wq->rq_rptr + count; - while (ptr++ != wq->rq_wptr) + while (ptr++ != wq->rq_wptr) { insert_recv_cqe(wq, cq); + flushed++; + } + return flushed; } static void @@ -468,19 +439,22 @@ insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, cq->sw_wptr++; } -void +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) { __u32 ptr; + int flushed = 0; struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); ptr = wq->sq_rptr + count; - sqp += count; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); while (ptr != wq->sq_wptr) { insert_sq_cqe(wq, cq, sqp); - sqp++; ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + flushed++; } + return flushed; } /* @@ -516,7 +490,7 @@ static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq) if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe)) return 0; - if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) && + if (CQE_OPCODE(*cqe) && RQ_TYPE(*cqe) && Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) return 0; @@ -563,16 +537,8 @@ cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count) static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p) { - struct rdma_cq_setup setup; - setup.id = 0; - setup.base_addr = 0; /* NULL address */ - setup.size = 1; /* enable the CQ */ - setup.credits = 0; - - /* force SGE to redirect to RspQ and interrupt */ - setup.credit_thres = 0; - setup.ovfl_mode = 1; - return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); + + return (cxio_rdma_cq_setup(rdev_p, 0, 0, 1, 1, 0, 0)); } static int @@ -584,41 +550,28 @@ cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) struct t3_modify_qp_wr *wqe; struct mbuf *m; - m = m_gethdr(MT_DATA, M_NOWAIT); + m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, wqe); if (m == NULL) { CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__); - return (-ENOMEM); + return (ENOMEM); } err = cxio_hal_init_ctrl_cq(rdev_p); if (err) { CTR2(KTR_IW_CXGB, "%s err %d initializing ctrl_cq", __FUNCTION__, err); goto err; } -#if 0 - rdev_p->ctrl_qp.workq = dma_alloc_coherent( - rdev_p->rnic_info.pdev, - (1 << T3_CTRL_QP_SIZE_LOG2) * - sizeof(union t3_wr), - &(rdev_p->ctrl_qp.dma_addr), - M_NOWAIT); -#else + rdev_p->ctrl_qp.workq = contigmalloc((1 << T3_CTRL_QP_SIZE_LOG2) *sizeof(union t3_wr), M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); if (rdev_p->ctrl_qp.workq) rdev_p->ctrl_qp.dma_addr = vtophys(rdev_p->ctrl_qp.workq); - -#endif - - if (!rdev_p->ctrl_qp.workq) { + else { CTR1(KTR_IW_CXGB, "%s dma_alloc_coherent failed", __FUNCTION__); - err = -ENOMEM; + err = ENOMEM; goto err; } -#if 0 - pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping, - rdev_p->ctrl_qp.dma_addr); -#endif - rdev_p->ctrl_qp.doorbell = (void /*__iomem */ *)rdev_p->rnic_info.kdb_addr; + + rdev_p->ctrl_qp.doorbell = rdev_p->rnic_info.kdb_addr; memset(rdev_p->ctrl_qp.workq, 0, (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); @@ -637,10 +590,8 @@ cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) | V_EC_TYPE(0) | V_EC_GEN(1) | V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32; - wqe = mtod(m, struct t3_modify_qp_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*wqe); memset(wqe, 0, sizeof(*wqe)); - build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0, T3_CTL_QP_TID, 7); wqe->flags = htobe32(MODQP_WRITE_EC); sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3; @@ -650,12 +601,9 @@ cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) CTR3(KTR_IW_CXGB, "CtrlQP dma_addr 0x%llx workq %p size %d", (unsigned long long) rdev_p->ctrl_qp.dma_addr, rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); - m_set_priority(m, CPL_PRIORITY_CONTROL); - m_set_sgl(m, NULL); - m_set_sgllen(m, 0); - return (cxgb_ofld_send(rdev_p->t3cdev_p, m)); + return t3_offload_tx(rdev_p->adap, m); err: - m_free(m); + m_freem(m); return err; } @@ -681,7 +629,7 @@ cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) */ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, - u32 len, void *data, int completion) + u32 len, void *data) { u32 i, nr_wqe, copy_len; u8 *copy_data; @@ -718,7 +666,7 @@ cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, flag = 0; if (i == (nr_wqe - 1)) { /* last WQE */ - flag = completion ? T3_COMPLETION_FLAG : 0; + flag = T3_COMPLETION_FLAG; if (len % 32) utx_len = len / 32 + 1; else @@ -786,14 +734,13 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, u32 *stag, u8 stag_state, u32 pdid, enum tpt_mem_type type, enum tpt_mem_perm perm, - u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl, - u32 *pbl_size, u32 *pbl_addr) + u32 zbva, u64 to, u32 len, u8 page_size, + u32 pbl_size, u32 pbl_addr) { int err; struct tpt_entry tpt; u32 stag_idx; u32 wptr; - int rereg = (*stag != T3_STAG_UNSET); stag_state = stag_state > 0; stag_idx = (*stag) >> 8; @@ -807,30 +754,8 @@ __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, CTR5(KTR_IW_CXGB, "%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x", __FUNCTION__, stag_state, type, pdid, stag_idx); - if (reset_tpt_entry) - cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3); - else if (!rereg) { - *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3); - if (!*pbl_addr) { - return (-ENOMEM); - } - } - mtx_lock(&rdev_p->ctrl_qp.lock); - /* write PBL first if any - update pbl only if pbl list exist */ - if (pbl) { - - CTR4(KTR_IW_CXGB, "%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d", - __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base, - *pbl_size); - err = cxio_hal_ctrl_qp_write_mem(rdev_p, - (*pbl_addr >> 5), - (*pbl_size << 3), pbl, 0); - if (err) - goto ret; - } - /* write TPT entry */ if (reset_tpt_entry) memset(&tpt, 0, sizeof(tpt)); @@ -845,23 +770,23 @@ __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | V_TPT_PAGE_SIZE(page_size)); tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : - htobe32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3)); + htobe32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); tpt.len = htobe32(len); tpt.va_hi = htobe32((u32) (to >> 32)); tpt.va_low_or_fbo = htobe32((u32) (to & 0xFFFFFFFFULL)); tpt.rsvd_bind_cnt_or_pstag = 0; tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : - htobe32(V_TPT_PBL_SIZE((*pbl_size) >> 2)); + htobe32(V_TPT_PBL_SIZE((pbl_size) >> 2)); } err = cxio_hal_ctrl_qp_write_mem(rdev_p, stag_idx + (rdev_p->rnic_info.tpt_base >> 5), - sizeof(tpt), &tpt, 1); + sizeof(tpt), &tpt); /* release the stag index to free pool */ if (reset_tpt_entry) cxio_hal_put_stag(rdev_p->rscp, stag_idx); -ret: + wptr = rdev_p->ctrl_qp.wptr; mtx_unlock(&rdev_p->ctrl_qp.lock); if (!err) @@ -872,61 +797,90 @@ ret: return err; } +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + u32 wptr; + int err; + + CTR4(KTR_IW_CXGB, "%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d", + __func__, pbl_addr, rdev_p->rnic_info.pbl_base, + pbl_size); + + mtx_lock(&rdev_p->ctrl_qp.lock); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3, + pbl); + wptr = rdev_p->ctrl_qp.wptr; + mtx_unlock(&rdev_p->ctrl_qp.lock); + if (err) + return err; + + if (cxio_wait(&rdev_p->ctrl_qp, + &rdev_p->ctrl_qp.lock, + SEQ32_GE(rdev_p->ctrl_qp.rptr, wptr))) + return ERESTART; + + return 0; +} + int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr) + u8 page_size, u32 pbl_size, u32 pbl_addr) { *stag = T3_STAG_UNSET; return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, - zbva, to, len, page_size, pbl, pbl_size, pbl_addr); + zbva, to, len, page_size, pbl_size, pbl_addr); } int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr) + u8 page_size, u32 pbl_size, u32 pbl_addr) { return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, - zbva, to, len, page_size, pbl, pbl_size, pbl_addr); + zbva, to, len, page_size, pbl_size, pbl_addr); } int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, u32 pbl_addr) { - return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, - &pbl_size, &pbl_addr); + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + pbl_size, pbl_addr); } int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) { - u32 pbl_size = 0; *stag = T3_STAG_UNSET; return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, - NULL, &pbl_size, NULL); + 0, 0); } int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) { - return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, - NULL, NULL); + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + 0, 0); } int -cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) +cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr, + struct socket *so) { struct t3_rdma_init_wr *wqe; - struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT); + struct mbuf *m; + struct ofld_hdr *oh; + int rc; + struct tcpcb *tp; + struct inpcb *inp; + struct toepcb *toep; + + m = M_GETHDR_OFLD(0, CPL_PRIORITY_DATA, wqe); if (m == NULL) return (-ENOMEM); CTR2(KTR_IW_CXGB, "%s rdev_p %p", __FUNCTION__, rdev_p); - wqe = mtod(m, struct t3_rdma_init_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*wqe); wqe->wrh.op_seop_flags = htobe32(V_FW_RIWR_OP(T3_WR_INIT)); wqe->wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(attr->tid) | V_FW_RIWR_LEN(sizeof(*wqe) >> 3)); @@ -940,36 +894,41 @@ cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) wqe->mpaattrs = attr->mpaattrs; wqe->qpcaps = attr->qpcaps; wqe->ulpdu_size = htobe16(attr->tcp_emss); - wqe->flags = htobe32(attr->flags); + wqe->rqe_count = htobe16(attr->rqe_count); + wqe->flags_rtr_type = htobe16(attr->flags | + V_RTR_TYPE(attr->rtr_type) | + V_CHAN(attr->chan)); wqe->ord = htobe32(attr->ord); wqe->ird = htobe32(attr->ird); wqe->qp_dma_addr = htobe64(attr->qp_dma_addr); wqe->qp_dma_size = htobe32(attr->qp_dma_size); wqe->irs = htobe32(attr->irs); - m_set_priority(m, 0); /* 0=>ToeQ; 1=>CtrlQ */ - m_set_sgl(m, NULL); - m_set_sgllen(m, 0); - return (cxgb_ofld_send(rdev_p->t3cdev_p, m)); -} -void -cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) -{ - cxio_ev_cb = ev_cb; -} - -void -cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb) -{ - cxio_ev_cb = NULL; + /* XXX: bad form, fix later */ + inp = sotoinpcb(so); + INP_WLOCK(inp); + tp = intotcpcb(inp); + toep = tp->t_toe; + oh = mtod(m, struct ofld_hdr *); + oh->plen = 0; + oh->flags |= F_HDR_DF; + enqueue_wr(toep, m); + toep->tp_wr_avail--; + toep->tp_wr_unacked++; + rc = t3_offload_tx(rdev_p->adap, m); + INP_WUNLOCK(inp); + + return (rc); } static int -cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct mbuf *m) +cxio_hal_ev_handler(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - static int cnt; - struct cxio_rdev *rdev_p = NULL; + struct adapter *sc = qs->adap; + struct iwch_dev *rnicp = sc->iwarp_softc; + struct cxio_rdev *rdev_p = &rnicp->rdev; struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data; + int qpid = CQE_QPID(rsp_msg->cqe); CTR6(KTR_IW_CXGB, "%s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x", __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg), @@ -978,80 +937,50 @@ cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct mbuf *m) RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg), RSPQ_CREDIT_THRESH(rsp_msg)); CTR4(KTR_IW_CXGB, "CQE: QPID 0x%0x type 0x%0x status 0x%0x opcode %d", - CQE_QPID(rsp_msg->cqe), - CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), - CQE_OPCODE(rsp_msg->cqe)); + qpid, CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe)); CTR3(KTR_IW_CXGB, "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x", CQE_LEN(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); - rdev_p = (struct cxio_rdev *)t3cdev_p->ulp; - if (!rdev_p) { - CTR2(KTR_IW_CXGB, "%s called by t3cdev %p with null ulp", __FUNCTION__, - t3cdev_p); - return 0; - } - if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) { + + switch(qpid) { + case T3_CTRL_QP_ID: mtx_lock(&rdev_p->ctrl_qp.lock); rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1; wakeup(&rdev_p->ctrl_qp); mtx_unlock(&rdev_p->ctrl_qp.lock); - m_free(m); - } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8) - m_free(m); - else if (cxio_ev_cb) - (*cxio_ev_cb) (rdev_p, m); - else - m_free(m); - cnt++; - return 0; + break; + case 0xfff8: + break; + default: + iwch_ev_dispatch(rnicp, m); + } + + m_freem(m); + return (0); } /* Caller takes care of locking if needed */ int cxio_rdev_open(struct cxio_rdev *rdev_p) { - struct ifnet *ifp; int err = 0; + struct rdma_info *ri = &rdev_p->rnic_info; + struct adapter *sc = rdev_p->adap; - if (strlen(rdev_p->dev_name)) { - if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) { - return (-EBUSY); - } - ifp = rdev_p->ifp; - if (ifp == NULL) - return (-EINVAL); - if_free(ifp); - } else if (rdev_p->t3cdev_p) { - if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) - return (-EBUSY); - ifp = rdev_p->t3cdev_p->lldev; - strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name, - T3_MAX_DEV_NAME_LEN); - } else { - CTR1(KTR_IW_CXGB, "%s t3cdev_p or dev_name must be set", __FUNCTION__); - return (-EINVAL); - } - - TAILQ_INSERT_TAIL(&rdev_list, rdev_p, entry); + KASSERT(rdev_p->adap, ("%s: adap is NULL", __func__)); - CTR2(KTR_IW_CXGB, "%s opening rnic dev %s", __FUNCTION__, rdev_p->dev_name); memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp)); - if (!rdev_p->t3cdev_p) - rdev_p->t3cdev_p = T3CDEV(ifp); - rdev_p->t3cdev_p->ulp = (void *) rdev_p; - err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS, - &(rdev_p->rnic_info)); - if (err) { - log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n", - __FUNCTION__, rdev_p->t3cdev_p, err); - goto err1; - } - err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS, - &(rdev_p->port_info)); - if (err) { - log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n", - __FUNCTION__, rdev_p->t3cdev_p, err); - goto err1; - } + + ri->udbell_physbase = rman_get_start(sc->udbs_res); + ri->udbell_len = rman_get_size(sc->udbs_res); + ri->tpt_base = t3_read_reg(sc, A_ULPTX_TPT_LLIMIT); + ri->tpt_top = t3_read_reg(sc, A_ULPTX_TPT_ULIMIT); + ri->pbl_base = t3_read_reg(sc, A_ULPTX_PBL_LLIMIT); + ri->pbl_top = t3_read_reg(sc, A_ULPTX_PBL_ULIMIT); + ri->rqt_base = t3_read_reg(sc, A_ULPRX_RQ_LLIMIT); + ri->rqt_top = t3_read_reg(sc, A_ULPRX_RQ_ULIMIT); + ri->kdb_addr = (void *)((unsigned long) + rman_get_virtual(sc->regs_res) + A_SG_KDOORBELL); /* * qpshift is the number of bits to shift the qpid left in order @@ -1064,8 +993,8 @@ cxio_rdev_open(struct cxio_rdev *rdev_p) PAGE_SHIFT)); rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT; rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1; - CTR4(KTR_IW_CXGB, "cxio_rdev_open rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d", - rdev_p->dev_name, rdev_p->rnic_info.tpt_base, + CTR4(KTR_IW_CXGB, "cxio_rdev_open rnic %p info: tpt_base 0x%0x tpt_top 0x%0x num stags %d", + rdev_p->adap, rdev_p->rnic_info.tpt_base, rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p)); CTR4(KTR_IW_CXGB, "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x", rdev_p->rnic_info.pbl_base, @@ -1111,43 +1040,34 @@ err3: err2: cxio_hal_destroy_ctrl_qp(rdev_p); err1: - TAILQ_REMOVE(&rdev_list, rdev_p, entry); return err; } void cxio_rdev_close(struct cxio_rdev *rdev_p) { - if (rdev_p) { - cxio_hal_pblpool_destroy(rdev_p); - cxio_hal_rqtpool_destroy(rdev_p); - TAILQ_REMOVE(&rdev_list, rdev_p, entry); - rdev_p->t3cdev_p->ulp = NULL; - cxio_hal_destroy_ctrl_qp(rdev_p); - cxio_hal_destroy_resource(rdev_p->rscp); - } + cxio_hal_pblpool_destroy(rdev_p); + cxio_hal_rqtpool_destroy(rdev_p); + cxio_hal_destroy_ctrl_qp(rdev_p); + cxio_hal_destroy_resource(rdev_p->rscp); } int -cxio_hal_init(void) +cxio_hal_init(struct adapter *sc) { - TAILQ_INIT(&rdev_list); #ifdef needed if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI)) - return (-ENOMEM); + return (ENOMEM); #endif - t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler); - return 0; + t3_register_cpl_handler(sc, CPL_ASYNC_NOTIF, cxio_hal_ev_handler); + + return (0); } void -cxio_hal_exit(void) +cxio_hal_uninit(struct adapter *sc) { - struct cxio_rdev *rdev, *tmp; - - t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL); - TAILQ_FOREACH_SAFE(rdev, &rdev_list, entry, tmp) - cxio_rdev_close(rdev); + t3_register_cpl_handler(sc, CPL_ASYNC_NOTIF, NULL); #ifdef needed cxio_hal_destroy_rhdl_resource(); #endif @@ -1304,11 +1224,12 @@ cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, } /* incoming SEND with no receive posted failures */ - if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) && + if (CQE_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) && Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { ret = -1; goto skip_cqe; } + PANIC_IF((*cqe_flushed == 0) && !SW_CQE(*hw_cqe)); goto proc_cqe; } @@ -1323,6 +1244,13 @@ cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, * then we complete this with TPT_ERR_MSN and mark the wq in * error. */ + + if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + wq->error = 1; + ret = -1; + goto skip_cqe; + } + if (__predict_false((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) { wq->error = 1; hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN)); @@ -1367,13 +1295,17 @@ proc_cqe: wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe); CTR2(KTR_IW_CXGB, "%s completing sq idx %ld", __FUNCTION__, Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)); - *cookie = (wq->sq + - Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id; + *cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id; wq->sq_rptr++; } else { CTR2(KTR_IW_CXGB, "%s completing rq idx %ld", __FUNCTION__, Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); - *cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + *cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id; + if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr) + cxio_hal_pblpool_free(wq->rdev, + wq->rq[Q_PTR2IDX(wq->rq_rptr, + wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE); + PANIC_IF(Q_EMPTY(wq->rq_rptr, wq->rq_wptr)); wq->rq_rptr++; } @@ -1404,5 +1336,4 @@ skip_cqe: } return ret; } - - +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h index 6a401e0..6b5f948 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h @@ -45,7 +45,11 @@ $FreeBSD$ #define T3_MAX_NUM_PD (1<<15) #define T3_MAX_PBL_SIZE 256 #define T3_MAX_RQ_SIZE 1024 +#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1) +#define T3_MAX_CQ_DEPTH 65536 #define T3_MAX_NUM_STAG (1<<15) +#define T3_MAX_MR_SIZE 0x100000000ULL +#define T3_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ #define T3_STAG_UNSET 0xffffffff @@ -55,12 +59,9 @@ struct cxio_hal_ctrl_qp { u32 wptr; u32 rptr; struct mtx lock; /* for the wtpr, can sleep */ -#ifdef notyet - DECLARE_PCI_UNMAP_ADDR(mapping) -#endif union t3_wr *workq; /* the work request queue */ bus_addr_t dma_addr; /* pci bus address of the workq */ - void /* __iomem */ *doorbell; + void *doorbell; }; struct cxio_hal_resource { @@ -85,13 +86,10 @@ struct cxio_ucontext { }; struct cxio_rdev { - char dev_name[T3_MAX_DEV_NAME_LEN]; - struct t3cdev *t3cdev_p; + struct adapter *adap; struct rdma_info rnic_info; - struct adap_ports port_info; struct cxio_hal_resource *rscp; struct cxio_hal_ctrl_qp ctrl_qp; - void *ulp; unsigned long qpshift; u32 qpnr; u32 qpmask; @@ -139,9 +137,8 @@ int cxio_rdev_open(struct cxio_rdev *rdev); void cxio_rdev_close(struct cxio_rdev *rdev); int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, enum t3_cq_opcode op, u32 credit); -int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel); int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); -int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, @@ -149,27 +146,27 @@ int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, struct cxio_ucontext *uctx); int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size); int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr); + u8 page_size, u32 pbl_size, u32 pbl_addr); int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr); + u8 page_size, u32 pbl_size, u32 pbl_addr); int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, u32 pbl_addr); int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag); -int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr); -void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb); -void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr, + struct socket *so); u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); -int cxio_hal_init(void); +int cxio_hal_init(struct adapter *); +void cxio_hal_uninit(struct adapter *); void cxio_hal_exit(void); -void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); -void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); void cxio_flush_hw_cq(struct t3_cq *cq); @@ -178,7 +175,7 @@ int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, #define MOD "iw_cxgb: " -#ifdef DEBUG +#ifdef INVARIANTS void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint32_t len, u8 shift); void cxio_dump_wqe(union t3_wr *wqe); @@ -187,60 +184,7 @@ void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); #endif - - static unsigned char hiBitSetTab[] = { - 0, 1, 2, 2, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7 - -}; - - -static __inline -int ilog2(unsigned long val) -{ - unsigned long tmp; - - tmp = val >> 24; - if (tmp) { - return hiBitSetTab[tmp] + 23; - } - tmp = (val >> 16) & 0xff; - if (tmp) { - return hiBitSetTab[tmp] + 15; - } - tmp = (val >> 8) & 0xff; - if (tmp) { - return hiBitSetTab[tmp] + 7; - - } - return hiBitSetTab[val & 0xff] - 1; -} - #define cxfree(a) free((a), M_DEVBUF); -#define kmalloc(a, b) malloc((a), M_DEVBUF, (b)) -#define kzalloc(a, b) malloc((a), M_DEVBUF, (b)|M_ZERO) - -static __inline __attribute__((const)) -unsigned long roundup_pow_of_two(unsigned long n) -{ - return 1UL << flsl(n - 1); -} - -#define PAGE_ALIGN(x) roundup2((x), PAGE_SIZE) #include struct gen_pool { @@ -259,6 +203,7 @@ gen_pool_create(daddr_t base, u_int chunk_shift, u_int len) if (gp == NULL) return (NULL); + memset(gp, 0, sizeof(struct gen_pool)); gp->gen_list = blist_create(len >> chunk_shift, M_NOWAIT); if (gp->gen_list == NULL) { free(gp, M_DEVBUF); @@ -323,8 +268,7 @@ gen_pool_destroy(struct gen_pool *gp) mtx_unlock(lockp); \ __ret; \ }) -extern struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev); -#define KTR_IW_CXGB KTR_SPARE4 +#define KTR_IW_CXGB KTR_SPARE3 #endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h new file mode 100644 index 0000000..7c7cd24 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ib_intfc.h @@ -0,0 +1,22 @@ +#ifndef __IB_INTFC_H__ +#define __IB_INTFC_H__ + +/* $FreeBSD$ */ + +#undef prefetch +#undef WARN_ON +#undef max_t +#undef udelay +#undef le32_to_cpu +#undef le16_to_cpu +#undef cpu_to_le32 +#undef swab32 +#undef container_of + +#undef LIST_HEAD +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#endif /* __IB_INTFC_H__ */ diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c index c052aa6..0ccb70f 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c @@ -29,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -59,9 +61,11 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include @@ -72,14 +76,24 @@ __FBSDID("$FreeBSD$"); #include #include +static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) +{ + u32 mmid; + + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __func__, mmid, mhp); + return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); +} + int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, - int shift, - __be64 *page_list) + int shift) { u32 stag; - u32 mmid; - + int ret; if (cxio_register_phys_mem(&rhp->rdev, &stag, mhp->attr.pdid, @@ -87,28 +101,24 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len, - shift-12, - page_list, - &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) return (-ENOMEM); - mhp->attr.state = 1; - mhp->attr.stag = stag; - mmid = stag >> 8; - mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); - CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp); - return 0; + + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; } int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift, - __be64 *page_list, int npages) { u32 stag; - u32 mmid; - + int ret; /* We could support this... */ if (npages > mhp->attr.pbl_size) @@ -121,17 +131,40 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len, - shift-12, - page_list, - &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) return (-ENOMEM); - mhp->attr.state = 1; - mhp->attr.stag = stag; - mmid = stag >> 8; - mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); - CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp); + + ret = iwch_finish_mem_reg(mhp, stag); + if (ret) + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + return ret; +} + +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + return 0; + } + +void iwch_free_pbl(struct iwch_mr *mhp) +{ + cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +} + +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset) +{ + return cxio_write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (offset << 3), npages); } int build_phys_page_list(struct ib_phys_buf *buffer_list, @@ -204,3 +237,4 @@ int build_phys_page_list(struct ib_phys_buf *buffer_list, return 0; } +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c index bc336a4..4961395 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c @@ -29,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -62,9 +64,12 @@ __FBSDID("$FreeBSD$"); #include #include -#include -#include -#include +#include +#include +#include +#include +#include + #include #include @@ -180,6 +185,8 @@ iwch_create_cq(struct ib_device *ibdev, int entries, int vector, struct iwch_create_cq_resp uresp; struct iwch_create_cq_req ureq; struct iwch_ucontext *ucontext = NULL; + static int warned; + size_t resplen; CTR3(KTR_IW_CXGB, "%s ib_dev %p entries %d", __FUNCTION__, ibdev, entries); rhp = to_iwch_dev(ibdev); @@ -214,7 +221,7 @@ iwch_create_cq(struct ib_device *ibdev, int entries, int vector, entries = roundup_pow_of_two(entries); chp->cq.size_log2 = ilog2(entries); - if (cxio_create_cq(&rhp->rdev, &chp->cq)) { + if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) { cxfree(chp); return ERR_PTR(-ENOMEM); } @@ -222,7 +229,11 @@ iwch_create_cq(struct ib_device *ibdev, int entries, int vector, chp->ibcq.cqe = 1 << chp->cq.size_log2; mtx_init(&chp->lock, "cxgb cq", NULL, MTX_DEF|MTX_DUPOK); chp->refcnt = 1; - insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) { + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + cxfree(chp); + return ERR_PTR(-ENOMEM); + } if (ucontext) { struct iwch_mm_entry *mm; @@ -238,15 +249,27 @@ iwch_create_cq(struct ib_device *ibdev, int entries, int vector, uresp.key = ucontext->key; ucontext->key += PAGE_SIZE; mtx_unlock(&ucontext->mmap_lock); - if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + mm->key = uresp.key; + mm->addr = vtophys(chp->cq.queue); + if (udata->outlen < sizeof uresp) { + if (!warned++) + CTR1(KTR_IW_CXGB, "%s Warning - " + "downlevel libcxgb3 (non-fatal).\n", + __func__); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof(struct t3_cqe)); + resplen = sizeof(struct iwch_create_cq_resp_v0); + } else { + mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) * + sizeof(struct t3_cqe)); + uresp.memsize = mm->len; + resplen = sizeof uresp; + } + if (ib_copy_to_udata(udata, &uresp, resplen)) { cxfree(mm); iwch_destroy_cq(&chp->ibcq); return ERR_PTR(-EFAULT); } - mm->key = uresp.key; - mm->addr = vtophys(chp->cq.queue); - mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * - sizeof (struct t3_cqe)); insert_mmap(ucontext, mm); } CTR4(KTR_IW_CXGB, "created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx", @@ -256,72 +279,11 @@ iwch_create_cq(struct ib_device *ibdev, int entries, int vector, } static int -iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +iwch_resize_cq(struct ib_cq *cq __unused, int cqe __unused, + struct ib_udata *udata __unused) { -#ifdef notyet - struct iwch_cq *chp = to_iwch_cq(cq); - struct t3_cq oldcq, newcq; - int ret; - - CTR3(KTR_IW_CXGB, "%s ib_cq %p cqe %d", __FUNCTION__, cq, cqe); - - /* We don't downsize... */ - if (cqe <= cq->cqe) - return 0; - - /* create new t3_cq with new size */ - cqe = roundup_pow_of_two(cqe+1); - newcq.size_log2 = ilog2(cqe); - - /* Dont allow resize to less than the current wce count */ - if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { - return (-ENOMEM); - } - /* Quiesce all QPs using this CQ */ - ret = iwch_quiesce_qps(chp); - if (ret) { - return (ret); - } - - ret = cxio_create_cq(&chp->rhp->rdev, &newcq); - if (ret) { - return (ret); - } - - /* copy CQEs */ - memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * - sizeof(struct t3_cqe)); - - /* old iwch_qp gets new t3_cq but keeps old cqid */ - oldcq = chp->cq; - chp->cq = newcq; - chp->cq.cqid = oldcq.cqid; - - /* resize new t3_cq to update the HW context */ - ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); - if (ret) { - chp->cq = oldcq; - return ret; - } - chp->ibcq.cqe = (1<cq.size_log2) - 1; - - /* destroy old t3_cq */ - oldcq.cqid = newcq.cqid; - ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); - if (ret) { - log(LOG_ERR, "%s - cxio_destroy_cq failed %d\n", - __FUNCTION__, ret); - } - - /* add user hooks here */ - - /* resume qps */ - ret = iwch_resume_qps(chp); - return ret; -#else return (-ENOSYS); -#endif } static int @@ -357,67 +319,12 @@ iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) return err; } -#ifdef notyet static int -iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +iwch_mmap(struct ib_ucontext *context __unused, struct vm_area_struct *vma __unused) { -#ifdef notyet - int len = vma->vm_end - vma->vm_start; - u32 key = vma->vm_pgoff << PAGE_SHIFT; - struct cxio_rdev *rdev_p; - int ret = 0; - struct iwch_mm_entry *mm; - struct iwch_ucontext *ucontext; - u64 addr; - - CTR4(KTR_IW_CXGB, "%s pgoff 0x%lx key 0x%x len %d", __FUNCTION__, vma->vm_pgoff, - key, len); - - if (vma->vm_start & (PAGE_SIZE-1)) { - return (-EINVAL); - } - - rdev_p = &(to_iwch_dev(context->device)->rdev); - ucontext = to_iwch_ucontext(context); - - mm = remove_mmap(ucontext, key, len); - if (!mm) - return (-EINVAL); - addr = mm->addr; - cxfree(mm); - - if ((addr >= rdev_p->rnic_info.udbell_physbase) && - (addr < (rdev_p->rnic_info.udbell_physbase + - rdev_p->rnic_info.udbell_len))) { - - /* - * Map T3 DB register. - */ - if (vma->vm_flags & VM_READ) { - return (-EPERM); - } - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; - vma->vm_flags &= ~VM_MAYREAD; - ret = io_remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); - } else { - - /* - * Map WQ or CQ contig dma memory... - */ - ret = remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); - } - - return ret; -#endif - return (0); + return (-ENOSYS); } -#endif static int iwch_deallocate_pd(struct ib_pd *pd) { @@ -470,7 +377,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) CTR2(KTR_IW_CXGB, "%s ib_mr %p", __FUNCTION__, ib_mr); /* There can be no memory windows */ - if (atomic_load_acq_int(&ib_mr->usecnt)) + if (atomic_load_acq_int(&ib_mr->usecnt.counter)) return (-EINVAL); mhp = to_iwch_mr(ib_mr); @@ -478,6 +385,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) mmid = mhp->attr.stag >> 8; cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, mhp->attr.pbl_addr); + iwch_free_pbl(mhp); remove_handle(rhp, &rhp->mmidr, mmid); if (mhp->kva) cxfree((void *) (unsigned long) mhp->kva); @@ -511,6 +419,8 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, if (!mhp) return ERR_PTR(-ENOMEM); + mhp->rhp = rhp; + /* First check that we have enough alignment */ if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { ret = -EINVAL; @@ -528,7 +438,17 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, if (ret) goto err; - mhp->rhp = rhp; + ret = iwch_alloc_pbl(mhp, npages); + if (ret) { + cxfree(page_list); + goto err_pbl; + } + + ret = iwch_write_pbl(mhp, page_list, npages, 0); + cxfree(page_list); + if (ret) + goto err; + mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; @@ -538,15 +458,18 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, mhp->attr.len = (u32) total_size; mhp->attr.pbl_size = npages; - ret = iwch_register_mem(rhp, php, mhp, shift, page_list); - cxfree(page_list); - if (ret) { - goto err; - } + ret = iwch_register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + err: cxfree(mhp); - return ERR_PTR(-ret); + return ERR_PTR(ret); } @@ -570,7 +493,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, CTR3(KTR_IW_CXGB, "%s ib_mr %p ib_pd %p", __FUNCTION__, mr, pd); /* There can be no memory windows */ - if (atomic_load_acq_int(&mr->usecnt)) + if (atomic_load_acq_int(&mr->usecnt.counter)) return (-EINVAL); mhp = to_iwch_mr(mr); @@ -596,7 +519,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, return ret; } - ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages); + ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); cxfree(page_list); if (ret) { return ret; @@ -640,7 +563,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mhp) return ERR_PTR(-ENOMEM); - mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc); + mhp->rhp = rhp; + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); cxfree(mhp); @@ -650,18 +575,22 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, shift = ffs(mhp->umem->page_size) - 1; n = 0; - TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry) + list_for_each_entry(chunk, &mhp->umem->chunk_list, list) n += chunk->nents; - pages = kmalloc(n * sizeof(u64), M_NOWAIT); + err = iwch_alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) kmalloc(n * sizeof(u64), M_NOWAIT); if (!pages) { err = -ENOMEM; - goto err; + goto err_pbl; } i = n = 0; -#if 0 +#ifdef notyet TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; @@ -669,21 +598,36 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, pages[i++] = htobe64(sg_dma_address( &chunk->page_list[j]) + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } } } #endif - mhp->rhp = rhp; + + if (i) + err = iwch_write_pbl(mhp, pages, i, n); +#ifdef notyet +pbl_done: +#endif + cxfree(pages); + if (err) + goto err_pbl; + mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); mhp->attr.va_fbo = virt; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) length; - mhp->attr.pbl_size = i; - err = iwch_register_mem(rhp, php, mhp, shift, pages); - cxfree(pages); + + err = iwch_register_mem(rhp, php, mhp, shift); if (err) - goto err; + goto err_pbl; if (udata && !t3a_device(rhp)) { uresp.pbl_addr = (mhp->attr.pbl_addr - @@ -700,6 +644,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mhp->ibmr; +err_pbl: + iwch_free_pbl(mhp); + err: ib_umem_release(mhp->umem); cxfree(mhp); @@ -748,7 +695,12 @@ static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) mhp->attr.type = TPT_MW; mhp->attr.stag = stag; mmid = (stag) >> 8; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); + mhp->ibmw.rkey = stag; + if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) { + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + cxfree(mhp); + return ERR_PTR(-ENOMEM); + } CTR4(KTR_IW_CXGB, "%s mmid 0x%x mhp %p stag 0x%x", __FUNCTION__, mmid, mhp, stag); return &(mhp->ibmw); } @@ -893,7 +845,13 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, mtx_init(&qhp->lock, "cxgb qp", NULL, MTX_DEF|MTX_DUPOK); qhp->refcnt = 1; - insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid); + + if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) { + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + cxfree(qhp); + return ERR_PTR(-ENOMEM); + } if (udata) { @@ -1023,12 +981,14 @@ static int iwch_query_gid(struct ib_device *ibdev, u8 port, { struct iwch_dev *dev; struct port_info *pi; + struct adapter *sc; CTR5(KTR_IW_CXGB, "%s ibdev %p, port %d, index %d, gid %p", __FUNCTION__, ibdev, port, index, gid); dev = to_iwch_dev(ibdev); + sc = dev->rdev.adap; PANIC_IF(port == 0 || port > 2); - pi = ((struct port_info *)dev->rdev.port_info.lldevs[port-1]->if_softc); + pi = &sc->port[port - 1]; memset(&(gid->raw[0]), 0, sizeof(gid->raw)); memcpy(&(gid->raw[0]), pi->hw_addr, 6); return 0; @@ -1037,21 +997,20 @@ static int iwch_query_gid(struct ib_device *ibdev, u8 port, static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { - struct iwch_dev *dev; + struct adapter *sc; + CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); dev = to_iwch_dev(ibdev); + sc = dev->rdev.adap; memset(props, 0, sizeof *props); -#ifdef notyet - memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->if_addr.ifa_addr, 6); -#endif + memcpy(&props->sys_image_guid, sc->port[0].hw_addr, 6); props->device_cap_flags = dev->device_cap_flags; -#ifdef notyet - props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; - props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; -#endif - props->max_mr_size = ~0ull; + props->page_size_cap = dev->attr.mem_pgsizes_bitmask; + props->vendor_id = pci_get_vendor(sc->dev); + props->vendor_part_id = pci_get_device(sc->dev); + props->max_mr_size = dev->attr.max_mr_size; props->max_qp = dev->attr.max_qps; props->max_qp_wr = dev->attr.max_wrs; props->max_sge = dev->attr.max_sge_per_wr; @@ -1071,13 +1030,10 @@ static int iwch_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); + memset(props, 0, sizeof(struct ib_port_attr)); props->max_mtu = IB_MTU_4096; - props->lid = 0; - props->lmc = 0; - props->sm_lid = 0; - props->sm_sl = 0; + props->active_mtu = IB_MTU_2048; props->state = IB_PORT_ACTIVE; - props->phys_state = 0; props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_SNMP_TUNNEL_SUP | @@ -1086,7 +1042,6 @@ static int iwch_query_port(struct ib_device *ibdev, IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; props->gid_tbl_len = 1; props->pkey_tbl_len = 1; - props->qkey_viol_cntr = 0; props->active_width = 2; props->active_speed = 2; props->max_msg_sz = -1; @@ -1094,80 +1049,18 @@ static int iwch_query_port(struct ib_device *ibdev, return 0; } -#ifdef notyet -static ssize_t show_rev(struct class_device *cdev, char *buf) -{ - struct iwch_dev *dev = container_of(cdev, struct iwch_dev, - ibdev.class_dev); - CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev); - return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type); -} - -static ssize_t show_fw_ver(struct class_device *cdev, char *buf) -{ - struct iwch_dev *dev = container_of(cdev, struct iwch_dev, - ibdev.class_dev); - struct ethtool_drvinfo info; - struct net_device *lldev = dev->rdev.t3cdev_p->lldev; - - CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev); - lldev->ethtool_ops->get_drvinfo(lldev, &info); - return sprintf(buf, "%s\n", info.fw_version); -} - -static ssize_t show_hca(struct class_device *cdev, char *buf) -{ - struct iwch_dev *dev = container_of(cdev, struct iwch_dev, - ibdev.class_dev); - struct ethtool_drvinfo info; - struct net_device *lldev = dev->rdev.t3cdev_p->lldev; - - CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev); - lldev->ethtool_ops->get_drvinfo(lldev, &info); - return sprintf(buf, "%s\n", info.driver); -} - -static ssize_t show_board(struct class_device *cdev, char *buf) -{ - struct iwch_dev *dev = container_of(cdev, struct iwch_dev, - ibdev.class_dev); - CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, dev); -#ifdef notyet - return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor, - dev->rdev.rnic_info.pdev->device); -#else - return sprintf(buf, "%x.%x\n", 0xdead, 0xbeef); /* XXX */ -#endif -} - -static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); -static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); - -static struct class_device_attribute *iwch_class_attributes[] = { - &class_device_attr_hw_rev, - &class_device_attr_fw_ver, - &class_device_attr_hca_type, - &class_device_attr_board_id -}; -#endif - int iwch_register_device(struct iwch_dev *dev) { int ret; -#ifdef notyet - int i; -#endif + struct adapter *sc = dev->rdev.adap; + CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev); strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); -#ifdef notyet - memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); -#endif + memcpy(&dev->ibdev.node_guid, sc->port[0].hw_addr, 6); dev->device_cap_flags = - (IB_DEVICE_ZERO_STAG | - IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW); + (IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW); dev->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -1189,9 +1082,9 @@ int iwch_register_device(struct iwch_dev *dev) (1ull << IB_USER_VERBS_CMD_POST_RECV); dev->ibdev.node_type = RDMA_NODE_RNIC; memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); - dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; + dev->ibdev.phys_port_cnt = sc->params.nports; dev->ibdev.num_comp_vectors = 1; - dev->ibdev.dma_device = dev->rdev.rnic_info.pdev; + dev->ibdev.dma_device = dev->rdev.adap->dev; dev->ibdev.query_device = iwch_query_device; dev->ibdev.query_port = iwch_query_port; dev->ibdev.modify_port = iwch_modify_port; @@ -1199,9 +1092,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.query_gid = iwch_query_gid; dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; -#ifdef notyet dev->ibdev.mmap = iwch_mmap; -#endif dev->ibdev.alloc_pd = iwch_allocate_pd; dev->ibdev.dealloc_pd = iwch_deallocate_pd; dev->ibdev.create_ah = iwch_ah_create; @@ -1229,11 +1120,13 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.req_notify_cq = iwch_arm_cq; dev->ibdev.post_send = iwch_post_send; dev->ibdev.post_recv = iwch_post_receive; - + dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; dev->ibdev.iwcm = - (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs), - M_NOWAIT); + kmalloc(sizeof(struct iw_cm_verbs), M_NOWAIT); + if (!dev->ibdev.iwcm) + return (ENOMEM); + dev->ibdev.iwcm->connect = iwch_connect; dev->ibdev.iwcm->accept = iwch_accept_cr; dev->ibdev.iwcm->reject = iwch_reject_cr; @@ -1246,35 +1139,19 @@ int iwch_register_device(struct iwch_dev *dev) ret = ib_register_device(&dev->ibdev); if (ret) goto bail1; -#ifdef notyet - for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { - ret = class_device_create_file(&dev->ibdev.class_dev, - iwch_class_attributes[i]); - if (ret) { - goto bail2; - } - } -#endif - return 0; -#ifdef notyet -bail2: -#endif - ib_unregister_device(&dev->ibdev); + + return (0); + bail1: - return ret; + cxfree(dev->ibdev.iwcm); + return (ret); } void iwch_unregister_device(struct iwch_dev *dev) { -#ifdef notyet - int i; - CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev); - - for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) - class_device_remove_file(&dev->ibdev.class_dev, - iwch_class_attributes[i]); -#endif ib_unregister_device(&dev->ibdev); + cxfree(dev->ibdev.iwcm); return; } +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h index c857ce8..2e012fd 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h @@ -31,7 +31,7 @@ $FreeBSD$ #ifndef __IWCH_PROVIDER_H__ #define __IWCH_PROVIDER_H__ -#include +#include struct iwch_pd { struct ib_pd ibpd; @@ -116,6 +116,7 @@ enum IWCH_QP_FLAGS { }; struct iwch_mpa_attributes { + u8 initiator; u8 recv_marker_enabled; u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ u8 crc_enabled; @@ -336,18 +337,17 @@ int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); int iwch_register_device(struct iwch_dev *dev); void iwch_unregister_device(struct iwch_dev *dev); -int iwch_quiesce_qps(struct iwch_cq *chp); -int iwch_resume_qps(struct iwch_cq *chp); void stop_read_rep_timer(struct iwch_qp *qhp); int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, - int shift, - __be64 *page_list); + int shift); int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift, - __be64 *page_list, int npages); +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages); +void iwch_free_pbl(struct iwch_mr *mhp); +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset); int build_phys_page_list(struct ib_phys_buf *buffer_list, int num_phys_buf, u64 *iova_start, diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c index fd51498..7734fd0 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c @@ -1,4 +1,3 @@ - /************************************************************************** Copyright (c) 2007, Chelsio Inc. @@ -30,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -48,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -57,14 +59,26 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include #include - -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include #include #include +#include #include #include #include @@ -75,7 +89,7 @@ __FBSDID("$FreeBSD$"); #define NO_SUPPORT -1 -static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, +static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, u8 * flit_cnt) { int i; @@ -83,59 +97,46 @@ static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_SEND: - case IB_WR_SEND_WITH_IMM: if (wr->send_flags & IB_SEND_SOLICITED) wqe->send.rdmaop = T3_SEND_WITH_SE; else wqe->send.rdmaop = T3_SEND; wqe->send.rem_stag = 0; break; -#if 0 /* Not currently supported */ - case TYPE_SEND_INVALIDATE: - case TYPE_SEND_INVALIDATE_IMMEDIATE: - wqe->send.rdmaop = T3_SEND_WITH_INV; - wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey); - break; - case TYPE_SEND_SE_INVALIDATE: - wqe->send.rdmaop = T3_SEND_WITH_SE_INV; - wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey); + case IB_WR_SEND_WITH_IMM: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE_INV; + else + wqe->send.rdmaop = T3_SEND_WITH_INV; + wqe->send.rem_stag = 0; break; -#endif default: - break; + return -EINVAL; } if (wr->num_sge > T3_MAX_SGE) return (-EINVAL); wqe->send.reserved[0] = 0; wqe->send.reserved[1] = 0; wqe->send.reserved[2] = 0; - if (wr->opcode == IB_WR_SEND_WITH_IMM) { - plen = 4; - wqe->send.sgl[0].stag = wr->imm_data; - wqe->send.sgl[0].len = 0; - wqe->send.num_sgle = 0; - *flit_cnt = 5; - } else { - plen = 0; - for (i = 0; i < wr->num_sge; i++) { - if ((plen + wr->sg_list[i].length) < plen) { - return (-EMSGSIZE); - } - plen += wr->sg_list[i].length; - wqe->send.sgl[i].stag = - htobe32(wr->sg_list[i].lkey); - wqe->send.sgl[i].len = - htobe32(wr->sg_list[i].length); - wqe->send.sgl[i].to = htobe64(wr->sg_list[i].addr); + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return (-EMSGSIZE); } - wqe->send.num_sgle = htobe32(wr->num_sge); - *flit_cnt = 4 + ((wr->num_sge) << 1); + plen += wr->sg_list[i].length; + wqe->send.sgl[i].stag = + htobe32(wr->sg_list[i].lkey); + wqe->send.sgl[i].len = + htobe32(wr->sg_list[i].length); + wqe->send.sgl[i].to = htobe64(wr->sg_list[i].addr); } + wqe->send.num_sgle = htobe32(wr->num_sge); + *flit_cnt = 4 + ((wr->num_sge) << 1); wqe->send.plen = htobe32(plen); return 0; } -static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, +static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, u8 *flit_cnt) { int i; @@ -152,7 +153,7 @@ static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { plen = 4; - wqe->write.sgl[0].stag = wr->imm_data; + wqe->write.sgl[0].stag = wr->ex.imm_data; wqe->write.sgl[0].len = 0; wqe->write.num_sgle = 0; *flit_cnt = 6; @@ -177,7 +178,7 @@ static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, return 0; } -static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, +static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, u8 *flit_cnt) { if (wr->num_sge > 1) @@ -195,15 +196,12 @@ static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, return 0; } -/* - * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now. - */ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, u32 num_sgle, u32 * pbl_addr, u8 * page_size) { int i; struct iwch_mr *mhp; - u32 offset; + u64 offset; for (i = 0; i < num_sgle; i++) { mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8); @@ -235,8 +233,8 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, return (-EINVAL); } offset = sg_list[i].addr - mhp->attr.va_fbo; - offset += ((u32) mhp->attr.va_fbo) % - (1UL << (12 + mhp->attr.page_size)); + offset += mhp->attr.va_fbo & + ((1UL << (12 + mhp->attr.page_size)) - 1); pbl_addr[i] = ((mhp->attr.pbl_addr - rhp->rdev.rnic_info.pbl_base) >> 3) + (offset >> (12 + mhp->attr.page_size)); @@ -245,26 +243,113 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, return 0; } -static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe, +static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe, struct ib_recv_wr *wr) { - int i; - if (wr->num_sge > T3_MAX_SGE) + int i, err = 0; + u32 pbl_addr[T3_MAX_SGE]; + u8 page_size[T3_MAX_SGE]; + + if (wr->num_sge > T3_MAX_SGE) return (-EINVAL); + + + err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr, + page_size); + if (err) + return err; + wqe->recv.pagesz[0] = page_size[0]; + wqe->recv.pagesz[1] = page_size[1]; + wqe->recv.pagesz[2] = page_size[2]; + wqe->recv.pagesz[3] = page_size[3]; wqe->recv.num_sgle = htobe32(wr->num_sge); + for (i = 0; i < wr->num_sge; i++) { wqe->recv.sgl[i].stag = htobe32(wr->sg_list[i].lkey); wqe->recv.sgl[i].len = htobe32(wr->sg_list[i].length); - wqe->recv.sgl[i].to = htobe64(wr->sg_list[i].addr); + wqe->recv.sgl[i].to = htobe64(((u32)wr->sg_list[i].addr) & + ((1UL << (12 + page_size[i])) - 1)); + /* pbl_addr is the adapters address in the PBL */ + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]); } for (; i < T3_MAX_SGE; i++) { wqe->recv.sgl[i].stag = 0; wqe->recv.sgl[i].len = 0; wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; } + + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].wr_id = wr->wr_id; + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].pbl_addr = 0; + return 0; } +static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i; + u32 pbl_addr; + u32 pbl_offset; + + + /* + * The T3 HW requires the PBL in the HW recv descriptor to reference + * a PBL entry. So we allocate the max needed PBL memory here and pass + * it to the uP in the recv WR. The uP will build the PBL and setup + * the HW recv descriptor. + */ + pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE); + if (!pbl_addr) + return -ENOMEM; + + /* + * Compute the 8B aligned offset. + */ + pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3; + + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + + for (i = 0; i < wr->num_sge; i++) { + + /* + * Use a 128MB page size. This and an imposed 128MB + * sge length limit allows us to require only a 2-entry HW + * PBL for each SGE. This restriction is acceptable since + * since it is not possible to allocate 128MB of contiguous + * DMA coherent memory! + */ + if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN) + return -EINVAL; + wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT; + + /* + * T3 restricts a recv to all zero-stag or all non-zero-stag. + */ + if (wr->sg_list[i].lkey != 0) + return -EINVAL; + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = htobe32(wr->sg_list[i].length); + wqe->recv.sgl[i].to = htobe64(wr->sg_list[i].addr); + wqe->recv.pbl_addr[i] = htobe32(pbl_offset); + pbl_offset += 2; + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.pagesz[i] = 0; + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].wr_id = wr->wr_id; + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, + qhp->wq.rq_size_log2)].pbl_addr = pbl_addr; + return 0; +} + int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -282,18 +367,19 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, mtx_lock(&qhp->lock); if (qhp->attr.state > IWCH_QP_STATE_RTS) { mtx_unlock(&qhp->lock); - return (-EINVAL); + err = -EINVAL; + goto out; } num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, qhp->wq.sq_size_log2); - if (num_wrs <= 0) { + if (num_wrs == 0) { mtx_unlock(&qhp->lock); - return (-ENOMEM); + err = -EINVAL; + goto out; } while (wr) { if (num_wrs == 0) { err = -ENOMEM; - *bad_wr = wr; break; } idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); @@ -311,17 +397,17 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: t3_wr_opcode = T3_WR_SEND; - err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt); + err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt); break; case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: t3_wr_opcode = T3_WR_WRITE; - err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt); + err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt); break; case IB_WR_RDMA_READ: t3_wr_opcode = T3_WR_READ; t3_wr_flags = 0; /* T3 reads are always signaled */ - err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt); + err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt); if (err) break; sqp->read_len = wqe->read.local_len; @@ -333,10 +419,9 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wr->opcode); err = -EINVAL; } - if (err) { - *bad_wr = wr; + if (err) break; - } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; sqp->wr_id = wr->wr_id; sqp->opcode = wr2opcode(t3_wr_opcode); @@ -358,6 +443,9 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } mtx_unlock(&qhp->lock); ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); +out: + if (err) + *bad_wr = wr; return err; } @@ -374,27 +462,35 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, mtx_lock(&qhp->lock); if (qhp->attr.state > IWCH_QP_STATE_RTS) { mtx_unlock(&qhp->lock); - return (-EINVAL); + err = -EINVAL; + goto out; } num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, qhp->wq.rq_size_log2) - 1; if (!wr) { mtx_unlock(&qhp->lock); - return (-EINVAL); + err = -EINVAL; + goto out; } + while (wr) { + if (wr->num_sge > T3_MAX_SGE) { + err = -EINVAL; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); wqe = (union t3_wr *) (qhp->wq.queue + idx); - if (num_wrs) - err = iwch_build_rdma_recv(qhp->rhp, wqe, wr); - else + if (num_wrs) { + if (wr->sg_list[0].lkey) + err = build_rdma_recv(qhp, wqe, wr); + else + err = build_zero_stag_recv(qhp, wqe, wr); + } else err = -ENOMEM; - if (err) { - *bad_wr = wr; + if (err) break; - } - qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] = - wr->wr_id; + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, sizeof(struct t3_receive_wr) >> 3); @@ -408,6 +504,9 @@ int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, } mtx_unlock(&qhp->lock); ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); +out: + if (err) + *bad_wr = wr; return err; } @@ -439,7 +538,7 @@ int iwch_bind_mw(struct ib_qp *qp, } num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, qhp->wq.sq_size_log2); - if ((num_wrs) <= 0) { + if ((num_wrs) == 0) { mtx_unlock(&qhp->lock); return (-ENOMEM); } @@ -491,7 +590,7 @@ int iwch_bind_mw(struct ib_qp *qp, return err; } -static inline void build_term_codes(struct respQ_msg_t *rsp_msg, +static void build_term_codes(struct respQ_msg_t *rsp_msg, u8 *layer_type, u8 *ecode) { int status = TPT_ERR_INTERNAL_ERR; @@ -631,15 +730,18 @@ int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) union t3_wr *wqe; struct terminate_message *term; struct mbuf *m; + struct ofld_hdr *oh; - CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); - m = m_gethdr(MT_DATA, M_NOWAIT); - if (!m) { + CTR3(KTR_IW_CXGB, "%s: tid %u, %p", __func__, qhp->ep->hwtid, rsp_msg); + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { log(LOG_ERR, "%s cannot send TERMINATE!\n", __FUNCTION__); return (-ENOMEM); } - wqe = mtod(m, union t3_wr *); - m->m_len = m->m_pkthdr.len = 40; + oh = mtod(m, struct ofld_hdr *); + m->m_pkthdr.len = m->m_len = sizeof(*oh) + 40; + oh->flags = V_HDR_NDESC(1) | V_HDR_CTRL(CPL_PRIORITY_DATA) | V_HDR_QSET(0); + wqe = (void *)(oh + 1); memset(wqe, 0, 40); wqe->send.rdmaop = T3_TERMINATE; @@ -653,22 +755,17 @@ int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG)); wqe->send.wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(qhp->ep->hwtid)); - m_set_priority(m, CPL_PRIORITY_DATA); - m_set_sgl(m, NULL); - m_set_sgllen(m, 0); - return cxgb_ofld_send(qhp->rhp->rdev.t3cdev_p, m); + return t3_offload_tx(qhp->rhp->rdev.adap, m); } /* * Assumes qhp lock is held. */ -static void __flush_qp(struct iwch_qp *qhp) +static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp, + struct iwch_cq *schp) { - struct iwch_cq *rchp, *schp; int count; - - rchp = get_chp(qhp->rhp, qhp->attr.rcq); - schp = get_chp(qhp->rhp, qhp->attr.scq); + int flushed; CTR4(KTR_IW_CXGB, "%s qhp %p rchp %p schp %p", __FUNCTION__, qhp, rchp, schp); /* take a ref on the qhp since we must release the lock */ @@ -680,20 +777,22 @@ static void __flush_qp(struct iwch_qp *qhp) mtx_lock(&qhp->lock); cxio_flush_hw_cq(&rchp->cq); cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); - cxio_flush_rq(&qhp->wq, &rchp->cq, count); + flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count); mtx_unlock(&qhp->lock); mtx_unlock(&rchp->lock); - (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + if (flushed) + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); /* locking hierarchy: cq lock first, then qp lock. */ mtx_lock(&schp->lock); mtx_lock(&qhp->lock); cxio_flush_hw_cq(&schp->cq); cxio_count_scqes(&schp->cq, &qhp->wq, &count); - cxio_flush_sq(&qhp->wq, &schp->cq, count); + flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count); mtx_unlock(&qhp->lock); mtx_unlock(&schp->lock); - (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + if (flushed) + (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); /* deref */ mtx_lock(&qhp->lock); @@ -703,10 +802,23 @@ static void __flush_qp(struct iwch_qp *qhp) static void flush_qp(struct iwch_qp *qhp) { - if (qhp->ibqp.uobject) + struct iwch_cq *rchp, *schp; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + if (qhp->ibqp.uobject) { cxio_set_wq_in_error(&qhp->wq); - else - __flush_qp(qhp); + cxio_set_cq_in_error(&rchp->cq); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + if (schp != rchp) { + cxio_set_cq_in_error(&schp->cq); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + } + return; + } + __flush_qp(qhp, rchp, schp); } @@ -715,7 +827,13 @@ static void flush_qp(struct iwch_qp *qhp) */ static int rqes_posted(struct iwch_qp *qhp) { - return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV; + union t3_wr *wqe = qhp->wq.queue; + u16 count = 0; + while ((count+1) != 0 && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { + count++; + wqe++; + } + return count; } static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, @@ -724,6 +842,10 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, { struct t3_rdma_init_attr init_attr; int ret; + struct socket *so = qhp->ep->com.so; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct toepcb *toep; init_attr.tid = qhp->ep->hwtid; init_attr.qpid = qhp->wq.qpid; @@ -737,32 +859,28 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, (qhp->attr.mpa_attr.xmit_marker_enabled << 1) | (qhp->attr.mpa_attr.crc_enabled << 2); - /* - * XXX - The IWCM doesn't quite handle getting these - * attrs set before going into RTS. For now, just turn - * them on always... - */ -#if 0 - init_attr.qpcaps = qhp->attr.enableRdmaRead | - (qhp->attr.enableRdmaWrite << 1) | - (qhp->attr.enableBind << 2) | - (qhp->attr.enable_stag0_fastreg << 3) | - (qhp->attr.enable_stag0_fastreg << 4); -#else - init_attr.qpcaps = 0x1f; -#endif + init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE | + uP_RI_QP_RDMA_WRITE_ENABLE | + uP_RI_QP_BIND_ENABLE; + if (!qhp->ibqp.uobject) + init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE; init_attr.tcp_emss = qhp->ep->emss; init_attr.ord = qhp->attr.max_ord; init_attr.ird = qhp->attr.max_ird; init_attr.qp_dma_addr = qhp->wq.dma_addr; init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); - init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0; + init_attr.rqe_count = rqes_posted(qhp); + init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0; + init_attr.rtr_type = 0; + tp = intotcpcb(inp); + toep = tp->t_toe; + init_attr.chan = toep->tp_l2t->smt_idx; init_attr.irs = qhp->ep->rcv_seq; CTR5(KTR_IW_CXGB, "%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " "flags 0x%x qpcaps 0x%x", __FUNCTION__, init_attr.rq_addr, init_attr.rq_size, init_attr.flags, init_attr.qpcaps); - ret = cxio_rdma_init(&rhp->rdev, &init_attr); + ret = cxio_rdma_init(&rhp->rdev, &init_attr, qhp->ep->com.so); CTR2(KTR_IW_CXGB, "%s ret %d", __FUNCTION__, ret); return ret; } @@ -870,8 +988,8 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, abort=0; disconnect = 1; ep = qhp->ep; + get_ep(&ep->com); } - flush_qp(qhp); break; case IWCH_QP_STATE_TERMINATE: qhp->attr.state = IWCH_QP_STATE_TERMINATE; @@ -886,6 +1004,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, abort=1; disconnect = 1; ep = qhp->ep; + get_ep(&ep->com); } goto err; break; @@ -901,6 +1020,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, } switch (attrs->next_state) { case IWCH_QP_STATE_IDLE: + flush_qp(qhp); qhp->attr.state = IWCH_QP_STATE_IDLE; qhp->attr.llp_stream_handle = NULL; put_ep(&qhp->ep->com); @@ -908,7 +1028,6 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, wakeup(qhp); break; case IWCH_QP_STATE_ERROR: - disconnect=1; goto err; default: ret = -EINVAL; @@ -960,81 +1079,29 @@ err: out: mtx_unlock(&qhp->lock); - if (terminate) + if (terminate) iwch_post_terminate(qhp, NULL); + /* * If disconnect is 1, then we need to initiate a disconnect * on the EP. This can be a normal close (RTS->CLOSING) or * an abnormal close (RTS/CLOSING->ERROR). */ - if (disconnect) + if (disconnect) { iwch_ep_disconnect(ep, abort, M_NOWAIT); - + put_ep(&ep->com); + } + /* * If free is 1, then we've disassociated the EP from the QP * and we need to dereference the EP. */ - if (free) + if (free) put_ep(&ep->com); + CTR2(KTR_IW_CXGB, "%s exit state %d", __FUNCTION__, qhp->attr.state); return ret; } - -static int quiesce_qp(struct iwch_qp *qhp) -{ - mtx_lock(&qhp->lock); - iwch_quiesce_tid(qhp->ep); - qhp->flags |= QP_QUIESCED; - mtx_unlock(&qhp->lock); - return 0; -} - -static int resume_qp(struct iwch_qp *qhp) -{ - mtx_lock(&qhp->lock); - iwch_resume_tid(qhp->ep); - qhp->flags &= ~QP_QUIESCED; - mtx_lock(&qhp->lock); - return 0; -} - -int iwch_quiesce_qps(struct iwch_cq *chp) -{ - int i; - struct iwch_qp *qhp; - - for (i=0; i < T3_MAX_NUM_QP; i++) { - qhp = get_qhp(chp->rhp, i); - if (!qhp) - continue; - if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) { - quiesce_qp(qhp); - continue; - } - if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp)) - quiesce_qp(qhp); - } - return 0; -} - -int iwch_resume_qps(struct iwch_cq *chp) -{ - int i; - struct iwch_qp *qhp; - - for (i=0; i < T3_MAX_NUM_QP; i++) { - qhp = get_qhp(chp->rhp, i); - if (!qhp) - continue; - if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) { - resume_qp(qhp); - continue; - } - if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp)) - resume_qp(qhp); - } - return 0; -} - +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c index 62ffef5..a99bf66 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c @@ -29,11 +29,13 @@ POSSIBILITY OF SUCH DAMAGE. #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include -#include #include #include #include @@ -59,9 +61,11 @@ __FBSDID("$FreeBSD$"); #include -#include -#include -#include +#include +#include +#include +#include +#include #include #include @@ -369,3 +373,4 @@ void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p) { gen_pool_destroy(rdev_p->rqt_pool); } +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h index 3086a63..55a1006 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h @@ -47,10 +47,18 @@ struct iwch_create_cq_req { uint64_t user_rptr_addr; }; +struct iwch_create_cq_resp_v0 { + __u64 key; + __u32 cqid; + __u32 size_log2; +}; + struct iwch_create_cq_resp { uint64_t key; uint32_t cqid; uint32_t size_log2; + __u32 memsize; + __u32 reserved; }; struct iwch_create_qp_resp { diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h index bf8f2d6..8718aa0 100644 --- a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h @@ -32,6 +32,9 @@ $FreeBSD$ #define __CXIO_WR_H__ #define T3_MAX_SGE 4 #define T3_MAX_INLINE 64 +#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3) +#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024) +#define T3_STAG0_PAGE_SHIFT 15 #define Q_EMPTY(rptr,wptr) ((rptr)==(wptr)) #define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \ @@ -272,6 +275,22 @@ enum t3_qp_caps { uP_RI_QP_STAG0_ENABLE = 0x10 } __attribute__ ((packed)); +enum rdma_init_rtr_types { + RTR_READ = 1, + RTR_WRITE = 2, + RTR_SEND = 3, +}; + +#define S_RTR_TYPE 2 +#define M_RTR_TYPE 0x3 +#define V_RTR_TYPE(x) ((x) << S_RTR_TYPE) +#define G_RTR_TYPE(x) ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE) + +#define S_CHAN 4 +#define M_CHAN 0x3 +#define V_CHAN(x) ((x) << S_CHAN) +#define G_CHAN(x) ((((x) >> S_CHAN)) & M_CHAN) + struct t3_rdma_init_attr { u32 tid; u32 qpid; @@ -287,8 +306,11 @@ struct t3_rdma_init_attr { u32 ird; u64 qp_dma_addr; u32 qp_dma_size; - u32 flags; + enum rdma_init_rtr_types rtr_type; + u16 flags; + u16 rqe_count; u32 irs; + u32 chan; }; struct t3_rdma_init_wr { @@ -303,13 +325,13 @@ struct t3_rdma_init_wr { u8 mpaattrs; /* 5 */ u8 qpcaps; __be16 ulpdu_size; - __be32 flags; /* bits 31-1 - reservered */ - /* bit 0 - set if RECV posted */ + __be16 flags_rtr_type; + __be16 rqe_count; __be32 ord; /* 6 */ __be32 ird; __be64 qp_dma_addr; /* 7 */ __be32 qp_dma_size; /* 8 */ - u32 irs; + __be32 irs; }; struct t3_genbit { @@ -318,7 +340,8 @@ struct t3_genbit { }; enum rdma_init_wr_flags { - RECVS_POSTED = 1, + MPA_INITIATOR = (1<<0), + PRIV_QP = (1<<1), }; union t3_wr { @@ -531,6 +554,12 @@ struct t3_cqe { #define CQE_STATUS(x) (G_CQE_STATUS(be32toh((x).header))) #define CQE_OPCODE(x) (G_CQE_OPCODE(be32toh((x).header))) +#define CQE_SEND_OPCODE(x)( \ + (G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \ + (G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \ + (G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \ + (G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV)) + #define CQE_LEN(x) (be32toh((x).len)) /* used for RQ completion processing */ @@ -589,21 +618,23 @@ struct t3_swsq { uint64_t wr_id; struct t3_cqe cqe; uint32_t sq_wptr; - uint32_t read_len; + __be32 read_len; int opcode; int complete; int signaled; }; +struct t3_swrq { + __u64 wr_id; + __u32 pbl_addr; +}; + /* * A T3 WQ implements both the SQ and RQ. */ struct t3_wq { union t3_wr *queue; /* DMA accessable memory */ bus_addr_t dma_addr; /* DMA address for HW */ -#ifdef notyet - DECLARE_PCI_UNMAP_ADDR(mapping) /* unmap kruft */ -#endif u32 error; /* 1 once we go to ERROR */ u32 qpid; u32 wptr; /* idx to next available WR slot */ @@ -613,14 +644,15 @@ struct t3_wq { u32 sq_wptr; /* sq_wptr - sq_rptr == count of */ u32 sq_rptr; /* pending wrs */ u32 sq_size_log2; /* sq size */ - u64 *rq; /* SW RQ (holds consumer wr_ids */ + struct t3_swrq *rq; /* SW RQ (holds consumer wr_ids */ u32 rq_wptr; /* rq_wptr - rq_rptr == count of */ u32 rq_rptr; /* pending wrs */ - u64 *rq_oldest_wr; /* oldest wr on the SW RQ */ + struct t3_swrq *rq_oldest_wr; /* oldest wr on the SW RQ */ u32 rq_size_log2; /* rq size */ u32 rq_addr; /* rq adapter address */ - void /* __iomem */ *doorbell; /* kernel db */ + void *doorbell; /* kernel db */ u64 udb; /* user db if any */ + struct cxio_rdev *rdev; }; struct t3_cq { @@ -629,9 +661,6 @@ struct t3_cq { u32 wptr; u32 size_log2; bus_addr_t dma_addr; -#ifdef notyet - DECLARE_PCI_UNMAP_ADDR(mapping) -#endif struct t3_cqe *queue; struct t3_cqe *sw_queue; u32 sw_rptr; @@ -641,6 +670,22 @@ struct t3_cq { #define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \ CQE_GENBIT(*cqe)) +struct t3_cq_status_page { + u32 cq_err; +}; + +static inline int cxio_cq_in_error(struct t3_cq *cq) +{ + return ((struct t3_cq_status_page *) + &cq->queue[1 << cq->size_log2])->cq_err; +} + +static inline void cxio_set_cq_in_error(struct t3_cq *cq) +{ + ((struct t3_cq_status_page *) + &cq->queue[1 << cq->size_log2])->cq_err = 1; +} + static inline void cxio_set_wq_in_error(struct t3_wq *wq) { wq->queue->flit[13] = 1; diff --git a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h deleted file mode 100644 index 398923a..0000000 --- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h +++ /dev/null @@ -1,49 +0,0 @@ - -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -$FreeBSD$ - -***************************************************************************/ - -#ifndef _CXGB_TOEDEV_H_ -#define _CXGB_TOEDEV_H_ -#include - - -/* offload type ids */ -enum { - TOE_ID_CHELSIO_T1 = 1, - TOE_ID_CHELSIO_T1C, - TOE_ID_CHELSIO_T2, - TOE_ID_CHELSIO_T3, - TOE_ID_CHELSIO_T3B, - TOE_ID_CHELSIO_T3C, -} - ; - -#endif diff --git a/sys/dev/cxgb/ulp/toecore/toedev.c b/sys/dev/cxgb/ulp/toecore/toedev.c deleted file mode 100644 index 01a7d90..0000000 --- a/sys/dev/cxgb/ulp/toecore/toedev.c +++ /dev/null @@ -1,420 +0,0 @@ - -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - - -/* - * XXX - */ -#include -#include - -static struct mtx offload_db_lock; -static TAILQ_HEAD(, toedev) offload_dev_list; -static TAILQ_HEAD(, tom_info) offload_module_list; - -/* - * Returns the entry in the given table with the given offload id, or NULL - * if the id is not found. - */ -static const struct offload_id * -id_find(unsigned int id, const struct offload_id *table) -{ - for ( ; table->id; ++table) - if (table->id == id) - return table; - return NULL; -} - -/* - * Returns true if an offload device is presently attached to an offload module. - */ -static inline int -is_attached(const struct toedev *dev) -{ - return dev->tod_offload_mod != NULL; -} - -/* - * Try to attach a new offload device to an existing TCP offload module that - * can handle the device's offload id. Returns 0 if it succeeds. - * - * Must be called with the offload_db_lock held. - */ -static int -offload_attach(struct toedev *dev) -{ - struct tom_info *t; - - TAILQ_FOREACH(t, &offload_module_list, entry) { - const struct offload_id *entry; - - entry = id_find(dev->tod_ttid, t->ti_id_table); - if (entry && t->ti_attach(dev, entry) == 0) { - dev->tod_offload_mod = t; - return 0; - } - } - return (ENOPROTOOPT); -} - -/** - * register_tom - register a TCP Offload Module (TOM) - * @t: the offload module to register - * - * Register a TCP Offload Module (TOM). - */ -int -register_tom(struct tom_info *t) -{ - mtx_lock(&offload_db_lock); - toedev_registration_count++; - TAILQ_INSERT_HEAD(&offload_module_list, t, entry); - mtx_unlock(&offload_db_lock); - return 0; -} - -/** - * unregister_tom - unregister a TCP Offload Module (TOM) - * @t: the offload module to register - * - * Unregister a TCP Offload Module (TOM). Note that this does not affect any - * TOE devices to which the TOM is already attached. - */ -int -unregister_tom(struct tom_info *t) -{ - mtx_lock(&offload_db_lock); - TAILQ_REMOVE(&offload_module_list, t, entry); - mtx_unlock(&offload_db_lock); - return 0; -} - -/* - * Find an offload device by name. Must be called with offload_db_lock held. - */ -static struct toedev * -__find_offload_dev_by_name(const char *name) -{ - struct toedev *dev; - - TAILQ_FOREACH(dev, &offload_dev_list, entry) { - if (!strncmp(dev->tod_name, name, TOENAMSIZ)) - return dev; - } - return NULL; -} - -/* - * Returns true if an offload device is already registered. - * Must be called with the offload_db_lock held. - */ -static int -is_registered(const struct toedev *dev) -{ - struct toedev *d; - - TAILQ_FOREACH(d, &offload_dev_list, entry) { - if (d == dev) - return 1; - } - return 0; -} - -/* - * Finalize the name of an offload device by assigning values to any format - * strings in its name. - */ -static int -assign_name(struct toedev *dev, const char *name, int limit) -{ - int i; - - for (i = 0; i < limit; ++i) { - char s[TOENAMSIZ]; - - if (snprintf(s, sizeof(s), name, i) >= sizeof(s)) - return -1; /* name too long */ - if (!__find_offload_dev_by_name(s)) { - strcpy(dev->tod_name, s); - return 0; - } - } - return -1; -} - -/** - * register_toedev - register a TOE device - * @dev: the device - * @name: a name template for the device - * - * Register a TOE device and try to attach an appropriate TCP offload module - * to it. @name is a template that may contain at most one %d format - * specifier. - */ -int -register_toedev(struct toedev *dev, const char *name) -{ - int ret; - const char *p; - - /* - * Validate the name template. Only one %d allowed and name must be - * a valid filename so it can appear in sysfs. - */ - if (!name || !*name || !strcmp(name, ".") || !strcmp(name, "..") || - strchr(name, '/')) - return EINVAL; - - p = strchr(name, '%'); - if (p && (p[1] != 'd' || strchr(p + 2, '%'))) - return EINVAL; - - mtx_lock(&offload_db_lock); - if (is_registered(dev)) { /* device already registered */ - ret = EEXIST; - goto out; - } - - if ((ret = assign_name(dev, name, 32)) != 0) - goto out; - - dev->tod_offload_mod = NULL; - TAILQ_INSERT_TAIL(&offload_dev_list, dev, entry); -out: - mtx_unlock(&offload_db_lock); - return ret; -} - -/** - * unregister_toedev - unregister a TOE device - * @dev: the device - * - * Unregister a TOE device. The device must not be attached to an offload - * module. - */ -int -unregister_toedev(struct toedev *dev) -{ - int ret = 0; - - mtx_lock(&offload_db_lock); - if (!is_registered(dev)) { - ret = ENODEV; - goto out; - } - if (is_attached(dev)) { - ret = EBUSY; - goto out; - } - TAILQ_REMOVE(&offload_dev_list, dev, entry); -out: - mtx_unlock(&offload_db_lock); - return ret; -} - -/** - * activate_offload - activate an offload device - * @dev: the device - * - * Activate an offload device by locating an appropriate registered offload - * module. If no module is found the operation fails and may be retried at - * a later time. - */ -int -activate_offload(struct toedev *dev) -{ - int ret = 0; - - mtx_lock(&offload_db_lock); - if (!is_registered(dev)) - ret = ENODEV; - else if (!is_attached(dev)) - ret = offload_attach(dev); - mtx_unlock(&offload_db_lock); - return ret; -} - -/** - * toe_send - send a packet to a TOE device - * @dev: the device - * @m: the packet - * - * Sends an mbuf to a TOE driver after dealing with any active network taps. - */ -int -toe_send(struct toedev *dev, struct mbuf *m) -{ - int r; - - critical_enter(); /* XXX neccessary? */ - r = dev->tod_send(dev, m); - critical_exit(); - if (r) - BPF_MTAP(dev->tod_lldev, m); - return r; -} - -/** - * toe_receive_mbuf - process n received TOE packets - * @dev: the toe device - * @m: an array of offload packets - * @n: the number of offload packets - * - * Process an array of ingress offload packets. Each packet is forwarded - * to any active network taps and then passed to the toe device's receive - * method. We optimize passing packets to the receive method by passing - * it the whole array at once except when there are active taps. - */ -int -toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n) -{ - if (__predict_true(!bpf_peers_present(dev->tod_lldev->if_bpf))) - return dev->tod_recv(dev, m, n); - - for ( ; n; n--, m++) { - m[0]->m_pkthdr.rcvif = dev->tod_lldev; - BPF_MTAP(dev->tod_lldev, m[0]); - dev->tod_recv(dev, m, 1); - } - return 0; -} - -static inline int -ifnet_is_offload(const struct ifnet *ifp) -{ - return (ifp->if_flags & IFCAP_TOE); -} - -void -toe_arp_update(struct rtentry *rt) -{ - struct ifnet *ifp = rt->rt_ifp; - - if (ifp && ifnet_is_offload(ifp)) { - struct toedev *tdev = TOEDEV(ifp); - - if (tdev && tdev->tod_arp_update) - tdev->tod_arp_update(tdev, rt); - } -} - -/** - * offload_get_phys_egress - find the physical egress device - * @root_dev: the root device anchoring the search - * @so: the socket used to determine egress port in bonding mode - * @context: in bonding mode, indicates a connection set up or failover - * - * Given a root network device it returns the physical egress device that is a - * descendant of the root device. The root device may be either a physical - * device, in which case it is the device returned, or a virtual device, such - * as a VLAN or bonding device. In case of a bonding device the search - * considers the decisions of the bonding device given its mode to locate the - * correct egress device. - */ -struct ifnet * -offload_get_phys_egress(struct ifnet *root_dev, struct socket *so, int context) -{ - -#if 0 - while (root_dev && ifnet_is_offload(root_dev)) { - if (root_dev->tod_priv_flags & IFF_802_1Q_VLAN) - root_dev = VLAN_DEV_INFO(root_dev)->real_dev; - else if (root_dev->tod_flags & IFF_MASTER) - root_dev = toe_bond_get_slave(root_dev, sk, context); - else - break; - } -#endif - return root_dev; -} - -static int -toecore_load(module_t mod, int cmd, void *arg) -{ - int err = 0; - - switch (cmd) { - case MOD_LOAD: - mtx_init(&offload_db_lock, "toedev lock", NULL, MTX_DEF); - TAILQ_INIT(&offload_dev_list); - TAILQ_INIT(&offload_module_list); - break; - case MOD_QUIESCE: - break; - case MOD_UNLOAD: - mtx_lock(&offload_db_lock); - if (!TAILQ_EMPTY(&offload_dev_list) || - !TAILQ_EMPTY(&offload_module_list)) { - err = EBUSY; - mtx_unlock(&offload_db_lock); - break; - } - mtx_unlock(&offload_db_lock); - mtx_destroy(&offload_db_lock); - break; - case MOD_SHUTDOWN: - break; - default: - err = EOPNOTSUPP; - break; - } - - return (err); -} - - -static moduledata_t mod_data= { - "toecore", - toecore_load, - 0 -}; - -MODULE_VERSION(toecore, 1); -DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c index e019c61..16b5394 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -1,35 +1,35 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include @@ -42,22 +42,17 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include - -#if __FreeBSD_version < 800044 -#define V_tcp_do_autosndbuf tcp_do_autosndbuf -#define V_tcp_autosndbuf_max tcp_autosndbuf_max -#define V_tcp_do_rfc1323 tcp_do_rfc1323 -#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf -#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max -#define V_tcpstat tcpstat -#endif +#include +#include #include +#include #include #include @@ -65,37 +60,33 @@ __FBSDID("$FreeBSD$"); #include #include - -#include -#include - #include #include +#define TCPSTATES #include -#include +#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "cxgb_include.h" +#include "ulp/tom/cxgb_l2t.h" +#include "ulp/tom/cxgb_tom.h" +#include "ulp/tom/cxgb_toepcb.h" + +VNET_DECLARE(int, tcp_do_autosndbuf); +#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) +VNET_DECLARE(int, tcp_autosndbuf_inc); +#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) +VNET_DECLARE(int, tcp_autosndbuf_max); +#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) +VNET_DECLARE(int, tcp_do_autorcvbuf); +#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) +VNET_DECLARE(int, tcp_autorcvbuf_inc); +#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) +VNET_DECLARE(int, tcp_autorcvbuf_max); +#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) +extern int always_keepalive; /* * For ULP connections HW may add headers, e.g., for digests, that aren't part @@ -108,29 +99,6 @@ __FBSDID("$FreeBSD$"); */ const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; -#ifdef notyet -/* - * This sk_buff holds a fake header-only TCP segment that we use whenever we - * need to exploit SW TCP functionality that expects TCP headers, such as - * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple - * CPUs without locking. - */ -static struct mbuf *tcphdr_mbuf __read_mostly; -#endif - -/* - * Size of WRs in bytes. Note that we assume all devices we are handling have - * the same WR size. - */ -static unsigned int wrlen __read_mostly; - -/* - * The number of WRs needed for an skb depends on the number of page fragments - * in the skb and whether it has any payload in its main body. This maps the - * length of the gather list represented by an skb into the # of necessary WRs. - */ -static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; - /* * Max receive window supported by HW in bytes. Only a small part of it can * be set through option0, the rest needs to be set through RX_DATA_ACK. @@ -144,3597 +112,1507 @@ static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; #define MIN_RCV_WND (24 * 1024U) #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) -#define VALIDATE_SEQ 0 -#define VALIDATE_SOCK(so) -#define DEBUG_WR 0 +static void t3_release_offload_resources(struct toepcb *); +static void send_reset(struct toepcb *toep); + +/* + * Called after the last CPL for the toepcb has been received. + * + * The inp must be wlocked on entry and is unlocked (or maybe destroyed) by the + * time this function exits. + */ +static int +toepcb_release(struct toepcb *toep) +{ + struct inpcb *inp = toep->tp_inp; + struct toedev *tod = toep->tp_tod; + struct tom_data *td = t3_tomdata(tod); + int rc; -#define TCP_TIMEWAIT 1 -#define TCP_CLOSE 2 -#define TCP_DROP 3 + INP_WLOCK_ASSERT(inp); + KASSERT(!(toep->tp_flags & TP_CPL_DONE), + ("%s: double release?", __func__)); -static void t3_send_reset(struct toepcb *toep); -static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); -static inline void free_atid(struct t3cdev *cdev, unsigned int tid); -static void handle_syncache_event(int event, void *arg); + CTR2(KTR_CXGB, "%s: tid %d", __func__, toep->tp_tid); -static inline void -SBAPPEND(struct sockbuf *sb, struct mbuf *n) + toep->tp_flags |= TP_CPL_DONE; + toep->tp_inp = NULL; + + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); + + if (!(toep->tp_flags & TP_ATTACHED)) + t3_release_offload_resources(toep); + + rc = in_pcbrele_wlocked(inp); + if (!rc) + INP_WUNLOCK(inp); + return (rc); +} + +/* + * One sided detach. The tcpcb is going away and we need to unhook the toepcb + * hanging off it. If the TOE driver is also done with the toepcb we'll release + * all offload resources. + */ +static void +toepcb_detach(struct inpcb *inp) { - struct mbuf *m; + struct toepcb *toep; + struct tcpcb *tp; - m = sb->sb_mb; - while (m) { - KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || - !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", - !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); - KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", - m->m_next, m->m_nextpkt, m->m_flags)); - m = m->m_next; - } - m = n; - while (m) { - KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || - !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", - !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); - KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", - m->m_next, m->m_nextpkt, m->m_flags)); - m = m->m_next; - } - KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); - sbappendstream_locked(sb, n); - m = sb->sb_mb; - - while (m) { - KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", - m->m_next, m->m_nextpkt, m->m_flags)); - m = m->m_next; - } + KASSERT(inp, ("%s: inp is NULL", __func__)); + INP_WLOCK_ASSERT(inp); + + tp = intotcpcb(inp); + toep = tp->t_toe; + + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + KASSERT(toep->tp_flags & TP_ATTACHED, ("%s: not attached", __func__)); + + CTR6(KTR_CXGB, "%s: %s %u, toep %p, inp %p, tp %p", __func__, + tp->t_state == TCPS_SYN_SENT ? "atid" : "tid", toep->tp_tid, + toep, inp, tp); + + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + toep->tp_flags &= ~TP_ATTACHED; + + if (toep->tp_flags & TP_CPL_DONE) + t3_release_offload_resources(toep); } -static inline int -is_t3a(const struct toedev *dev) +void +t3_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) +{ + + toepcb_detach(tp->t_inpcb); +} + +static int +alloc_atid(struct tid_info *t, void *ctx) { - return (dev->tod_ttid == TOE_ID_CHELSIO_T3); + int atid = -1; + + mtx_lock(&t->atid_lock); + if (t->afree) { + union active_open_entry *p = t->afree; + + atid = (p - t->atid_tab) + t->atid_base; + t->afree = p->next; + p->ctx = ctx; + t->atids_in_use++; + } + mtx_unlock(&t->atid_lock); + + return (atid); } static void -dump_toepcb(struct toepcb *toep) +free_atid(struct tid_info *t, int atid) { - DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", - toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, - toep->tp_mtu_idx, toep->tp_tid); + union active_open_entry *p = atid2entry(t, atid); - DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", - toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, - toep->tp_mss_clamp, toep->tp_flags); + mtx_lock(&t->atid_lock); + p->next = t->afree; + t->afree = p; + t->atids_in_use--; + mtx_unlock(&t->atid_lock); } -#ifndef RTALLOC2_DEFINED -static struct rtentry * -rtalloc2(struct sockaddr *dst, int report, u_long ignflags) +void +insert_tid(struct tom_data *td, void *ctx, unsigned int tid) { - struct rtentry *rt = NULL; - - if ((rt = rtalloc1(dst, report, ignflags)) != NULL) - RT_UNLOCK(rt); + struct tid_info *t = &td->tid_maps; - return (rt); + t->tid_tab[tid] = ctx; + atomic_add_int(&t->tids_in_use, 1); } -#endif -/* - * Determine whether to send a CPL message now or defer it. A message is - * deferred if the connection is in SYN_SENT since we don't know the TID yet. - * For connections in other states the message is sent immediately. - * If through_l2t is set the message is subject to ARP processing, otherwise - * it is sent directly. - */ -static inline void -send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) +void +update_tid(struct tom_data *td, void *ctx, unsigned int tid) { - struct tcpcb *tp = toep->tp_tp; - - if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { - inp_wlock(tp->t_inpcb); - mbufq_tail(&toep->out_of_order_queue, m); // defer - inp_wunlock(tp->t_inpcb); - } else if (through_l2t) - l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T - else - cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly + struct tid_info *t = &td->tid_maps; + + t->tid_tab[tid] = ctx; +} + +void +remove_tid(struct tom_data *td, unsigned int tid) +{ + struct tid_info *t = &td->tid_maps; + + t->tid_tab[tid] = NULL; + atomic_add_int(&t->tids_in_use, -1); } -static inline unsigned int -mkprio(unsigned int cntrl, const struct toepcb *toep) +/* use ctx as a next pointer in the tid release list */ +void +queue_tid_release(struct toedev *tod, unsigned int tid) { - return (cntrl); + struct tom_data *td = t3_tomdata(tod); + void **p = &td->tid_maps.tid_tab[tid]; + struct adapter *sc = tod->tod_softc; + + mtx_lock(&td->tid_release_lock); + *p = td->tid_release_list; + td->tid_release_list = p; + if (!*p) + taskqueue_enqueue(sc->tq, &td->tid_release_task); + mtx_unlock(&td->tid_release_lock); } /* - * Populate a TID_RELEASE WR. The skb must be already propely sized. + * Populate a TID_RELEASE WR. */ static inline void -mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) +mk_tid_release(struct cpl_tid_release *cpl, unsigned int tid) +{ + + cpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); +} + +void +release_tid(struct toedev *tod, unsigned int tid, int qset) +{ + struct tom_data *td = t3_tomdata(tod); + struct adapter *sc = tod->tod_softc; + struct mbuf *m; + struct cpl_tid_release *cpl; +#ifdef INVARIANTS + struct tid_info *t = &td->tid_maps; +#endif + + KASSERT(tid >= 0 && tid < t->ntids, + ("%s: tid=%d, ntids=%d", __func__, tid, t->ntids)); + + m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); + if (m) { + mk_tid_release(cpl, tid); + t3_offload_tx(sc, m); + remove_tid(td, tid); + } else + queue_tid_release(tod, tid); + +} + +void +t3_process_tid_release_list(void *data, int pending) +{ + struct mbuf *m; + struct tom_data *td = data; + struct adapter *sc = td->tod.tod_softc; + + mtx_lock(&td->tid_release_lock); + while (td->tid_release_list) { + void **p = td->tid_release_list; + unsigned int tid = p - td->tid_maps.tid_tab; + struct cpl_tid_release *cpl; + + td->tid_release_list = (void **)*p; + m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, cpl); /* qs 0 here */ + if (m == NULL) + break; /* XXX: who reschedules the release task? */ + mtx_unlock(&td->tid_release_lock); + mk_tid_release(cpl, tid); + t3_offload_tx(sc, m); + remove_tid(td, tid); + mtx_lock(&td->tid_release_lock); + } + mtx_unlock(&td->tid_release_lock); +} + +static void +close_conn(struct adapter *sc, struct toepcb *toep) { - struct cpl_tid_release *req; - - m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); - m->m_pkthdr.len = m->m_len = sizeof(*req); - req = mtod(m, struct cpl_tid_release *); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = 0; - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); + struct mbuf *m; + struct cpl_close_con_req *req; + + if (toep->tp_flags & TP_FIN_SENT) + return; + + m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); + if (m == NULL) + CXGB_UNIMPLEMENTED(); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, toep->tp_tid)); + req->rsvd = 0; + + toep->tp_flags |= TP_FIN_SENT; + t3_offload_tx(sc, m); } static inline void -make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) +make_tx_data_wr(struct socket *so, struct tx_data_wr *req, int len, + struct mbuf *tail) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; - struct tx_data_wr *req; struct sockbuf *snd; - + inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); - - req = mtod(m, struct tx_data_wr *); - m->m_len = sizeof(*req); - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); - req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr.wrh_lo = htonl(V_WR_TID(toep->tp_tid)); /* len includes the length of any HW ULP additions */ req->len = htonl(len); req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); /* V_TX_ULP_SUBMODE sets both the mode and submode */ - req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | - V_TX_URG(/* skb_urgent(skb) */ 0 ) | - V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && - (tail ? 0 : 1)))); + req->flags = htonl(V_TX_ULP_SUBMODE(toep->tp_ulp_mode) | V_TX_URG(0) | + V_TX_SHOVE(!(tp->t_flags & TF_MORETOCOME) && (tail ? 0 : 1))); req->sndseq = htonl(tp->snd_nxt); if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { - req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | - V_TX_CPU_IDX(toep->tp_qset)); - - /* Sendbuffer is in units of 32KB. - */ + struct adapter *sc = toep->tp_tod->tod_softc; + int cpu_idx = sc->rrss_map[toep->tp_qset]; + + req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | + V_TX_CPU_IDX(cpu_idx)); + + /* Sendbuffer is in units of 32KB. */ if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) - req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); - else { + req->param |= htonl(V_TX_SNDBUF(VNET(tcp_autosndbuf_max) >> 15)); + else req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); - } - + toep->tp_flags |= TP_DATASENT; } } -#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ +/* + * TOM_XXX_DUPLICATION sgl_len, calc_tx_descs, calc_tx_descs_ofld, mbuf_wrs, etc. + * TOM_XXX_MOVE to some common header file. + */ +/* + * IMM_LEN: # of bytes that can be tx'd as immediate data. There are 16 flits + * in a tx desc; subtract 3 for tx_data_wr (including the WR header), and 1 more + * for the second gen bit flit. This leaves us with 12 flits. + * + * descs_to_sgllen: # of SGL entries that can fit into the given # of tx descs. + * The first desc has a tx_data_wr (which includes the WR header), the rest have + * the WR header only. All descs have the second gen bit flit. + * + * sgllen_to_descs: # of tx descs used up by an sgl of given length. The first + * desc has a tx_data_wr (which includes the WR header), the rest have the WR + * header only. All descs have the second gen bit flit. + * + * flits_to_sgllen: # of SGL entries that can be fit in the given # of flits. + * + */ +#define IMM_LEN 96 +static int descs_to_sgllen[TX_MAX_DESC + 1] = {0, 8, 17, 26, 35}; +static int sgllen_to_descs[TX_MAX_SEGS] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, /* 0 - 9 */ + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, /* 10 - 19 */ + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, /* 20 - 29 */ + 4, 4, 4, 4, 4, 4 /* 30 - 35 */ +}; +#if 0 +static int flits_to_sgllen[TX_DESC_FLITS + 1] = { + 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10 +}; +#endif +#if SGE_NUM_GENBITS != 2 +#error "SGE_NUM_GENBITS really must be 2" +#endif int t3_push_frames(struct socket *so, int req_completion) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; - - struct mbuf *tail, *m0, *last; - struct t3cdev *cdev; - struct tom_data *d; - int state, bytes, count, total_bytes; - bus_dma_segment_t segs[TX_MAX_SEGS], *segp; + struct mbuf *m0, *sndptr, *m; + struct toedev *tod = toep->tp_tod; + struct adapter *sc = tod->tod_softc; + int bytes, ndesc, total_bytes = 0, mlen; struct sockbuf *snd; - - if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { - DPRINTF("tcp state=%d\n", tp->t_state); - return (0); - } - - state = so_state_get(so); - - if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { - DPRINTF("disconnecting\n"); - - return (0); - } + struct sglist *sgl; + struct ofld_hdr *oh; + caddr_t dst; + struct tx_data_wr *wr; inp_lock_assert(tp->t_inpcb); snd = so_sockbuf_snd(so); - sockbuf_lock(snd); + SOCKBUF_LOCK(snd); + + /* + * Autosize the send buffer. + */ + if (snd->sb_flags & SB_AUTOSIZE && VNET(tcp_do_autosndbuf)) { + if (snd->sb_cc >= (snd->sb_hiwat / 8 * 7) && + snd->sb_cc < VNET(tcp_autosndbuf_max)) { + if (!sbreserve_locked(snd, min(snd->sb_hiwat + + VNET(tcp_autosndbuf_inc), VNET(tcp_autosndbuf_max)), + so, curthread)) + snd->sb_flags &= ~SB_AUTOSIZE; + } + } - d = TOM_DATA(toep->tp_toedev); - cdev = d->cdev; + if (toep->tp_m_last && toep->tp_m_last == snd->sb_sndptr) + sndptr = toep->tp_m_last->m_next; + else + sndptr = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; - last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; + /* Nothing to send or no WRs available for sending data */ + if (toep->tp_wr_avail == 0 || sndptr == NULL) + goto out; - total_bytes = 0; - DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", - toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); + /* Something to send and at least 1 WR available */ + while (toep->tp_wr_avail && sndptr != NULL) { - if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { - KASSERT(tail, ("sbdrop error")); - last = tail = tail->m_next; - } + m0 = m_gethdr(M_NOWAIT, MT_DATA); + if (m0 == NULL) + break; + oh = mtod(m0, struct ofld_hdr *); + wr = (void *)(oh + 1); + dst = (void *)(wr + 1); - if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { - DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); - sockbuf_unlock(snd); + m0->m_pkthdr.len = m0->m_len = sizeof(*oh) + sizeof(*wr); + oh->flags = V_HDR_CTRL(CPL_PRIORITY_DATA) | F_HDR_DF | + V_HDR_QSET(toep->tp_qset); - return (0); - } - - toep->tp_m_last = NULL; - while (toep->tp_wr_avail && (tail != NULL)) { - count = bytes = 0; - segp = segs; - if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { - sockbuf_unlock(snd); - return (0); - } /* - * If the data in tail fits as in-line, then - * make an immediate data wr. + * Try to construct an immediate data WR if possible. Stuff as + * much data into it as possible, one whole mbuf at a time. */ - if (tail->m_len <= IMM_LEN) { - count = 1; - bytes = tail->m_len; - last = tail; - tail = tail->m_next; - m_set_sgl(m0, NULL); - m_set_sgllen(m0, 0); - make_tx_data_wr(so, m0, bytes, tail); - m_append(m0, bytes, mtod(last, caddr_t)); - KASSERT(!m0->m_next, ("bad append")); - } else { - while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) - && (tail != NULL) && (count < TX_MAX_SEGS-1)) { - bytes += tail->m_len; - last = tail; - count++; - /* - * technically an abuse to be using this for a VA - * but less gross than defining my own structure - * or calling pmap_kextract from here :-| - */ - segp->ds_addr = (bus_addr_t)tail->m_data; - segp->ds_len = tail->m_len; - DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", - count, mbuf_wrs[count], tail->m_data, tail->m_len); - segp++; - tail = tail->m_next; - } - DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", - toep->tp_wr_avail, count, mbuf_wrs[count], tail); - - m_set_sgl(m0, segs); - m_set_sgllen(m0, count); - make_tx_data_wr(so, m0, bytes, tail); + mlen = sndptr->m_len; + ndesc = bytes = 0; + while (mlen <= IMM_LEN - bytes) { + bcopy(sndptr->m_data, dst, mlen); + bytes += mlen; + dst += mlen; + + if (!(sndptr = sndptr->m_next)) + break; + mlen = sndptr->m_len; } - m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); - if (tail) { - snd->sb_sndptr = tail; - toep->tp_m_last = NULL; - } else - toep->tp_m_last = snd->sb_sndptr = last; + if (bytes) { + /* Was able to fit 'bytes' bytes in an immediate WR */ - DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); + ndesc = 1; + make_tx_data_wr(so, wr, bytes, sndptr); - snd->sb_sndptroff += bytes; - total_bytes += bytes; - toep->tp_write_seq += bytes; - CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" - " tail=%p sndptr=%p sndptroff=%d", - toep->tp_wr_avail, count, mbuf_wrs[count], - tail, snd->sb_sndptr, snd->sb_sndptroff); - if (tail) - CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" - " tp_m_last=%p tailbuf=%p snd_una=0x%08x", - total_bytes, toep->tp_m_last, tail->m_data, - tp->snd_una); - else - CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" - " tp_m_last=%p snd_una=0x%08x", - total_bytes, toep->tp_m_last, tp->snd_una); + m0->m_len += bytes; + m0->m_pkthdr.len = m0->m_len; + } else { + int wr_avail = min(toep->tp_wr_avail, TX_MAX_DESC); -#ifdef KTR -{ - int i; - - i = 0; - while (i < count && m_get_sgllen(m0)) { - if ((count - i) >= 3) { - CTR6(KTR_TOM, - "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" - " len=%d pa=0x%zx len=%d", - segs[i].ds_addr, segs[i].ds_len, - segs[i + 1].ds_addr, segs[i + 1].ds_len, - segs[i + 2].ds_addr, segs[i + 2].ds_len); - i += 3; - } else if ((count - i) == 2) { - CTR4(KTR_TOM, - "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" - " len=%d", - segs[i].ds_addr, segs[i].ds_len, - segs[i + 1].ds_addr, segs[i + 1].ds_len); - i += 2; - } else { - CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", - segs[i].ds_addr, segs[i].ds_len); - i++; - } - - } -} -#endif - /* - * remember credits used - */ - m0->m_pkthdr.csum_data = mbuf_wrs[count]; - m0->m_pkthdr.len = bytes; - toep->tp_wr_avail -= mbuf_wrs[count]; - toep->tp_wr_unacked += mbuf_wrs[count]; - - if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || - toep->tp_wr_unacked >= toep->tp_wr_max / 2) { - struct work_request_hdr *wr = cplhdr(m0); + /* Need to make an SGL */ - wr->wr_hi |= htonl(F_WR_COMPL); - toep->tp_wr_unacked = 0; + sgl = sglist_alloc(descs_to_sgllen[wr_avail], M_NOWAIT); + if (sgl == NULL) + break; + + for (m = sndptr; m != NULL; m = m->m_next) { + if ((mlen = m->m_len) > 0) { + if (sglist_append(sgl, m->m_data, mlen)) + break; + } + bytes += mlen; + } + sndptr = m; + if (bytes == 0) { + sglist_free(sgl); + break; + } + ndesc = sgllen_to_descs[sgl->sg_nseg]; + oh->flags |= F_HDR_SGL; + oh->sgl = sgl; + make_tx_data_wr(so, wr, bytes, sndptr); } - KASSERT((m0->m_pkthdr.csum_data > 0) && - (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", - m0->m_pkthdr.csum_data)); - m0->m_type = MT_DONTFREE; - enqueue_wr(toep, m0); - DPRINTF("sending offload tx with %d bytes in %d segments\n", - bytes, count); - l2t_send(cdev, m0, toep->tp_l2t); - } - sockbuf_unlock(snd); - return (total_bytes); -} -/* - * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail - * under any circumstances. We take the easy way out and always queue the - * message to the write_queue. We can optimize the case where the queue is - * already empty though the optimization is probably not worth it. - */ -static void -close_conn(struct socket *so) -{ - struct mbuf *m; - struct cpl_close_con_req *req; - struct tom_data *d; - struct inpcb *inp = so_sotoinpcb(so); - struct tcpcb *tp; - struct toepcb *toep; - unsigned int tid; + oh->flags |= V_HDR_NDESC(ndesc); + oh->plen = bytes; + snd->sb_sndptr = sndptr; + snd->sb_sndptroff += bytes; + if (sndptr == NULL) { + snd->sb_sndptr = snd->sb_mbtail; + snd->sb_sndptroff -= snd->sb_mbtail->m_len; + toep->tp_m_last = snd->sb_mbtail; + } else + toep->tp_m_last = NULL; - inp_wlock(inp); - tp = so_sototcpcb(so); - toep = tp->t_toe; - - if (tp->t_state != TCPS_SYN_SENT) - t3_push_frames(so, 1); - - if (toep->tp_flags & TP_FIN_SENT) { - inp_wunlock(inp); - return; - } + total_bytes += bytes; - tid = toep->tp_tid; - - d = TOM_DATA(toep->tp_toedev); - - m = m_gethdr_nofail(sizeof(*req)); - m_set_priority(m, CPL_PRIORITY_DATA); - m_set_sgl(m, NULL); - m_set_sgllen(m, 0); + toep->tp_wr_avail -= ndesc; + toep->tp_wr_unacked += ndesc; - toep->tp_flags |= TP_FIN_SENT; - req = mtod(m, struct cpl_close_con_req *); - - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); - req->wr.wr_lo = htonl(V_WR_TID(tid)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); - req->rsvd = 0; - inp_wunlock(inp); - /* - * XXX - need to defer shutdown while there is still data in the queue - * - */ - CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); - cxgb_ofld_send(d->cdev, m); + if ((req_completion && toep->tp_wr_unacked == ndesc) || + toep->tp_wr_unacked >= toep->tp_wr_max / 2) { + wr->wr.wrh_hi |= htonl(F_WR_COMPL); + toep->tp_wr_unacked = 0; + } -} + enqueue_wr(toep, m0); + l2t_send(sc, m0, toep->tp_l2t); + } +out: + SOCKBUF_UNLOCK(snd); -/* - * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant - * and send it along. - */ -static void -abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) -{ - struct cpl_abort_req *req = cplhdr(m); + if (sndptr == NULL && (toep->tp_flags & TP_SEND_FIN)) + close_conn(sc, toep); - req->cmd = CPL_ABORT_NO_RST; - cxgb_ofld_send(cdev, m); + return (total_bytes); } -/* - * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are - * permitted to return without sending the message in case we cannot allocate - * an sk_buff. Returns the number of credits sent. - */ -uint32_t -t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) +static int +send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct mbuf *m; struct cpl_rx_data_ack *req; - struct toepcb *toep = tp->t_toe; - struct toedev *tdev = toep->tp_toedev; - - m = m_gethdr_nofail(sizeof(*req)); + uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); - DPRINTF("returning %u credits to HW\n", credits); - - req = mtod(m, struct cpl_rx_data_ack *); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = 0; + m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_CONTROL, req); + if (m == NULL) + return (0); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wrh_lo = 0; OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); - m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); - cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + t3_offload_tx(sc, m); return (credits); } -/* - * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. - * This is only used in DDP mode, so we take the opportunity to also set the - * DACK mode and flush any Rx credits. - */ void -t3_send_rx_modulate(struct toepcb *toep) +t3_rcvd(struct toedev *tod, struct tcpcb *tp) { - struct mbuf *m; - struct cpl_rx_data_ack *req; - - m = m_gethdr_nofail(sizeof(*req)); - - req = mtod(m, struct cpl_rx_data_ack *); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = 0; - m->m_pkthdr.len = m->m_len = sizeof(*req); - - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); - req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | - V_RX_DACK_MODE(1) | - V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - cxgb_ofld_send(TOEP_T3C_DEV(toep), m); - toep->tp_rcv_wup = toep->tp_copied_seq; -} + struct adapter *sc = tod->tod_softc; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct sockbuf *so_rcv = &so->so_rcv; + struct toepcb *toep = tp->t_toe; + int must_send; -/* - * Handle receipt of an urgent pointer. - */ -static void -handle_urg_ptr(struct socket *so, uint32_t urg_seq) -{ -#ifdef URGENT_DATA_SUPPORTED - struct tcpcb *tp = so_sototcpcb(so); + INP_WLOCK_ASSERT(inp); - urg_seq--; /* initially points past the urgent data, per BSD */ + SOCKBUF_LOCK(so_rcv); + KASSERT(toep->tp_enqueued >= so_rcv->sb_cc, + ("%s: so_rcv->sb_cc > enqueued", __func__)); + toep->tp_rx_credits += toep->tp_enqueued - so_rcv->sb_cc; + toep->tp_enqueued = so_rcv->sb_cc; + SOCKBUF_UNLOCK(so_rcv); - if (tp->urg_data && !after(urg_seq, tp->urg_seq)) - return; /* duplicate pointer */ - sk_send_sigurg(sk); - if (tp->urg_seq == tp->copied_seq && tp->urg_data && - !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + must_send = toep->tp_rx_credits + 16384 >= tp->rcv_wnd; + if (must_send || toep->tp_rx_credits >= 15 * 1024) { + int credits; - tp->copied_seq++; - if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) - tom_eat_skb(sk, skb, 0); + credits = send_rx_credits(sc, toep, toep->tp_rx_credits); + toep->tp_rx_credits -= credits; + tp->rcv_wnd += credits; + tp->rcv_adv += credits; } - tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = urg_seq; -#endif -} - -/* - * Returns true if a socket cannot accept new Rx data. - */ -static inline int -so_no_receive(const struct socket *so) -{ - return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); } -/* - * Process an urgent data notification. - */ -static void -rx_urg_notify(struct toepcb *toep, struct mbuf *m) +static int +do_rx_urg_notify(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct cpl_rx_urg_notify *hdr = cplhdr(m); - struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_rx_urg_notify *hdr = mtod(m, void *); + unsigned int tid = GET_TID(hdr); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); - VALIDATE_SOCK(so); - - if (!so_no_receive(so)) - handle_urg_ptr(so, ntohl(hdr->seq)); + log(LOG_ERR, "%s: tid %u inp %p", __func__, tid, toep->tp_inp); m_freem(m); + return (0); } -/* - * Handler for RX_URG_NOTIFY CPL messages. - */ -static int -do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; - - rx_urg_notify(toep, m); - return (0); -} - -static __inline int -is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) -{ - return (toep->tp_ulp_mode || - (toep->tp_ulp_mode == ULP_MODE_TCPDDP && - dev->tod_ttid >= TOE_ID_CHELSIO_T3)); -} - -/* - * Set of states for which we should return RX credits. - */ -#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) - -/* - * Called after some received data has been read. It returns RX credits - * to the HW for the amount of data processed. - */ -void -t3_cleanup_rbuf(struct tcpcb *tp, int copied) +int +t3_send_fin(struct toedev *tod, struct tcpcb *tp) { struct toepcb *toep = tp->t_toe; - struct socket *so; - struct toedev *dev; - int dack_mode, must_send, read; - u32 thres, credits, dack = 0; - struct sockbuf *rcv; - - so = inp_inpcbtosocket(tp->t_inpcb); - rcv = so_sockbuf_rcv(so); - - if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || - (tp->t_state == TCPS_FIN_WAIT_2))) { - if (copied) { - sockbuf_lock(rcv); - toep->tp_copied_seq += copied; - sockbuf_unlock(rcv); - } - - return; - } - - inp_lock_assert(tp->t_inpcb); - - sockbuf_lock(rcv); - if (copied) - toep->tp_copied_seq += copied; - else { - read = toep->tp_enqueued_bytes - rcv->sb_cc; - toep->tp_copied_seq += read; - } - credits = toep->tp_copied_seq - toep->tp_rcv_wup; - toep->tp_enqueued_bytes = rcv->sb_cc; - sockbuf_unlock(rcv); - - if (credits > rcv->sb_mbmax) { - log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", - toep->tp_copied_seq, toep->tp_rcv_wup, credits); - credits = rcv->sb_mbmax; - } - - - /* - * XXX this won't accurately reflect credit return - we need - * to look at the difference between the amount that has been - * put in the recv sockbuf and what is there now - */ - - if (__predict_false(!credits)) - return; - - dev = toep->tp_toedev; - thres = TOM_TUNABLE(dev, rx_credit_thres); + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp_inpcbtosocket(inp); +#if defined(KTR) + unsigned int tid = toep->tp_tid; +#endif - if (__predict_false(thres == 0)) - return; + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); - if (is_delack_mode_valid(dev, toep)) { - dack_mode = TOM_TUNABLE(dev, delack); - if (__predict_false(dack_mode != toep->tp_delack_mode)) { - u32 r = tp->rcv_nxt - toep->tp_delack_seq; + CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, + toep->tp_flags); - if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) - dack = F_RX_DACK_CHANGE | - V_RX_DACK_MODE(dack_mode); - } - } else - dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); - - /* - * For coalescing to work effectively ensure the receive window has - * at least 16KB left. - */ - must_send = credits + 16384 >= tp->rcv_wnd; + toep->tp_flags |= TP_SEND_FIN; + t3_push_frames(so, 1); - if (must_send || credits >= thres) - toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); + return (0); } -static int -cxgb_toe_disconnect(struct tcpcb *tp) +int +t3_tod_output(struct toedev *tod, struct tcpcb *tp) { - struct socket *so; - - DPRINTF("cxgb_toe_disconnect\n"); + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; - so = inp_inpcbtosocket(tp->t_inpcb); - close_conn(so); + t3_push_frames(so, 1); return (0); } -static int -cxgb_toe_reset(struct tcpcb *tp) +/* What mtu_idx to use, given a 4-tuple and/or an MSS cap */ +int +find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) { - struct toepcb *toep = tp->t_toe; + unsigned short *mtus = &sc->params.mtus[0]; + int i = 0, mss; - t3_send_reset(toep); + KASSERT(inc != NULL || pmss > 0, + ("%s: at least one of inc/pmss must be specified", __func__)); - /* - * unhook from socket - */ - tp->t_flags &= ~TF_TOE; - toep->tp_tp = NULL; - tp->t_toe = NULL; - return (0); -} + mss = inc ? tcp_mssopt(inc) : pmss; + if (pmss > 0 && mss > pmss) + mss = pmss; -static int -cxgb_toe_send(struct tcpcb *tp) -{ - struct socket *so; - - DPRINTF("cxgb_toe_send\n"); - dump_toepcb(tp->t_toe); + while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) + ++i; - so = inp_inpcbtosocket(tp->t_inpcb); - t3_push_frames(so, 1); - return (0); + return (i); } -static int -cxgb_toe_rcvd(struct tcpcb *tp) +static inline void +purge_wr_queue(struct toepcb *toep) { + struct mbuf *m; + struct ofld_hdr *oh; - inp_lock_assert(tp->t_inpcb); - - t3_cleanup_rbuf(tp, 0); - - return (0); + while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) { + oh = mtod(m, struct ofld_hdr *); + if (oh->flags & F_HDR_SGL) + sglist_free(oh->sgl); + m_freem(m); + } } +/* + * Release cxgb(4) and T3 resources held by an offload connection (TID, L2T + * entry, etc.) + */ static void -cxgb_toe_detach(struct tcpcb *tp) +t3_release_offload_resources(struct toepcb *toep) { - struct toepcb *toep; - - /* - * XXX how do we handle teardown in the SYN_SENT state? - * - */ - inp_lock_assert(tp->t_inpcb); - toep = tp->t_toe; - toep->tp_tp = NULL; + struct toedev *tod = toep->tp_tod; + struct tom_data *td = t3_tomdata(tod); /* - * unhook from socket + * The TOM explicitly detaches its toepcb from the system's inp before + * it releases the offload resources. */ - tp->t_flags &= ~TF_TOE; - tp->t_toe = NULL; -} - + if (toep->tp_inp) { + panic("%s: inp %p still attached to toepcb %p", + __func__, toep->tp_inp, toep); + } -static struct toe_usrreqs cxgb_toe_usrreqs = { - .tu_disconnect = cxgb_toe_disconnect, - .tu_reset = cxgb_toe_reset, - .tu_send = cxgb_toe_send, - .tu_rcvd = cxgb_toe_rcvd, - .tu_detach = cxgb_toe_detach, - .tu_detach = cxgb_toe_detach, - .tu_syncache_event = handle_syncache_event, -}; + if (toep->tp_wr_avail != toep->tp_wr_max) + purge_wr_queue(toep); + + if (toep->tp_l2t) { + l2t_release(td->l2t, toep->tp_l2t); + toep->tp_l2t = NULL; + } + if (toep->tp_tid >= 0) + release_tid(tod, toep->tp_tid, toep->tp_qset); -static void -__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, - uint64_t mask, uint64_t val, int no_reply) + toepcb_free(toep); +} + +/* + * Determine the receive window size for a socket. + */ +unsigned long +select_rcv_wnd(struct socket *so) { - struct cpl_set_tcb_field *req; + unsigned long wnd; - CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", - toep->tp_tid, word, mask, val); + SOCKBUF_LOCK_ASSERT(&so->so_rcv); - req = mtod(m, struct cpl_set_tcb_field *); - m->m_pkthdr.len = m->m_len = sizeof(*req); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = 0; - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); - req->reply = V_NO_REPLY(no_reply); - req->cpu_idx = 0; - req->word = htons(word); - req->mask = htobe64(mask); - req->val = htobe64(val); + wnd = sbspace(&so->so_rcv); + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - send_or_defer(toep, m, 0); + return min(wnd, MAX_RCV_WND); } -static void -t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) +int +select_rcv_wscale(void) { - struct mbuf *m; - struct tcpcb *tp = toep->tp_tp; - - if (toep == NULL) - return; - - if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { - printf("not seting field\n"); - return; - } - - m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); + int wscale = 0; + unsigned long space = sb_max; - __set_tcb_field(toep, m, word, mask, val, 1); -} + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; -/* - * Set one of the t_flags bits in the TCB. - */ -static void -set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) -{ + while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) + wscale++; - t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); + return (wscale); } -/* - * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. - */ -static void -t3_set_nagle(struct toepcb *toep) -{ - struct tcpcb *tp = toep->tp_tp; - - set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); -} /* - * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. + * Set up the socket for TCP offload. */ void -t3_set_keepalive(struct toepcb *toep, int on_off) +offload_socket(struct socket *so, struct toepcb *toep) { + struct toedev *tod = toep->tp_tod; + struct tom_data *td = t3_tomdata(tod); + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); - set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); -} + INP_WLOCK_ASSERT(inp); -void -t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) -{ - set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); + /* Update socket */ + SOCKBUF_LOCK(&so->so_snd); + so_sockbuf_snd(so)->sb_flags |= SB_NOCOALESCE; + SOCKBUF_UNLOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + so_sockbuf_rcv(so)->sb_flags |= SB_NOCOALESCE; + SOCKBUF_UNLOCK(&so->so_rcv); + + /* Update TCP PCB */ + tp->tod = toep->tp_tod; + tp->t_toe = toep; + tp->t_flags |= TF_TOE; + + /* Install an extra hold on inp */ + toep->tp_inp = inp; + toep->tp_flags |= TP_ATTACHED; + in_pcbref(inp); + + /* Add the TOE PCB to the active list */ + mtx_lock(&td->toep_list_lock); + TAILQ_INSERT_HEAD(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); } +/* This is _not_ the normal way to "unoffload" a socket. */ void -t3_set_dack_mss(struct toepcb *toep, int on_off) +undo_offload_socket(struct socket *so) { + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep = tp->t_toe; + struct toedev *tod = toep->tp_tod; + struct tom_data *td = t3_tomdata(tod); - set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); + INP_WLOCK_ASSERT(inp); + + so_sockbuf_snd(so)->sb_flags &= ~SB_NOCOALESCE; + so_sockbuf_rcv(so)->sb_flags &= ~SB_NOCOALESCE; + + tp->tod = NULL; + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + + toep->tp_inp = NULL; + toep->tp_flags &= ~TP_ATTACHED; + if (in_pcbrele_wlocked(inp)) + panic("%s: inp freed.", __func__); + + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); } /* - * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. + * Socket could be a listening socket, and we may not have a toepcb at all at + * this time. */ -static void -t3_set_tos(struct toepcb *toep) +uint32_t +calc_opt0h(struct socket *so, int mtu_idx, int rscale, struct l2t_entry *e) { - int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); - - t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), - V_TCB_TOS(tos)); -} + uint32_t opt0h = F_TCAM_BYPASS | V_WND_SCALE(rscale) | + V_MSS_IDX(mtu_idx); + if (so != NULL) { + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + int keepalive = always_keepalive || + so_options_get(so) & SO_KEEPALIVE; -/* - * In DDP mode, TP fails to schedule a timer to push RX data to the host when - * DDP is disabled (data is delivered to freelist). [Note that, the peer should - * set the PSH bit in the last segment, which would trigger delivery.] - * We work around the issue by setting a DDP buffer in a partial placed state, - * which guarantees that TP will schedule a timer. - */ -#define TP_DDP_TIMER_WORKAROUND_MASK\ - (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ - ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ - V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) -#define TP_DDP_TIMER_WORKAROUND_VAL\ - (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ - ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ - 32)) + opt0h |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); + opt0h |= V_KEEP_ALIVE(keepalive != 0); + } -static void -t3_enable_ddp(struct toepcb *toep, int on) + if (e != NULL) + opt0h |= V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx); + + return (htobe32(opt0h)); +} + +uint32_t +calc_opt0l(struct socket *so, int rcv_bufsize) { - if (on) { - - t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), - V_TF_DDP_OFF(0)); - } else - t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, - V_TF_DDP_OFF(1) | - TP_DDP_TIMER_WORKAROUND_MASK, - V_TF_DDP_OFF(1) | - TP_DDP_TIMER_WORKAROUND_VAL); + uint32_t opt0l = V_ULP_MODE(ULP_MODE_NONE) | V_RCV_BUFSIZ(rcv_bufsize); + + KASSERT(rcv_bufsize <= M_RCV_BUFSIZ, + ("%s: rcv_bufsize (%d) is too high", __func__, rcv_bufsize)); + if (so != NULL) /* optional because noone cares about IP TOS */ + opt0l |= V_TOS(INP_TOS(sotoinpcb(so))); + + return (htobe32(opt0l)); } -void -t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) +/* + * Convert an ACT_OPEN_RPL status to an errno. + */ +static int +act_open_rpl_status_to_errno(int status) { - t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, - V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), - tag_color); + switch (status) { + case CPL_ERR_CONN_RESET: + return (ECONNREFUSED); + case CPL_ERR_ARP_MISS: + return (EHOSTUNREACH); + case CPL_ERR_CONN_TIMEDOUT: + return (ETIMEDOUT); + case CPL_ERR_TCAM_FULL: + return (ENOMEM); + case CPL_ERR_CONN_EXIST: + log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); + return (EADDRINUSE); + default: + return (EIO); + } } -void -t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, - unsigned int len) +/* + * Return whether a failed active open has allocated a TID + */ +static inline int +act_open_has_tid(int status) { - if (buf_idx == 0) - t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, - V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | - V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), - V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | - V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); - else - t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, - V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | - V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), - V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | - V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; } +/* + * Active open failed. + */ static int -t3_set_cong_control(struct socket *so, const char *name) -{ -#ifdef CONGESTION_CONTROL_SUPPORTED - int cong_algo; +do_act_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + struct cpl_act_open_rpl *rpl = mtod(m, void *); + unsigned int atid = G_TID(ntohl(rpl->atid)); + struct toepcb *toep = lookup_atid(&td->tid_maps, atid); + struct inpcb *inp = toep->tp_inp; + struct tcpcb *tp = intotcpcb(inp); + int s = rpl->status; - for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) - if (!strcmp(name, t3_cong_ops[cong_algo].name)) - break; + CTR3(KTR_CXGB, "%s: atid %u, status %u ", __func__, atid, s); - if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) - return -EINVAL; -#endif - return 0; + free_atid(&td->tid_maps, atid); + toep->tp_tid = -1; + + if (act_open_has_tid(s)) + queue_tid_release(tod, GET_TID(rpl)); + + if (s == CPL_ERR_TCAM_FULL || s == CPL_ERR_CONN_EXIST) { + INP_WLOCK(inp); + toe_connect_failed(tod, tp, EAGAIN); + toepcb_release(toep); /* unlocks inp */ + } else { + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(s)); + toepcb_release(toep); /* unlocks inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + } + + m_freem(m); + return (0); } +/* + * Send an active open request. + * + * State of affairs on entry: + * soisconnecting (so_state |= SS_ISCONNECTING) + * tcbinfo not locked (this has changed - used to be WLOCKed) + * inp WLOCKed + * tp->t_state = TCPS_SYN_SENT + * rtalloc1, RT_UNLOCK on rt. + */ int -t3_get_tcb(struct toepcb *toep) +t3_connect(struct toedev *tod, struct socket *so, + struct rtentry *rt, struct sockaddr *nam) { - struct cpl_get_tcb *req; - struct tcpcb *tp = toep->tp_tp; - struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); + struct mbuf *m = NULL; + struct l2t_entry *e = NULL; + struct tom_data *td = t3_tomdata(tod); + struct adapter *sc = tod->tod_softc; + struct cpl_act_open_req *cpl; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep; + int atid = -1, mtu_idx, rscale, cpu_idx, qset; + struct sockaddr *gw; + struct ifnet *ifp = rt->rt_ifp; + struct port_info *pi = ifp->if_softc; /* XXX wrong for VLAN etc. */ - if (!m) - return (ENOMEM); - - inp_lock_assert(tp->t_inpcb); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - req = mtod(m, struct cpl_get_tcb *); - m->m_pkthdr.len = m->m_len = sizeof(*req); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = 0; - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); - req->cpuno = htons(toep->tp_qset); - req->rsvd = 0; - if (tp->t_state == TCPS_SYN_SENT) - mbufq_tail(&toep->out_of_order_queue, m); // defer - else - cxgb_ofld_send(TOEP_T3C_DEV(toep), m); - return 0; -} + INP_WLOCK_ASSERT(inp); -static inline void -so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) -{ + toep = toepcb_alloc(tod); + if (toep == NULL) + goto failed; - toepcb_hold(toep); + atid = alloc_atid(&td->tid_maps, toep); + if (atid < 0) + goto failed; - cxgb_insert_tid(d->cdev, d->client, toep, tid); -} + qset = pi->first_qset + (arc4random() % pi->nqsets); -/** - * find_best_mtu - find the entry in the MTU table closest to an MTU - * @d: TOM state - * @mtu: the target MTU - * - * Returns the index of the value in the MTU table that is closest to but - * does not exceed the target MTU. - */ -static unsigned int -find_best_mtu(const struct t3c_data *d, unsigned short mtu) -{ - int i = 0; + m = M_GETHDR_OFLD(qset, CPL_PRIORITY_CONTROL, cpl); + if (m == NULL) + goto failed; - while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) - ++i; - return (i); -} + gw = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam; + e = t3_l2t_get(pi, ifp, gw); + if (e == NULL) + goto failed; -static unsigned int -select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) -{ - unsigned int idx; - -#ifdef notyet - struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; -#endif - if (tp) { - tp->t_maxseg = pmtu - 40; - if (tp->t_maxseg < td->mtus[0] - 40) - tp->t_maxseg = td->mtus[0] - 40; - idx = find_best_mtu(td, tp->t_maxseg + 40); + toep->tp_l2t = e; + toep->tp_tid = atid; /* used to double check response */ + toep->tp_qset = qset; - tp->t_maxseg = td->mtus[idx] - 40; - } else - idx = find_best_mtu(td, pmtu); - - return (idx); -} + SOCKBUF_LOCK(&so->so_rcv); + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + toep->tp_rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + SOCKBUF_UNLOCK(&so->so_rcv); -static inline void -free_atid(struct t3cdev *cdev, unsigned int tid) -{ - struct toepcb *toep = cxgb_free_atid(cdev, tid); + offload_socket(so, toep); + + /* + * The kernel sets request_r_scale based on sb_max whereas we need to + * take hardware's MAX_RCV_WND into account too. This is normally a + * no-op as MAX_RCV_WND is much larger than the default sb_max. + */ + if (tp->t_flags & TF_REQ_SCALE) + rscale = tp->request_r_scale = select_rcv_wscale(); + else + rscale = 0; + mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); + cpu_idx = sc->rrss_map[qset]; + + cpl->wr.wrh_hi = htobe32(V_WR_OP(FW_WROPCODE_FORWARD)); + cpl->wr.wrh_lo = 0; + OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); + inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, + &cpl->peer_port); + cpl->opt0h = calc_opt0h(so, mtu_idx, rscale, e); + cpl->opt0l = calc_opt0l(so, toep->tp_rx_credits); + cpl->params = 0; + cpl->opt2 = calc_opt2(cpu_idx); + + CTR5(KTR_CXGB, "%s: atid %u (%s), toep %p, inp %p", __func__, + toep->tp_tid, tcpstates[tp->t_state], toep, inp); + + if (l2t_send(sc, m, e) == 0) + return (0); + + undo_offload_socket(so); + +failed: + CTR5(KTR_CXGB, "%s: FAILED, atid %d, toep %p, l2te %p, mbuf %p", + __func__, atid, toep, e, m); + + if (atid >= 0) + free_atid(&td->tid_maps, atid); + + if (e) + l2t_release(td->l2t, e); if (toep) - toepcb_release(toep); + toepcb_free(toep); + + m_freem(m); + + return (ENOMEM); } /* - * Release resources held by an offload connection (TID, L2T entry, etc.) + * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do not + * send multiple ABORT_REQs for the same connection and also that we do not try + * to send a message after the connection has closed. */ static void -t3_release_offload_resources(struct toepcb *toep) +send_reset(struct toepcb *toep) { - struct tcpcb *tp = toep->tp_tp; - struct toedev *tdev = toep->tp_toedev; - struct t3cdev *cdev; - struct socket *so; + + struct cpl_abort_req *req; unsigned int tid = toep->tp_tid; - struct sockbuf *rcv; - - CTR0(KTR_TOM, "t3_release_offload_resources"); + struct inpcb *inp = toep->tp_inp; + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); + struct toedev *tod = toep->tp_tod; + struct adapter *sc = tod->tod_softc; + struct mbuf *m; - if (!tdev) - return; + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); - cdev = TOEP_T3C_DEV(toep); - if (!cdev) + CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, + toep->tp_flags); + + if (toep->tp_flags & TP_ABORT_SHUTDOWN) return; - toep->tp_qset = 0; - t3_release_ddp_resources(toep); + toep->tp_flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); -#ifdef CTRL_SKB_CACHE - kfree_skb(CTRL_SKB_CACHE(tp)); - CTRL_SKB_CACHE(tp) = NULL; -#endif + /* Purge the send queue */ + sbflush(so_sockbuf_snd(so)); + purge_wr_queue(toep); - if (toep->tp_wr_avail != toep->tp_wr_max) { - purge_wr_queue(toep); - reset_wr_list(toep); - } - - if (toep->tp_l2t) { - l2t_release(L2DATA(cdev), toep->tp_l2t); - toep->tp_l2t = NULL; - } - toep->tp_tp = NULL; - if (tp) { - inp_lock_assert(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - rcv = so_sockbuf_rcv(so); - /* - * cancel any offloaded reads - * - */ - sockbuf_lock(rcv); - tp->t_toe = NULL; - tp->t_flags &= ~TF_TOE; - if (toep->tp_ddp_state.user_ddp_pending) { - t3_cancel_ubuf(toep, rcv); - toep->tp_ddp_state.user_ddp_pending = 0; - } - so_sorwakeup_locked(so); - - } - - if (toep->tp_state == TCPS_SYN_SENT) { - free_atid(cdev, tid); -#ifdef notyet - __skb_queue_purge(&tp->out_of_order_queue); -#endif - } else { // we have TID - cxgb_remove_tid(cdev, toep, tid); - toepcb_release(toep); - } -#if 0 - log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); -#endif -} - -static void -install_offload_ops(struct socket *so) -{ - struct tcpcb *tp = so_sototcpcb(so); - - KASSERT(tp->t_toe != NULL, ("toepcb not set")); - - t3_install_socket_ops(so); - tp->t_flags |= TF_TOE; - tp->t_tu = &cxgb_toe_usrreqs; -} - -/* - * Determine the receive window scaling factor given a target max - * receive window. - */ -static __inline int -select_rcv_wscale(int space, struct vnet *vnet) -{ - int wscale = 0; - - if (space > MAX_RCV_WND) - space = MAX_RCV_WND; - - if (V_tcp_do_rfc1323) - for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; - - return (wscale); -} - -/* - * Determine the receive window size for a socket. - */ -static unsigned long -select_rcv_wnd(struct toedev *dev, struct socket *so) -{ - struct tom_data *d = TOM_DATA(dev); - unsigned int wnd; - unsigned int max_rcv_wnd; - struct sockbuf *rcv; - - rcv = so_sockbuf_rcv(so); - - if (V_tcp_do_autorcvbuf) - wnd = V_tcp_autorcvbuf_max; - else - wnd = rcv->sb_hiwat; - - - - /* XXX - * For receive coalescing to work effectively we need a receive window - * that can accomodate a coalesced segment. - */ - if (wnd < MIN_RCV_WND) - wnd = MIN_RCV_WND; - - /* PR 5138 */ - max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? - (uint32_t)d->rx_page_size * 23 : - MAX_RCV_WND); - - return min(wnd, max_rcv_wnd); -} - -/* - * Assign offload parameters to some socket fields. This code is used by - * both active and passive opens. - */ -static inline void -init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, - struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) -{ - struct tcpcb *tp = so_sototcpcb(so); - struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); - struct sockbuf *snd, *rcv; - -#ifdef notyet - SOCK_LOCK_ASSERT(so); -#endif - - snd = so_sockbuf_snd(so); - rcv = so_sockbuf_rcv(so); - - log(LOG_INFO, "initializing offload socket\n"); - /* - * We either need to fix push frames to work with sbcompress - * or we need to add this - */ - snd->sb_flags |= SB_NOCOALESCE; - rcv->sb_flags |= SB_NOCOALESCE; - - tp->t_toe = toep; - toep->tp_tp = tp; - toep->tp_toedev = dev; - - toep->tp_tid = tid; - toep->tp_l2t = e; - toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); - toep->tp_wr_unacked = 0; - toep->tp_delack_mode = 0; - - toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); - /* - * XXX broken - * - */ - tp->rcv_wnd = select_rcv_wnd(dev, so); - - toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && - tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; - toep->tp_qset_idx = 0; - - reset_wr_list(toep); - DPRINTF("initialization done\n"); -} - -/* - * The next two functions calculate the option 0 value for a socket. - */ -static inline unsigned int -calc_opt0h(struct socket *so, int mtu_idx) -{ - struct tcpcb *tp = so_sototcpcb(so); - int wscale = select_rcv_wscale(tp->rcv_wnd, so->so_vnet); - - return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | - V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | - V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); -} - -static inline unsigned int -calc_opt0l(struct socket *so, int ulp_mode) -{ - struct tcpcb *tp = so_sototcpcb(so); - unsigned int val; - - val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | - V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); - - DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); - return (val); -} - -static inline unsigned int -calc_opt2(const struct socket *so, struct toedev *dev) -{ - int flv_valid; - - flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); - - return (V_FLAVORS_VALID(flv_valid) | - V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); -} - -#if DEBUG_WR > 1 -static int -count_pending_wrs(const struct toepcb *toep) -{ - const struct mbuf *m; - int n = 0; - - wr_queue_walk(toep, m) - n += m->m_pkthdr.csum_data; - return (n); -} -#endif - -#if 0 -(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) -#endif - -static void -mk_act_open_req(struct socket *so, struct mbuf *m, - unsigned int atid, const struct l2t_entry *e) -{ - struct cpl_act_open_req *req; - struct inpcb *inp = so_sotoinpcb(so); - struct tcpcb *tp = inp_inpcbtotcpcb(inp); - struct toepcb *toep = tp->t_toe; - struct toedev *tdev = toep->tp_toedev; - - m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); - - req = mtod(m, struct cpl_act_open_req *); - m->m_pkthdr.len = m->m_len = sizeof(*req); - - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - req->wr.wr_lo = 0; - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); - inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); -#if 0 - req->local_port = inp->inp_lport; - req->peer_port = inp->inp_fport; - memcpy(&req->local_ip, &inp->inp_laddr, 4); - memcpy(&req->peer_ip, &inp->inp_faddr, 4); -#endif - req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | - V_TX_CHANNEL(e->smt_idx)); - req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); - req->params = 0; - req->opt2 = htonl(calc_opt2(so, tdev)); -} - - -/* - * Convert an ACT_OPEN_RPL status to an errno. - */ -static int -act_open_rpl_status_to_errno(int status) -{ - switch (status) { - case CPL_ERR_CONN_RESET: - return (ECONNREFUSED); - case CPL_ERR_ARP_MISS: - return (EHOSTUNREACH); - case CPL_ERR_CONN_TIMEDOUT: - return (ETIMEDOUT); - case CPL_ERR_TCAM_FULL: - return (ENOMEM); - case CPL_ERR_CONN_EXIST: - log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); - return (EADDRINUSE); - default: - return (EIO); - } -} - -static void -fail_act_open(struct toepcb *toep, int errno) -{ - struct tcpcb *tp = toep->tp_tp; - - t3_release_offload_resources(toep); - if (tp) { - inp_wunlock(tp->t_inpcb); - tcp_offload_drop(tp, errno); - } - -#ifdef notyet - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); -#endif -} - -/* - * Handle active open failures. - */ -static void -active_open_failed(struct toepcb *toep, struct mbuf *m) -{ - struct cpl_act_open_rpl *rpl = cplhdr(m); - struct inpcb *inp; - - if (toep->tp_tp == NULL) - goto done; - - inp = toep->tp_tp->t_inpcb; - -/* - * Don't handle connection retry for now - */ -#ifdef notyet - struct inet_connection_sock *icsk = inet_csk(sk); - - if (rpl->status == CPL_ERR_CONN_EXIST && - icsk->icsk_retransmit_timer.function != act_open_retry_timer) { - icsk->icsk_retransmit_timer.function = act_open_retry_timer; - sk_reset_timer(so, &icsk->icsk_retransmit_timer, - jiffies + HZ / 2); - } else -#endif - { - inp_wlock(inp); - /* - * drops the inpcb lock - */ - fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); - } - - done: - m_free(m); -} - -/* - * Return whether a failed active open has allocated a TID - */ -static inline int -act_open_has_tid(int status) -{ - return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && - status != CPL_ERR_ARP_MISS; -} - -/* - * Process an ACT_OPEN_RPL CPL message. - */ -static int -do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; - struct cpl_act_open_rpl *rpl = cplhdr(m); - - if (cdev->type != T3A && act_open_has_tid(rpl->status)) - cxgb_queue_tid_release(cdev, GET_TID(rpl)); - - active_open_failed(toep, m); - return (0); -} - -/* - * Handle an ARP failure for an active open. XXX purge ofo queue - * - * XXX badly broken for crossed SYNs as the ATID is no longer valid. - * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should - * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't - * free the atid. Hmm. - */ -#ifdef notyet -static void -act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) -{ - struct toepcb *toep = m_get_toep(m); - struct tcpcb *tp = toep->tp_tp; - struct inpcb *inp = tp->t_inpcb; - struct socket *so; - - inp_wlock(inp); - if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { - /* - * drops the inpcb lock - */ - fail_act_open(so, EHOSTUNREACH); - printf("freeing %p\n", m); - - m_free(m); - } else - inp_wunlock(inp); -} -#endif -/* - * Send an active open request. - */ -int -t3_connect(struct toedev *tdev, struct socket *so, - struct rtentry *rt, struct sockaddr *nam) -{ - struct mbuf *m; - struct l2t_entry *e; - struct tom_data *d = TOM_DATA(tdev); - struct inpcb *inp = so_sotoinpcb(so); - struct tcpcb *tp = intotcpcb(inp); - struct toepcb *toep; /* allocated by init_offload_socket */ - - int atid; - - toep = toepcb_alloc(); - if (toep == NULL) - goto out_err; - - if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) - goto out_err; - - e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); - if (!e) - goto free_tid; - - inp_lock_assert(inp); - m = m_gethdr(MT_DATA, M_WAITOK); - -#if 0 - m->m_toe.mt_toepcb = tp->t_toe; - set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); -#endif - so_lock(so); - - init_offload_socket(so, tdev, atid, e, rt, toep); - - install_offload_ops(so); - - mk_act_open_req(so, m, atid, e); - so_unlock(so); - - soisconnecting(so); - toep = tp->t_toe; - m_set_toep(m, tp->t_toe); - - toep->tp_state = TCPS_SYN_SENT; - l2t_send(d->cdev, (struct mbuf *)m, e); - - if (toep->tp_ulp_mode) - t3_enable_ddp(toep, 0); - return (0); - -free_tid: - printf("failing connect - free atid\n"); - - free_atid(d->cdev, atid); -out_err: - printf("return ENOMEM\n"); - return (ENOMEM); -} - -/* - * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do - * not send multiple ABORT_REQs for the same connection and also that we do - * not try to send a message after the connection has closed. Returns 1 if - * an ABORT_REQ wasn't generated after all, 0 otherwise. - */ -static void -t3_send_reset(struct toepcb *toep) -{ - - struct cpl_abort_req *req; - unsigned int tid = toep->tp_tid; - int mode = CPL_ABORT_SEND_RST; - struct tcpcb *tp = toep->tp_tp; - struct toedev *tdev = toep->tp_toedev; - struct socket *so = NULL; - struct mbuf *m; - struct sockbuf *snd; - - if (tp) { - inp_lock_assert(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - } - - if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || - tdev == NULL)) - return; - toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); - - snd = so_sockbuf_snd(so); - /* Purge the send queue so we don't send anything after an abort. */ - if (so) - sbflush(snd); - if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) - mode |= CPL_ABORT_POST_CLOSE_REQ; - - m = m_gethdr_nofail(sizeof(*req)); - m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); - set_arp_failure_handler(m, abort_arp_failure); - - req = mtod(m, struct cpl_abort_req *); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); - req->wr.wr_lo = htonl(V_WR_TID(tid)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); - req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; - req->rsvd1 = !(toep->tp_flags & TP_DATASENT); - req->cmd = mode; - if (tp && (tp->t_state == TCPS_SYN_SENT)) - mbufq_tail(&toep->out_of_order_queue, m); // defer - else - l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); -} - -static int -t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) -{ - struct inpcb *inp; - int error, optval; - - if (sopt->sopt_name == IP_OPTIONS) - return (ENOPROTOOPT); - - if (sopt->sopt_name != IP_TOS) - return (EOPNOTSUPP); - - error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); - - if (error) - return (error); - - if (optval > IPTOS_PREC_CRITIC_ECP) - return (EINVAL); - - inp = so_sotoinpcb(so); - inp_wlock(inp); - inp_ip_tos_set(inp, optval); -#if 0 - inp->inp_ip_tos = optval; -#endif - t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); - inp_wunlock(inp); - - return (0); -} - -static int -t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) -{ - int err = 0; - size_t copied; - - if (sopt->sopt_name != TCP_CONGESTION && - sopt->sopt_name != TCP_NODELAY) - return (EOPNOTSUPP); - - if (sopt->sopt_name == TCP_CONGESTION) { - char name[TCP_CA_NAME_MAX]; - int optlen = sopt->sopt_valsize; - struct tcpcb *tp; - - if (sopt->sopt_dir == SOPT_GET) { - KASSERT(0, ("unimplemented")); - return (EOPNOTSUPP); - } - - if (optlen < 1) - return (EINVAL); - - err = copyinstr(sopt->sopt_val, name, - min(TCP_CA_NAME_MAX - 1, optlen), &copied); - if (err) - return (err); - if (copied < 1) - return (EINVAL); - - tp = so_sototcpcb(so); - /* - * XXX I need to revisit this - */ - if ((err = t3_set_cong_control(so, name)) == 0) { -#ifdef CONGESTION_CONTROL_SUPPORTED - tp->t_cong_control = strdup(name, M_CXGB); -#endif - } else - return (err); - } else { - int optval, oldval; - struct inpcb *inp; - struct tcpcb *tp; - - if (sopt->sopt_dir == SOPT_GET) - return (EOPNOTSUPP); - - err = sooptcopyin(sopt, &optval, sizeof optval, - sizeof optval); - - if (err) - return (err); - - inp = so_sotoinpcb(so); - inp_wlock(inp); - tp = inp_inpcbtotcpcb(inp); - - oldval = tp->t_flags; - if (optval) - tp->t_flags |= TF_NODELAY; - else - tp->t_flags &= ~TF_NODELAY; - inp_wunlock(inp); - - - if (oldval != tp->t_flags && (tp->t_toe != NULL)) - t3_set_nagle(tp->t_toe); - - } - - return (0); -} - -int -t3_ctloutput(struct socket *so, struct sockopt *sopt) -{ - int err; - - if (sopt->sopt_level != IPPROTO_TCP) - err = t3_ip_ctloutput(so, sopt); - else - err = t3_tcp_ctloutput(so, sopt); - - if (err != EOPNOTSUPP) - return (err); - - return (tcp_ctloutput(so, sopt)); -} - -/* - * Returns true if we need to explicitly request RST when we receive new data - * on an RX-closed connection. - */ -static inline int -need_rst_on_excess_rx(const struct toepcb *toep) -{ - return (1); -} - -/* - * Handles Rx data that arrives in a state where the socket isn't accepting - * new data. - */ -static void -handle_excess_rx(struct toepcb *toep, struct mbuf *m) -{ - - if (need_rst_on_excess_rx(toep) && - !(toep->tp_flags & TP_ABORT_SHUTDOWN)) - t3_send_reset(toep); - m_freem(m); -} - -/* - * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) - * by getting the DDP offset from the TCB. - */ -static void -tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) -{ - struct ddp_state *q = &toep->tp_ddp_state; - struct ddp_buf_state *bsp; - struct cpl_get_tcb_rpl *hdr; - unsigned int ddp_offset; - struct socket *so; - struct tcpcb *tp; - struct sockbuf *rcv; - int state; - - uint64_t t; - __be64 *tcb; - - tp = toep->tp_tp; - so = inp_inpcbtosocket(tp->t_inpcb); - - inp_lock_assert(tp->t_inpcb); - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); - - /* Note that we only accout for CPL_GET_TCB issued by the DDP code. - * We really need a cookie in order to dispatch the RPLs. - */ - q->get_tcb_count--; - - /* It is a possible that a previous CPL already invalidated UBUF DDP - * and moved the cur_buf idx and hence no further processing of this - * skb is required. However, the app might be sleeping on - * !q->get_tcb_count and we need to wake it up. - */ - if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { - int state = so_state_get(so); - - m_freem(m); - if (__predict_true((state & SS_NOFDREF) == 0)) - so_sorwakeup_locked(so); - else - sockbuf_unlock(rcv); - - return; - } - - bsp = &q->buf_state[q->cur_buf]; - hdr = cplhdr(m); - tcb = (__be64 *)(hdr + 1); - if (q->cur_buf == 0) { - t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); - ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); - } else { - t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); - ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; - } - ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; - m->m_cur_offset = bsp->cur_offset; - bsp->cur_offset = ddp_offset; - m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; - - CTR5(KTR_TOM, - "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", - q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); - KASSERT(ddp_offset >= m->m_cur_offset, - ("ddp_offset=%u less than cur_offset=%u", - ddp_offset, m->m_cur_offset)); - -#if 0 -{ - unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; - - t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); - ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; - - t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); - rcv_nxt = t >> S_TCB_RCV_NXT; - rcv_nxt &= M_TCB_RCV_NXT; - - t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); - rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); - rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; - - T3_TRACE2(TIDTB(sk), - "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", - ddp_flags, rcv_nxt - rx_hdr_offset); - T3_TRACE4(TB(q), - "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", - tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); - T3_TRACE3(TB(q), - "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", - rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); - T3_TRACE2(TB(q), - "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", - q->buf_state[0].flags, q->buf_state[1].flags); - -} -#endif - if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { - handle_excess_rx(toep, m); - return; - } - -#ifdef T3_TRACE - if ((int)m->m_pkthdr.len < 0) { - t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); - } -#endif - if (bsp->flags & DDP_BF_NOCOPY) { -#ifdef T3_TRACE - T3_TRACE0(TB(q), - "tcb_rpl_as_ddp_complete: CANCEL UBUF"); - - if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { - printk("!cancel_ubuf"); - t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); - } -#endif - m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; - bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); - q->cur_buf ^= 1; - } else if (bsp->flags & DDP_BF_NOFLIP) { - - m->m_ddp_flags = 1; /* always a kernel buffer */ - - /* now HW buffer carries a user buffer */ - bsp->flags &= ~DDP_BF_NOFLIP; - bsp->flags |= DDP_BF_NOCOPY; - - /* It is possible that the CPL_GET_TCB_RPL doesn't indicate - * any new data in which case we're done. If in addition the - * offset is 0, then there wasn't a completion for the kbuf - * and we need to decrement the posted count. - */ - if (m->m_pkthdr.len == 0) { - if (ddp_offset == 0) { - q->kbuf_posted--; - bsp->flags |= DDP_BF_NODATA; - } - sockbuf_unlock(rcv); - m_free(m); - return; - } - } else { - sockbuf_unlock(rcv); - - /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, - * but it got here way late and nobody cares anymore. - */ - m_free(m); - return; - } - - m->m_ddp_gl = (unsigned char *)bsp->gl; - m->m_flags |= M_DDP; - m->m_seq = tp->rcv_nxt; - tp->rcv_nxt += m->m_pkthdr.len; - tp->t_rcvtime = ticks; - CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", - m->m_seq, q->cur_buf, m->m_pkthdr.len); - if (m->m_pkthdr.len == 0) { - q->user_ddp_pending = 0; - m_free(m); - } else - SBAPPEND(rcv, m); - - state = so_state_get(so); - if (__predict_true((state & SS_NOFDREF) == 0)) - so_sorwakeup_locked(so); - else - sockbuf_unlock(rcv); -} - -/* - * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, - * in that case they are similar to DDP completions. - */ -static int -do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; - - /* OK if socket doesn't exist */ - if (toep == NULL) { - printf("null toep in do_get_tcb_rpl\n"); - return (CPL_RET_BUF_DONE); - } - - inp_wlock(toep->tp_tp->t_inpcb); - tcb_rpl_as_ddp_complete(toep, m); - inp_wunlock(toep->tp_tp->t_inpcb); - - return (0); -} - -static void -handle_ddp_data(struct toepcb *toep, struct mbuf *m) -{ - struct tcpcb *tp = toep->tp_tp; - struct socket *so; - struct ddp_state *q; - struct ddp_buf_state *bsp; - struct cpl_rx_data *hdr = cplhdr(m); - unsigned int rcv_nxt = ntohl(hdr->seq); - struct sockbuf *rcv; - - if (tp->rcv_nxt == rcv_nxt) - return; - - inp_lock_assert(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); - - q = &toep->tp_ddp_state; - bsp = &q->buf_state[q->cur_buf]; - KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", - rcv_nxt, tp->rcv_nxt)); - m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; - KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); - CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", - rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); - -#ifdef T3_TRACE - if ((int)m->m_pkthdr.len < 0) { - t3_ddp_error(so, "handle_ddp_data: neg len"); - } -#endif - m->m_ddp_gl = (unsigned char *)bsp->gl; - m->m_flags |= M_DDP; - m->m_cur_offset = bsp->cur_offset; - m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; - if (bsp->flags & DDP_BF_NOCOPY) - bsp->flags &= ~DDP_BF_NOCOPY; - - m->m_seq = tp->rcv_nxt; - tp->rcv_nxt = rcv_nxt; - bsp->cur_offset += m->m_pkthdr.len; - if (!(bsp->flags & DDP_BF_NOFLIP)) - q->cur_buf ^= 1; - /* - * For now, don't re-enable DDP after a connection fell out of DDP - * mode. - */ - q->ubuf_ddp_ready = 0; - sockbuf_unlock(rcv); -} - -/* - * Process new data received for a connection. - */ -static void -new_rx_data(struct toepcb *toep, struct mbuf *m) -{ - struct cpl_rx_data *hdr = cplhdr(m); - struct tcpcb *tp = toep->tp_tp; - struct socket *so; - struct sockbuf *rcv; - int state; - int len = be16toh(hdr->len); - - inp_wlock(tp->t_inpcb); - - so = inp_inpcbtosocket(tp->t_inpcb); - - if (__predict_false(so_no_receive(so))) { - handle_excess_rx(toep, m); - inp_wunlock(tp->t_inpcb); - TRACE_EXIT; - return; - } - - if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) - handle_ddp_data(toep, m); - - m->m_seq = ntohl(hdr->seq); - m->m_ulp_mode = 0; /* for iSCSI */ - -#if VALIDATE_SEQ - if (__predict_false(m->m_seq != tp->rcv_nxt)) { - log(LOG_ERR, - "%s: TID %u: Bad sequence number %u, expected %u\n", - toep->tp_toedev->name, toep->tp_tid, m->m_seq, - tp->rcv_nxt); - m_freem(m); - inp_wunlock(tp->t_inpcb); - return; - } -#endif - m_adj(m, sizeof(*hdr)); - -#ifdef URGENT_DATA_SUPPORTED - /* - * We don't handle urgent data yet - */ - if (__predict_false(hdr->urg)) - handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); - if (__predict_false(tp->urg_data == TCP_URG_NOTYET && - tp->urg_seq - tp->rcv_nxt < skb->len)) - tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - - tp->rcv_nxt]; -#endif - if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { - toep->tp_delack_mode = hdr->dack_mode; - toep->tp_delack_seq = tp->rcv_nxt; - } - CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", - m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); - - if (len < m->m_pkthdr.len) - m->m_pkthdr.len = m->m_len = len; - - tp->rcv_nxt += m->m_pkthdr.len; - tp->t_rcvtime = ticks; - toep->tp_enqueued_bytes += m->m_pkthdr.len; - CTR2(KTR_TOM, - "new_rx_data: seq 0x%x len %u", - m->m_seq, m->m_pkthdr.len); - inp_wunlock(tp->t_inpcb); - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); -#if 0 - if (sb_notify(rcv)) - DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); -#endif - SBAPPEND(rcv, m); - -#ifdef notyet - /* - * We're giving too many credits to the card - but disable this check so we can keep on moving :-| - * - */ - KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), - - ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", - so, rcv->sb_cc, rcv->sb_mbmax)); -#endif - - - CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", - rcv->sb_cc, rcv->sb_mbcnt); - - state = so_state_get(so); - if (__predict_true((state & SS_NOFDREF) == 0)) - so_sorwakeup_locked(so); - else - sockbuf_unlock(rcv); -} - -/* - * Handler for RX_DATA CPL messages. - */ -static int -do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; - - DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); - - new_rx_data(toep, m); - - return (0); -} - -static void -new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) -{ - struct tcpcb *tp; - struct ddp_state *q; - struct ddp_buf_state *bsp; - struct cpl_rx_data_ddp *hdr; - struct socket *so; - unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; - int nomoredata = 0; - unsigned int delack_mode; - struct sockbuf *rcv; - - tp = toep->tp_tp; - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - - if (__predict_false(so_no_receive(so))) { - - handle_excess_rx(toep, m); - inp_wunlock(tp->t_inpcb); - return; - } - - q = &toep->tp_ddp_state; - hdr = cplhdr(m); - ddp_report = ntohl(hdr->u.ddp_report); - buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; - bsp = &q->buf_state[buf_idx]; - - CTR4(KTR_TOM, - "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " - "hdr seq 0x%x len %u", - tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), - ntohs(hdr->len)); - CTR3(KTR_TOM, - "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", - G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); - - ddp_len = ntohs(hdr->len); - rcv_nxt = ntohl(hdr->seq) + ddp_len; - - delack_mode = G_DDP_DACK_MODE(ddp_report); - if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { - toep->tp_delack_mode = delack_mode; - toep->tp_delack_seq = tp->rcv_nxt; - } - - m->m_seq = tp->rcv_nxt; - tp->rcv_nxt = rcv_nxt; - - tp->t_rcvtime = ticks; - /* - * Store the length in m->m_len. We are changing the meaning of - * m->m_len here, we need to be very careful that nothing from now on - * interprets ->len of this packet the usual way. - */ - m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; - inp_wunlock(tp->t_inpcb); - CTR3(KTR_TOM, - "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", - m->m_len, rcv_nxt, m->m_seq); - /* - * Figure out where the new data was placed in the buffer and store it - * in when. Assumes the buffer offset starts at 0, consumer needs to - * account for page pod's pg_offset. - */ - end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; - m->m_cur_offset = end_offset - m->m_pkthdr.len; - - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); - - m->m_ddp_gl = (unsigned char *)bsp->gl; - m->m_flags |= M_DDP; - bsp->cur_offset = end_offset; - toep->tp_enqueued_bytes += m->m_pkthdr.len; - - /* - * Length is only meaningful for kbuf - */ - if (!(bsp->flags & DDP_BF_NOCOPY)) - KASSERT(m->m_len <= bsp->gl->dgl_length, - ("length received exceeds ddp pages: len=%d dgl_length=%d", - m->m_len, bsp->gl->dgl_length)); - - KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); - KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); - /* - * Bit 0 of flags stores whether the DDP buffer is completed. - * Note that other parts of the code depend on this being in bit 0. - */ - if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { - panic("spurious ddp completion"); - } else { - m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); - if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) - q->cur_buf ^= 1; /* flip buffers */ - } - - if (bsp->flags & DDP_BF_NOCOPY) { - m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); - bsp->flags &= ~DDP_BF_NOCOPY; - } - - if (ddp_report & F_DDP_PSH) - m->m_ddp_flags |= DDP_BF_PSH; - if (nomoredata) - m->m_ddp_flags |= DDP_BF_NODATA; - -#ifdef notyet - skb_reset_transport_header(skb); - tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ -#endif - SBAPPEND(rcv, m); - - if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || - (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) - || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) - so_sorwakeup_locked(so); - else - sockbuf_unlock(rcv); -} - -#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ - F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ - F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ - F_DDP_INVALID_PPOD) - -/* - * Handler for RX_DATA_DDP CPL messages. - */ -static int -do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = ctx; - const struct cpl_rx_data_ddp *hdr = cplhdr(m); - - VALIDATE_SOCK(so); - - if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { - log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", - GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); - return (CPL_RET_BUF_DONE); - } -#if 0 - skb->h.th = tcphdr_skb->h.th; -#endif - new_rx_data_ddp(toep, m); - return (0); -} - -static void -process_ddp_complete(struct toepcb *toep, struct mbuf *m) -{ - struct tcpcb *tp = toep->tp_tp; - struct socket *so; - struct ddp_state *q; - struct ddp_buf_state *bsp; - struct cpl_rx_ddp_complete *hdr; - unsigned int ddp_report, buf_idx, when, delack_mode; - int nomoredata = 0; - struct sockbuf *rcv; - - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - - if (__predict_false(so_no_receive(so))) { - struct inpcb *inp = so_sotoinpcb(so); - - handle_excess_rx(toep, m); - inp_wunlock(inp); - return; - } - q = &toep->tp_ddp_state; - hdr = cplhdr(m); - ddp_report = ntohl(hdr->ddp_report); - buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; - m->m_pkthdr.csum_data = tp->rcv_nxt; - - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); - - bsp = &q->buf_state[buf_idx]; - when = bsp->cur_offset; - m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; - tp->rcv_nxt += m->m_len; - tp->t_rcvtime = ticks; - - delack_mode = G_DDP_DACK_MODE(ddp_report); - if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { - toep->tp_delack_mode = delack_mode; - toep->tp_delack_seq = tp->rcv_nxt; - } -#ifdef notyet - skb_reset_transport_header(skb); - tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ -#endif - inp_wunlock(tp->t_inpcb); - - KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); - CTR5(KTR_TOM, - "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " - "ddp_report 0x%x offset %u, len %u", - tp->rcv_nxt, bsp->cur_offset, ddp_report, - G_DDP_OFFSET(ddp_report), m->m_len); - - m->m_cur_offset = bsp->cur_offset; - bsp->cur_offset += m->m_len; - - if (!(bsp->flags & DDP_BF_NOFLIP)) { - q->cur_buf ^= 1; /* flip buffers */ - if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) - nomoredata=1; - } - - CTR4(KTR_TOM, - "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " - "ddp_report %u offset %u", - tp->rcv_nxt, bsp->cur_offset, ddp_report, - G_DDP_OFFSET(ddp_report)); - - m->m_ddp_gl = (unsigned char *)bsp->gl; - m->m_flags |= M_DDP; - m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; - if (bsp->flags & DDP_BF_NOCOPY) - bsp->flags &= ~DDP_BF_NOCOPY; - if (nomoredata) - m->m_ddp_flags |= DDP_BF_NODATA; - - SBAPPEND(rcv, m); - if ((so_state_get(so) & SS_NOFDREF) == 0) - so_sorwakeup_locked(so); - else - sockbuf_unlock(rcv); -} - -/* - * Handler for RX_DDP_COMPLETE CPL messages. - */ -static int -do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = ctx; - - VALIDATE_SOCK(so); -#if 0 - skb->h.th = tcphdr_skb->h.th; -#endif - process_ddp_complete(toep, m); - return (0); -} - -/* - * Move a socket to TIME_WAIT state. We need to make some adjustments to the - * socket state before calling tcp_time_wait to comply with its expectations. - */ -static void -enter_timewait(struct tcpcb *tp) -{ - /* - * Bump rcv_nxt for the peer FIN. We don't do this at the time we - * process peer_close because we don't want to carry the peer FIN in - * the socket's receive queue and if we increment rcv_nxt without - * having the FIN in the receive queue we'll confuse facilities such - * as SIOCINQ. - */ - inp_wlock(tp->t_inpcb); - tp->rcv_nxt++; - - tp->ts_recent_age = 0; /* defeat recycling */ - tp->t_srtt = 0; /* defeat tcp_update_metrics */ - inp_wunlock(tp->t_inpcb); - tcp_offload_twstart(tp); -} - -/* - * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This - * function deals with the data that may be reported along with the FIN. - * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to - * perform normal FIN-related processing. In the latter case 1 indicates that - * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the - * skb can be freed. - */ -static int -handle_peer_close_data(struct socket *so, struct mbuf *m) -{ - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; - struct ddp_state *q; - struct ddp_buf_state *bsp; - struct cpl_peer_close *req = cplhdr(m); - unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ - struct sockbuf *rcv; - - if (tp->rcv_nxt == rcv_nxt) /* no data */ - return (0); - - CTR0(KTR_TOM, "handle_peer_close_data"); - if (__predict_false(so_no_receive(so))) { - handle_excess_rx(toep, m); + m = M_GETHDR_OFLD(toep->tp_qset, CPL_PRIORITY_DATA, req); + if (m == NULL) + CXGB_UNIMPLEMENTED(); - /* - * Although we discard the data we want to process the FIN so - * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + - * PEER_CLOSE without data. In particular this PEER_CLOSE - * may be what will close the connection. We return 1 because - * handle_excess_rx() already freed the packet. - */ - return (1); - } + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wrh_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); + req->rsvd0 = htonl(tp->snd_nxt); + req->rsvd1 = !(toep->tp_flags & TP_DATASENT); + req->cmd = CPL_ABORT_SEND_RST; - inp_lock_assert(tp->t_inpcb); - q = &toep->tp_ddp_state; - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); - - bsp = &q->buf_state[q->cur_buf]; - m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; - KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); - m->m_ddp_gl = (unsigned char *)bsp->gl; - m->m_flags |= M_DDP; - m->m_cur_offset = bsp->cur_offset; - m->m_ddp_flags = - DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; - m->m_seq = tp->rcv_nxt; - tp->rcv_nxt = rcv_nxt; - bsp->cur_offset += m->m_pkthdr.len; - if (!(bsp->flags & DDP_BF_NOFLIP)) - q->cur_buf ^= 1; -#ifdef notyet - skb_reset_transport_header(skb); - tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ -#endif - tp->t_rcvtime = ticks; - SBAPPEND(rcv, m); - if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) - so_sorwakeup_locked(so); + if (tp->t_state == TCPS_SYN_SENT) + mbufq_tail(&toep->out_of_order_queue, m); /* defer */ else - sockbuf_unlock(rcv); - - return (1); + l2t_send(sc, m, toep->tp_l2t); } -/* - * Handle a peer FIN. - */ -static void -do_peer_fin(struct toepcb *toep, struct mbuf *m) +int +t3_send_rst(struct toedev *tod __unused, struct tcpcb *tp) { - struct socket *so; - struct tcpcb *tp = toep->tp_tp; - int keep, action; - - action = keep = 0; - CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); - if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { - printf("abort_pending set\n"); - - goto out; - } - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); - if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { - keep = handle_peer_close_data(so, m); - if (keep < 0) { - inp_wunlock(tp->t_inpcb); - return; - } - } - if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { - CTR1(KTR_TOM, - "waking up waiters for cantrcvmore on %p ", so); - socantrcvmore(so); - - /* - * If connection is half-synchronized - * (ie NEEDSYN flag on) then delay ACK, - * so it may be piggybacked when SYN is sent. - * Otherwise, since we received a FIN then no - * more input can be expected, send ACK now. - */ - if (tp->t_flags & TF_NEEDSYN) - tp->t_flags |= TF_DELACK; - else - tp->t_flags |= TF_ACKNOW; - tp->rcv_nxt++; - } - - switch (tp->t_state) { - case TCPS_SYN_RECEIVED: - tp->t_starttime = ticks; - /* FALLTHROUGH */ - case TCPS_ESTABLISHED: - tp->t_state = TCPS_CLOSE_WAIT; - break; - case TCPS_FIN_WAIT_1: - tp->t_state = TCPS_CLOSING; - break; - case TCPS_FIN_WAIT_2: - /* - * If we've sent an abort_req we must have sent it too late, - * HW will send us a reply telling us so, and this peer_close - * is really the last message for this connection and needs to - * be treated as an abort_rpl, i.e., transition the connection - * to TCP_CLOSE (note that the host stack does this at the - * time of generating the RST but we must wait for HW). - * Otherwise we enter TIME_WAIT. - */ - t3_release_offload_resources(toep); - if (toep->tp_flags & TP_ABORT_RPL_PENDING) { - action = TCP_CLOSE; - } else { - action = TCP_TIMEWAIT; - } - break; - default: - log(LOG_ERR, - "%s: TID %u received PEER_CLOSE in bad state %d\n", - toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); - } - inp_wunlock(tp->t_inpcb); - - if (action == TCP_TIMEWAIT) { - enter_timewait(tp); - } else if (action == TCP_DROP) { - tcp_offload_drop(tp, 0); - } else if (action == TCP_CLOSE) { - tcp_offload_close(tp); - } - -#ifdef notyet - /* Do not send POLL_HUP for half duplex close. */ - if ((sk->sk_shutdown & SEND_SHUTDOWN) || - sk->sk_state == TCP_CLOSE) - sk_wake_async(so, 1, POLL_HUP); - else - sk_wake_async(so, 1, POLL_IN); -#endif -out: - if (!keep) - m_free(m); + send_reset(tp->t_toe); + return (0); } /* - * Handler for PEER_CLOSE CPL messages. + * Handler for RX_DATA CPL messages. */ static int -do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; - - VALIDATE_SOCK(so); - - do_peer_fin(toep, m); - return (0); -} - -static void -process_close_con_rpl(struct toepcb *toep, struct mbuf *m) -{ - struct cpl_close_con_rpl *rpl = cplhdr(m); - struct tcpcb *tp = toep->tp_tp; - struct socket *so; - int action = 0; - struct sockbuf *rcv; - - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - - tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ - - if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { - inp_wunlock(tp->t_inpcb); - goto out; - } - - CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, - tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); +do_rx_data(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_rx_data *hdr = mtod(m, void *); + unsigned int tid = GET_TID(hdr); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); + struct inpcb *inp = toep->tp_inp; + struct tcpcb *tp; + struct socket *so; + struct sockbuf *so_rcv; - switch (tp->t_state) { - case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ - t3_release_offload_resources(toep); - if (toep->tp_flags & TP_ABORT_RPL_PENDING) { - action = TCP_CLOSE; + /* Advance over CPL */ + m_adj(m, sizeof(*hdr)); - } else { - action = TCP_TIMEWAIT; - } - break; - case TCPS_LAST_ACK: - /* - * In this state we don't care about pending abort_rpl. - * If we've sent abort_req it was post-close and was sent too - * late, this close_con_rpl is the actual last message. - */ - t3_release_offload_resources(toep); - action = TCP_CLOSE; - break; - case TCPS_FIN_WAIT_1: + /* XXX: revisit. This comes from the T4 TOM */ + if (__predict_false(inp == NULL)) { /* - * If we can't receive any more - * data, then closing user can proceed. - * Starting the timer is contrary to the - * specification, but if we don't get a FIN - * we'll hang forever. - * - * XXXjl: - * we should release the tp also, and use a - * compressed state. + * do_pass_establish failed and must be attempting to abort the + * connection. Meanwhile, the T4 has sent us data for such a + * connection. */ - if (so) - rcv = so_sockbuf_rcv(so); - else - break; - - if (rcv->sb_state & SBS_CANTRCVMORE) { - int timeout; - - if (so) - soisdisconnected(so); - timeout = (tcp_fast_finwait2_recycle) ? - tcp_finwait2_timeout : tcp_maxidle; - tcp_timer_activate(tp, TT_2MSL, timeout); - } - tp->t_state = TCPS_FIN_WAIT_2; - if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && - (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { - action = TCP_DROP; - } - - break; - default: - log(LOG_ERR, - "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", - toep->tp_toedev->tod_name, toep->tp_tid, - tp->t_state); +#ifdef notyet + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: inp NULL and tid isn't being aborted", __func__)); +#endif + m_freem(m); + return (0); } - inp_wunlock(tp->t_inpcb); - - if (action == TCP_TIMEWAIT) { - enter_timewait(tp); - } else if (action == TCP_DROP) { - tcp_offload_drop(tp, 0); - } else if (action == TCP_CLOSE) { - tcp_offload_close(tp); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { + CTR4(KTR_CXGB, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", + __func__, tid, m->m_pkthdr.len, inp->inp_flags); + INP_WUNLOCK(inp); + m_freem(m); + return (0); } -out: - m_freem(m); -} -/* - * Handler for CLOSE_CON_RPL CPL messages. - */ -static int -do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, - void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; + if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) + toep->tp_delack_mode = hdr->dack_mode; - process_close_con_rpl(toep, m); - return (0); -} + tp = intotcpcb(inp); -/* - * Process abort replies. We only process these messages if we anticipate - * them as the coordination between SW and HW in this area is somewhat lacking - * and sometimes we get ABORT_RPLs after we are done with the connection that - * originated the ABORT_REQ. - */ -static void -process_abort_rpl(struct toepcb *toep, struct mbuf *m) -{ - struct tcpcb *tp = toep->tp_tp; - struct socket *so; - int needclose = 0; - -#ifdef T3_TRACE - T3_TRACE1(TIDTB(sk), - "process_abort_rpl: GTS rpl pending %d", - sock_flag(sk, ABORT_RPL_PENDING)); -#endif - - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); - - if (toep->tp_flags & TP_ABORT_RPL_PENDING) { - /* - * XXX panic on tcpdrop - */ - if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) - toep->tp_flags |= TP_ABORT_RPL_RCVD; - else { - toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); - if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || - !is_t3a(toep->tp_toedev)) { - if (toep->tp_flags & TP_ABORT_REQ_RCVD) - panic("TP_ABORT_REQ_RCVD set"); - t3_release_offload_resources(toep); - needclose = 1; - } - } +#ifdef INVARIANTS + if (__predict_false(tp->rcv_nxt != be32toh(hdr->seq))) { + log(LOG_ERR, + "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", + __func__, be32toh(hdr->seq), toep->tp_tid, tp->rcv_nxt); } - inp_wunlock(tp->t_inpcb); +#endif + tp->rcv_nxt += m->m_pkthdr.len; + KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, + ("%s: negative window size", __func__)); + tp->rcv_wnd -= m->m_pkthdr.len; + tp->t_rcvtime = ticks; - if (needclose) - tcp_offload_close(tp); + so = inp->inp_socket; + so_rcv = &so->so_rcv; + SOCKBUF_LOCK(so_rcv); - m_free(m); -} + if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { + CTR3(KTR_CXGB, "%s: tid %u, excess rx (%d bytes)", + __func__, tid, m->m_pkthdr.len); + SOCKBUF_UNLOCK(so_rcv); + INP_WUNLOCK(inp); -/* - * Handle an ABORT_RPL_RSS CPL message. - */ -static int -do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - struct cpl_abort_rpl_rss *rpl = cplhdr(m); - struct toepcb *toep; - - /* - * Ignore replies to post-close aborts indicating that the abort was - * requested too late. These connections are terminated when we get - * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss - * arrives the TID is either no longer used or it has been recycled. - */ - if (rpl->status == CPL_ERR_ABORT_FAILED) { -discard: - m_free(m); + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = tcp_drop(tp, ECONNRESET); + if (tp) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + + m_freem(m); return (0); } - toep = (struct toepcb *)ctx; - - /* - * Sometimes we've already closed the socket, e.g., a post-close - * abort races with ABORT_REQ_RSS, the latter frees the socket - * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, - * but FW turns the ABORT_REQ into a regular one and so we get - * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. - */ - if (!toep) - goto discard; + /* receive buffer autosize */ + if (so_rcv->sb_flags & SB_AUTOSIZE && + V_tcp_do_autorcvbuf && + so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && + (m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7) || tp->rcv_wnd < 32768)) { + unsigned int hiwat = so_rcv->sb_hiwat; + unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); - if (toep->tp_tp == NULL) { - log(LOG_NOTICE, "removing tid for abort\n"); - cxgb_remove_tid(cdev, toep, toep->tp_tid); - if (toep->tp_l2t) - l2t_release(L2DATA(cdev), toep->tp_l2t); - - toepcb_release(toep); - goto discard; + if (!sbreserve_locked(so_rcv, newsize, so, NULL)) + so_rcv->sb_flags &= ~SB_AUTOSIZE; + else + toep->tp_rx_credits += newsize - hiwat; } - - log(LOG_NOTICE, "toep=%p\n", toep); - log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); - toepcb_hold(toep); - process_abort_rpl(toep, m); - toepcb_release(toep); + toep->tp_enqueued += m->m_pkthdr.len; + sbappendstream_locked(so_rcv, m); + sorwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(so_rcv); + + INP_WUNLOCK(inp); return (0); } /* - * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also - * indicate whether RST should be sent in response. + * Handler for PEER_CLOSE CPL messages. */ static int -abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) -{ - struct tcpcb *tp = so_sototcpcb(so); - - switch (abort_reason) { - case CPL_ERR_BAD_SYN: -#if 0 - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through -#endif - case CPL_ERR_CONN_RESET: - // XXX need to handle SYN_RECV due to crossed SYNs - return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); - case CPL_ERR_XMIT_TIMEDOUT: - case CPL_ERR_PERSIST_TIMEDOUT: - case CPL_ERR_FINWAIT2_TIMEDOUT: - case CPL_ERR_KEEPALIVE_TIMEDOUT: -#if 0 - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); -#endif - return (ETIMEDOUT); - default: - return (EIO); - } -} - -static inline void -set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) -{ - struct cpl_abort_rpl *rpl = cplhdr(m); - - rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); - rpl->wr.wr_lo = htonl(V_WR_TID(tid)); - m->m_len = m->m_pkthdr.len = sizeof(*rpl); - - OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); - rpl->cmd = cmd; -} - -static void -send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) -{ - struct mbuf *reply_mbuf; - struct cpl_abort_req_rss *req = cplhdr(m); - - reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); - m_set_priority(m, CPL_PRIORITY_DATA); - m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); - set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); - cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); - m_free(m); -} - -/* - * Returns whether an ABORT_REQ_RSS message is a negative advice. - */ -static inline int -is_neg_adv_abort(unsigned int status) -{ - return status == CPL_ERR_RTX_NEG_ADVICE || - status == CPL_ERR_PERSIST_NEG_ADVICE; -} - -static void -send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) -{ - struct mbuf *reply_mbuf; - struct cpl_abort_req_rss *req = cplhdr(m); - - reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); - - if (!reply_mbuf) { - /* Defer the reply. Stick rst_status into req->cmd. */ - req->status = rst_status; - t3_defer_reply(m, tdev, send_deferred_abort_rpl); - return; - } - - m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); - set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); - m_free(m); - - /* - * XXX need to sync with ARP as for SYN_RECV connections we can send - * these messages while ARP is pending. For other connection states - * it's not a problem. - */ - cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); -} +do_peer_close(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + const struct cpl_peer_close *hdr = mtod(m, void *); + unsigned int tid = GET_TID(hdr); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); + struct inpcb *inp = toep->tp_inp; + struct tcpcb *tp; + struct socket *so; -#ifdef notyet -static void -cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) -{ - CXGB_UNIMPLEMENTED(); -#ifdef notyet - struct request_sock *req = child->sk_user_data; - - inet_csk_reqsk_queue_removed(parent, req); - synq_remove(tcp_sk(child)); - __reqsk_free(req); - child->sk_user_data = NULL; -#endif -} + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = intotcpcb(inp); + CTR5(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, + tid, tp ? tcpstates[tp->t_state] : "no tp" , toep->tp_flags, inp); -/* - * Performs the actual work to abort a SYN_RECV connection. - */ -static void -do_abort_syn_rcv(struct socket *child, struct socket *parent) -{ - struct tcpcb *parenttp = so_sototcpcb(parent); - struct tcpcb *childtp = so_sototcpcb(child); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) + goto done; - /* - * If the server is still open we clean up the child connection, - * otherwise the server already did the clean up as it was purging - * its SYN queue and the skb was just sitting in its backlog. - */ - if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { - cleanup_syn_rcv_conn(child, parent); - inp_wlock(childtp->t_inpcb); - t3_release_offload_resources(childtp->t_toe); - inp_wunlock(childtp->t_inpcb); - tcp_offload_close(childtp); - } -} -#endif + so = inp_inpcbtosocket(inp); -/* - * Handle abort requests for a SYN_RECV connection. These need extra work - * because the socket is on its parent's SYN queue. - */ -static int -abort_syn_rcv(struct socket *so, struct mbuf *m) -{ - CXGB_UNIMPLEMENTED(); -#ifdef notyet - struct socket *parent; - struct toedev *tdev = toep->tp_toedev; - struct t3cdev *cdev = TOM_DATA(tdev)->cdev; - struct socket *oreq = so->so_incomp; - struct t3c_tid_entry *t3c_stid; - struct tid_info *t; - - if (!oreq) - return -1; /* somehow we are not on the SYN queue */ - - t = &(T3C_DATA(cdev))->tid_maps; - t3c_stid = lookup_stid(t, oreq->ts_recent); - parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; - - so_lock(parent); - do_abort_syn_rcv(so, parent); - send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); - so_unlock(parent); -#endif - return (0); -} + socantrcvmore(so); + tp->rcv_nxt++; -/* - * Process abort requests. If we are waiting for an ABORT_RPL we ignore this - * request except that we need to reply to it. - */ -static void -process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) -{ - int rst_status = CPL_ABORT_NO_RST; - const struct cpl_abort_req_rss *req = cplhdr(m); - struct tcpcb *tp = toep->tp_tp; - struct socket *so; - int needclose = 0; - - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); - if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { - toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); - m_free(m); - goto skip; - } + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + case TCPS_FIN_WAIT_2: + tcp_twstart(tp); + INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); - toep->tp_flags &= ~TP_ABORT_REQ_RCVD; - /* - * Three cases to consider: - * a) We haven't sent an abort_req; close the connection. - * b) We have sent a post-close abort_req that will get to TP too late - * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will - * be ignored and the connection should be closed now. - * c) We have sent a regular abort_req that will get to TP too late. - * That will generate an abort_rpl with status 0, wait for it. - */ - if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || - (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { - int error; - - error = abort_status_to_errno(so, req->status, - &rst_status); - so_error_set(so, error); - - if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) - so_sorwakeup(so); - /* - * SYN_RECV needs special processing. If abort_syn_rcv() - * returns 0 is has taken care of the abort. - */ - if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) - goto skip; + INP_WLOCK(inp); + toepcb_release(toep); /* no more CPLs expected */ - t3_release_offload_resources(toep); - needclose = 1; + m_freem(m); + return (0); + default: + log(LOG_ERR, "%s: TID %u received PEER_CLOSE in bad state %d\n", + __func__, toep->tp_tid, tp->t_state); } - inp_wunlock(tp->t_inpcb); - if (needclose) - tcp_offload_close(tp); +done: + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); - send_abort_rpl(m, tdev, rst_status); - return; -skip: - inp_wunlock(tp->t_inpcb); + m_freem(m); + return (0); } /* - * Handle an ABORT_REQ_RSS CPL message. + * Handler for CLOSE_CON_RPL CPL messages. peer ACK to our FIN received. */ static int -do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - const struct cpl_abort_req_rss *req = cplhdr(m); - struct toepcb *toep = (struct toepcb *)ctx; - - if (is_neg_adv_abort(req->status)) { - m_free(m); - return (0); - } +do_close_con_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + const struct cpl_close_con_rpl *rpl = mtod(m, void *); + unsigned int tid = GET_TID(rpl); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); + struct inpcb *inp = toep->tp_inp; + struct tcpcb *tp; + struct socket *so; - log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); - - if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { - cxgb_remove_tid(cdev, toep, toep->tp_tid); - toep->tp_flags |= TP_ABORT_REQ_RCVD; - - send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); - if (toep->tp_l2t) - l2t_release(L2DATA(cdev), toep->tp_l2t); + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = intotcpcb(inp); - /* - * Unhook - */ - toep->tp_tp->t_toe = NULL; - toep->tp_tp->t_flags &= ~TF_TOE; - toep->tp_tp = NULL; - /* - * XXX need to call syncache_chkrst - but we don't - * have a way of doing that yet - */ - toepcb_release(toep); - log(LOG_ERR, "abort for unestablished connection :-(\n"); - return (0); - } - if (toep->tp_tp == NULL) { - log(LOG_NOTICE, "disconnected toepcb\n"); - /* should be freed momentarily */ - return (0); - } + CTR4(KTR_CXGB, "%s: tid %u (%s), toep_flags 0x%x", __func__, tid, + tp ? tcpstates[tp->t_state] : "no tp", toep->tp_flags); + if ((toep->tp_flags & TP_ABORT_RPL_PENDING)) + goto done; - toepcb_hold(toep); - process_abort_req(toep, m, toep->tp_toedev); - toepcb_release(toep); - return (0); -} -#ifdef notyet -static void -pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) -{ - struct toedev *tdev = TOE_DEV(parent); + so = inp_inpcbtosocket(inp); + tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ - do_abort_syn_rcv(child, parent); - if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { - struct cpl_pass_accept_rpl *rpl = cplhdr(m); + switch (tp->t_state) { + case TCPS_CLOSING: + tcp_twstart(tp); +release: + INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); - rpl->opt0h = htonl(F_TCAM_BYPASS); - rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); - cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); - } else - m_free(m); -} -#endif -static void -handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) -{ - CXGB_UNIMPLEMENTED(); - -#ifdef notyet - struct t3cdev *cdev; - struct socket *parent; - struct socket *oreq; - struct t3c_tid_entry *t3c_stid; - struct tid_info *t; - struct tcpcb *otp, *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; + INP_WLOCK(inp); + toepcb_release(toep); /* no more CPLs expected */ - /* - * If the connection is being aborted due to the parent listening - * socket going away there's nothing to do, the ABORT_REQ will close - * the connection. - */ - if (toep->tp_flags & TP_ABORT_RPL_PENDING) { - m_free(m); - return; - } + m_freem(m); + return (0); + case TCPS_LAST_ACK: + if (tcp_close(tp)) + INP_WUNLOCK(inp); + goto release; - oreq = so->so_incomp; - otp = so_sototcpcb(oreq); - - cdev = T3C_DEV(so); - t = &(T3C_DATA(cdev))->tid_maps; - t3c_stid = lookup_stid(t, otp->ts_recent); - parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; - - so_lock(parent); - pass_open_abort(so, parent, m); - so_unlock(parent); -#endif -} + case TCPS_FIN_WAIT_1: + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + soisdisconnected(so); + tp->t_state = TCPS_FIN_WAIT_2; + break; + default: + log(LOG_ERR, + "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", + __func__, toep->tp_tid, tp->t_state); + } -/* - * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly - * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV - * connection. - */ -static void -pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) -{ +done: + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); -#ifdef notyet - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); - BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); -#endif - handle_pass_open_arp_failure(m_get_socket(m), m); + m_freem(m); + return (0); } -/* - * Populate a reject CPL_PASS_ACCEPT_RPL WR. - */ -static void -mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) +static int +do_smt_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct cpl_pass_accept_req *req = cplhdr(req_mbuf); - struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); - unsigned int tid = GET_TID(req); - - m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); - rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); - rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet - rpl->opt0h = htonl(F_TCAM_BYPASS); - rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); - rpl->opt2 = 0; - rpl->rsvd = rpl->opt2; /* workaround for HW bug */ -} + struct cpl_smt_write_rpl *rpl = mtod(m, void *); -/* - * Send a deferred reject to an accept request. - */ -static void -reject_pass_request(struct toedev *tdev, struct mbuf *m) -{ - struct mbuf *reply_mbuf; + if (rpl->status != CPL_ERR_NONE) { + log(LOG_ERR, + "Unexpected SMT_WRITE_RPL status %u for entry %u\n", + rpl->status, GET_TID(rpl)); + } - reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); - mk_pass_accept_rpl(reply_mbuf, m); - cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); - m_free(m); + m_freem(m); + return (0); } -static void -handle_syncache_event(int event, void *arg) +static int +do_set_tcb_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct toepcb *toep = arg; + struct cpl_set_tcb_rpl *rpl = mtod(m, void *); - switch (event) { - case TOE_SC_ENTRY_PRESENT: - /* - * entry already exists - free toepcb - * and l2t - */ - printf("syncache entry present\n"); - toepcb_release(toep); - break; - case TOE_SC_DROP: - /* - * The syncache has given up on this entry - * either it timed out, or it was evicted - * we need to explicitly release the tid - */ - printf("syncache entry dropped\n"); - toepcb_release(toep); - break; - default: - log(LOG_ERR, "unknown syncache event %d\n", event); - break; + if (rpl->status != CPL_ERR_NONE) { + log(LOG_ERR, "Unexpected SET_TCB_RPL status %u for tid %u\n", + rpl->status, GET_TID(rpl)); } + + m_freem(m); + return (0); } -static void -syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) +/* + * Handle an ABORT_RPL_RSS CPL message. + */ +static int +do_abort_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct in_conninfo inc; - struct toeopt toeo; - struct tcphdr th; + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); + unsigned int tid = GET_TID(rpl); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); struct inpcb *inp; - int mss, wsf, sack, ts; - uint32_t rcv_isn = ntohl(req->rcv_isn); - - bzero(&toeo, sizeof(struct toeopt)); - inp = so_sotoinpcb(lso); - + /* - * Fill out information for entering us into the syncache + * Ignore replies to post-close aborts indicating that the abort was + * requested too late. These connections are terminated when we get + * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss + * arrives the TID is either no longer used or it has been recycled. */ - bzero(&inc, sizeof(inc)); - inc.inc_fport = th.th_sport = req->peer_port; - inc.inc_lport = th.th_dport = req->local_port; - th.th_seq = req->rcv_isn; - th.th_flags = TH_SYN; - - toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; + if (rpl->status == CPL_ERR_ABORT_FAILED) { + m_freem(m); + return (0); + } - inc.inc_len = 0; - inc.inc_faddr.s_addr = req->peer_ip; - inc.inc_laddr.s_addr = req->local_ip; + if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) + return (do_abort_rpl_synqe(qs, r, m)); - DPRINTF("syncache add of %d:%d %d:%d\n", - ntohl(req->local_ip), ntohs(req->local_port), - ntohl(req->peer_ip), ntohs(req->peer_port)); - - mss = req->tcp_options.mss; - wsf = req->tcp_options.wsf; - ts = req->tcp_options.tstamp; - sack = req->tcp_options.sack; - toeo.to_mss = mss; - toeo.to_wscale = wsf; - toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); - tcp_offload_syncache_add(&inc, &toeo, &th, inp, &lso, &cxgb_toe_usrreqs, -toep); -} + CTR4(KTR_CXGB, "%s: tid %d, toep %p, status %d", __func__, tid, toep, + rpl->status); + inp = toep->tp_inp; + INP_WLOCK(inp); -/* - * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket - * lock held. Note that the sock here is a listening socket that is not owned - * by the TOE. - */ -static void -process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, - struct listen_ctx *lctx) -{ - int rt_flags; - struct l2t_entry *e; - struct iff_mac tim; - struct mbuf *reply_mbuf, *ddp_mbuf = NULL; - struct cpl_pass_accept_rpl *rpl; - struct cpl_pass_accept_req *req = cplhdr(m); - unsigned int tid = GET_TID(req); - struct tom_data *d = TOM_DATA(tdev); - struct t3cdev *cdev = d->cdev; - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *newtoep; - struct rtentry *dst; - struct sockaddr_in nam; - struct t3c_data *td = T3C_DATA(cdev); - - reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); - if (__predict_false(reply_mbuf == NULL)) { - if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) - t3_defer_reply(m, tdev, reject_pass_request); - else { - cxgb_queue_tid_release(cdev, tid); - m_free(m); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + if (!(toep->tp_flags & TP_ABORT_RPL_RCVD)) { + toep->tp_flags |= TP_ABORT_RPL_RCVD; + INP_WUNLOCK(inp); + } else { + toep->tp_flags &= ~TP_ABORT_RPL_RCVD; + toep->tp_flags &= TP_ABORT_RPL_PENDING; + toepcb_release(toep); /* no more CPLs expected */ } - DPRINTF("failed to get reply_mbuf\n"); - - goto out; - } - - if (tp->t_state != TCPS_LISTEN) { - DPRINTF("socket not in listen state\n"); - - goto reject; - } - - tim.mac_addr = req->dst_mac; - tim.vlan_tag = ntohs(req->vlan_tag); - if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { - DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); - goto reject; } - -#ifdef notyet - /* - * XXX do route lookup to confirm that we're still listening on this - * address - */ - if (ip_route_input(skb, req->local_ip, req->peer_ip, - G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) - goto reject; - rt_flags = ((struct rtable *)skb->dst)->rt_flags & - (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); - dst_release(skb->dst); // done with the input route, release it - skb->dst = NULL; - - if ((rt_flags & RTF_LOCAL) == 0) - goto reject; -#endif - /* - * XXX - */ - rt_flags = RTF_LOCAL; - if ((rt_flags & RTF_LOCAL) == 0) - goto reject; - - /* - * Calculate values and add to syncache - */ - newtoep = toepcb_alloc(); - if (newtoep == NULL) - goto reject; - - bzero(&nam, sizeof(struct sockaddr_in)); - - nam.sin_len = sizeof(struct sockaddr_in); - nam.sin_family = AF_INET; - nam.sin_addr.s_addr =req->peer_ip; - dst = rtalloc2((struct sockaddr *)&nam, 1, 0); - - if (dst == NULL) { - printf("failed to find route\n"); - goto reject; - } - e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, - (struct sockaddr *)&nam); - if (e == NULL) { - DPRINTF("failed to get l2t\n"); - } - /* - * Point to our listen socket until accept - */ - newtoep->tp_tp = tp; - newtoep->tp_flags = TP_SYN_RCVD; - newtoep->tp_tid = tid; - newtoep->tp_toedev = tdev; - tp->rcv_wnd = select_rcv_wnd(tdev, so); - - cxgb_insert_tid(cdev, d->client, newtoep, tid); - so_lock(so); - LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); - so_unlock(so); - - newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && - tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; - - if (newtoep->tp_ulp_mode) { - ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); - - if (ddp_mbuf == NULL) - newtoep->tp_ulp_mode = 0; - } - - CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", - TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); - set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); - /* - * XXX workaround for lack of syncache drop - */ - toepcb_hold(newtoep); - syncache_add_accept_req(req, so, newtoep); - - rpl = cplhdr(reply_mbuf); - reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); - rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - rpl->wr.wr_lo = 0; - OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); - rpl->opt2 = htonl(calc_opt2(so, tdev)); - rpl->rsvd = rpl->opt2; /* workaround for HW bug */ - rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten - - rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | - V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); - rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | - CPL_PASS_OPEN_ACCEPT); - - DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); - - m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); - - l2t_send(cdev, reply_mbuf, e); - m_free(m); - if (newtoep->tp_ulp_mode) { - __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, - V_TF_DDP_OFF(1) | - TP_DDP_TIMER_WORKAROUND_MASK, - V_TF_DDP_OFF(1) | - TP_DDP_TIMER_WORKAROUND_VAL, 1); - } else - DPRINTF("no DDP\n"); - - return; -reject: - if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) - mk_pass_accept_rpl(reply_mbuf, m); - else - mk_tid_release(reply_mbuf, newtoep, tid); - cxgb_ofld_send(cdev, reply_mbuf); - m_free(m); -out: -#if 0 - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); -#else - return; -#endif -} + m_freem(m); + return (0); +} /* - * Handle a CPL_PASS_ACCEPT_REQ message. + * Convert the status code of an ABORT_REQ into a FreeBSD error code. */ static int -do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) +abort_status_to_errno(struct tcpcb *tp, int abort_reason) { - struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; - struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ - struct tom_data *d = listen_ctx->tom_data; - -#if VALIDATE_TID - struct cpl_pass_accept_req *req = cplhdr(m); - unsigned int tid = GET_TID(req); - struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; - - if (unlikely(!lsk)) { - printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", - cdev->name, - (unsigned long)((union listen_entry *)ctx - - t->stid_tab)); - return CPL_RET_BUF_DONE; - } - if (unlikely(tid >= t->ntids)) { - printk(KERN_ERR "%s: passive open TID %u too large\n", - cdev->name, tid); - return CPL_RET_BUF_DONE; - } - /* - * For T3A the current user of the TID may have closed but its last - * message(s) may have been backlogged so the TID appears to be still - * in use. Just take the TID away, the connection can close at its - * own leisure. For T3B this situation is a bug. - */ - if (!valid_new_tid(t, tid) && - cdev->type != T3A) { - printk(KERN_ERR "%s: passive open uses existing TID %u\n", - cdev->name, tid); - return CPL_RET_BUF_DONE; + switch (abort_reason) { + case CPL_ERR_BAD_SYN: + case CPL_ERR_CONN_RESET: + return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: + return (ETIMEDOUT); + default: + return (EIO); } -#endif - - process_pass_accept_req(lso, m, &d->tdev, listen_ctx); - return (0); } /* - * Called when a connection is established to translate the TCP options - * reported by HW to FreeBSD's native format. + * Returns whether an ABORT_REQ_RSS message is a negative advice. */ -static void -assign_rxopt(struct socket *so, unsigned int opt) +static inline int +is_neg_adv_abort(unsigned int status) { - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; - const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); - - inp_lock_assert(tp->t_inpcb); - - toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; - tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; - tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; - tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; - if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == - (TF_RCVD_SCALE|TF_REQ_SCALE)) - tp->rcv_scale = tp->request_r_scale; + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; } -/* - * Completes some final bits of initialization for just established connections - * and changes their state to TCP_ESTABLISHED. - * - * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. - */ -static void -make_established(struct socket *so, u32 snd_isn, unsigned int opt) +void +send_abort_rpl(struct toedev *tod, int tid, int qset) { - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; - - toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; - assign_rxopt(so, opt); + struct mbuf *reply; + struct cpl_abort_rpl *rpl; + struct adapter *sc = tod->tod_softc; - /* - *XXXXXXXXXXX - * - */ -#ifdef notyet - so->so_proto->pr_ctloutput = t3_ctloutput; -#endif - -#if 0 - inet_sk(sk)->id = tp->write_seq ^ jiffies; -#endif - /* - * XXX not clear what rcv_wup maps to - */ - /* - * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't - * pass through opt0. - */ - if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) - toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); + reply = M_GETHDR_OFLD(qset, CPL_PRIORITY_DATA, rpl); + if (!reply) + CXGB_UNIMPLEMENTED(); - dump_toepcb(toep); + rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wrh_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); + rpl->cmd = CPL_ABORT_NO_RST; -#ifdef notyet -/* - * no clean interface for marking ARP up to date - */ - dst_confirm(sk->sk_dst_cache); -#endif - tp->t_starttime = ticks; - tp->t_state = TCPS_ESTABLISHED; - soisconnected(so); + t3_offload_tx(sc, reply); } +/* + * Handle an ABORT_REQ_RSS CPL message. If we're waiting for an ABORT_RPL we + * ignore this request except that we need to reply to it. + */ static int -syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) +do_abort_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + const struct cpl_abort_req_rss *req = mtod(m, void *); + unsigned int tid = GET_TID(req); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); + struct inpcb *inp; + struct tcpcb *tp; + struct socket *so; + int qset = toep->tp_qset; + + if (is_neg_adv_abort(req->status)) { + CTR4(KTR_CXGB, "%s: negative advice %d for tid %u (%x)", + __func__, req->status, tid, toep->tp_flags); + m_freem(m); + return (0); + } + + if (toep->tp_flags & TP_IS_A_SYNQ_ENTRY) + return (do_abort_req_synqe(qs, r, m)); + + inp = toep->tp_inp; + INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_WLOCK(inp); + + tp = intotcpcb(inp); + so = inp->inp_socket; + + CTR6(KTR_CXGB, "%s: tid %u (%s), toep %p (%x), status %d", + __func__, tid, tcpstates[tp->t_state], toep, toep->tp_flags, + req->status); + + if (!(toep->tp_flags & TP_ABORT_REQ_RCVD)) { + toep->tp_flags |= TP_ABORT_REQ_RCVD; + toep->tp_flags |= TP_ABORT_SHUTDOWN; + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + m_freem(m); + return (0); + } + toep->tp_flags &= ~TP_ABORT_REQ_RCVD; - struct in_conninfo inc; - struct toeopt toeo; - struct tcphdr th; - int mss, wsf, sack, ts; - struct mbuf *m = NULL; - const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); - unsigned int opt; - -#ifdef MAC -#error "no MAC support" -#endif - - opt = ntohs(req->tcp_opt); - - bzero(&toeo, sizeof(struct toeopt)); - /* - * Fill out information for entering us into the syncache + * If we'd sent a reset on this toep, we'll ignore this and clean up in + * the T3's reply to our reset instead. */ - bzero(&inc, sizeof(inc)); - inc.inc_fport = th.th_sport = req->peer_port; - inc.inc_lport = th.th_dport = req->local_port; - th.th_seq = req->rcv_isn; - th.th_flags = TH_ACK; - - inc.inc_len = 0; - inc.inc_faddr.s_addr = req->peer_ip; - inc.inc_laddr.s_addr = req->local_ip; - - mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; - wsf = G_TCPOPT_WSCALE_OK(opt); - ts = G_TCPOPT_TSTAMP(opt); - sack = G_TCPOPT_SACK(opt); - - toeo.to_mss = mss; - toeo.to_wscale = G_TCPOPT_SND_WSCALE(opt); - toeo.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); - - DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", - ntohl(req->local_ip), ntohs(req->local_port), - ntohl(req->peer_ip), ntohs(req->peer_port), - mss, wsf, ts, sack); - return tcp_offload_syncache_expand(&inc, &toeo, &th, so, m); -} + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + toep->tp_flags |= TP_ABORT_RPL_SENT; + INP_WUNLOCK(inp); + } else { + so_error_set(so, abort_status_to_errno(tp, req->status)); + tp = tcp_close(tp); + if (tp == NULL) + INP_WLOCK(inp); /* re-acquire */ + toepcb_release(toep); /* no more CPLs expected */ + } + INP_INFO_WUNLOCK(&V_tcbinfo); + send_abort_rpl(tod, tid, qset); + m_freem(m); + return (0); +} -/* - * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work - * if we are in TCP_SYN_RECV due to crossed SYNs - */ -static int -do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) +static void +assign_rxopt(struct tcpcb *tp, uint16_t tcpopt) { - struct cpl_pass_establish *req = cplhdr(m); - struct toepcb *toep = (struct toepcb *)ctx; - struct tcpcb *tp = toep->tp_tp; - struct socket *so, *lso; - struct t3c_data *td = T3C_DATA(cdev); - struct sockbuf *snd, *rcv; - - // Complete socket initialization now that we have the SND_ISN - - struct toedev *tdev; + struct toepcb *toep = tp->t_toe; + struct adapter *sc = toep->tp_tod->tod_softc; + tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(tcpopt)] - 40; - tdev = toep->tp_toedev; + if (G_TCPOPT_TSTAMP(tcpopt)) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->t_flags |= TF_REQ_TSTMP; /* forcibly set */ + tp->ts_recent = 0; /* XXX */ + tp->ts_recent_age = tcp_ts_getticks(); + tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; + } - inp_wlock(tp->t_inpcb); - - /* - * - * XXX need to add reference while we're manipulating - */ - so = lso = inp_inpcbtosocket(tp->t_inpcb); + if (G_TCPOPT_SACK(tcpopt)) + tp->t_flags |= TF_SACK_PERMIT; + else + tp->t_flags &= ~TF_SACK_PERMIT; - inp_wunlock(tp->t_inpcb); + if (G_TCPOPT_WSCALE_OK(tcpopt)) + tp->t_flags |= TF_RCVD_SCALE; - so_lock(so); - LIST_REMOVE(toep, synq_entry); - so_unlock(so); - - if (!syncache_expand_establish_req(req, &so, toep)) { - /* - * No entry - */ - CXGB_UNIMPLEMENTED(); - } - if (so == NULL) { - /* - * Couldn't create the socket - */ - CXGB_UNIMPLEMENTED(); + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + tp->snd_scale = G_TCPOPT_SND_WSCALE(tcpopt); } - tp = so_sototcpcb(so); - inp_wlock(tp->t_inpcb); +} - snd = so_sockbuf_snd(so); - rcv = so_sockbuf_rcv(so); +/* + * The ISS and IRS are from after the exchange of SYNs and are off by 1. + */ +void +make_established(struct socket *so, uint32_t cpl_iss, uint32_t cpl_irs, + uint16_t cpl_tcpopt) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep = tp->t_toe; + long bufsize; + uint32_t iss = be32toh(cpl_iss) - 1; /* true ISS */ + uint32_t irs = be32toh(cpl_irs) - 1; /* true IRS */ + uint16_t tcpopt = be16toh(cpl_tcpopt); - snd->sb_flags |= SB_NOCOALESCE; - rcv->sb_flags |= SB_NOCOALESCE; + INP_WLOCK_ASSERT(inp); - toep->tp_tp = tp; - toep->tp_flags = 0; - tp->t_toe = toep; - reset_wr_list(toep); - tp->rcv_wnd = select_rcv_wnd(tdev, so); - tp->rcv_nxt = toep->tp_copied_seq; - install_offload_ops(so); - - toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); - toep->tp_wr_unacked = 0; - toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); - toep->tp_qset_idx = 0; - toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); - - /* - * XXX Cancel any keep alive timer - */ - - make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + tp->t_state = TCPS_ESTABLISHED; + tp->t_starttime = ticks; + TCPSTAT_INC(tcps_connects); + + CTR4(KTR_CXGB, "%s tid %u, toep %p, inp %p", tcpstates[tp->t_state], + toep->tp_tid, toep, inp); + + tp->irs = irs; + tcp_rcvseqinit(tp); + tp->rcv_wnd = toep->tp_rx_credits << 10; + tp->rcv_adv += tp->rcv_wnd; + tp->last_ack_sent = tp->rcv_nxt; /* - * XXX workaround for lack of syncache drop - */ - toepcb_release(toep); - inp_wunlock(tp->t_inpcb); - - CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); - cxgb_log_tcb(cdev->adapter, toep->tp_tid); -#ifdef notyet - /* - * XXX not sure how these checks map to us - */ - if (unlikely(sk->sk_socket)) { // simultaneous opens only - sk->sk_state_change(sk); - sk_wake_async(so, 0, POLL_OUT); - } - /* - * The state for the new connection is now up to date. - * Next check if we should add the connection to the parent's - * accept queue. When the parent closes it resets connections - * on its SYN queue, so check if we are being reset. If so we - * don't need to do anything more, the coming ABORT_RPL will - * destroy this socket. Otherwise move the connection to the - * accept queue. - * - * Note that we reset the synq before closing the server so if - * we are not being reset the stid is still open. + * If we were unable to send all rx credits via opt0, save the remainder + * in rx_credits so that they can be handed over with the next credit + * update. */ - if (unlikely(!tp->forward_skb_hint)) { // removed from synq - __kfree_skb(skb); - goto unlock; - } -#endif - m_free(m); - - return (0); + SOCKBUF_LOCK(&so->so_rcv); + bufsize = select_rcv_wnd(so); + SOCKBUF_UNLOCK(&so->so_rcv); + toep->tp_rx_credits = bufsize - tp->rcv_wnd; + + tp->iss = iss; + tcp_sendseqinit(tp); + tp->snd_una = iss + 1; + tp->snd_nxt = iss + 1; + tp->snd_max = iss + 1; + + assign_rxopt(tp, tcpopt); + soisconnected(so); } /* @@ -3745,129 +1623,80 @@ static void fixup_and_send_ofo(struct toepcb *toep) { struct mbuf *m; - struct toedev *tdev = toep->tp_toedev; - struct tcpcb *tp = toep->tp_tp; + struct toedev *tod = toep->tp_tod; + struct adapter *sc = tod->tod_softc; + struct inpcb *inp = toep->tp_inp; unsigned int tid = toep->tp_tid; - log(LOG_NOTICE, "fixup_and_send_ofo\n"); - - inp_lock_assert(tp->t_inpcb); + inp_lock_assert(inp); + while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { + struct ofld_hdr *oh = mtod(m, void *); /* * A variety of messages can be waiting but the fields we'll * be touching are common to all so any message type will do. */ - struct cpl_close_con_req *p = cplhdr(m); + struct cpl_close_con_req *p = (void *)(oh + 1); - p->wr.wr_lo = htonl(V_WR_TID(tid)); + p->wr.wrh_lo = htonl(V_WR_TID(tid)); OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); - cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); - } -} - -/* - * Updates socket state from an active establish CPL message. Runs with the - * socket lock held. - */ -static void -socket_act_establish(struct socket *so, struct mbuf *m) -{ - struct cpl_act_establish *req = cplhdr(m); - u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; - - if (__predict_false(tp->t_state != TCPS_SYN_SENT)) - log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", - toep->tp_tid, tp->t_state); - - tp->ts_recent_age = ticks; - tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; - toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; - - make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); - - /* - * Now that we finally have a TID send any CPL messages that we had to - * defer for lack of a TID. - */ - if (mbufq_len(&toep->out_of_order_queue)) - fixup_and_send_ofo(toep); - - if (__predict_false(so_state_get(so) & SS_NOFDREF)) { - /* - * XXX does this even make sense? - */ - so_sorwakeup(so); + t3_offload_tx(sc, m); } - m_free(m); -#ifdef notyet -/* - * XXX assume no write requests permitted while socket connection is - * incomplete - */ - /* - * Currently the send queue must be empty at this point because the - * socket layer does not send anything before a connection is - * established. To be future proof though we handle the possibility - * that there are pending buffers to send (either TX_DATA or - * CLOSE_CON_REQ). First we need to adjust the sequence number of the - * buffers according to the just learned write_seq, and then we send - * them on their way. - */ - fixup_pending_writeq_buffers(sk); - if (t3_push_frames(so, 1)) - sk->sk_write_space(sk); -#endif - - toep->tp_state = tp->t_state; - KMOD_TCPSTAT_INC(tcps_connects); - } /* * Process a CPL_ACT_ESTABLISH message. */ static int -do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) +do_act_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct cpl_act_establish *req = cplhdr(m); + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_act_establish *req = mtod(m, void *); unsigned int tid = GET_TID(req); unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); - struct toepcb *toep = (struct toepcb *)ctx; - struct tcpcb *tp = toep->tp_tp; + struct toepcb *toep = lookup_atid(&td->tid_maps, atid); + struct inpcb *inp = toep->tp_inp; + struct tcpcb *tp; struct socket *so; - struct toedev *tdev; - struct tom_data *d; - - if (tp == NULL) { - free_atid(cdev, atid); - return (0); + + CTR3(KTR_CXGB, "%s: atid %u, tid %u", __func__, atid, tid); + + free_atid(&td->tid_maps, atid); + + INP_WLOCK(inp); + tp = intotcpcb(inp); + + KASSERT(toep->tp_qset == qs->idx, + ("%s qset mismatch %d %d", __func__, toep->tp_qset, qs->idx)); + KASSERT(toep->tp_tid == atid, + ("%s atid mismatch %d %d", __func__, toep->tp_tid, atid)); + + toep->tp_tid = tid; + insert_tid(td, toep, tid); + + if (inp->inp_flags & INP_DROPPED) { + /* socket closed by the kernel before hw told us it connected */ + send_reset(toep); + goto done; } - inp_wlock(tp->t_inpcb); - /* - * XXX - */ - so = inp_inpcbtosocket(tp->t_inpcb); - tdev = toep->tp_toedev; /* blow up here if link was down */ - d = TOM_DATA(tdev); + KASSERT(tp->t_state == TCPS_SYN_SENT, + ("TID %u expected TCPS_SYN_SENT, found %d.", tid, tp->t_state)); + + so = inp->inp_socket; + make_established(so, req->snd_isn, req->rcv_isn, req->tcp_opt); /* - * It's OK if the TID is currently in use, the owning socket may have - * backlogged its last CPL message(s). Just take it away. + * Now that we finally have a TID send any CPL messages that we had to + * defer for lack of a TID. */ - toep->tp_tid = tid; - toep->tp_tp = tp; - so_insert_tid(d, toep, tid); - free_atid(cdev, atid); - toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); - - socket_act_establish(so, m); - inp_wunlock(tp->t_inpcb); - CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); - cxgb_log_tcb(cdev->adapter, toep->tp_tid); + if (mbufq_len(&toep->out_of_order_queue)) + fixup_and_send_ofo(toep); +done: + INP_WUNLOCK(inp); + m_freem(m); return (0); } @@ -3878,97 +1707,66 @@ do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) static void wr_ack(struct toepcb *toep, struct mbuf *m) { - struct tcpcb *tp = toep->tp_tp; - struct cpl_wr_ack *hdr = cplhdr(m); + struct inpcb *inp = toep->tp_inp; + struct tcpcb *tp; + struct cpl_wr_ack *hdr = mtod(m, void *); struct socket *so; unsigned int credits = ntohs(hdr->credits); u32 snd_una = ntohl(hdr->snd_una); int bytes = 0; struct sockbuf *snd; - - CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); + struct mbuf *p; + struct ofld_hdr *oh; - inp_wlock(tp->t_inpcb); - so = inp_inpcbtosocket(tp->t_inpcb); + inp_wlock(inp); + tp = intotcpcb(inp); + so = inp->inp_socket; toep->tp_wr_avail += credits; if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; while (credits) { - struct mbuf *p = peek_wr(toep); - + p = peek_wr(toep); + if (__predict_false(!p)) { + CTR5(KTR_CXGB, "%s: %u extra WR_ACK credits, " + "tid %u, state %u, wr_avail %u", __func__, credits, + toep->tp_tid, tp->t_state, toep->tp_wr_avail); + log(LOG_ERR, "%u WR_ACK credits for TID %u with " "nothing pending, state %u wr_avail=%u\n", credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); break; } - CTR2(KTR_TOM, - "wr_ack: p->credits=%d p->bytes=%d", - p->m_pkthdr.csum_data, p->m_pkthdr.len); - KASSERT(p->m_pkthdr.csum_data != 0, - ("empty request still on list")); - - if (__predict_false(credits < p->m_pkthdr.csum_data)) { - -#if DEBUG_WR > 1 - struct tx_data_wr *w = cplhdr(p); - log(LOG_ERR, - "TID %u got %u WR credits, need %u, len %u, " - "main body %u, frags %u, seq # %u, ACK una %u," - " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", - toep->tp_tid, credits, p->csum, p->len, - p->len - p->data_len, skb_shinfo(p)->nr_frags, - ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), - toep->tp_wr_avail, count_pending_wrs(tp) - credits); -#endif - p->m_pkthdr.csum_data -= credits; - break; - } else { - dequeue_wr(toep); - credits -= p->m_pkthdr.csum_data; - bytes += p->m_pkthdr.len; - CTR3(KTR_TOM, - "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", - p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); - - m_free(p); - } - } -#if DEBUG_WR - check_wr_invariants(tp); -#endif + oh = mtod(p, struct ofld_hdr *); - if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { -#if VALIDATE_SEQ - struct tom_data *d = TOM_DATA(TOE_DEV(so)); + KASSERT(credits >= G_HDR_NDESC(oh->flags), + ("%s: partial credits? %d %d", __func__, credits, + G_HDR_NDESC(oh->flags))); - log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " - "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, - toep->tp_tid, tp->snd_una); -#endif - goto out_free; + dequeue_wr(toep); + credits -= G_HDR_NDESC(oh->flags); + bytes += oh->plen; + + if (oh->flags & F_HDR_SGL) + sglist_free(oh->sgl); + m_freem(p); } + if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) + goto out_free; + if (tp->snd_una != snd_una) { tp->snd_una = snd_una; - tp->ts_recent_age = ticks; -#ifdef notyet - /* - * Keep ARP entry "minty fresh" - */ - dst_confirm(sk->sk_dst_cache); -#endif + tp->ts_recent_age = tcp_ts_getticks(); if (tp->snd_una == tp->snd_nxt) toep->tp_flags &= ~TP_TX_WAIT_IDLE; } snd = so_sockbuf_snd(so); if (bytes) { - CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); - snd = so_sockbuf_snd(so); - sockbuf_lock(snd); + SOCKBUF_LOCK(snd); sbdrop_locked(snd, bytes); so_sowwakeup_locked(so); } @@ -3978,142 +1776,25 @@ wr_ack(struct toepcb *toep, struct mbuf *m) out_free: inp_wunlock(tp->t_inpcb); - m_free(m); + m_freem(m); } /* * Handler for TX_DATA_ACK CPL messages. */ static int -do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) -{ - struct toepcb *toep = (struct toepcb *)ctx; - - VALIDATE_SOCK(so); - - wr_ack(toep, m); - return 0; -} - -/* - * Handler for TRACE_PKT CPL messages. Just sink these packets. - */ -static int -do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) -{ - m_freem(m); - return 0; -} - -/* - * Reset a connection that is on a listener's SYN queue or accept queue, - * i.e., one that has not had a struct socket associated with it. - * Must be called from process context. - * - * Modeled after code in inet_csk_listen_stop(). - */ -static void -t3_reset_listen_child(struct socket *child) -{ - struct tcpcb *tp = so_sototcpcb(child); - - t3_send_reset(tp->t_toe); -} - - -static void -t3_child_disconnect(struct socket *so, void *arg) -{ - struct tcpcb *tp = so_sototcpcb(so); - - if (tp->t_flags & TF_TOE) { - inp_wlock(tp->t_inpcb); - t3_reset_listen_child(so); - inp_wunlock(tp->t_inpcb); - } -} - -/* - * Disconnect offloaded established but not yet accepted connections sitting - * on a server's accept_queue. We just send an ABORT_REQ at this point and - * finish off the disconnect later as we may need to wait for the ABORT_RPL. - */ -void -t3_disconnect_acceptq(struct socket *listen_so) -{ - - so_lock(listen_so); - so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); - so_unlock(listen_so); -} - -/* - * Reset offloaded connections sitting on a server's syn queue. As above - * we send ABORT_REQ and finish off when we get ABORT_RPL. - */ - -void -t3_reset_synq(struct listen_ctx *lctx) +do_wr_ack(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct toepcb *toep; - - so_lock(lctx->lso); - while (!LIST_EMPTY(&lctx->synq_head)) { - toep = LIST_FIRST(&lctx->synq_head); - LIST_REMOVE(toep, synq_entry); - toep->tp_tp = NULL; - t3_send_reset(toep); - cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); - toepcb_release(toep); - } - so_unlock(lctx->lso); -} - + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_wr_ack *hdr = mtod(m, void *); + unsigned int tid = GET_TID(hdr); + struct toepcb *toep = lookup_tid(&td->tid_maps, tid); -int -t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, - unsigned int nppods, unsigned int tag, unsigned int maxoff, - unsigned int pg_off, unsigned int color) -{ - unsigned int i, j, pidx; - struct pagepod *p; - struct mbuf *m; - struct ulp_mem_io *req; - unsigned int tid = toep->tp_tid; - const struct tom_data *td = TOM_DATA(toep->tp_toedev); - unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; + /* XXX bad race */ + if (toep) + wr_ack(toep, m); - CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", - gl, nppods, tag, maxoff, pg_off, color); - - for (i = 0; i < nppods; ++i) { - m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - req = mtod(m, struct ulp_mem_io *); - m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); - req->wr.wr_lo = 0; - req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | - V_ULPTX_CMD(ULP_MEM_WRITE)); - req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | - V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); - - p = (struct pagepod *)(req + 1); - if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { - p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); - p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | - V_PPOD_COLOR(color)); - p->pp_max_offset = htonl(maxoff); - p->pp_page_offset = htonl(pg_off); - p->pp_rsvd = 0; - for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) - p->pp_addr[j] = pidx < gl->dgl_nelem ? - htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; - } else - p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ - send_or_defer(toep, m, 0); - ppod_addr += PPOD_SIZE; - } return (0); } @@ -4153,10 +1834,7 @@ mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, unsigned int word, uint64_t mask, uint64_t val) { struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; - - CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", - tid, word, mask, val); - + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); @@ -4167,294 +1845,19 @@ mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, req->val = htobe64(val); } -/* - * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. - */ -static void -mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, - unsigned int tid, unsigned int credits) -{ - struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; - - txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); - txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); - OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); - ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | - V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | - V_RX_CREDITS(credits)); -} - -void -t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) -{ - unsigned int wrlen; - struct mbuf *m; - struct work_request_hdr *wr; - struct cpl_barrier *lock; - struct cpl_set_tcb_field *req; - struct cpl_get_tcb *getreq; - struct ddp_state *p = &toep->tp_ddp_state; - -#if 0 - SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); -#endif - wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + - sizeof(*getreq); - m = m_gethdr_nofail(wrlen); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - wr = mtod(m, struct work_request_hdr *); - bzero(wr, wrlen); - - wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); - m->m_pkthdr.len = m->m_len = wrlen; - - lock = (struct cpl_barrier *)(wr + 1); - mk_cpl_barrier_ulp(lock); - - req = (struct cpl_set_tcb_field *)(lock + 1); - - CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); - - /* Hmmm, not sure if this actually a good thing: reactivating - * the other buffer might be an issue if it has been completed - * already. However, that is unlikely, since the fact that the UBUF - * is not completed indicates that there is no oustanding data. - */ - if (bufidx == 0) - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, - V_TF_DDP_ACTIVE_BUF(1) | - V_TF_DDP_BUF0_VALID(1), - V_TF_DDP_ACTIVE_BUF(1)); - else - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, - V_TF_DDP_ACTIVE_BUF(1) | - V_TF_DDP_BUF1_VALID(1), 0); - - getreq = (struct cpl_get_tcb *)(req + 1); - mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); - - mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); - - /* Keep track of the number of oustanding CPL_GET_TCB requests - */ - p->get_tcb_count++; - -#ifdef T3_TRACE - T3_TRACE1(TIDTB(so), - "t3_cancel_ddpbuf: bufidx %u", bufidx); -#endif - cxgb_ofld_send(TOEP_T3C_DEV(toep), m); -} - -/** - * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one - * @sk: the socket associated with the buffers - * @bufidx: index of HW DDP buffer (0 or 1) - * @tag0: new tag for HW buffer 0 - * @tag1: new tag for HW buffer 1 - * @len: new length for HW buf @bufidx - * - * Sends a compound WR to overlay a new DDP buffer on top of an existing - * buffer by changing the buffer tag and length and setting the valid and - * active flag accordingly. The caller must ensure the new buffer is at - * least as big as the existing one. Since we typically reprogram both HW - * buffers this function sets both tags for convenience. Read the TCB to - * determine how made data was written into the buffer before the overlay - * took place. - */ -void -t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, - unsigned int tag1, unsigned int len) -{ - unsigned int wrlen; - struct mbuf *m; - struct work_request_hdr *wr; - struct cpl_get_tcb *getreq; - struct cpl_set_tcb_field *req; - struct ddp_state *p = &toep->tp_ddp_state; - - CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", - bufidx, tag0, tag1, len); -#if 0 - SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); -#endif - wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); - m = m_gethdr_nofail(wrlen); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - wr = mtod(m, struct work_request_hdr *); - m->m_pkthdr.len = m->m_len = wrlen; - bzero(wr, wrlen); - - - /* Set the ATOMIC flag to make sure that TP processes the following - * CPLs in an atomic manner and no wire segments can be interleaved. - */ - wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); - req = (struct cpl_set_tcb_field *)(wr + 1); - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, - V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | - V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, - V_TCB_RX_DDP_BUF0_TAG(tag0) | - V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); - req++; - if (bufidx == 0) { - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, - V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), - V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); - req++; - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, - V_TF_DDP_PUSH_DISABLE_0(1) | - V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), - V_TF_DDP_PUSH_DISABLE_0(0) | - V_TF_DDP_BUF0_VALID(1)); - } else { - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, - V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), - V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); - req++; - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, - V_TF_DDP_PUSH_DISABLE_1(1) | - V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), - V_TF_DDP_PUSH_DISABLE_1(0) | - V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); - } - - getreq = (struct cpl_get_tcb *)(req + 1); - mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); - - /* Keep track of the number of oustanding CPL_GET_TCB requests - */ - p->get_tcb_count++; - -#ifdef T3_TRACE - T3_TRACE4(TIDTB(sk), - "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " - "len %d", - bufidx, tag0, tag1, len); -#endif - cxgb_ofld_send(TOEP_T3C_DEV(toep), m); -} - -/* - * Sends a compound WR containing all the CPL messages needed to program the - * two HW DDP buffers, namely optionally setting up the length and offset of - * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. - */ -void -t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, - unsigned int len1, unsigned int offset1, - uint64_t ddp_flags, uint64_t flag_mask, int modulate) -{ - unsigned int wrlen; - struct mbuf *m; - struct work_request_hdr *wr; - struct cpl_set_tcb_field *req; - - CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", - len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); - -#if 0 - SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); -#endif - wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + - (len1 ? sizeof(*req) : 0) + - (modulate ? sizeof(struct cpl_rx_data_ack) : 0); - m = m_gethdr_nofail(wrlen); - m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); - wr = mtod(m, struct work_request_hdr *); - bzero(wr, wrlen); - - wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); - m->m_pkthdr.len = m->m_len = wrlen; - - req = (struct cpl_set_tcb_field *)(wr + 1); - if (len0) { /* program buffer 0 offset and length */ - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, - V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | - V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), - V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | - V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); - req++; - } - if (len1) { /* program buffer 1 offset and length */ - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, - V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | - V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, - V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | - V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); - req++; - } - - mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, - ddp_flags); - - if (modulate) { - mk_rx_data_ack_ulp(toep, - (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, - toep->tp_copied_seq - toep->tp_rcv_wup); - toep->tp_rcv_wup = toep->tp_copied_seq; - } - -#ifdef T3_TRACE - T3_TRACE5(TIDTB(sk), - "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " - "modulate %d", - len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, - modulate); -#endif - - cxgb_ofld_send(TOEP_T3C_DEV(toep), m); -} - void -t3_init_wr_tab(unsigned int wr_len) -{ - int i; - - if (mbuf_wrs[1]) /* already initialized */ - return; - - for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { - int sgl_len = (3 * i) / 2 + (i & 1); - - sgl_len += 3; - mbuf_wrs[i] = sgl_len <= wr_len ? - 1 : 1 + (sgl_len - 2) / (wr_len - 1); - } - - wrlen = wr_len * 8; +t3_init_cpl_io(struct adapter *sc) +{ + t3_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); + t3_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); + t3_register_cpl_handler(sc, CPL_RX_URG_NOTIFY, do_rx_urg_notify); + t3_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); + t3_register_cpl_handler(sc, CPL_TX_DMA_ACK, do_wr_ack); + t3_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); + t3_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); + t3_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); + t3_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); + t3_register_cpl_handler(sc, CPL_SMT_WRITE_RPL, do_smt_write_rpl); + t3_register_cpl_handler(sc, CPL_SET_TCB_RPL, do_set_tcb_rpl); } - -int -t3_init_cpl_io(void) -{ -#ifdef notyet - tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); - if (!tcphdr_skb) { - log(LOG_ERR, - "Chelsio TCP offload: can't allocate sk_buff\n"); - return -1; - } - skb_put(tcphdr_skb, sizeof(struct tcphdr)); - tcphdr_skb->h.raw = tcphdr_skb->data; - memset(tcphdr_skb->data, 0, tcphdr_skb->len); #endif - - t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); - t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); - t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); - t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); - t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); - t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); - t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); - t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); - t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); - t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); - t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); - t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); - t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); - t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); - t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); - return (0); -} - diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c deleted file mode 100644 index bb0015f..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ /dev/null @@ -1,1034 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - - -static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, - struct uio *uio, struct mbuf *top, struct mbuf *control, - int flags, struct thread *td); - -static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, - struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, - int *flagsp); - -#define TMP_IOV_MAX 16 -#ifndef PG_FRAME -#define PG_FRAME ~PAGE_MASK -#endif -#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) - -void -t3_init_socket_ops(void) -{ - struct protosw *prp; - - prp = pffindtype(AF_INET, SOCK_STREAM); - pru_sosend = prp->pr_usrreqs->pru_sosend; - pru_soreceive = prp->pr_usrreqs->pru_soreceive; -} - -struct cxgb_dma_info { - size_t cdi_mapped; - int cdi_nsegs; - bus_dma_segment_t *cdi_segs; - -}; - -static void -cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs, - bus_size_t mapsize, int error) -{ - struct cxgb_dma_info *cdi = arg; - - cdi->cdi_mapped = mapsize; - cdi->cdi_nsegs = nsegs; - cdi->cdi_segs = segs; -} - -static void -iov_adj(struct iovec **iov, int *iovcnt, size_t count) -{ - struct iovec *iovtmp; - int iovcnttmp; - caddr_t ptmp; - - if (count > 0) { - iovtmp = *iov; - iovcnttmp = *iovcnt; - while (count > 0) { - if (count < iovtmp->iov_len) { - ptmp = iovtmp->iov_base; - ptmp += count; - iovtmp->iov_base = ptmp; - iovtmp->iov_len -= count; - break; - } else - count -= iovtmp->iov_len; - iovtmp++; - iovcnttmp--; - } - *iov = iovtmp; - *iovcnt = iovcnttmp; - } else if (count < 0) { - iovtmp = &(*iov)[*iovcnt - 1]; - iovcnttmp = *iovcnt; - while (count < 0) { - if (-count < iovtmp->iov_len) { - iovtmp->iov_len += count; - break; - } else - count += iovtmp->iov_len; - iovtmp--; - iovcnttmp--; - } - *iovcnt = iovcnttmp; - } -} - -static void -cxgb_zero_copy_free(void *cl, void *arg) -{ - struct mbuf_vec *mv; - struct mbuf *m = (struct mbuf *)cl; - - mv = mtomv(m); - /* - * Physical addresses, don't try to free should be unheld separately from sbdrop - * - */ - mv->mv_count = 0; - m_free_iovec(m, m->m_type); -} - - -static int -cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, vm_prot_t prot) -{ - struct iovec *iov = uio->uio_iov; - int iovcnt = uio->uio_iovcnt; - int err, i, count, totcount, maxcount, totbytes, npages, curbytes; - uint64_t start, end; - vm_page_t *mp; - vm_map_t map; - - map = &uio->uio_td->td_proc->p_vmspace->vm_map; - totbytes = totcount = 0; - maxcount = *held; - - mp = m; - for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) { - count = maxcount - totcount; - - start = (uintptr_t)iov->iov_base; - end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len); - start &= PG_FRAME; - end += PAGE_MASK; - end &= PG_FRAME; - npages = (end - start) >> PAGE_SHIFT; - - count = min(count, npages); - - /* The following return value is not used. XXX */ - err = vm_fault_quick_hold_pages(map, - (vm_offset_t)iov->iov_base, iov->iov_len, prot, mp, count); - mp += count; - totcount += count; - curbytes = iov->iov_len; - if (count != npages) - curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK); - totbytes += curbytes; - } - uio->uio_resid -= totbytes; - - return (0); -} - -/* - * Returns whether a connection should enable DDP. This happens when all of - * the following conditions are met: - * - the connection's ULP mode is DDP - * - DDP is not already enabled - * - the last receive was above the DDP threshold - * - receive buffers are in user space - * - receive side isn't shutdown (handled by caller) - * - the connection's receive window is big enough so that sizable buffers - * can be posted without closing the window in the middle of DDP (checked - * when the connection is offloaded) - */ -static int -so_should_ddp(const struct toepcb *toep, int last_recv_len) -{ - - DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n", - toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres), - toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN)); - - return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) && - last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && - toep->tp_tp->rcv_wnd > - (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); -} - -static inline int -is_ddp(const struct mbuf *m) -{ - return ((m->m_flags & M_DDP) != 0); -} - -static inline int -is_ddp_psh(const struct mbuf *m) -{ - return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0); -} - -static int -m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) -{ - int curlen, startlen, resid_init, err = 0; - caddr_t buf; - - DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n", - m, offset, len); - - startlen = len; - resid_init = uio->uio_resid; - while (m && len) { - buf = mtod(m, caddr_t); - curlen = m->m_len; - if (offset && (offset < curlen)) { - curlen -= offset; - buf += offset; - offset = 0; - } else if (offset) { - offset -= curlen; - m = m->m_next; - continue; - } - err = uiomove(buf, min(len, curlen), uio); - if (err) { - printf("uiomove returned %d\n", err); - return (err); - } - - len -= min(len, curlen); - m = m->m_next; - } - DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n", - startlen - len, resid_init, uio->uio_resid); - return (err); -} - -/* - * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the - * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a - * DDP buffer. - */ -static inline int -copy_data(const struct mbuf *m, int offset, int len, struct uio *uio) -{ - struct iovec *to = uio->uio_iov; - int err; - - if (__predict_true(!is_ddp(m))) /* RX_DATA */ - return m_uiomove(m, offset, len, uio); - if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ - to->iov_len -= len; - to->iov_base = ((caddr_t)to->iov_base) + len; - uio->uio_iov = to; - uio->uio_resid -= len; - return (0); - } - err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ - return (err); -} - -static void -cxgb_wait_dma_completion(struct toepcb *toep) -{ - struct rwlock *lock; - - lock = &toep->tp_tp->t_inpcb->inp_lock; - inp_wlock(toep->tp_tp->t_inpcb); - cv_wait_unlock(&toep->tp_cv, lock); -} - -static int -cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) -{ - int i, seg_count, err, type; - struct mbuf *m0; - struct cxgb_dma_info cdi; - struct mbuf_vec *mv; - struct mbuf_iovec *mi; - bus_dma_segment_t *segs; - - err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio, - cxgb_dma_callback, &cdi, 0); - - if (err) - return (err); - seg_count = cdi.cdi_nsegs; - if ((m0 = mcl_alloc(seg_count, &type)) == NULL) { - bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap); - return (ENOMEM); - } - segs = cdi.cdi_segs; - m0->m_type = type; - m0->m_flags = (M_EXT|M_NOFREE); - m0->m_ext.ext_type = EXT_EXTREF; - m0->m_ext.ext_free = cxgb_zero_copy_free; -#if __FreeBSD_version >= 800016 - m0->m_ext.ext_arg1 = NULL; /* XXX: probably wrong /phk */ - m0->m_ext.ext_arg2 = NULL; -#else - m0->m_ext.ext_args = NULL; -#endif - - mv = mtomv(m0); - mv->mv_count = seg_count; - mv->mv_first = 0; - for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++) - mi_collapse_sge(mi, segs); - - *m = m0; - - /* - * This appears to be a no-op at the moment - * as busdma is all or nothing need to make - * sure the tag values are large enough - * - */ - if (cdi.cdi_mapped < uio->uio_resid) { - uio->uio_resid -= cdi.cdi_mapped; - } else - uio->uio_resid = 0; - - return (0); -} - -static int -t3_sosend(struct socket *so, struct uio *uio) -{ - int rv, count, hold_resid, sent, iovcnt; - struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; - struct mbuf *m; - struct uio uiotmp; - struct sockbuf *snd; - - /* - * Events requiring iteration: - * - number of pages exceeds max hold pages for process or system - * - number of pages exceeds maximum sg entries for a single WR - * - * We're limited to holding 128 pages at once - and we're limited to - * 34 SG entries per work request, but each SG entry can be any number - * of contiguous pages - * - */ - - uiotmp = *uio; - iovcnt = uio->uio_iovcnt; - iov = uio->uio_iov; - sent = 0; - snd = so_sockbuf_snd(so); -sendmore: - /* - * Make sure we don't exceed the socket buffer - */ - count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE); - rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, VM_PROT_READ); - hold_resid = uiotmp.uio_resid; - if (rv) - return (rv); - - /* - * Bump past sent and shave off the unheld amount - */ - if (hold_resid > 0) { - iovtmpp = iovtmp; - memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); - if (sent) - iov_adj(&iovtmpp, &iovcnt, sent); - iov_adj(&iovtmpp, &iovcnt, -hold_resid); - uiotmp.uio_iov = iovtmpp; - uiotmp.uio_iovcnt = iovcnt; - - } - uiotmp.uio_resid = uio->uio_resid - hold_resid; - - /* - * Push off all held pages - * - */ - while (uiotmp.uio_resid > 0) { - rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m); - if (rv) { - vm_page_unhold_pages(toep->tp_pages, count); - return (rv); - } - uio->uio_resid -= m->m_pkthdr.len; - sent += m->m_pkthdr.len; - sbappend(snd, m); - t3_push_frames(so, TRUE); - iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); - } - - /* - * Wait for pending I/O to be DMA'd to the card - * - */ - cxgb_wait_dma_completion(toep); - vm_page_unhold_pages(toep->tp_pages, count); - /* - * If there is more data to send adjust local copy of iov - * to point to teh start - */ - if (hold_resid) { - iovtmpp = iovtmp; - memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); - iov_adj(&iovtmpp, &iovcnt, sent); - uiotmp = *uio; - uiotmp.uio_iov = iovtmpp; - uiotmp.uio_iovcnt = iovcnt; - goto sendmore; - } - - return (0); -} - -static int -cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, - struct mbuf *top, struct mbuf *control, int flags, struct thread *td) -{ - struct tcpcb *tp = so_sototcpcb(so); - struct toedev *tdev; - int zcopy_thres, zcopy_enabled, rv; - - /* - * In order to use DMA direct from userspace the following - * conditions must be met: - * - the connection is currently offloaded - * - ddp is enabled - * - the number of bytes to be transferred exceeds the threshold - * - the number of bytes currently in flight won't exceed the in-flight - * threshold XXX TODO - * - vm_fault_quick_hold_pages succeeds - * - blocking socket XXX for now - * - */ - if (tp && tp->t_flags & TF_TOE) { - struct toepcb *toep = tp->t_toe; - - tdev = toep->tp_toedev; - zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); - zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); - - if (uio && (uio->uio_resid > zcopy_thres) && - (uio->uio_iovcnt < TMP_IOV_MAX) && ((so_state_get(so) & SS_NBIO) == 0) - && zcopy_enabled) { - rv = t3_sosend(so, uio); - if (rv != EAGAIN) - return (rv); - } - } - return pru_sosend(so, addr, uio, top, control, flags, td); -} - -/* - * Following replacement or removal of the first mbuf on the first mbuf chain - * of a socket buffer, push necessary state changes back into the socket - * buffer so that other consumers see the values consistently. 'nextrecord' - * is the callers locally stored value of the original value of - * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. - * NOTE: 'nextrecord' may be NULL. - */ -static __inline void -sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) -{ - sockbuf_lock_assert(sb); - /* - * First, update for the new value of nextrecord. If necessary, make - * it the first record. - */ - if (sb->sb_mb != NULL) - sb->sb_mb->m_nextpkt = nextrecord; - else - sb->sb_mb = nextrecord; - - /* - * Now update any dependent socket buffer fields to reflect the new - * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the - * addition of a second clause that takes care of the case where - * sb_mb has been updated, but remains the last record. - */ - if (sb->sb_mb == NULL) { - sb->sb_mbtail = NULL; - sb->sb_lastrecord = NULL; - } else if (sb->sb_mb->m_nextpkt == NULL) - sb->sb_lastrecord = sb->sb_mb; -} - -#define IS_NONBLOCKING(so) (so_state_get(so) & SS_NBIO) - -static int -t3_soreceive(struct socket *so, int *flagsp, struct uio *uio) -{ - struct tcpcb *tp = so_sototcpcb(so); - struct toepcb *toep = tp->t_toe; - struct mbuf *m; - uint32_t offset; - int err, flags, avail, len, copied, copied_unacked; - int target; /* Read at least this many bytes */ - int user_ddp_ok; - struct ddp_state *p; - struct inpcb *inp = so_sotoinpcb(so); - int socket_state, socket_error; - struct sockbuf *rcv; - - avail = offset = copied = copied_unacked = 0; - flags = flagsp ? (*flagsp &~ MSG_EOR) : 0; - rcv = so_sockbuf_rcv(so); - - err = sblock(rcv, SBLOCKWAIT(flags)); - p = &toep->tp_ddp_state; - - if (err) - return (err); - - rcv = so_sockbuf_rcv(so); - sockbuf_lock(rcv); - if ((tp->t_flags & TF_TOE) == 0) { - sockbuf_unlock(rcv); - err = EAGAIN; - goto done_unlocked; - } - - p->user_ddp_pending = 0; -restart: - if ((tp->t_flags & TF_TOE) == 0) { - sockbuf_unlock(rcv); - err = EAGAIN; - goto done_unlocked; - } - - len = uio->uio_resid; - m = rcv->sb_mb; - target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat; - user_ddp_ok = p->ubuf_ddp_ready; - p->cancel_ubuf = 0; - - if (len == 0) - goto done; - if (m) - goto got_mbuf; - - /* empty receive queue */ - if (copied >= target && (rcv->sb_mb == NULL) && - !p->user_ddp_pending) - goto done; - - socket_state = so_state_get(so); - socket_error = so_error_get(so); - rcv = so_sockbuf_rcv(so); - - if (copied) { - if (socket_error || tp->t_state == TCPS_CLOSED || - (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))) - goto done; - } else { - if (socket_state & SS_NOFDREF) - goto done; - if (socket_error) { - err = socket_error; - socket_error = 0; - goto done; - } - if (rcv->sb_state & SBS_CANTRCVMORE) - goto done; - if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) - goto done; - if (tp->t_state == TCPS_CLOSED) { - err = ENOTCONN; - goto done; - } - } - if (rcv->sb_mb && !p->user_ddp_pending) { - sockbuf_unlock(rcv); - inp_wlock(inp); - t3_cleanup_rbuf(tp, copied_unacked); - inp_wunlock(inp); - sockbuf_lock(rcv); - copied_unacked = 0; - goto restart; - } - if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && - uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && - p->ubuf_ddp_ready) { - p->user_ddp_pending = - !t3_overlay_ubuf(toep, rcv, uio, - IS_NONBLOCKING(so), flags, 1, 1); - if (p->user_ddp_pending) { - p->kbuf_posted++; - user_ddp_ok = 0; - } - } - if (p->kbuf[0] && (p->kbuf_posted == 0)) { - t3_post_kbuf(toep, 1, IS_NONBLOCKING(so)); - p->kbuf_posted++; - } - if (p->user_ddp_pending) { - /* One shot at DDP if we already have enough data */ - if (copied >= target) - user_ddp_ok = 0; - - if (rcv->sb_state & SBS_CANTRCVMORE) - goto done; - CTR0(KTR_TOM, "ddp pending -- waiting"); - if ((err = sbwait(rcv)) != 0) - goto done; -//for timers to work await_ddp_completion(sk, flags, &timeo); - } else if (copied >= target) - goto done; - else { - if (copied_unacked) { - int i = 0; - - sockbuf_unlock(rcv); - inp_wlock(inp); - t3_cleanup_rbuf(tp, copied_unacked); - inp_wunlock(inp); - copied_unacked = 0; - if (mp_ncpus > 1) - while (i++ < 200 && rcv->sb_mb == NULL) - cpu_spinwait(); - sockbuf_lock(rcv); - } - if (rcv->sb_mb) - goto restart; - - if (rcv->sb_state & SBS_CANTRCVMORE) - goto done; - - CTR0(KTR_TOM, "no buffers -- waiting"); - - if ((err = sbwait(rcv)) != 0) - goto done; - } - goto restart; -got_mbuf: - /* - * Adjust the mbuf seqno if it has already been partially processed by - * soreceive_generic - */ - if (m->m_pkthdr.len != m->m_len) { - m->m_seq += m->m_pkthdr.len - m->m_len; - m->m_pkthdr.len = m->m_len; - } - - CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u " - "m_seq=0x%08x c_seq=0x%08x c_unack=%u", - (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len, - m->m_seq, toep->tp_copied_seq, copied_unacked); - KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), - ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), - m->m_ext.ext_type, m->m_len, m->m_pkthdr.len)); - KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p" - " m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len)); - if (m->m_pkthdr.len == 0) { - if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0) - panic("empty mbuf and NOCOPY not set\n"); - CTR0(KTR_TOM, "ddp done notification"); - p->user_ddp_pending = 0; - sbdroprecord_locked(rcv); - goto done; - } - - KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0, - ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x", - offset, toep->tp_copied_seq, copied_unacked, m->m_seq)); - offset = toep->tp_copied_seq + copied_unacked - m->m_seq; - - if (offset >= m->m_pkthdr.len) - panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x " - "seq 0x%x pktlen %d ddp flags 0x%x", offset, - toep->tp_copied_seq + copied_unacked, m->m_seq, - m->m_pkthdr.len, m->m_ddp_flags); - - avail = m->m_pkthdr.len - offset; - if (len < avail) { - if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) - panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset); - avail = len; - rcv->sb_flags |= SB_IN_TOE; - } else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0) - rcv->sb_flags &= ~SB_IN_TOE; - -#ifdef URGENT_DATA_SUPPORTED - /* - * Check if the data we are preparing to copy contains urgent - * data. Either stop short of urgent data or skip it if it's - * first and we are not delivering urgent data inline. - */ - if (__predict_false(toep->tp_urg_data)) { - uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked; - - if (urg_offset < avail) { - if (urg_offset) { - /* stop short of the urgent data */ - avail = urg_offset; - } else if ((so_options_get(so) & SO_OOBINLINE) == 0) { - /* First byte is urgent, skip */ - toep->tp_copied_seq++; - offset++; - avail--; - if (!avail) - goto skip_copy; - } - } - } -#endif - if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) { - user_ddp_ok = 0; -#ifdef T3_TRACE - T3_TRACE0(TIDTB(so), "t3_sosend: PSH"); -#endif - } - - if (user_ddp_ok && !p->user_ddp_pending && - uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && - p->ubuf_ddp_ready) { - p->user_ddp_pending = - !t3_overlay_ubuf(toep, rcv, uio, - IS_NONBLOCKING(so), flags, 1, 1); - if (p->user_ddp_pending) { - p->kbuf_posted++; - user_ddp_ok = 0; - } - DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending); - } else - DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n", - user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0, - p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted); - - /* - * If MSG_TRUNC is specified the data is discarded. - * XXX need to check pr_atomic - */ - KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset)); - if (__predict_true(!(flags & MSG_TRUNC))) { - int resid = uio->uio_resid; - - sockbuf_unlock(rcv); - if ((err = copy_data(m, offset, avail, uio))) { - if (err) - err = EFAULT; - goto done_unlocked; - } - - sockbuf_lock(rcv); - if (avail != (resid - uio->uio_resid)) - printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n", - avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m)); - - if ((tp->t_flags & TF_TOE) == 0) { - sockbuf_unlock(rcv); - err = EAGAIN; - goto done_unlocked; - } - } - - copied += avail; - copied_unacked += avail; - len -= avail; - -#ifdef URGENT_DATA_SUPPORTED -skip_copy: - if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq)) - tp->urg_data = 0; -#endif - /* - * If the buffer is fully consumed free it. If it's a DDP - * buffer also handle any events it indicates. - */ - if (avail + offset >= m->m_pkthdr.len) { - unsigned int fl = m->m_ddp_flags; - int exitnow, got_psh = 0, nomoredata = 0; - int count; - struct mbuf *nextrecord; - - if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) { - if (is_ddp_psh(m) && p->user_ddp_pending) - got_psh = 1; - - if (fl & DDP_BF_NOCOPY) - p->user_ddp_pending = 0; - else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) { - p->kbuf_posted--; - nomoredata = 1; - } else { - p->kbuf_posted--; - p->ubuf_ddp_ready = 1; - } - } - - nextrecord = m->m_nextpkt; - count = m->m_pkthdr.len; - while (count > 0) { - count -= m->m_len; - KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); - CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len); - sbfree(rcv, m); - rcv->sb_mb = m_free(m); - m = rcv->sb_mb; - } - sockbuf_pushsync(rcv, nextrecord); -#if 0 - sbdrop_locked(rcv, m->m_pkthdr.len); -#endif - exitnow = got_psh || nomoredata; - if (copied >= target && (rcv->sb_mb == NULL) && exitnow) - goto done; - if (copied_unacked > (rcv->sb_hiwat >> 2)) { - sockbuf_unlock(rcv); - inp_wlock(inp); - t3_cleanup_rbuf(tp, copied_unacked); - inp_wunlock(inp); - copied_unacked = 0; - sockbuf_lock(rcv); - } - } - if (len > 0) - goto restart; - - done: - if ((tp->t_flags & TF_TOE) == 0) { - sockbuf_unlock(rcv); - err = EAGAIN; - goto done_unlocked; - } - /* - * If we can still receive decide what to do in preparation for the - * next receive. Note that RCV_SHUTDOWN is set if the connection - * transitioned to CLOSE but not if it was in that state to begin with. - */ - if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) { - if (p->user_ddp_pending) { - user_ddp_ok = 0; - t3_cancel_ubuf(toep, rcv); - if (rcv->sb_mb) { - if (copied < 0) - copied = 0; - if (len > 0) - goto restart; - } - p->user_ddp_pending = 0; - } - if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) { -#ifdef T3_TRACE - T3_TRACE0(TIDTB(so), - "chelsio_recvmsg: about to exit, repost kbuf"); -#endif - - t3_post_kbuf(toep, 1, IS_NONBLOCKING(so)); - p->kbuf_posted++; - } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) { - CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid); - if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev, - ddp_copy_limit), 0, IS_NONBLOCKING(so))) { - rcv->sb_flags |= SB_IN_TOE; - p->kbuf_posted = 1; - } - - } - } -#ifdef T3_TRACE - T3_TRACE5(TIDTB(so), - "chelsio_recvmsg <-: copied %d len %d buffers_freed %d " - "kbuf_posted %d user_ddp_pending %u", - copied, len, buffers_freed, p ? p->kbuf_posted : -1, - p->user_ddp_pending); -#endif - sockbuf_unlock(rcv); -done_unlocked: - if (copied_unacked && (tp->t_flags & TF_TOE)) { - inp_wlock(inp); - t3_cleanup_rbuf(tp, copied_unacked); - inp_wunlock(inp); - } - sbunlock(rcv); - - return (err); -} - -static int -cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, - struct mbuf **mp0, struct mbuf **controlp, int *flagsp) -{ - struct toedev *tdev; - int rv, zcopy_thres, zcopy_enabled, flags; - struct tcpcb *tp = so_sototcpcb(so); - struct sockbuf *rcv = so_sockbuf_rcv(so); - - flags = flagsp ? *flagsp &~ MSG_EOR : 0; - - /* - * In order to use DMA direct from userspace the following - * conditions must be met: - * - the connection is currently offloaded - * - ddp is enabled - * - the number of bytes to be transferred exceeds the threshold - * - the number of bytes currently in flight won't exceed the in-flight - * threshold XXX TODO - * - vm_fault_quick_hold_pages succeeds - * - blocking socket XXX for now - * - iovcnt is 1 - * - */ - if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0) - && (uio->uio_iovcnt == 1) && (mp0 == NULL) && - ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) { - struct toepcb *toep = tp->t_toe; - - tdev = toep->tp_toedev; - zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); - zcopy_enabled = TOM_TUNABLE(tdev, ddp); - if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) && - (uio->uio_iovcnt == 1) && zcopy_enabled)) { - CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d", - rcv->sb_flags, tp->t_flags, flags, uio->uio_resid); - rv = t3_soreceive(so, flagsp, uio); - if (rv != EAGAIN) - return (rv); - else - printf("returned EAGAIN\n"); - } - } else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) { - struct sockbuf *rcv = so_sockbuf_rcv(so); - - log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n", - flags, uio->uio_iovcnt, rcv->sb_state); - } - - return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); -} - -struct protosw cxgb_protosw; -struct pr_usrreqs cxgb_tcp_usrreqs; - -void -t3_install_socket_ops(struct socket *so) -{ - static int copied = 0; - struct pr_usrreqs *pru; - struct protosw *psw; - - if (copied == 0) { - psw = so_protosw_get(so); - pru = psw->pr_usrreqs; - - bcopy(psw, &cxgb_protosw, sizeof(*psw)); - bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru)); - - cxgb_protosw.pr_ctloutput = t3_ctloutput; - cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs; - cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend; - cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive; - } - so_protosw_set(so, &cxgb_protosw); - -#if 0 - so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; - so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; -#endif -} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c deleted file mode 100644 index fe3b075..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c +++ /dev/null @@ -1,738 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007-2008, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include -#include -#include - - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - - -#define MAX_SCHEDULE_TIMEOUT 300 - -/* - * Return the # of page pods needed to accommodate a # of pages. - */ -static inline unsigned int -pages2ppods(unsigned int pages) -{ - return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS; -} - -/** - * t3_pin_pages - pin a user memory range and prepare it for DDP - * @addr - the starting address - * @len - the length of the range - * @newgl - contains the pages and physical addresses of the pinned range - * @gl - an existing gather list, may be %NULL - * - * Pins the pages in the user-space memory range [addr, addr + len) and - * maps them for DMA. Returns a gather list with the pinned pages and - * their physical addresses. If @gl is non NULL the pages it describes - * are compared against the pages for [addr, addr + len), and if the - * existing gather list already covers the range a new list is not - * allocated. Returns 0 on success, or a negative errno. On success if - * a new gather list was allocated it is returned in @newgl. - */ -static int -t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t dmamap, vm_offset_t addr, - size_t len, struct ddp_gather_list **newgl, - const struct ddp_gather_list *gl) -{ - int i = 0, err; - size_t pg_off; - unsigned int npages; - struct ddp_gather_list *p; - vm_map_t map; - - pg_off = addr & PAGE_MASK; - npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *), - M_DEVBUF, M_NOWAIT|M_ZERO); - if (p == NULL) - return (ENOMEM); - - map = &curthread->td_proc->p_vmspace->vm_map; - if (vm_fault_quick_hold_pages(map, addr, len, VM_PROT_READ | - VM_PROT_WRITE, p->dgl_pages, npages) < 0) { - err = EFAULT; - goto free_gl; - } - - if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages && - gl->dgl_length >= len) { - for (i = 0; i < npages; i++) - if (p->dgl_pages[i] != gl->dgl_pages[i]) - goto different_gl; - err = 0; - goto unpin; - } - -different_gl: - p->dgl_length = len; - p->dgl_offset = pg_off; - p->dgl_nelem = npages; -#ifdef NEED_BUSDMA - p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off, - PAGE_SIZE - pg_off, - PCI_DMA_FROMDEVICE) - pg_off; - for (i = 1; i < npages; ++i) - p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE, - PCI_DMA_FROMDEVICE); -#endif - *newgl = p; - return (0); -unpin: - vm_page_unhold_pages(p->dgl_pages, npages); - -free_gl: - - free(p, M_DEVBUF); - *newgl = NULL; - return (err); -} - -static void -unmap_ddp_gl(const struct ddp_gather_list *gl) -{ -#ifdef NEED_BUSDMA - int i; - - if (!gl->nelem) - return; - - pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset, - PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE); - for (i = 1; i < gl->nelem; ++i) - pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE, - PCI_DMA_FROMDEVICE); - -#endif -} - -static void -ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty) -{ - /* - * XXX mark pages as dirty before unholding - */ - vm_page_unhold_pages(gl->dgl_pages, gl->dgl_nelem); -} - -void -t3_free_ddp_gl(struct ddp_gather_list *gl) -{ - unmap_ddp_gl(gl); - ddp_gl_free_pages(gl, 0); - free(gl, M_DEVBUF); -} - -/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */ -#define MAX_PPODS 64U - -/* - * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in - * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we - * try to allocate enough page pods to accommodate the whole buffer, subject to - * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page - * pods before failing entirely. - */ -static int -alloc_buf1_ppods(struct toepcb *toep, struct ddp_state *p, - unsigned long addr, unsigned int len) -{ - int err, tag, npages, nppods; - struct tom_data *d = TOM_DATA(toep->tp_toedev); - -#if 0 - SOCKBUF_LOCK_ASSERT(&so->so_rcv); -#endif - npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - nppods = min(pages2ppods(npages), MAX_PPODS); - nppods = roundup2(nppods, PPOD_CLUSTER_SIZE); - err = t3_alloc_ppods(d, nppods, &tag); - if (err && nppods > PPOD_CLUSTER_SIZE) { - nppods = PPOD_CLUSTER_SIZE; - err = t3_alloc_ppods(d, nppods, &tag); - } - if (err) - return (ENOMEM); - - p->ubuf_nppods = nppods; - p->ubuf_tag = tag; -#if NUM_DDP_KBUF == 1 - t3_set_ddp_tag(toep, 1, tag << 6); -#endif - return (0); -} - -/* - * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush - * won't block indefinitely if there's nothing to place (which should be rare). - */ -#define UBUF_OFFSET 1 - -static __inline unsigned long -select_ddp_flags(const struct toepcb *toep, int buf_idx, - int nonblock, int rcv_flags) -{ - if (buf_idx == 1) { - if (__predict_false(rcv_flags & MSG_WAITALL)) - return V_TF_DDP_PSH_NO_INVALIDATE0(1) | - V_TF_DDP_PSH_NO_INVALIDATE1(1) | - V_TF_DDP_PUSH_DISABLE_1(1); - if (nonblock) - return V_TF_DDP_BUF1_FLUSH(1); - - return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(toep->tp_toedev, - ddp_push_wait)); - } - - if (__predict_false(rcv_flags & MSG_WAITALL)) - return V_TF_DDP_PSH_NO_INVALIDATE0(1) | - V_TF_DDP_PSH_NO_INVALIDATE1(1) | - V_TF_DDP_PUSH_DISABLE_0(1); - if (nonblock) - return V_TF_DDP_BUF0_FLUSH(1); - - return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(toep->tp_toedev, ddp_push_wait)); -} - -/* - * Reposts the kernel DDP buffer after it has been previously become full and - * invalidated. We just need to reset the offset and adjust the DDP flags. - * Conveniently, we can set the flags and the offset with a single message. - * Note that this function does not set the buffer length. Again conveniently - * our kernel buffer is of fixed size. If the length needs to be changed it - * needs to be done separately. - */ -static void -t3_repost_kbuf(struct toepcb *toep, unsigned int bufidx, int modulate, - int activate, int nonblock) -{ - struct ddp_state *p = &toep->tp_ddp_state; - unsigned long flags; - -#if 0 - SOCKBUF_LOCK_ASSERT(&so->so_rcv); -#endif - p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset; - p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0; - p->buf_state[bufidx].gl = p->kbuf[bufidx]; - p->cur_buf = bufidx; - p->kbuf_idx = bufidx; - - flags = select_ddp_flags(toep, bufidx, nonblock, 0); - if (!bufidx) - t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags | - V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) | - V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | - V_TF_DDP_BUF0_VALID(1), - V_TF_DDP_BUF0_FLUSH(1) | - V_TF_DDP_PSH_NO_INVALIDATE0(1) | - V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) | - V_TF_DDP_BUF0_VALID(1) | - V_TF_DDP_ACTIVE_BUF(activate), modulate); - else - t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags | - V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) | - V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | - V_TF_DDP_BUF1_VALID(1) | - V_TF_DDP_ACTIVE_BUF(activate), - V_TF_DDP_BUF1_FLUSH(1) | - V_TF_DDP_PSH_NO_INVALIDATE0(1) | - V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) | - V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), - modulate); - -} - -/** - * setup_uio_ppods - setup HW page pods for a user iovec - * @sk: the associated socket - * @uio: the uio - * @oft: additional bytes to map before the start of the buffer - * - * Pins a user iovec and sets up HW page pods for DDP into it. We allocate - * page pods for user buffers on the first call per socket. Afterwards we - * limit the buffer length to whatever the existing page pods can accommodate. - * Returns a negative error code or the length of the mapped buffer. - * - * The current implementation handles iovecs with only one entry. - */ -static int -setup_uio_ppods(struct toepcb *toep, const struct uio *uio, int oft, int *length) -{ - int err; - unsigned int len; - struct ddp_gather_list *gl = NULL; - struct ddp_state *p = &toep->tp_ddp_state; - struct iovec *iov = uio->uio_iov; - vm_offset_t addr = (vm_offset_t)iov->iov_base - oft; - -#ifdef notyet - SOCKBUF_LOCK_ASSERT(&so->so_rcv); -#endif - if (__predict_false(p->ubuf_nppods == 0)) { - err = alloc_buf1_ppods(toep, p, addr, iov->iov_len + oft); - if (err) - return (err); - } - - len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE; - len -= addr & PAGE_MASK; - if (len > M_TCB_RX_DDP_BUF0_LEN) - len = M_TCB_RX_DDP_BUF0_LEN; - len = min(len, toep->tp_tp->rcv_wnd - 32768); - len = min(len, iov->iov_len + oft); - - if (len <= p->kbuf[0]->dgl_length) { - printf("length too short\n"); - return (EINVAL); - } - - err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf); - if (err) - return (err); - if (gl) { - if (p->ubuf) - t3_free_ddp_gl(p->ubuf); - p->ubuf = gl; - t3_setup_ppods(toep, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len, - gl->dgl_offset, 0); - } - *length = len; - return (0); -} - -/* - * - */ -void -t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv) -{ - struct ddp_state *p = &toep->tp_ddp_state; - int ubuf_pending = t3_ddp_ubuf_pending(toep); - int err = 0, count = 0; - - if (p->ubuf == NULL) - return; - - sockbuf_lock_assert(rcv); - - p->cancel_ubuf = 1; - while (ubuf_pending && !(rcv->sb_state & SBS_CANTRCVMORE)) { - CTR3(KTR_TOM, - "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d", - p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), - p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), - p->get_tcb_count); - if (p->get_tcb_count == 0) - t3_cancel_ddpbuf(toep, p->cur_buf); - else - CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p SBS_CANTRCVMORE=%d", - err, p->get_tcb_count, rcv->sb_timeo, rcv, - !!(rcv->sb_state & SBS_CANTRCVMORE)); - - while (p->get_tcb_count && !(rcv->sb_state & SBS_CANTRCVMORE)) { - if (count & 0xfffffff) - CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p count=%d", - err, p->get_tcb_count, rcv->sb_timeo, rcv, count); - count++; - err = sbwait(rcv); - } - ubuf_pending = t3_ddp_ubuf_pending(toep); - } - p->cancel_ubuf = 0; - p->user_ddp_pending = 0; - -} - -#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \ - V_TF_DDP_PSH_NO_INVALIDATE1(1) | \ - V_TF_DDP_BUF1_FLUSH(1) | \ - V_TF_DDP_BUF0_FLUSH(1) | \ - V_TF_DDP_PUSH_DISABLE_1(1) | \ - V_TF_DDP_PUSH_DISABLE_0(1) | \ - V_TF_DDP_INDICATE_OUT(1)) - -/* - * Post a user buffer as an overlay on top of the current kernel buffer. - */ -int -t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv, - const struct uio *uio, int nonblock, int rcv_flags, - int modulate, int post_kbuf) -{ - int err, len, ubuf_idx; - unsigned long flags; - struct ddp_state *p = &toep->tp_ddp_state; - - if (p->kbuf[0] == NULL) { - return (EINVAL); - } - sockbuf_unlock(rcv); - err = setup_uio_ppods(toep, uio, 0, &len); - sockbuf_lock(rcv); - if (err) - return (err); - - if ((rcv->sb_state & SBS_CANTRCVMORE) || - (toep->tp_tp->t_flags & TF_TOE) == 0) - return (EINVAL); - - ubuf_idx = p->kbuf_idx; - p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP; - /* Use existing offset */ - /* Don't need to update .gl, user buffer isn't copied. */ - p->cur_buf = ubuf_idx; - - flags = select_ddp_flags(toep, ubuf_idx, nonblock, rcv_flags); - - if (post_kbuf) { - struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1]; - - dbs->cur_offset = 0; - dbs->flags = 0; - dbs->gl = p->kbuf[ubuf_idx ^ 1]; - p->kbuf_idx ^= 1; - flags |= p->kbuf_idx ? - V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) : - V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0); - } - - if (ubuf_idx == 0) { - t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6, - len); - t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0, - flags, - OVERLAY_MASK | flags, 1); - } else { - t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6, - len); - t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0, - flags, - OVERLAY_MASK | flags, 1); - } -#ifdef T3_TRACE - T3_TRACE5(TIDTB(so), - "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d " - " kbuf_idx %d", - p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx); -#endif - CTR3(KTR_TOM, - "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x", - p->ubuf_tag, flags, OVERLAY_MASK); - CTR3(KTR_TOM, - "t3_overlay_ubuf: ubuf_idx %d kbuf_idx %d post_kbuf %d", - ubuf_idx, p->kbuf_idx, post_kbuf); - - return (0); -} - -/* - * Clean up DDP state that needs to survive until socket close time, such as the - * DDP buffers. The buffers are already unmapped at this point as unmapping - * needs the PCI device and a socket may close long after the device is removed. - */ -void -t3_cleanup_ddp(struct toepcb *toep) -{ - struct ddp_state *p = &toep->tp_ddp_state; - int idx; - - for (idx = 0; idx < NUM_DDP_KBUF; idx++) - if (p->kbuf[idx]) { - ddp_gl_free_pages(p->kbuf[idx], 0); - free(p->kbuf[idx], M_DEVBUF); - } - if (p->ubuf) { - ddp_gl_free_pages(p->ubuf, 0); - free(p->ubuf, M_DEVBUF); - p->ubuf = NULL; - } - toep->tp_ulp_mode = 0; -} - -/* - * This is a companion to t3_cleanup_ddp() and releases the HW resources - * associated with a connection's DDP state, such as the page pods. - * It's called when HW is done with a connection. The rest of the state - * remains available until both HW and the app are done with the connection. - */ -void -t3_release_ddp_resources(struct toepcb *toep) -{ - struct ddp_state *p = &toep->tp_ddp_state; - struct tom_data *d = TOM_DATA(toep->tp_toedev); - int idx; - - for (idx = 0; idx < NUM_DDP_KBUF; idx++) { - t3_free_ppods(d, p->kbuf_tag[idx], - p->kbuf_nppods[idx]); - unmap_ddp_gl(p->kbuf[idx]); - } - - if (p->ubuf_nppods) { - t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods); - p->ubuf_nppods = 0; - } - if (p->ubuf) - unmap_ddp_gl(p->ubuf); - -} - -void -t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock) -{ - struct ddp_state *p = &toep->tp_ddp_state; - - t3_set_ddp_tag(toep, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6); - t3_set_ddp_buf(toep, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length); - t3_repost_kbuf(toep, p->cur_buf, modulate, 1, nonblock); -#ifdef T3_TRACE - T3_TRACE1(TIDTB(so), - "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); -#endif - CTR1(KTR_TOM, - "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); -} - -/* - * Prepare a socket for DDP. Must be called when the socket is known to be - * open. - */ -int -t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock) -{ - int i, err = ENOMEM; - static vm_pindex_t color; - unsigned int nppods, kbuf_pages, idx = 0; - struct ddp_state *p = &toep->tp_ddp_state; - struct tom_data *d = TOM_DATA(toep->tp_toedev); - - - if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN) - return (EINVAL); - -#ifdef notyet - SOCKBUF_LOCK_ASSERT(&so->so_rcv); -#endif - kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - nppods = pages2ppods(kbuf_pages); - - p->kbuf_noinval = !!waitall; - p->kbuf_tag[NUM_DDP_KBUF - 1] = -1; - for (idx = 0; idx < NUM_DDP_KBUF; idx++) { - p->kbuf[idx] = - malloc(sizeof (struct ddp_gather_list) + kbuf_pages * - sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO); - if (p->kbuf[idx] == NULL) - goto err; - err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]); - if (err) { - printf("t3_alloc_ppods failed err=%d\n", err); - goto err; - } - - p->kbuf_nppods[idx] = nppods; - p->kbuf[idx]->dgl_length = kbuf_size; - p->kbuf[idx]->dgl_offset = 0; - p->kbuf[idx]->dgl_nelem = kbuf_pages; - - for (i = 0; i < kbuf_pages; ++i) { - p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color, - VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - if (p->kbuf[idx]->dgl_pages[i] == NULL) { - p->kbuf[idx]->dgl_nelem = i; - printf("failed to allocate kbuf pages\n"); - goto err; - } - } -#ifdef NEED_BUSDMA - /* - * XXX we'll need this for VT-d or any platform with an iommu :-/ - * - */ - for (i = 0; i < kbuf_pages; ++i) - p->kbuf[idx]->phys_addr[i] = - pci_map_page(p->pdev, p->kbuf[idx]->pages[i], - 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); -#endif - t3_setup_ppods(toep, p->kbuf[idx], nppods, p->kbuf_tag[idx], - p->kbuf[idx]->dgl_length, 0, 0); - } - cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid); - - t3_set_ddp_tag(toep, 0, p->kbuf_tag[0] << 6); - t3_set_ddp_buf(toep, 0, 0, p->kbuf[0]->dgl_length); - t3_repost_kbuf(toep, 0, 0, 1, nonblock); - - t3_set_rcv_coalesce_enable(toep, - TOM_TUNABLE(toep->tp_toedev, ddp_rcvcoalesce)); - t3_set_dack_mss(toep, TOM_TUNABLE(toep->tp_toedev, delack)>>1); - -#ifdef T3_TRACE - T3_TRACE4(TIDTB(so), - "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", - kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); -#endif - CTR4(KTR_TOM, - "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", - kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); - cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid); - return (0); - -err: - t3_release_ddp_resources(toep); - t3_cleanup_ddp(toep); - return (err); -} - -int -t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len) -{ - int resid_init, err; - struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl; - - resid_init = uio->uio_resid; - - if (!gl->dgl_pages) - panic("pages not set\n"); - - CTR4(KTR_TOM, "t3_ddp_copy: offset=%d dgl_offset=%d cur_offset=%d len=%d", - offset, gl->dgl_offset, m->m_cur_offset, len); - offset += gl->dgl_offset + m->m_cur_offset; - KASSERT(len <= gl->dgl_length, - ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length)); - - - err = uiomove_fromphys(gl->dgl_pages, offset, len, uio); - return (err); -} - - -/* - * Allocate n page pods. Returns -1 on failure or the page pod tag. - */ -int -t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag) -{ - unsigned int i, j; - - if (__predict_false(!td->ppod_map)) { - printf("ppod_map not set\n"); - return (EINVAL); - } - - mtx_lock(&td->ppod_map_lock); - for (i = 0; i < td->nppods; ) { - - for (j = 0; j < n; ++j) /* scan ppod_map[i..i+n-1] */ - if (td->ppod_map[i + j]) { - i = i + j + 1; - goto next; - } - memset(&td->ppod_map[i], 1, n); /* allocate range */ - mtx_unlock(&td->ppod_map_lock); - CTR2(KTR_TOM, - "t3_alloc_ppods: n=%u tag=%u", n, i); - *ptag = i; - return (0); - next: ; - } - mtx_unlock(&td->ppod_map_lock); - return (0); -} - -void -t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n) -{ - /* No need to take ppod_lock here */ - memset(&td->ppod_map[tag], 0, n); -} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h deleted file mode 100644 index 758f024..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_defs.h +++ /dev/null @@ -1,91 +0,0 @@ - -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -$FreeBSD$ - -***************************************************************************/ -#ifndef CXGB_DEFS_H_ -#define CXGB_DEFS_H_ - -#define VALIDATE_TID 0 - -#define TOEPCB(so) ((struct toepcb *)(sototcpcb((so))->t_toe)) -#define TOE_DEV(so) (TOEPCB((so))->tp_toedev) -#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket) -#define sototoep(so) (sototcpcb((so))->t_toe) - -#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__) -#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__) - -#define KTR_TOM KTR_SPARE2 -#define KTR_TCB KTR_SPARE3 - -struct toepcb; -struct listen_ctx; - -void cxgb_log_tcb(struct adapter *sc, unsigned int tid); -typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m); - -void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h); -void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev); -void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev); -int t3_push_frames(struct socket *so, int req_completion); -int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt, - struct sockaddr *nam); -void t3_init_listen_cpl_handlers(void); -int t3_init_cpl_io(void); -void t3_init_wr_tab(unsigned int wr_len); -uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail); -void t3_send_rx_modulate(struct toepcb *toep); -void t3_cleanup_rbuf(struct tcpcb *tp, int copied); - -void t3_init_socket_ops(void); -void t3_install_socket_ops(struct socket *so); - - -void t3_disconnect_acceptq(struct socket *listen_so); -void t3_reset_synq(struct listen_ctx *ctx); -void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler); - -struct toepcb *toepcb_alloc(void); -void toepcb_hold(struct toepcb *); -void toepcb_release(struct toepcb *); -void toepcb_init(struct toepcb *); - -void t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off); -void t3_set_dack_mss(struct toepcb *toep, int on); -void t3_set_keepalive(struct toepcb *toep, int on_off); -void t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag); -void t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, - unsigned int len); -int t3_get_tcb(struct toepcb *toep); - -int t3_ctloutput(struct socket *so, struct sockopt *sopt); - -#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_l2t.c b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c index 2484923..4352f33 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_l2t.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c @@ -1,76 +1,61 @@ -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD #include #include #include #include #include -#include -#include -#if __FreeBSD_version > 700000 -#include -#endif - #include #include #include #include -#include -#include #include -#include +#include -#include -#include +#include "cxgb_include.h" +#include "ulp/tom/cxgb_tom.h" +#include "ulp/tom/cxgb_l2t.h" -#define VLAN_NONE 0xfff -#define SDL(s) ((struct sockaddr_dl *)s) -#define RT_ENADDR(sa) ((u_char *)LLADDR(SDL((sa)))) -#define rt_expire rt_rmx.rmx_expire - -struct llinfo_arp { - struct callout la_timer; - struct rtentry *la_rt; - struct mbuf *la_hold; /* last packet until resolved/timeout */ - u_short la_preempt; /* countdown for pre-expiry arps */ - u_short la_asked; /* # requests sent */ -}; +#define VLAN_NONE 0xfff +#define SA(x) ((struct sockaddr *)(x)) +#define SIN(x) ((struct sockaddr_in *)(x)) +#define SINADDR(x) (SIN(x)->sin_addr.s_addr) /* * Module locking notes: There is a RW lock protecting the L2 table as a - * whole plus a spinlock per L2T entry. Entry lookups and allocations happen + * whole plus a mutex per L2T entry. Entry lookups and allocations happen * under the protection of the table lock, individual entry changes happen - * while holding that entry's spinlock. The table lock nests outside the + * while holding that entry's mutex. The table lock nests outside the * entry locks. Allocations of new entries take the table lock as writers so * no other lookups can happen while allocating new entries. Entry updates * take the table lock as readers so multiple entries can be updated in @@ -78,72 +63,60 @@ struct llinfo_arp { * and therefore can happen in parallel with entry allocation but no entry * can change state or increment its ref count during allocation as both of * these perform lookups. + * + * When acquiring multiple locks, the order is llentry -> L2 table -> L2 entry. */ static inline unsigned int -vlan_prio(const struct l2t_entry *e) -{ - return e->vlan >> 13; -} - -static inline unsigned int arp_hash(u32 key, int ifindex, const struct l2t_data *d) { return jhash_2words(key, ifindex, 0) & (d->nentries - 1); } -static inline void -neigh_replace(struct l2t_entry *e, struct llentry *neigh) -{ - LLE_WLOCK(neigh); - LLE_ADDREF(neigh); - LLE_WUNLOCK(neigh); - - if (e->neigh) - LLE_FREE(e->neigh); - e->neigh = neigh; -} - /* - * Set up an L2T entry and send any packets waiting in the arp queue. The - * supplied mbuf is used for the CPL_L2T_WRITE_REQ. Must be called with the - * entry locked. + * Set up an L2T entry and send any packets waiting in the arp queue. Must be + * called with the entry locked. */ static int -setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m, - struct l2t_entry *e) +setup_l2e_send_pending(struct adapter *sc, struct l2t_entry *e) { + struct mbuf *m; struct cpl_l2t_write_req *req; + struct port_info *pi = &sc->port[e->smt_idx]; /* smt_idx is port_id */ + + mtx_assert(&e->lock, MA_OWNED); - if (!m) { - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return (ENOMEM); + m = M_GETHDR_OFLD(pi->first_qset, CPL_PRIORITY_CONTROL, req); + if (m == NULL) { + log(LOG_ERR, "%s: no mbuf, can't setup L2 entry at index %d\n", + __func__, e->idx); + return (ENOMEM); } - /* - * XXX MH_ALIGN - */ - req = mtod(m, struct cpl_l2t_write_req *); - m->m_pkthdr.len = m->m_len = sizeof(*req); - - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx)); req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) | - V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) | - V_L2T_W_PRIO(vlan_prio(e))); - + V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) | + V_L2T_W_PRIO(EVL_PRIOFTAG(e->vlan))); memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); - m_set_priority(m, CPL_PRIORITY_CONTROL); - cxgb_ofld_send(dev, m); + + t3_offload_tx(sc, m); + + /* + * XXX: We used pi->first_qset to send the L2T_WRITE_REQ. If any mbuf + * on the arpq is going out via another queue set associated with the + * port then it has a bad race with the L2T_WRITE_REQ. Ideally we + * should wait till the reply to the write before draining the arpq. + */ while (e->arpq_head) { m = e->arpq_head; e->arpq_head = m->m_next; m->m_next = NULL; - cxgb_ofld_send(dev, m); + t3_offload_tx(sc, m); } e->arpq_tail = NULL; - e->state = L2T_STATE_VALID; - return 0; + return (0); } /* @@ -153,6 +126,8 @@ setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m, static inline void arpq_enqueue(struct l2t_entry *e, struct mbuf *m) { + mtx_assert(&e->lock, MA_OWNED); + m->m_next = NULL; if (e->arpq_head) e->arpq_tail->m_next = m; @@ -161,113 +136,149 @@ arpq_enqueue(struct l2t_entry *e, struct mbuf *m) e->arpq_tail = m; } -int -t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e) +static void +resolution_failed_mbuf(struct mbuf *m) { - struct llentry *lle = e->neigh; - struct sockaddr_in sin; + log(LOG_ERR, "%s: leaked mbuf %p, CPL at %p", + __func__, m, mtod(m, void *)); +} - bzero(&sin, sizeof(struct sockaddr_in)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_addr.s_addr = e->addr; +static void +resolution_failed(struct l2t_entry *e) +{ + struct mbuf *m; - CTR2(KTR_CXGB, "send slow on rt=%p eaddr=0x%08x\n", rt, e->addr); -again: - switch (e->state) { - case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ - arpresolve(rt->rt_ifp, rt, NULL, - (struct sockaddr *)&sin, e->dmac, &lle); - mtx_lock(&e->lock); - if (e->state == L2T_STATE_STALE) - e->state = L2T_STATE_VALID; - mtx_unlock(&e->lock); - case L2T_STATE_VALID: /* fast-path, send the packet on */ - return cxgb_ofld_send(dev, m); - case L2T_STATE_RESOLVING: - mtx_lock(&e->lock); - if (e->state != L2T_STATE_RESOLVING) { // ARP already completed - mtx_unlock(&e->lock); - goto again; - } - arpq_enqueue(e, m); - mtx_unlock(&e->lock); + mtx_assert(&e->lock, MA_OWNED); + + while (e->arpq_head) { + m = e->arpq_head; + e->arpq_head = m->m_next; + m->m_next = NULL; + resolution_failed_mbuf(m); + } + e->arpq_tail = NULL; +} + +static void +update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr, + uint16_t vtag) +{ + + mtx_assert(&e->lock, MA_OWNED); + + /* + * The entry may be in active use (e->refcount > 0) or not. We update + * it even when it's not as this simplifies the case where we decide to + * reuse the entry later. + */ + + if (lladdr == NULL && + (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) { /* - * Only the first packet added to the arpq should kick off - * resolution. However, because the m_gethdr below can fail, - * we allow each packet added to the arpq to retry resolution - * as a way of recovering from transient memory exhaustion. - * A better way would be to use a work request to retry L2T - * entries when there's no memory. + * Never got a valid L2 address for this one. Just mark it as + * failed instead of removing it from the hash (for which we'd + * need to wlock the table). */ - if (arpresolve(rt->rt_ifp, rt, NULL, - (struct sockaddr *)&sin, e->dmac, &lle) == 0) { - CTR6(KTR_CXGB, "mac=%x:%x:%x:%x:%x:%x\n", - e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]); - - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return (ENOMEM); + e->state = L2T_STATE_FAILED; + resolution_failed(e); + return; - mtx_lock(&e->lock); - if (e->arpq_head) - setup_l2e_send_pending(dev, m, e); - else - m_freem(m); - mtx_unlock(&e->lock); + } else if (lladdr == NULL) { + + /* Valid or already-stale entry was deleted (or expired) */ + + KASSERT(e->state == L2T_STATE_VALID || + e->state == L2T_STATE_STALE, + ("%s: lladdr NULL, state %d", __func__, e->state)); + + e->state = L2T_STATE_STALE; + + } else { + + if (e->state == L2T_STATE_RESOLVING || + e->state == L2T_STATE_FAILED || + memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) { + + /* unresolved -> resolved; or dmac changed */ + + memcpy(e->dmac, lladdr, ETHER_ADDR_LEN); + e->vlan = vtag; + setup_l2e_send_pending(sc, e); } + e->state = L2T_STATE_VALID; } - return 0; } -void -t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) +static int +resolve_entry(struct adapter *sc, struct l2t_entry *e) { - struct mbuf *m0; - struct sockaddr_in sin; + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + struct sockaddr_in sin = {0}; + uint8_t dmac[ETHER_ADDR_LEN]; + uint16_t vtag = EVL_VLID_MASK; + int rc; + sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_addr.s_addr = e->addr; - struct llentry *lle; - - if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return; + SINADDR(&sin) = e->addr; + + rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag); + if (rc == EWOULDBLOCK) + return (rc); + + mtx_lock(&e->lock); + update_entry(sc, e, rc == 0 ? dmac : NULL, vtag); + mtx_unlock(&e->lock); + + return (rc); +} + +int +t3_l2t_send_slow(struct adapter *sc, struct mbuf *m, struct l2t_entry *e) +{ - rt = e->neigh; again: switch (e->state) { case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ - arpresolve(rt->rt_ifp, rt, NULL, - (struct sockaddr *)&sin, e->dmac, &lle); - mtx_lock(&e->lock); - if (e->state == L2T_STATE_STALE) { - e->state = L2T_STATE_VALID; - } - mtx_unlock(&e->lock); - return; + + if (resolve_entry(sc, e) != EWOULDBLOCK) + goto again; /* entry updated, re-examine state */ + + /* Fall through */ + case L2T_STATE_VALID: /* fast-path, send the packet on */ - return; + + return (t3_offload_tx(sc, m)); + case L2T_STATE_RESOLVING: mtx_lock(&e->lock); - if (e->state != L2T_STATE_RESOLVING) { // ARP already completed + if (e->state != L2T_STATE_RESOLVING) { mtx_unlock(&e->lock); goto again; } + arpq_enqueue(e, m); mtx_unlock(&e->lock); - - /* - * Only the first packet added to the arpq should kick off - * resolution. However, because the alloc_skb below can fail, - * we allow each packet added to the arpq to retry resolution - * as a way of recovering from transient memory exhaustion. - * A better way would be to use a work request to retry L2T - * entries when there's no memory. - */ - arpresolve(rt->rt_ifp, rt, NULL, - (struct sockaddr *)&sin, e->dmac, &lle); + if (resolve_entry(sc, e) == EWOULDBLOCK) + break; + + mtx_lock(&e->lock); + if (e->state == L2T_STATE_VALID && e->arpq_head) + setup_l2e_send_pending(sc, e); + if (e->state == L2T_STATE_FAILED) + resolution_failed(e); + mtx_unlock(&e->lock); + break; + + case L2T_STATE_FAILED: + resolution_failed_mbuf(m); + return (EHOSTUNREACH); } - return; + + return (0); } + /* * Allocate a free L2T entry. Must be called with l2t_data.lock held. */ @@ -276,15 +287,19 @@ alloc_l2e(struct l2t_data *d) { struct l2t_entry *end, *e, **p; + rw_assert(&d->lock, RA_WLOCKED); + if (!atomic_load_acq_int(&d->nfree)) - return NULL; + return (NULL); /* there's definitely a free entry */ - for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) + for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) { if (atomic_load_acq_int(&e->refcnt) == 0) goto found; + } - for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ; + for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) + continue; found: d->rover = e + 1; atomic_add_int(&d->nfree, -1); @@ -294,90 +309,37 @@ found: * presently in the hash table. We need to remove it. */ if (e->state != L2T_STATE_UNUSED) { - int hash = arp_hash(e->addr, e->ifindex, d); + int hash = arp_hash(e->addr, e->ifp->if_index, d); - for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) + for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) { if (*p == e) { *p = e->next; break; } + } e->state = L2T_STATE_UNUSED; } - - return e; -} -/* - * Called when an L2T entry has no more users. The entry is left in the hash - * table since it is likely to be reused but we also bump nfree to indicate - * that the entry can be reallocated for a different neighbor. We also drop - * the existing neighbor reference in case the neighbor is going away and is - * waiting on our reference. - * - * Because entries can be reallocated to other neighbors once their ref count - * drops to 0 we need to take the entry's lock to avoid races with a new - * incarnation. - */ -void -t3_l2e_free(struct l2t_data *d, struct l2t_entry *e) -{ - struct llentry *lle; - - mtx_lock(&e->lock); - if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */ - lle = e->neigh; - e->neigh = NULL; - } - - mtx_unlock(&e->lock); - atomic_add_int(&d->nfree, 1); - if (lle) - LLE_FREE(lle); -} - - -/* - * Update an L2T entry that was previously used for the same next hop as neigh. - * Must be called with softirqs disabled. - */ -static inline void -reuse_entry(struct l2t_entry *e, struct llentry *neigh) -{ - - mtx_lock(&e->lock); /* avoid race with t3_l2t_free */ - if (neigh != e->neigh) - neigh_replace(e, neigh); - - if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), sizeof(e->dmac)) || - (neigh->rt_expire > time_uptime)) - e->state = L2T_STATE_RESOLVING; - else if (la->la_hold == NULL) - e->state = L2T_STATE_VALID; - else - e->state = L2T_STATE_STALE; - mtx_unlock(&e->lock); + return (e); } struct l2t_entry * -t3_l2t_get(struct t3cdev *dev, struct llentry *neigh, struct ifnet *ifp, - struct sockaddr *sa) +t3_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa) { + struct tom_data *td = pi->adapter->tom_softc; struct l2t_entry *e; - struct l2t_data *d = L2DATA(dev); - u32 addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr; - int ifidx = ifp->if_index; - int hash = arp_hash(addr, ifidx, d); - unsigned int smt_idx = ((struct port_info *)ifp->if_softc)->port_id; + struct l2t_data *d = td->l2t; + uint32_t addr = SINADDR(sa); + int hash = arp_hash(addr, ifp->if_index, d); + unsigned int smt_idx = pi->port_id; rw_wlock(&d->lock); - for (e = d->l2tab[hash].first; e; e = e->next) - if (e->addr == addr && e->ifindex == ifidx && - e->smt_idx == smt_idx) { + for (e = d->l2tab[hash].first; e; e = e->next) { + if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) { l2t_hold(d, e); - if (atomic_load_acq_int(&e->refcnt) == 1) - reuse_entry(e, neigh); goto done; } + } /* Need to allocate a new entry */ e = alloc_l2e(d); @@ -385,116 +347,59 @@ t3_l2t_get(struct t3cdev *dev, struct llentry *neigh, struct ifnet *ifp, mtx_lock(&e->lock); /* avoid race with t3_l2t_free */ e->next = d->l2tab[hash].first; d->l2tab[hash].first = e; - rw_wunlock(&d->lock); - + e->state = L2T_STATE_RESOLVING; e->addr = addr; - e->ifindex = ifidx; + e->ifp = ifp; e->smt_idx = smt_idx; atomic_store_rel_int(&e->refcnt, 1); - e->neigh = NULL; - - - neigh_replace(e, neigh); -#ifdef notyet - /* - * XXX need to add accessor function for vlan tag - */ - if (neigh->rt_ifp->if_vlantrunk) - e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id; - else -#endif - e->vlan = VLAN_NONE; - mtx_unlock(&e->lock); - return (e); + KASSERT(ifp->if_vlantrunk == NULL, ("TOE+VLAN unimplemented.")); + e->vlan = VLAN_NONE; + + mtx_unlock(&e->lock); } - + done: rw_wunlock(&d->lock); - return e; -} - -/* - * Called when address resolution fails for an L2T entry to handle packets - * on the arpq head. If a packet specifies a failure handler it is invoked, - * otherwise the packets is sent to the TOE. - * - * XXX: maybe we should abandon the latter behavior and just require a failure - * handler. - */ -static void -handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq) -{ - - while (arpq) { - struct mbuf *m = arpq; -#ifdef notyet - struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m); -#endif - arpq = m->m_next; - m->m_next = NULL; -#ifdef notyet - if (cb->arp_failure_handler) - cb->arp_failure_handler(dev, m); - else -#endif - cxgb_ofld_send(dev, m); - } + return (e); } void -t3_l2t_update(struct t3cdev *dev, struct llentry *neigh, - uint8_t *enaddr, struct sockaddr *sa) +t3_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, + uint8_t *lladdr, uint16_t vtag) { + struct tom_data *td = t3_tomdata(tod); + struct adapter *sc = tod->tod_softc; struct l2t_entry *e; - struct mbuf *arpq = NULL; - struct l2t_data *d = L2DATA(dev); - u32 addr = *(u32 *) &((struct sockaddr_in *)sa)->sin_addr; - int hash = arp_hash(addr, ifidx, d); - struct llinfo_arp *la; + struct l2t_data *d = td->l2t; + u32 addr = *(u32 *) &SIN(sa)->sin_addr; + int hash = arp_hash(addr, ifp->if_index, d); rw_rlock(&d->lock); for (e = d->l2tab[hash].first; e; e = e->next) - if (e->addr == addr) { + if (e->addr == addr && e->ifp == ifp) { mtx_lock(&e->lock); goto found; } rw_runlock(&d->lock); - CTR1(KTR_CXGB, "t3_l2t_update: addr=0x%08x not found", addr); + + /* + * This is of no interest to us. We've never had an offloaded + * connection to this destination, and we aren't attempting one right + * now. + */ return; found: - printf("found 0x%08x\n", addr); - rw_runlock(&d->lock); - memcpy(e->dmac, enaddr, ETHER_ADDR_LEN); - printf("mac=%x:%x:%x:%x:%x:%x\n", - e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]); - - if (atomic_load_acq_int(&e->refcnt)) { - if (neigh != e->neigh) - neigh_replace(e, neigh); - - la = (struct llinfo_arp *)neigh->rt_llinfo; - if (e->state == L2T_STATE_RESOLVING) { - - if (la->la_asked >= 5 /* arp_maxtries */) { - arpq = e->arpq_head; - e->arpq_head = e->arpq_tail = NULL; - } else - setup_l2e_send_pending(dev, NULL, e); - } else { - e->state = L2T_STATE_VALID; - if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), 6)) - setup_l2e_send_pending(dev, NULL, e); - } - } - mtx_unlock(&e->lock); - if (arpq) - handle_failed_resolution(dev, arpq); + KASSERT(e->state != L2T_STATE_UNUSED, + ("%s: unused entry in the hash.", __func__)); + + update_entry(sc, e, lladdr, vtag); + mtx_unlock(&e->lock); } struct l2t_data * @@ -503,9 +408,9 @@ t3_init_l2t(unsigned int l2t_capacity) struct l2t_data *d; int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry); - d = cxgb_alloc_mem(size); + d = malloc(size, M_CXGB, M_NOWAIT | M_ZERO); if (!d) - return NULL; + return (NULL); d->nentries = l2t_capacity; d->rover = &d->l2tab[1]; /* entry 0 is not used */ @@ -515,10 +420,10 @@ t3_init_l2t(unsigned int l2t_capacity) for (i = 0; i < l2t_capacity; ++i) { d->l2tab[i].idx = i; d->l2tab[i].state = L2T_STATE_UNUSED; - mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF); + mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF); atomic_store_rel_int(&d->l2tab[i].refcnt, 0); } - return d; + return (d); } void @@ -530,5 +435,26 @@ t3_free_l2t(struct l2t_data *d) for (i = 0; i < d->nentries; ++i) mtx_destroy(&d->l2tab[i].lock); - cxgb_free_mem(d); + free(d, M_CXGB); } + +static int +do_l2t_write_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct cpl_l2t_write_rpl *rpl = mtod(m, void *); + + if (rpl->status != CPL_ERR_NONE) + log(LOG_ERR, + "Unexpected L2T_WRITE_RPL status %u for entry %u\n", + rpl->status, GET_TID(rpl)); + + m_freem(m); + return (0); +} + +void +t3_init_l2t_cpl_handlers(struct adapter *sc) +{ + t3_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl); +} +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_l2t.h b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h index 308ba66..d3ddf9d 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_l2t.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h @@ -1,6 +1,6 @@ /************************************************************************** -Copyright (c) 2007-2008, Chelsio Inc. +Copyright (c) 2007-2009, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -31,26 +31,19 @@ $FreeBSD$ #ifndef _CHELSIO_L2T_H #define _CHELSIO_L2T_H -#include #include - -#if __FreeBSD_version > 700000 #include -#else -#define rwlock mtx -#define rw_wlock(x) mtx_lock((x)) -#define rw_wunlock(x) mtx_unlock((x)) -#define rw_rlock(x) mtx_lock((x)) -#define rw_runlock(x) mtx_unlock((x)) -#define rw_init(x, str) mtx_init((x), (str), NULL, MTX_DEF) -#define rw_destroy(x) mtx_destroy((x)) -#endif enum { - L2T_STATE_VALID, /* entry is up to date */ - L2T_STATE_STALE, /* entry may be used but needs revalidation */ - L2T_STATE_RESOLVING, /* entry needs address resolution */ - L2T_STATE_UNUSED /* entry not in use */ + L2T_SIZE = 2048 +}; + +enum { + L2T_STATE_VALID, /* entry is up to date */ + L2T_STATE_STALE, /* entry may be used but needs revalidation */ + L2T_STATE_RESOLVING, /* entry needs address resolution */ + L2T_STATE_FAILED, /* failed to resolve */ + L2T_STATE_UNUSED /* entry not in use */ }; /* @@ -64,18 +57,17 @@ enum { struct l2t_entry { uint16_t state; /* entry state */ uint16_t idx; /* entry index */ - uint32_t addr; /* dest IP address */ - int ifindex; /* neighbor's net_device's ifindex */ + uint32_t addr; /* nexthop IP address */ + struct ifnet *ifp; /* outgoing interface */ uint16_t smt_idx; /* SMT index */ uint16_t vlan; /* VLAN TCI (id: bits 0-11, prio: 13-15 */ - struct llentry *neigh; /* associated neighbour */ struct l2t_entry *first; /* start of hash chain */ struct l2t_entry *next; /* next l2t_entry on chain */ struct mbuf *arpq_head; /* queue of packets awaiting resolution */ struct mbuf *arpq_tail; struct mtx lock; volatile uint32_t refcnt; /* entry reference count */ - uint8_t dmac[6]; /* neighbour's MAC address */ + uint8_t dmac[ETHER_ADDR_LEN]; /* nexthop's MAC address */ }; struct l2t_data { @@ -86,76 +78,37 @@ struct l2t_data { struct l2t_entry l2tab[0]; }; -typedef void (*arp_failure_handler_func)(struct t3cdev *dev, - struct mbuf *m); - -typedef void (*opaque_arp_failure_handler_func)(void *dev, - struct mbuf *m); - -/* - * Callback stored in an skb to handle address resolution failure. - */ -struct l2t_mbuf_cb { - arp_failure_handler_func arp_failure_handler; -}; - -/* - * XXX - */ -#define L2T_MBUF_CB(skb) ((struct l2t_mbuf_cb *)(skb)->cb) - - -static __inline void set_arp_failure_handler(struct mbuf *m, - arp_failure_handler_func hnd) +void t3_l2e_free(struct l2t_data *, struct l2t_entry *e); +void t3_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, + uint8_t *lladdr, uint16_t vtag); +struct l2t_entry *t3_l2t_get(struct port_info *, struct ifnet *, + struct sockaddr *); +int t3_l2t_send_slow(struct adapter *, struct mbuf *, struct l2t_entry *); +struct l2t_data *t3_init_l2t(unsigned int); +void t3_free_l2t(struct l2t_data *); +void t3_init_l2t_cpl_handlers(struct adapter *); + +static inline int +l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e) { - m->m_pkthdr.header = (opaque_arp_failure_handler_func)hnd; - + if (__predict_true(e->state == L2T_STATE_VALID)) + return t3_offload_tx(sc, m); + else + return t3_l2t_send_slow(sc, m, e); } -/* - * Getting to the L2 data from an offload device. - */ -#define L2DATA(dev) ((dev)->l2opt) - -void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e); -void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa); -struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh, - struct ifnet *ifp, struct sockaddr *sa); -int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, - struct l2t_entry *e); -void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e); -struct l2t_data *t3_init_l2t(unsigned int l2t_capacity); -void t3_free_l2t(struct l2t_data *d); - -#ifdef CONFIG_PROC_FS -int t3_l2t_proc_setup(struct proc_dir_entry *dir, struct l2t_data *d); -void t3_l2t_proc_free(struct proc_dir_entry *dir); -#else -#define l2t_proc_setup(dir, d) 0 -#define l2t_proc_free(dir) -#endif - -int cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m); - -static inline int l2t_send(struct t3cdev *dev, struct mbuf *m, - struct l2t_entry *e) -{ - if (__predict_true(e->state == L2T_STATE_VALID)) { - return cxgb_ofld_send(dev, (struct mbuf *)m); - } - return t3_l2t_send_slow(dev, (struct mbuf *)m, e); -} - -static inline void l2t_release(struct l2t_data *d, struct l2t_entry *e) +static inline void +l2t_release(struct l2t_data *d, struct l2t_entry *e) { - if (atomic_fetchadd_int(&e->refcnt, -1) == 1) - t3_l2e_free(d, e); + if (atomic_fetchadd_int(&e->refcnt, -1) == 1) /* 1 -> 0 transition */ + atomic_add_int(&d->nfree, 1); } -static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e) +static inline void +l2t_hold(struct l2t_data *d, struct l2t_entry *e) { - if (atomic_fetchadd_int(&e->refcnt, 1) == 1) /* 0 -> 1 transition */ - atomic_add_int(&d->nfree, 1); + if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */ + atomic_add_int(&d->nfree, -1); } #endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c index 5dc2d9f..c80abf0 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -1,343 +1,1140 @@ -/************************************************************************** +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ -Copyright (c) 2007, Chelsio Inc. -All rights reserved. +#include +__FBSDID("$FreeBSD$"); -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: +#include "opt_inet.h" - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. +#ifdef TCP_OFFLOAD +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. +#include "cxgb_include.h" +#include "ulp/tom/cxgb_tom.h" +#include "ulp/tom/cxgb_l2t.h" +#include "ulp/tom/cxgb_toepcb.h" -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. +static void t3_send_reset_synqe(struct toedev *, struct synq_entry *); -***************************************************************************/ +static int +alloc_stid(struct tid_info *t, void *ctx) +{ + int stid = -1; -#include -__FBSDID("$FreeBSD$"); + mtx_lock(&t->stid_lock); + if (t->sfree) { + union listen_entry *p = t->sfree; -#include -#include -#include -#include -#include -#include -#include + stid = (p - t->stid_tab) + t->stid_base; + t->sfree = p->next; + p->ctx = ctx; + t->stids_in_use++; + } + mtx_unlock(&t->stid_lock); + return (stid); +} -#include -#include -#include +static void +free_stid(struct tid_info *t, int stid) +{ + union listen_entry *p = stid2entry(t, stid); -#include -#include + mtx_lock(&t->stid_lock); + p->next = t->sfree; + t->sfree = p; + t->stids_in_use--; + mtx_unlock(&t->stid_lock); +} -#include -#include +static struct listen_ctx * +alloc_lctx(struct tom_data *td, struct inpcb *inp, int qset) +{ + struct listen_ctx *lctx; -#include -#include -#include -#include + INP_WLOCK_ASSERT(inp); + lctx = malloc(sizeof(struct listen_ctx), M_CXGB, M_NOWAIT | M_ZERO); + if (lctx == NULL) + return (NULL); -#include -#include + lctx->stid = alloc_stid(&td->tid_maps, lctx); + if (lctx->stid < 0) { + free(lctx, M_CXGB); + return (NULL); + } -#include -#include -#include + lctx->inp = inp; + in_pcbref(inp); -#include -#include + lctx->qset = qset; + refcount_init(&lctx->refcnt, 1); + TAILQ_INIT(&lctx->synq); + + return (lctx); +} + +/* Don't call this directly, use release_lctx instead */ +static int +free_lctx(struct tom_data *td, struct listen_ctx *lctx) +{ + struct inpcb *inp = lctx->inp; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include + INP_WLOCK_ASSERT(inp); + KASSERT(lctx->refcnt == 0, + ("%s: refcnt %d", __func__, lctx->refcnt)); + KASSERT(TAILQ_EMPTY(&lctx->synq), + ("%s: synq not empty.", __func__)); + KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); + CTR4(KTR_CXGB, "%s: stid %u, lctx %p, inp %p", + __func__, lctx->stid, lctx, lctx->inp); -static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid); -static int listen_hash_del(struct tom_data *d, struct socket *so); + free_stid(&td->tid_maps, lctx->stid); + free(lctx, M_CXGB); + + return in_pcbrele_wlocked(inp); +} + +static void +hold_lctx(struct listen_ctx *lctx) +{ + + refcount_acquire(&lctx->refcnt); +} + +static inline uint32_t +listen_hashfn(void *key, u_long mask) +{ + + return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); +} + +/* + * Add a listen_ctx entry to the listen hash table. + */ +static void +listen_hash_add(struct tom_data *td, struct listen_ctx *lctx) +{ + int bucket = listen_hashfn(lctx->inp, td->listen_mask); + + mtx_lock(&td->lctx_hash_lock); + LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); + td->lctx_count++; + mtx_unlock(&td->lctx_hash_lock); +} + +/* + * Look for the listening socket's context entry in the hash and return it. + */ +static struct listen_ctx * +listen_hash_find(struct tom_data *td, struct inpcb *inp) +{ + int bucket = listen_hashfn(inp, td->listen_mask); + struct listen_ctx *lctx; + + mtx_lock(&td->lctx_hash_lock); + LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { + if (lctx->inp == inp) + break; + } + mtx_unlock(&td->lctx_hash_lock); + + return (lctx); +} + +/* + * Removes the listen_ctx structure for inp from the hash and returns it. + */ +static struct listen_ctx * +listen_hash_del(struct tom_data *td, struct inpcb *inp) +{ + int bucket = listen_hashfn(inp, td->listen_mask); + struct listen_ctx *lctx, *l; + + mtx_lock(&td->lctx_hash_lock); + LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { + if (lctx->inp == inp) { + LIST_REMOVE(lctx, link); + td->lctx_count--; + break; + } + } + mtx_unlock(&td->lctx_hash_lock); + + return (lctx); +} + +/* + * Releases a hold on the lctx. Must be called with the listening socket's inp + * locked. The inp may be freed by this function and it returns NULL to + * indicate this. + */ +static struct inpcb * +release_lctx(struct tom_data *td, struct listen_ctx *lctx) +{ + struct inpcb *inp = lctx->inp; + int inp_freed = 0; + + INP_WLOCK_ASSERT(inp); + if (refcount_release(&lctx->refcnt)) + inp_freed = free_lctx(td, lctx); + + return (inp_freed ? NULL : inp); +} + +static int +create_server(struct adapter *sc, struct listen_ctx *lctx) +{ + struct mbuf *m; + struct cpl_pass_open_req *req; + struct inpcb *inp = lctx->inp; + + m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); + if (m == NULL) + return (ENOMEM); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); + req->local_port = inp->inp_lport; + memcpy(&req->local_ip, &inp->inp_laddr, 4); + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(16)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + t3_offload_tx(sc, m); + + return (0); +} + +static int +destroy_server(struct adapter *sc, struct listen_ctx *lctx) +{ + struct mbuf *m; + struct cpl_close_listserv_req *req; + + m = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, req); + if (m == NULL) + return (ENOMEM); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, + lctx->stid)); + req->cpu_idx = 0; + + t3_offload_tx(sc, m); + + return (0); +} /* * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release * the STID. */ static int -do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +do_close_server_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct cpl_close_listserv_rpl *rpl = cplhdr(m); + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_close_listserv_rpl *rpl = mtod(m, void *); unsigned int stid = GET_TID(rpl); + struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); + struct inpcb *inp = lctx->inp; - if (rpl->status != CPL_ERR_NONE) - log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for " - "STID %u\n", rpl->status, stid); - else { - struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + CTR3(KTR_CXGB, "%s: stid %u, status %u", __func__, stid, rpl->status); - cxgb_free_stid(cdev, stid); - free(listen_ctx, M_CXGB); + if (rpl->status != CPL_ERR_NONE) { + log(LOG_ERR, "%s: failed (%u) to close listener for stid %u", + __func__, rpl->status, stid); + } else { + INP_WLOCK(inp); + KASSERT(listen_hash_del(td, lctx->inp) == NULL, + ("%s: inp %p still in listen hash", __func__, inp)); + if (release_lctx(td, lctx) != NULL) + INP_WUNLOCK(inp); } - return (CPL_RET_BUF_DONE); + m_freem(m); + return (0); } /* - * Process a CPL_PASS_OPEN_RPL message. Remove the socket from the listen hash - * table and free the STID if there was any error, otherwise nothing to do. + * Process a CPL_PASS_OPEN_RPL message. Remove the lctx from the listen hash + * table and free it if there was any error, otherwise nothing to do. */ static int -do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +do_pass_open_rpl(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - struct cpl_pass_open_rpl *rpl = cplhdr(m); + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_pass_open_rpl *rpl = mtod(m, void *); + int stid = GET_TID(rpl); + struct listen_ctx *lctx; + struct inpcb *inp; + + /* + * We get these replies also when setting up HW filters. Just throw + * those away. + */ + if (stid >= td->tid_maps.stid_base + td->tid_maps.nstids) + goto done; + + lctx = lookup_stid(&td->tid_maps, stid); + inp = lctx->inp; + + INP_WLOCK(inp); + + CTR4(KTR_CXGB, "%s: stid %u, status %u, flags 0x%x", + __func__, stid, rpl->status, lctx->flags); + + lctx->flags &= ~LCTX_RPL_PENDING; if (rpl->status != CPL_ERR_NONE) { - int stid = GET_TID(rpl); - struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; - struct tom_data *d = listen_ctx->tom_data; - struct socket *lso = listen_ctx->lso; - -#if VALIDATE_TID - if (!lso) - return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE); -#endif - /* - * Note: It is safe to unconditionally call listen_hash_del() - * at this point without risking unhashing a reincarnation of - * an already closed socket (i.e., there is no listen, close, - * listen, free the sock for the second listen while processing - * a message for the first race) because we are still holding - * a reference on the socket. It is possible that the unhash - * will fail because the socket is already closed, but we can't - * unhash the wrong socket because it is impossible for the - * socket to which this message refers to have reincarnated. - */ - listen_hash_del(d, lso); - cxgb_free_stid(cdev, stid); -#ifdef notyet - /* - * XXX need to unreference the inpcb - * but we have no way of knowing that other TOMs aren't referencing it - */ - sock_put(lso); + log(LOG_ERR, "%s: %s: hw listen (stid %d) failed: %d\n", + __func__, device_get_nameunit(sc->dev), stid, rpl->status); + } + +#ifdef INVARIANTS + /* + * If the inp has been dropped (listening socket closed) then + * listen_stop must have run and taken the inp out of the hash. + */ + if (inp->inp_flags & INP_DROPPED) { + KASSERT(listen_hash_del(td, inp) == NULL, + ("%s: inp %p still in listen hash", __func__, inp)); + } #endif - free(listen_ctx, M_CXGB); + + if (inp->inp_flags & INP_DROPPED && rpl->status != CPL_ERR_NONE) { + if (release_lctx(td, lctx) != NULL) + INP_WUNLOCK(inp); + goto done; + } + + /* + * Listening socket stopped listening earlier and now the chip tells us + * it has started the hardware listener. Stop it; the lctx will be + * released in do_close_server_rpl. + */ + if (inp->inp_flags & INP_DROPPED) { + destroy_server(sc, lctx); + INP_WUNLOCK(inp); + goto done; + } + + /* + * Failed to start hardware listener. Take inp out of the hash and + * release our reference on it. An error message has been logged + * already. + */ + if (rpl->status != CPL_ERR_NONE) { + listen_hash_del(td, inp); + if (release_lctx(td, lctx) != NULL) + INP_WUNLOCK(inp); + goto done; } - return CPL_RET_BUF_DONE; + + /* hardware listener open for business */ + + INP_WUNLOCK(inp); +done: + m_freem(m); + return (0); } -void -t3_init_listen_cpl_handlers(void) +static void +pass_accept_req_to_protohdrs(const struct cpl_pass_accept_req *cpl, + struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) { - t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); - t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); + const struct tcp_options *t3opt = &cpl->tcp_options; + + bzero(inc, sizeof(*inc)); + inc->inc_faddr.s_addr = cpl->peer_ip; + inc->inc_laddr.s_addr = cpl->local_ip; + inc->inc_fport = cpl->peer_port; + inc->inc_lport = cpl->local_port; + + bzero(th, sizeof(*th)); + th->th_sport = cpl->peer_port; + th->th_dport = cpl->local_port; + th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ + th->th_flags = TH_SYN; + + bzero(to, sizeof(*to)); + if (t3opt->mss) { + to->to_flags |= TOF_MSS; + to->to_mss = be16toh(t3opt->mss); + } + if (t3opt->wsf) { + to->to_flags |= TOF_SCALE; + to->to_wscale = t3opt->wsf; + } + if (t3opt->tstamp) + to->to_flags |= TOF_TS; + if (t3opt->sack) + to->to_flags |= TOF_SACKPERM; } -static inline int -listen_hashfn(const struct socket *so) +static inline void +hold_synqe(struct synq_entry *synqe) { - return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1); + + refcount_acquire(&synqe->refcnt); +} + +static inline void +release_synqe(struct synq_entry *synqe) +{ + + if (refcount_release(&synqe->refcnt)) + m_freem(synqe->m); } /* - * Create and add a listen_info entry to the listen hash table. This and the - * listen hash table functions below cannot be called from softirqs. + * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to + * store some state temporarily. There will be enough room in the mbuf's + * trailing space as the CPL is not that large. + * + * XXX: bad hack. */ -static struct listen_info * -listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid) +static struct synq_entry * +mbuf_to_synq_entry(struct mbuf *m) { - struct listen_info *p; - - p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO); - if (p) { - int bucket = listen_hashfn(so); - - p->so = so; /* just a key, no need to take a reference */ - p->stid = stid; - mtx_lock(&d->listen_lock); - p->next = d->listen_hash_tab[bucket]; - d->listen_hash_tab[bucket] = p; - mtx_unlock(&d->listen_lock); + int len = roundup(sizeof (struct synq_entry), 8); + uint8_t *buf; + int buflen; + + if (__predict_false(M_TRAILINGSPACE(m) < len)) { + panic("%s: no room for synq_entry (%td, %d)\n", __func__, + M_TRAILINGSPACE(m), len); } - return p; + + if (m->m_flags & M_EXT) { + buf = m->m_ext.ext_buf; + buflen = m->m_ext.ext_size; + } else if (m->m_flags & M_PKTHDR) { + buf = &m->m_pktdat[0]; + buflen = MHLEN; + } else { + buf = &m->m_dat[0]; + buflen = MLEN; + } + + return ((void *)(buf + buflen - len)); } +#ifdef KTR +#define REJECT_PASS_ACCEPT() do { \ + reject_reason = __LINE__; \ + goto reject; \ +} while (0) +#else +#define REJECT_PASS_ACCEPT() do { goto reject; } while (0) +#endif + /* - * Given a pointer to a listening socket return its server TID by consulting - * the socket->stid map. Returns -1 if the socket is not in the map. + * The context associated with a tid entry via insert_tid could be a synq_entry + * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. + */ +CTASSERT(offsetof(struct toepcb, tp_flags) == offsetof(struct synq_entry, flags)); + +/* + * Handle a CPL_PASS_ACCEPT_REQ message. */ static int -listen_hash_find(struct tom_data *d, struct socket *so) +do_pass_accept_req(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - int stid = -1, bucket = listen_hashfn(so); - struct listen_info *p; + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + const struct cpl_pass_accept_req *req = mtod(m, void *); + unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + unsigned int tid = GET_TID(req); + struct listen_ctx *lctx = lookup_stid(&td->tid_maps, stid); + struct l2t_entry *e = NULL; + struct sockaddr_in nam; + struct rtentry *rt; + struct inpcb *inp; + struct socket *so; + struct port_info *pi; + struct ifnet *ifp; + struct in_conninfo inc; + struct tcphdr th; + struct tcpopt to; + struct synq_entry *synqe = NULL; + int i; +#ifdef KTR + int reject_reason; +#endif - mtx_lock(&d->listen_lock); - for (p = d->listen_hash_tab[bucket]; p; p = p->next) - if (p->so == so) { - stid = p->stid; - break; + CTR4(KTR_CXGB, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, + lctx); + + pass_accept_req_to_protohdrs(req, &inc, &th, &to); + + /* + * Don't offload if the interface that received the SYN doesn't have + * IFCAP_TOE enabled. + */ + pi = NULL; + for_each_port(sc, i) { + if (memcmp(sc->port[i].hw_addr, req->dst_mac, ETHER_ADDR_LEN)) + continue; + pi = &sc->port[i]; + break; + } + if (pi == NULL) + REJECT_PASS_ACCEPT(); + ifp = pi->ifp; + if ((ifp->if_capenable & IFCAP_TOE4) == 0) + REJECT_PASS_ACCEPT(); + + /* + * Don't offload if the outgoing interface for the route back to the + * peer is not the same as the interface that received the SYN. + */ + bzero(&nam, sizeof(nam)); + nam.sin_len = sizeof(nam); + nam.sin_family = AF_INET; + nam.sin_addr = inc.inc_faddr; + rt = rtalloc1((struct sockaddr *)&nam, 0, 0); + if (rt == NULL) + REJECT_PASS_ACCEPT(); + else { + struct sockaddr *nexthop; + + RT_UNLOCK(rt); + nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : + (struct sockaddr *)&nam; + if (rt->rt_ifp == ifp) + e = t3_l2t_get(pi, rt->rt_ifp, nexthop); + RTFREE(rt); + if (e == NULL) + REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ + } + + INP_INFO_WLOCK(&V_tcbinfo); + + /* Don't offload if the 4-tuple is already in use */ + if (toe_4tuple_check(&inc, &th, ifp) != 0) { + INP_INFO_WUNLOCK(&V_tcbinfo); + REJECT_PASS_ACCEPT(); + } + + inp = lctx->inp; /* listening socket (not owned by the TOE) */ + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * The listening socket has closed. The reply from the TOE to + * our CPL_CLOSE_LISTSRV_REQ will ultimately release all + * resources tied to this listen context. + */ + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + REJECT_PASS_ACCEPT(); + } + so = inp->inp_socket; + + /* Reuse the mbuf that delivered the CPL to us */ + synqe = mbuf_to_synq_entry(m); + synqe->flags = TP_IS_A_SYNQ_ENTRY; + synqe->m = m; + synqe->lctx = lctx; + synqe->tid = tid; + synqe->e = e; + synqe->opt0h = calc_opt0h(so, 0, 0, e); + synqe->qset = pi->first_qset + (arc4random() % pi->nqsets); + SOCKBUF_LOCK(&so->so_rcv); + synqe->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + SOCKBUF_UNLOCK(&so->so_rcv); + refcount_init(&synqe->refcnt, 1); + atomic_store_rel_int(&synqe->reply, RPL_OK); + + insert_tid(td, synqe, tid); + TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); + hold_synqe(synqe); + hold_lctx(lctx); + + /* syncache_add releases both pcbinfo and pcb locks */ + toe_syncache_add(&inc, &to, &th, inp, tod, synqe); + INP_UNLOCK_ASSERT(inp); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + + /* + * If we replied during syncache_add (reply is RPL_DONE), good. + * Otherwise (reply is unchanged - RPL_OK) it's no longer ok to reply. + * The mbuf will stick around as long as the entry is in the syncache. + * The kernel is free to retry syncache_respond but we'll ignore it due + * to RPL_DONT. + */ + if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONT)) { + + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* listener closed. synqe must have been aborted. */ + KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, + ("%s: listener %p closed but synqe %p not aborted", + __func__, inp, synqe)); + + CTR5(KTR_CXGB, + "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", + __func__, stid, tid, lctx, synqe); + INP_WUNLOCK(inp); + release_synqe(synqe); + return (__LINE__); } - mtx_unlock(&d->listen_lock); - return stid; + + KASSERT(!(synqe->flags & TP_ABORT_SHUTDOWN), + ("%s: synqe %p aborted, but listener %p not dropped.", + __func__, synqe, inp)); + + TAILQ_REMOVE(&lctx->synq, synqe, link); + release_synqe(synqe); /* removed from synq list */ + inp = release_lctx(td, lctx); + if (inp) + INP_WUNLOCK(inp); + + release_synqe(synqe); /* about to exit function */ + REJECT_PASS_ACCEPT(); + } + + KASSERT(synqe->reply == RPL_DONE, + ("%s: reply %d", __func__, synqe->reply)); + + CTR3(KTR_CXGB, "%s: stid %u, tid %u, OK", __func__, stid, tid); + release_synqe(synqe); + return (0); + +reject: + CTR4(KTR_CXGB, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, + reject_reason); + + if (synqe == NULL) + m_freem(m); + if (e) + l2t_release(td->l2t, e); + queue_tid_release(tod, tid); + + return (0); +} + +static void +pass_establish_to_protohdrs(const struct cpl_pass_establish *cpl, + struct in_conninfo *inc, struct tcphdr *th, struct tcpopt *to) +{ + uint16_t tcp_opt = be16toh(cpl->tcp_opt); + + bzero(inc, sizeof(*inc)); + inc->inc_faddr.s_addr = cpl->peer_ip; + inc->inc_laddr.s_addr = cpl->local_ip; + inc->inc_fport = cpl->peer_port; + inc->inc_lport = cpl->local_port; + + bzero(th, sizeof(*th)); + th->th_sport = cpl->peer_port; + th->th_dport = cpl->local_port; + th->th_flags = TH_ACK; + th->th_seq = be32toh(cpl->rcv_isn); /* as in tcp_fields_to_host */ + th->th_ack = be32toh(cpl->snd_isn); /* ditto */ + + bzero(to, sizeof(*to)); + if (G_TCPOPT_TSTAMP(tcp_opt)) + to->to_flags |= TOF_TS; } /* - * Delete the listen_info structure for a listening socket. Returns the server - * TID for the socket if it is present in the socket->stid map, or -1. + * Process a CPL_PASS_ESTABLISH message. The T3 has already established a + * connection and we need to do the software side setup. */ static int -listen_hash_del(struct tom_data *d, struct socket *so) +do_pass_establish(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) { - int bucket, stid = -1; - struct listen_info *p, **prev; - - bucket = listen_hashfn(so); - prev = &d->listen_hash_tab[bucket]; - - mtx_lock(&d->listen_lock); - for (p = *prev; p; prev = &p->next, p = p->next) - if (p->so == so) { - stid = p->stid; - *prev = p->next; - free(p, M_CXGB); - break; - } - mtx_unlock(&d->listen_lock); - - return (stid); + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct cpl_pass_establish *cpl = mtod(m, void *); + struct toedev *tod = &td->tod; + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); + struct toepcb *toep; + struct socket *so; + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + struct tcpopt to; + struct tcphdr th; + struct in_conninfo inc; +#ifdef KTR + int stid = G_PASS_OPEN_TID(ntohl(cpl->tos_tid)); +#endif + + CTR5(KTR_CXGB, "%s: stid %u, tid %u, lctx %p, inp_flags 0x%x", + __func__, stid, tid, lctx, inp->inp_flags); + + KASSERT(qs->idx == synqe->qset, + ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset)); + + INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_WLOCK(inp); + + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * The listening socket has closed. The TOM must have aborted + * all the embryonic connections (including this one) that were + * on the lctx's synq. do_abort_rpl for the tid is responsible + * for cleaning up. + */ + KASSERT(synqe->flags & TP_ABORT_SHUTDOWN, + ("%s: listen socket dropped but tid %u not aborted.", + __func__, tid)); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + m_freem(m); + return (0); + } + + pass_establish_to_protohdrs(cpl, &inc, &th, &to); + + /* Lie in order to pass the checks in syncache_expand */ + to.to_tsecr = synqe->ts; + th.th_ack = synqe->iss + 1; + + toep = toepcb_alloc(tod); + if (toep == NULL) { +reset: + t3_send_reset_synqe(tod, synqe); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + m_freem(m); + return (0); + } + toep->tp_qset = qs->idx; + toep->tp_l2t = synqe->e; + toep->tp_tid = tid; + toep->tp_rx_credits = synqe->rx_credits; + + synqe->toep = toep; + synqe->cpl = cpl; + + so = inp->inp_socket; + if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { + toepcb_free(toep); + goto reset; + } + + /* Remove the synq entry and release its reference on the lctx */ + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(td, lctx); + if (inp) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + release_synqe(synqe); + + m_freem(m); + return (0); +} + +void +t3_init_listen_cpl_handlers(struct adapter *sc) +{ + t3_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl); + t3_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); + t3_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req); + t3_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish); } /* * Start a listening server by sending a passive open request to HW. + * + * Can't take adapter lock here and access to sc->flags, sc->open_device_map, + * sc->offload_map, if_capenable are all race prone. */ -void -t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev) +int +t3_listen_start(struct toedev *tod, struct tcpcb *tp) { - int stid; - struct mbuf *m; - struct cpl_pass_open_req *req; - struct tom_data *d = TOM_DATA(dev); - struct inpcb *inp = so_sotoinpcb(so); - struct listen_ctx *ctx; + struct tom_data *td = t3_tomdata(tod); + struct adapter *sc = tod->tod_softc; + struct port_info *pi; + struct inpcb *inp = tp->t_inpcb; + struct listen_ctx *lctx; + int i; - if (!TOM_TUNABLE(dev, activated)) - return; + INP_WLOCK_ASSERT(inp); - if (listen_hash_find(d, so) != -1) - return; - - CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport)); - ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO); + if ((inp->inp_vflag & INP_IPV4) == 0) + return (0); - if (!ctx) - return; +#ifdef notyet + ADAPTER_LOCK(sc); + if (IS_BUSY(sc)) { + log(LOG_ERR, "%s: listen request ignored, %s is busy", + __func__, device_get_nameunit(sc->dev)); + goto done; + } - ctx->tom_data = d; - ctx->lso = so; - ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0; - LIST_INIT(&ctx->synq_head); - - stid = cxgb_alloc_stid(d->cdev, d->client, ctx); - if (stid < 0) - goto free_ctx; + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM not initialized", __func__)); +#endif - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) - goto free_stid; - m->m_pkthdr.len = m->m_len = sizeof(*req); - - if (!listen_hash_add(d, so, stid)) - goto free_all; - - req = mtod(m, struct cpl_pass_open_req *); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid)); - req->local_port = inp->inp_lport; - memcpy(&req->local_ip, &inp->inp_laddr, 4); - req->peer_port = 0; - req->peer_ip = 0; - req->peer_netmask = 0; - req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); - req->opt0l = htonl(V_RCV_BUFSIZ(16)); - req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + if ((sc->open_device_map & sc->offload_map) == 0) + goto done; /* no port that's UP with IFCAP_TOE enabled */ + + /* + * Find a running port with IFCAP_TOE4. We'll use the first such port's + * queues to send the passive open and receive the reply to it. + * + * XXX: need a way to mark an port in use by offload. if_cxgbe should + * then reject any attempt to bring down such a port (and maybe reject + * attempts to disable IFCAP_TOE on that port too?). + */ + for_each_port(sc, i) { + if (isset(&sc->open_device_map, i) && + sc->port[i].ifp->if_capenable & IFCAP_TOE4) + break; + } + KASSERT(i < sc->params.nports, + ("%s: no running port with TOE capability enabled.", __func__)); + pi = &sc->port[i]; + + if (listen_hash_find(td, inp) != NULL) + goto done; /* already setup */ + + lctx = alloc_lctx(td, inp, pi->first_qset); + if (lctx == NULL) { + log(LOG_ERR, + "%s: listen request ignored, %s couldn't allocate lctx\n", + __func__, device_get_nameunit(sc->dev)); + goto done; + } + listen_hash_add(td, lctx); - m_set_priority(m, CPL_PRIORITY_LISTEN); - cxgb_ofld_send(cdev, m); - return; - -free_all: - m_free(m); -free_stid: - cxgb_free_stid(cdev, stid); -#if 0 - sock_put(sk); -#endif -free_ctx: - free(ctx, M_CXGB); + CTR5(KTR_CXGB, "%s: stid %u (%s), lctx %p, inp %p", __func__, + lctx->stid, tcpstates[tp->t_state], lctx, inp); + + if (create_server(sc, lctx) != 0) { + log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__, + device_get_nameunit(sc->dev)); + (void) listen_hash_del(td, inp); + inp = release_lctx(td, lctx); + /* can't be freed, host stack has a reference */ + KASSERT(inp != NULL, ("%s: inp freed", __func__)); + goto done; + } + lctx->flags |= LCTX_RPL_PENDING; +done: +#ifdef notyet + ADAPTER_UNLOCK(sc); +#endif + return (0); } /* * Stop a listening server by sending a close_listsvr request to HW. * The server TID is freed when we get the reply. */ -void -t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev) +int +t3_listen_stop(struct toedev *tod, struct tcpcb *tp) { - struct mbuf *m; - struct cpl_close_listserv_req *req; struct listen_ctx *lctx; - int stid = listen_hash_del(TOM_DATA(dev), so); - - if (stid < 0) - return; + struct adapter *sc = tod->tod_softc; + struct tom_data *td = t3_tomdata(tod); + struct inpcb *inp = tp->t_inpcb; + struct synq_entry *synqe; + + INP_WLOCK_ASSERT(inp); + + lctx = listen_hash_del(td, inp); + if (lctx == NULL) + return (ENOENT); /* no hardware listener for this inp */ + + CTR4(KTR_CXGB, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, + lctx, lctx->flags); - lctx = cxgb_get_lctx(cdev, stid); /* - * Do this early so embryonic connections are marked as being aborted - * while the stid is still open. This ensures pass_establish messages - * that arrive while we are closing the server will be able to locate - * the listening socket. + * If the reply to the PASS_OPEN is still pending we'll wait for it to + * arrive and clean up when it does. */ - t3_reset_synq(lctx); + if (lctx->flags & LCTX_RPL_PENDING) { + KASSERT(TAILQ_EMPTY(&lctx->synq), + ("%s: synq not empty.", __func__)); + return (EINPROGRESS); + } - /* Send the close ASAP to stop further passive opens */ - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) { - /* - * XXX allocate from lowmem cache - */ + /* + * The host stack will abort all the connections on the listening + * socket's so_comp. It doesn't know about the connections on the synq + * so we need to take care of those. + */ + TAILQ_FOREACH(synqe, &lctx->synq, link) { + KASSERT(synqe->lctx == lctx, ("%s: synq corrupt", __func__)); + t3_send_reset_synqe(tod, synqe); } - m->m_pkthdr.len = m->m_len = sizeof(*req); - req = mtod(m, struct cpl_close_listserv_req *); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid)); - req->cpu_idx = 0; - m_set_priority(m, CPL_PRIORITY_LISTEN); - cxgb_ofld_send(cdev, m); + destroy_server(sc, lctx); + return (0); +} + +void +t3_syncache_added(struct toedev *tod __unused, void *arg) +{ + struct synq_entry *synqe = arg; + + hold_synqe(synqe); +} + +void +t3_syncache_removed(struct toedev *tod __unused, void *arg) +{ + struct synq_entry *synqe = arg; + + release_synqe(synqe); +} + +/* XXX */ +extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); + +int +t3_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) +{ + struct adapter *sc = tod->tod_softc; + struct synq_entry *synqe = arg; + struct l2t_entry *e = synqe->e; + struct ip *ip = mtod(m, struct ip *); + struct tcphdr *th = (void *)(ip + 1); + struct cpl_pass_accept_rpl *rpl; + struct mbuf *r; + struct listen_ctx *lctx = synqe->lctx; + struct tcpopt to; + int mtu_idx, cpu_idx; + + /* + * The first time we run it's during the call to syncache_add. That's + * the only one we care about. + */ + if (atomic_cmpset_int(&synqe->reply, RPL_OK, RPL_DONE) == 0) + goto done; /* reply to the CPL only if it's ok to do so */ + + r = M_GETHDR_OFLD(lctx->qset, CPL_PRIORITY_CONTROL, rpl); + if (r == NULL) + goto done; + + /* + * Use only the provided mbuf (with ip and tcp headers) and what's in + * synqe. Avoid looking at the listening socket (lctx->inp) here. + * + * XXX: if the incoming SYN had the TCP timestamp option but the kernel + * decides it doesn't want to use TCP timestamps we have no way of + * relaying this info to the chip on a per-tid basis (all we have is a + * global knob). + */ + bzero(&to, sizeof(to)); + tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), + TO_SYN); + + /* stash them for later */ + synqe->iss = be32toh(th->th_seq); + synqe->ts = to.to_tsval; + + mtu_idx = find_best_mtu_idx(sc, NULL, to.to_mss); + cpu_idx = sc->rrss_map[synqe->qset]; + + rpl->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + rpl->wr.wrh_lo = 0; + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, synqe->tid)); + rpl->opt2 = calc_opt2(cpu_idx); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + rpl->peer_ip = ip->ip_dst.s_addr; + rpl->opt0h = synqe->opt0h | + calc_opt0h(NULL, mtu_idx, to.to_wscale, NULL); + rpl->opt0l_status = htobe32(CPL_PASS_OPEN_ACCEPT) | + calc_opt0l(NULL, synqe->rx_credits); + + l2t_send(sc, r, e); +done: + m_freem(m); + return (0); +} - t3_disconnect_acceptq(so); +int +do_abort_req_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + const struct cpl_abort_req_rss *req = mtod(m, void *); + unsigned int tid = GET_TID(req); + struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + + KASSERT(synqe->flags & TP_IS_A_SYNQ_ENTRY, + ("%s: !SYNQ_ENTRY", __func__)); + + CTR6(KTR_CXGB, "%s: tid %u, synqe %p (%x), lctx %p, status %d", + __func__, tid, synqe, synqe->flags, synqe->lctx, req->status); + + INP_WLOCK(inp); + + if (!(synqe->flags & TP_ABORT_REQ_RCVD)) { + synqe->flags |= TP_ABORT_REQ_RCVD; + synqe->flags |= TP_ABORT_SHUTDOWN; + INP_WUNLOCK(inp); + m_freem(m); + return (0); + } + synqe->flags &= ~TP_ABORT_REQ_RCVD; + + /* + * If we'd sent a reset on this synqe, we'll ignore this and clean up in + * the T3's reply to our reset instead. + */ + if (synqe->flags & TP_ABORT_RPL_PENDING) { + synqe->flags |= TP_ABORT_RPL_SENT; + INP_WUNLOCK(inp); + } else { + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(td, lctx); + if (inp) + INP_WUNLOCK(inp); + release_tid(tod, tid, qs->idx); + l2t_release(td->l2t, synqe->e); + release_synqe(synqe); + } + + send_abort_rpl(tod, tid, qs->idx); + m_freem(m); + return (0); } + +int +do_abort_rpl_synqe(struct sge_qset *qs, struct rsp_desc *r, struct mbuf *m) +{ + struct adapter *sc = qs->adap; + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + const struct cpl_abort_rpl_rss *rpl = mtod(m, void *); + unsigned int tid = GET_TID(rpl); + struct synq_entry *synqe = lookup_tid(&td->tid_maps, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + + CTR3(KTR_CXGB, "%s: tid %d, synqe %p, status %d", tid, synqe, + rpl->status); + + INP_WLOCK(inp); + + if (synqe->flags & TP_ABORT_RPL_PENDING) { + if (!(synqe->flags & TP_ABORT_RPL_RCVD)) { + synqe->flags |= TP_ABORT_RPL_RCVD; + INP_WUNLOCK(inp); + } else { + synqe->flags &= ~TP_ABORT_RPL_RCVD; + synqe->flags &= TP_ABORT_RPL_PENDING; + + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(td, lctx); + if (inp) + INP_WUNLOCK(inp); + release_tid(tod, tid, qs->idx); + l2t_release(td->l2t, synqe->e); + release_synqe(synqe); + } + } + + m_freem(m); + return (0); +} + +static void +t3_send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) +{ + struct cpl_abort_req *req; + unsigned int tid = synqe->tid; + struct adapter *sc = tod->tod_softc; + struct mbuf *m; +#ifdef INVARIANTS + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; +#endif + + INP_WLOCK_ASSERT(inp); + + CTR4(KTR_CXGB, "%s: tid %d, synqe %p (%x)", __func__, tid, synqe, + synqe->flags); + + if (synqe->flags & TP_ABORT_SHUTDOWN) + return; + + synqe->flags |= (TP_ABORT_RPL_PENDING | TP_ABORT_SHUTDOWN); + + m = M_GETHDR_OFLD(synqe->qset, CPL_PRIORITY_DATA, req); + if (m == NULL) + CXGB_UNIMPLEMENTED(); + + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wrh_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); + req->rsvd0 = 0; + req->rsvd1 = !(synqe->flags & TP_DATASENT); + req->cmd = CPL_ABORT_SEND_RST; + + l2t_send(sc, m, synqe->e); +} + +void +t3_offload_socket(struct toedev *tod, void *arg, struct socket *so) +{ + struct adapter *sc = tod->tod_softc; + struct tom_data *td = sc->tom_softc; + struct synq_entry *synqe = arg; +#ifdef INVARIANTS + struct inpcb *inp = sotoinpcb(so); +#endif + struct cpl_pass_establish *cpl = synqe->cpl; + struct toepcb *toep = synqe->toep; + + INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_WLOCK_ASSERT(inp); + + offload_socket(so, toep); + make_established(so, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); + update_tid(td, toep, synqe->tid); +} +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h deleted file mode 100644 index d6f9804..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h +++ /dev/null @@ -1,181 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - -$FreeBSD$ - -***************************************************************************/ - -#ifndef T3_DDP_H -#define T3_DDP_H - -/* Should be 1 or 2 indicating single or double kernel buffers. */ -#define NUM_DDP_KBUF 2 - -/* min receive window for a connection to be considered for DDP */ -#define MIN_DDP_RCV_WIN (48 << 10) - -/* amount of Rx window not available to DDP to avoid window exhaustion */ -#define DDP_RSVD_WIN (16 << 10) - -/* # of sentinel invalid page pods at the end of a group of valid page pods */ -#define NUM_SENTINEL_PPODS 0 - -/* # of pages a pagepod can hold without needing another pagepod */ -#define PPOD_PAGES 4 - -/* page pods are allocated in groups of this size (must be power of 2) */ -#define PPOD_CLUSTER_SIZE 16 - -/* for each TID we reserve this many page pods up front */ -#define RSVD_PPODS_PER_TID 1 - -struct pagepod { - uint32_t pp_vld_tid; - uint32_t pp_pgsz_tag_color; - uint32_t pp_max_offset; - uint32_t pp_page_offset; - uint64_t pp_rsvd; - uint64_t pp_addr[5]; -}; - -#define PPOD_SIZE sizeof(struct pagepod) - -#define S_PPOD_TID 0 -#define M_PPOD_TID 0xFFFFFF -#define V_PPOD_TID(x) ((x) << S_PPOD_TID) - -#define S_PPOD_VALID 24 -#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID) -#define F_PPOD_VALID V_PPOD_VALID(1U) - -#define S_PPOD_COLOR 0 -#define M_PPOD_COLOR 0x3F -#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR) - -#define S_PPOD_TAG 6 -#define M_PPOD_TAG 0xFFFFFF -#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG) - -#define S_PPOD_PGSZ 30 -#define M_PPOD_PGSZ 0x3 -#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ) - -#include -#include -#include - -/* DDP gather lists can specify an offset only for the first page. */ -struct ddp_gather_list { - unsigned int dgl_length; - unsigned int dgl_offset; - unsigned int dgl_nelem; - vm_page_t dgl_pages[0]; -}; - -struct ddp_buf_state { - unsigned int cur_offset; /* offset of latest DDP notification */ - unsigned int flags; - struct ddp_gather_list *gl; -}; - -struct ddp_state { - struct ddp_buf_state buf_state[2]; /* per buffer state */ - int cur_buf; - unsigned short kbuf_noinval; - unsigned short kbuf_idx; /* which HW buffer is used for kbuf */ - struct ddp_gather_list *ubuf; - int user_ddp_pending; - unsigned int ubuf_nppods; /* # of page pods for buffer 1 */ - unsigned int ubuf_tag; - unsigned int ubuf_ddp_ready; - int cancel_ubuf; - int get_tcb_count; - unsigned int kbuf_posted; - unsigned int kbuf_nppods[NUM_DDP_KBUF]; - unsigned int kbuf_tag[NUM_DDP_KBUF]; - struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */ -}; - -/* buf_state flags */ -enum { - DDP_BF_NOINVAL = 1 << 0, /* buffer is set to NO_INVALIDATE */ - DDP_BF_NOCOPY = 1 << 1, /* DDP to final dest, no copy needed */ - DDP_BF_NOFLIP = 1 << 2, /* buffer flips after GET_TCB_RPL */ - DDP_BF_PSH = 1 << 3, /* set in skb->flags if the a DDP was - completed with a segment having the - PSH flag set */ - DDP_BF_NODATA = 1 << 4, /* buffer completed before filling */ -}; - -#include -struct sockbuf; - -/* - * Returns 1 if a UBUF DMA buffer might be active. - */ -static inline int -t3_ddp_ubuf_pending(struct toepcb *toep) -{ - struct ddp_state *p = &toep->tp_ddp_state; - - /* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP, - * but DDP_STATE() is only valid if the connection actually enabled - * DDP. - */ - if (p->kbuf[0] == NULL) - return (0); - - return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || - (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)); -} - -int t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, - unsigned int nppods, unsigned int tag, unsigned int maxoff, - unsigned int pg_off, unsigned int color); -int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag); -void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); -void t3_free_ddp_gl(struct ddp_gather_list *gl); -int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len); -//void t3_repost_kbuf(struct socket *so, int modulate, int activate); -void t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock); -int t3_post_ubuf(struct toepcb *toep, const struct uio *uio, int nonblock, - int rcv_flags, int modulate, int post_kbuf); -void t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv); -int t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv, - const struct uio *uio, int nonblock, - int rcv_flags, int modulate, int post_kbuf); -int t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock); -void t3_cleanup_ddp(struct toepcb *toep); -void t3_release_ddp_resources(struct toepcb *toep); -void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx); -void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0, - unsigned int tag1, unsigned int len); -void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0, - unsigned int len1, unsigned int offset1, - uint64_t ddp_flags, uint64_t flag_mask, int modulate); -#endif /* T3_DDP_H */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h deleted file mode 100644 index 3042ef0..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h +++ /dev/null @@ -1,47 +0,0 @@ - -/*- - * Copyright (c) 2007, Chelsio Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ -#ifndef CXGB_TCP_H_ -#define CXGB_TCP_H_ -#ifdef TCP_USRREQS_OVERLOAD -struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno); -#else -#define cxgb_tcp_drop tcp_drop -#endif -void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip); -struct tcpcb *cxgb_tcp_close(struct tcpcb *tp); - -extern struct pr_usrreqs cxgb_tcp_usrreqs; -#ifdef INET6 -extern struct pr_usrreqs cxgb_tcp6_usrreqs; -#endif - -#include -SYSCTL_DECL(_net_inet_tcp_cxgb); -#endif /* CXGB_TCP_H_ */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c deleted file mode 100644 index 6cb6107..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c +++ /dev/null @@ -1,97 +0,0 @@ -/*- - * Copyright (c) 2007, Chelsio Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -/* - * grab bag of accessor routines that will either be moved to netinet - * or removed - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * This file contains code as a short-term staging area before it is moved in - * to sys/netinet/tcp_offload.c - */ - -void -sockbuf_lock(struct sockbuf *sb) -{ - - SOCKBUF_LOCK(sb); -} - -void -sockbuf_lock_assert(struct sockbuf *sb) -{ - - SOCKBUF_LOCK_ASSERT(sb); -} - -void -sockbuf_unlock(struct sockbuf *sb) -{ - - SOCKBUF_UNLOCK(sb); -} - -int -sockbuf_sbspace(struct sockbuf *sb) -{ - - return (sbspace(sb)); -} - diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h deleted file mode 100644 index 2b516d7..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h +++ /dev/null @@ -1,14 +0,0 @@ -/* $FreeBSD$ */ - -#ifndef CXGB_TCP_OFFLOAD_H_ -#define CXGB_TCP_OFFLOAD_H_ - -struct sockbuf; - -void sockbuf_lock(struct sockbuf *); -void sockbuf_lock_assert(struct sockbuf *); -void sockbuf_unlock(struct sockbuf *); -int sockbuf_sbspace(struct sockbuf *); - - -#endif /* CXGB_TCP_OFFLOAD_H_ */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h index 1b28e96..d0046c8 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2007-2008, Chelsio Inc. + * Copyright (c) 2007-2009, Chelsio Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,88 +32,63 @@ #include #include +#define TP_DATASENT (1 << 0) +#define TP_TX_WAIT_IDLE (1 << 1) +#define TP_FIN_SENT (1 << 2) +#define TP_ABORT_RPL_PENDING (1 << 3) +#define TP_ABORT_SHUTDOWN (1 << 4) +#define TP_ABORT_RPL_RCVD (1 << 5) +#define TP_ABORT_REQ_RCVD (1 << 6) +#define TP_ATTACHED (1 << 7) +#define TP_CPL_DONE (1 << 8) +#define TP_IS_A_SYNQ_ENTRY (1 << 9) +#define TP_ABORT_RPL_SENT (1 << 10) +#define TP_SEND_FIN (1 << 11) + struct toepcb { - struct toedev *tp_toedev; + TAILQ_ENTRY(toepcb) link; /* toep_list */ + int tp_flags; + struct toedev *tp_tod; struct l2t_entry *tp_l2t; - unsigned int tp_tid; + int tp_tid; int tp_wr_max; int tp_wr_avail; int tp_wr_unacked; int tp_delack_mode; - int tp_mtu_idx; int tp_ulp_mode; - int tp_qset_idx; - int tp_mss_clamp; int tp_qset; - int tp_flags; - int tp_enqueued_bytes; - int tp_page_count; - int tp_state; - - tcp_seq tp_iss; - tcp_seq tp_delack_seq; - tcp_seq tp_rcv_wup; - tcp_seq tp_copied_seq; - uint64_t tp_write_seq; + int tp_enqueued; + int tp_rx_credits; - volatile int tp_refcount; - vm_page_t *tp_pages; - - struct tcpcb *tp_tp; - struct mbuf *tp_m_last; - bus_dma_tag_t tp_tx_dmat; - bus_dma_tag_t tp_rx_dmat; - bus_dmamap_t tp_dmamap; + struct inpcb *tp_inp; + struct mbuf *tp_m_last; - LIST_ENTRY(toepcb) synq_entry; struct mbuf_head wr_list; struct mbuf_head out_of_order_queue; - struct ddp_state tp_ddp_state; - struct cv tp_cv; - }; static inline void reset_wr_list(struct toepcb *toep) { - mbufq_init(&toep->wr_list); } static inline void -purge_wr_queue(struct toepcb *toep) -{ - struct mbuf *m; - - while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) - m_freem(m); -} - -static inline void enqueue_wr(struct toepcb *toep, struct mbuf *m) { - mbufq_tail(&toep->wr_list, m); } static inline struct mbuf * peek_wr(const struct toepcb *toep) { - return (mbufq_peek(&toep->wr_list)); } static inline struct mbuf * dequeue_wr(struct toepcb *toep) { - return (mbufq_dequeue(&toep->wr_list)); } -#define wr_queue_walk(toep, m) \ - for (m = peek_wr(toep); m; m = m->m_nextpkt) - - - #endif - diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c index 1328044..8f0dd25 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c @@ -1,261 +1,106 @@ -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include -#include +#include #include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include #include -#include -#include -#include -#include #include - -#include -#include - #include -#include -#include -#include - -#include -#include - -#include - -#include #include -#include -#include -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -TAILQ_HEAD(, adapter) adapter_list; -static struct rwlock adapter_list_lock; - -static TAILQ_HEAD(, tom_data) cxgb_list; -static struct mtx cxgb_list_lock; -static const unsigned int MAX_ATIDS = 64 * 1024; -static const unsigned int ATID_BASE = 0x100000; - -static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry); -static void cxgb_register_listeners(void); -static void t3c_tom_add(struct t3cdev *cdev); - -/* - * Handlers for each CPL opcode - */ -static cxgb_cpl_handler_func tom_cpl_handlers[256]; - - -static eventhandler_tag listen_tag; - -static struct offload_id t3_toe_id_tab[] = { - { TOE_ID_CHELSIO_T3, 0 }, - { TOE_ID_CHELSIO_T3B, 0 }, - { TOE_ID_CHELSIO_T3C, 0 }, - { 0 } +#include + +#ifdef TCP_OFFLOAD +#include "cxgb_include.h" +#include "ulp/tom/cxgb_tom.h" +#include "ulp/tom/cxgb_l2t.h" +#include "ulp/tom/cxgb_toepcb.h" + +MALLOC_DEFINE(M_CXGB, "cxgb", "Chelsio T3 Offload services"); + +/* Module ops */ +static int t3_tom_mod_load(void); +static int t3_tom_mod_unload(void); +static int t3_tom_modevent(module_t, int, void *); + +/* ULD ops and helpers */ +static int t3_tom_activate(struct adapter *); +static int t3_tom_deactivate(struct adapter *); + +static int alloc_tid_tabs(struct tid_info *, u_int, u_int, u_int, u_int, u_int); +static void free_tid_tabs(struct tid_info *); +static int write_smt_entry(struct adapter *, int); +static void free_tom_data(struct tom_data *); + +static struct uld_info tom_uld_info = { + .uld_id = ULD_TOM, + .activate = t3_tom_activate, + .deactivate = t3_tom_deactivate, }; -static struct tom_info t3_tom_info = { - .ti_attach = t3_toe_attach, - .ti_id_table = t3_toe_id_tab, - .ti_name = "Chelsio-T3" -}; - -struct cxgb_client t3c_tom_client = { - .name = "tom_cxgb3", - .add = t3c_tom_add, - .remove = NULL, - .handlers = tom_cpl_handlers, - .redirect = NULL -}; - -void -cxgb_log_tcb(struct adapter *sc, unsigned int tid) -{ - - char buf[TCB_SIZE]; - uint64_t *tcb = (uint64_t *)buf; - int i, error; - struct mc7 *mem = &sc->cm; - - error = t3_mc7_bd_read(mem, tid*TCB_SIZE/8, TCB_SIZE/8, tcb); - if (error) - printf("cxgb_tcb_log failed\n"); - - - CTR1(KTR_CXGB, "TCB tid=%u", tid); - for (i = 0; i < TCB_SIZE / 32; i++) { - - CTR5(KTR_CXGB, "%1d: %08x %08x %08x %08x", - i, (uint32_t)tcb[1], (uint32_t)(tcb[1] >> 32), - (uint32_t)tcb[0], (uint32_t)(tcb[0] >> 32)); - - tcb += 2; - CTR4(KTR_CXGB, " %08x %08x %08x %08x", - (uint32_t)tcb[1], (uint32_t)(tcb[1] >> 32), - (uint32_t)tcb[0], (uint32_t)(tcb[0] >> 32)); - tcb += 2; - } -} - -/* - * Add an skb to the deferred skb queue for processing from process context. - */ -void -t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler) -{ - struct tom_data *td = TOM_DATA(dev); - - m_set_handler(m, handler); - mtx_lock(&td->deferq.lock); - - mbufq_tail(&td->deferq, m); - if (mbufq_len(&td->deferq) == 1) - taskqueue_enqueue(td->tq, &td->deferq_task); - mtx_lock(&td->deferq.lock); -} - struct toepcb * -toepcb_alloc(void) +toepcb_alloc(struct toedev *tod) { struct toepcb *toep; - - toep = malloc(sizeof(struct toepcb), M_CXGB, M_NOWAIT|M_ZERO); - + + toep = malloc(sizeof(struct toepcb), M_CXGB, M_NOWAIT | M_ZERO); if (toep == NULL) return (NULL); - toepcb_init(toep); - return (toep); -} + toep->tp_tod = tod; + toep->tp_wr_max = toep->tp_wr_avail = 15; + toep->tp_wr_unacked = 0; + toep->tp_delack_mode = 0; -void -toepcb_init(struct toepcb *toep) -{ - toep->tp_refcount = 1; - cv_init(&toep->tp_cv, "toep cv"); -} - -void -toepcb_hold(struct toepcb *toep) -{ - atomic_add_acq_int(&toep->tp_refcount, 1); + return (toep); } void -toepcb_release(struct toepcb *toep) +toepcb_free(struct toepcb *toep) { - if (toep->tp_refcount == 1) { - free(toep, M_CXGB); - return; - } - atomic_add_acq_int(&toep->tp_refcount, -1); -} - - -/* - * Add a T3 offload device to the list of devices we are managing. - */ -static void -t3cdev_add(struct tom_data *t) -{ - mtx_lock(&cxgb_list_lock); - TAILQ_INSERT_TAIL(&cxgb_list, t, entry); - mtx_unlock(&cxgb_list_lock); -} - -static inline int -cdev2type(struct t3cdev *cdev) -{ - int type = 0; - - switch (cdev->type) { - case T3A: - type = TOE_ID_CHELSIO_T3; - break; - case T3B: - type = TOE_ID_CHELSIO_T3B; - break; - case T3C: - type = TOE_ID_CHELSIO_T3C; - break; - } - return (type); + free(toep, M_CXGB); } -/* - * Allocate and initialize the TID tables. Returns 0 on success. - */ static int -init_tid_tabs(struct tid_info *t, unsigned int ntids, - unsigned int natids, unsigned int nstids, - unsigned int atid_base, unsigned int stid_base) +alloc_tid_tabs(struct tid_info *t, u_int ntids, u_int natids, u_int nstids, + u_int atid_base, u_int stid_base) { unsigned long size = ntids * sizeof(*t->tid_tab) + natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab); - t->tid_tab = cxgb_alloc_mem(size); + t->tid_tab = malloc(size, M_CXGB, M_NOWAIT | M_ZERO); if (!t->tid_tab) return (ENOMEM); @@ -270,8 +115,8 @@ init_tid_tabs(struct tid_info *t, unsigned int ntids, t->afree = NULL; t->stids_in_use = t->atids_in_use = 0; t->tids_in_use = 0; - mtx_init(&t->stid_lock, "stid", NULL, MTX_DUPOK|MTX_DEF); - mtx_init(&t->atid_lock, "atid", NULL, MTX_DUPOK|MTX_DEF); + mtx_init(&t->stid_lock, "stid", NULL, MTX_DEF); + mtx_init(&t->atid_lock, "atid", NULL, MTX_DEF); /* * Setup the free lists for stid_tab and atid_tab. @@ -286,1240 +131,266 @@ init_tid_tabs(struct tid_info *t, unsigned int ntids, t->atid_tab[natids - 1].next = &t->atid_tab[natids]; t->afree = t->atid_tab; } - return 0; + return (0); } static void -free_tid_maps(struct tid_info *t) +free_tid_tabs(struct tid_info *t) { - mtx_destroy(&t->stid_lock); - mtx_destroy(&t->atid_lock); - cxgb_free_mem(t->tid_tab); + if (mtx_initialized(&t->stid_lock)) + mtx_destroy(&t->stid_lock); + if (mtx_initialized(&t->atid_lock)) + mtx_destroy(&t->atid_lock); + free(t->tid_tab, M_CXGB); } -static inline void -add_adapter(adapter_t *adap) -{ - rw_wlock(&adapter_list_lock); - TAILQ_INSERT_TAIL(&adapter_list, adap, adapter_entry); - rw_wunlock(&adapter_list_lock); -} - -static inline void -remove_adapter(adapter_t *adap) -{ - rw_wlock(&adapter_list_lock); - TAILQ_REMOVE(&adapter_list, adap, adapter_entry); - rw_wunlock(&adapter_list_lock); -} - -/* - * Populate a TID_RELEASE WR. The mbuf must be already propely sized. - */ -static inline void -mk_tid_release(struct mbuf *m, unsigned int tid) -{ - struct cpl_tid_release *req; - - m_set_priority(m, CPL_PRIORITY_SETUP); - req = mtod(m, struct cpl_tid_release *); - m->m_pkthdr.len = m->m_len = sizeof(*req); - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); -} - -static void -t3_process_tid_release_list(void *data, int pending) +static int +write_smt_entry(struct adapter *sc, int idx) { + struct port_info *pi = &sc->port[idx]; + struct cpl_smt_write_req *req; struct mbuf *m; - struct t3cdev *tdev = data; - struct t3c_data *td = T3C_DATA (tdev); - - mtx_lock(&td->tid_release_lock); - while (td->tid_release_list) { - struct toe_tid_entry *p = td->tid_release_list; - - td->tid_release_list = (struct toe_tid_entry *)p->ctx; - mtx_unlock(&td->tid_release_lock); - m = m_get(M_WAIT, MT_DATA); - mk_tid_release(m, p - td->tid_maps.tid_tab); - cxgb_ofld_send(tdev, m); - p->ctx = NULL; - mtx_lock(&td->tid_release_lock); - } - mtx_unlock(&td->tid_release_lock); -} -int -cxgb_offload_activate(struct adapter *adapter) -{ - struct t3cdev *dev = &adapter->tdev; - int natids, err; - struct t3c_data *t; - struct tid_range stid_range, tid_range; - struct mtutab mtutab; - unsigned int l2t_capacity; - - t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); - if (!t) + m = M_GETHDR_OFLD(0, CPL_PRIORITY_CONTROL, req); + if (m == NULL) { + log(LOG_ERR, "%s: no mbuf, can't write SMT entry for %d\n", + __func__, idx); return (ENOMEM); - dev->adapter = adapter; - - err = (EOPNOTSUPP); - if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 || - dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 || - dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 || - dev->ctl(dev, GET_MTUS, &mtutab) < 0 || - dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 || - dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0) { - device_printf(adapter->dev, "%s: dev->ctl check failed\n", __FUNCTION__); - goto out_free; } - - err = (ENOMEM); - L2DATA(dev) = t3_init_l2t(l2t_capacity); - if (!L2DATA(dev)) { - device_printf(adapter->dev, "%s: t3_init_l2t failed\n", __FUNCTION__); - goto out_free; - } - natids = min(tid_range.num / 2, MAX_ATIDS); - err = init_tid_tabs(&t->tid_maps, tid_range.num, natids, - stid_range.num, ATID_BASE, stid_range.base); - if (err) { - device_printf(adapter->dev, "%s: init_tid_tabs failed\n", __FUNCTION__); - goto out_free_l2t; - } - - t->mtus = mtutab.mtus; - t->nmtus = mtutab.size; - - TASK_INIT(&t->tid_release_task, 0 /* XXX? */, t3_process_tid_release_list, dev); - mtx_init(&t->tid_release_lock, "tid release", NULL, MTX_DUPOK|MTX_DEF); - t->dev = dev; - - T3C_DATA (dev) = t; - dev->recv = process_rx; - dev->arp_update = t3_l2t_update; - /* Register netevent handler once */ - if (TAILQ_EMPTY(&adapter_list)) { -#if defined(CONFIG_CHELSIO_T3_MODULE) - if (prepare_arp_with_t3core()) - log(LOG_ERR, "Unable to set offload capabilities\n"); -#endif - } - CTR1(KTR_CXGB, "adding adapter %p", adapter); - add_adapter(adapter); - device_printf(adapter->dev, "offload started\n"); - adapter->flags |= CXGB_OFLD_INIT; - return (0); - -out_free_l2t: - t3_free_l2t(L2DATA(dev)); - L2DATA(dev) = NULL; -out_free: - free(t, M_CXGB); - return (err); -} - -void -cxgb_offload_deactivate(struct adapter *adapter) -{ - struct t3cdev *tdev = &adapter->tdev; - struct t3c_data *t = T3C_DATA(tdev); - - printf("removing adapter %p\n", adapter); - remove_adapter(adapter); - if (TAILQ_EMPTY(&adapter_list)) { -#if defined(CONFIG_CHELSIO_T3_MODULE) - restore_arp_sans_t3core(); -#endif - } - free_tid_maps(&t->tid_maps); - T3C_DATA(tdev) = NULL; - t3_free_l2t(L2DATA(tdev)); - L2DATA(tdev) = NULL; - mtx_destroy(&t->tid_release_lock); - free(t, M_CXGB); -} - -/* - * Sends an sk_buff to a T3C driver after dealing with any active network taps. - */ -int -cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m) -{ - int r; - - r = dev->send(dev, m); - return r; -} - -static struct ifnet * -get_iff_from_mac(adapter_t *adapter, const uint8_t *mac, unsigned int vlan) -{ - int i; - - for_each_port(adapter, i) { -#ifdef notyet - const struct vlan_group *grp; -#endif - const struct port_info *p = &adapter->port[i]; - struct ifnet *ifp = p->ifp; - - if (!memcmp(p->hw_addr, mac, ETHER_ADDR_LEN)) { -#ifdef notyet - - if (vlan && vlan != EVL_VLID_MASK) { - grp = p->vlan_grp; - dev = grp ? grp->vlan_devices[vlan] : NULL; - } else - while (dev->master) - dev = dev->master; -#endif - return (ifp); - } - } - return (NULL); -} -static inline void -failover_fixup(adapter_t *adapter, int port) -{ - if (adapter->params.rev == 0) { - struct ifnet *ifp = adapter->port[port].ifp; - struct cmac *mac = &adapter->port[port].mac; - if (!(ifp->if_flags & IFF_UP)) { - /* Failover triggered by the interface ifdown */ - t3_write_reg(adapter, A_XGM_TX_CTRL + mac->offset, - F_TXEN); - t3_read_reg(adapter, A_XGM_TX_CTRL + mac->offset); - } else { - /* Failover triggered by the interface link down */ - t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0); - t3_read_reg(adapter, A_XGM_RX_CTRL + mac->offset); - t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, - F_RXEN); - } - } -} + req->wr.wrh_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx)); + req->mtu_idx = NMTUS - 1; /* should be 0 but there's a T3 bug */ + req->iff = idx; + memset(req->src_mac1, 0, sizeof(req->src_mac1)); + memcpy(req->src_mac0, pi->hw_addr, ETHER_ADDR_LEN); -static int -cxgb_ulp_iscsi_ctl(adapter_t *adapter, unsigned int req, void *data) -{ - int ret = 0; - struct ulp_iscsi_info *uiip = data; - - switch (req) { - case ULP_ISCSI_GET_PARAMS: - uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT); - uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT); - uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK); - /* - * On tx, the iscsi pdu has to be <= tx page size and has to - * fit into the Tx PM FIFO. - */ - uiip->max_txsz = min(adapter->params.tp.tx_pg_size, - t3_read_reg(adapter, A_PM1_TX_CFG) >> 17); - /* on rx, the iscsi pdu has to be < rx page size and the - whole pdu + cpl headers has to fit into one sge buffer */ - /* also check the max rx data length programmed in TP */ - uiip->max_rxsz = min(uiip->max_rxsz, - ((t3_read_reg(adapter, A_TP_PARA_REG2)) - >> S_MAXRXDATA) & M_MAXRXDATA); - break; - case ULP_ISCSI_SET_PARAMS: - t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask); - break; - default: - ret = (EOPNOTSUPP); - } - return ret; -} - -/* Response queue used for RDMA events. */ -#define ASYNC_NOTIF_RSPQ 0 - -static int -cxgb_rdma_ctl(adapter_t *adapter, unsigned int req, void *data) -{ - int ret = 0; - - switch (req) { - case RDMA_GET_PARAMS: { - struct rdma_info *req = data; - - req->udbell_physbase = rman_get_start(adapter->udbs_res); - req->udbell_len = rman_get_size(adapter->udbs_res); - req->tpt_base = t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT); - req->tpt_top = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT); - req->pbl_base = t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT); - req->pbl_top = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT); - req->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT); - req->rqt_top = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT); - req->kdb_addr = (void *)((unsigned long)rman_get_virtual(adapter->regs_res) + A_SG_KDOORBELL); break; - } - case RDMA_CQ_OP: { - struct rdma_cq_op *req = data; - - /* may be called in any context */ - mtx_lock_spin(&adapter->sge.reg_lock); - ret = t3_sge_cqcntxt_op(adapter, req->id, req->op, - req->credits); - mtx_unlock_spin(&adapter->sge.reg_lock); - break; - } - case RDMA_GET_MEM: { - struct ch_mem_range *t = data; - struct mc7 *mem; - - if ((t->addr & 7) || (t->len & 7)) - return (EINVAL); - if (t->mem_id == MEM_CM) - mem = &adapter->cm; - else if (t->mem_id == MEM_PMRX) - mem = &adapter->pmrx; - else if (t->mem_id == MEM_PMTX) - mem = &adapter->pmtx; - else - return (EINVAL); - - ret = t3_mc7_bd_read(mem, t->addr/8, t->len/8, (u64 *)t->buf); - if (ret) - return (ret); - break; - } - case RDMA_CQ_SETUP: { - struct rdma_cq_setup *req = data; - - mtx_lock_spin(&adapter->sge.reg_lock); - ret = t3_sge_init_cqcntxt(adapter, req->id, req->base_addr, - req->size, ASYNC_NOTIF_RSPQ, - req->ovfl_mode, req->credits, - req->credit_thres); - mtx_unlock_spin(&adapter->sge.reg_lock); - break; - } - case RDMA_CQ_DISABLE: - mtx_lock_spin(&adapter->sge.reg_lock); - ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data); - mtx_unlock_spin(&adapter->sge.reg_lock); - break; - case RDMA_CTRL_QP_SETUP: { - struct rdma_ctrlqp_setup *req = data; - - mtx_lock_spin(&adapter->sge.reg_lock); - ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0, - SGE_CNTXT_RDMA, ASYNC_NOTIF_RSPQ, - req->base_addr, req->size, - FW_RI_TID_START, 1, 0); - mtx_unlock_spin(&adapter->sge.reg_lock); - break; - } - default: - ret = EOPNOTSUPP; - } - return (ret); -} + t3_offload_tx(sc, m); -static int -cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data) -{ - struct adapter *adapter = tdev2adap(tdev); - struct tid_range *tid; - struct mtutab *mtup; - struct iff_mac *iffmacp; - struct ddp_params *ddpp; - struct adap_ports *ports; - struct ofld_page_info *rx_page_info; - struct tp_params *tp = &adapter->params.tp; - int port; - - switch (req) { - case GET_MAX_OUTSTANDING_WR: - *(unsigned int *)data = FW_WR_NUM; - break; - case GET_WR_LEN: - *(unsigned int *)data = WR_FLITS; - break; - case GET_TX_MAX_CHUNK: - *(unsigned int *)data = 1 << 20; /* 1MB */ - break; - case GET_TID_RANGE: - tid = data; - tid->num = t3_mc5_size(&adapter->mc5) - - adapter->params.mc5.nroutes - - adapter->params.mc5.nfilters - - adapter->params.mc5.nservers; - tid->base = 0; - break; - case GET_STID_RANGE: - tid = data; - tid->num = adapter->params.mc5.nservers; - tid->base = t3_mc5_size(&adapter->mc5) - tid->num - - adapter->params.mc5.nfilters - - adapter->params.mc5.nroutes; - break; - case GET_L2T_CAPACITY: - *(unsigned int *)data = 2048; - break; - case GET_MTUS: - mtup = data; - mtup->size = NMTUS; - mtup->mtus = adapter->params.mtus; - break; - case GET_IFF_FROM_MAC: - iffmacp = data; - iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr, - iffmacp->vlan_tag & EVL_VLID_MASK); - break; - case GET_DDP_PARAMS: - ddpp = data; - ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT); - ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT); - ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK); - break; - case GET_PORTS: - ports = data; - ports->nports = adapter->params.nports; - for_each_port(adapter, port) - ports->lldevs[port] = adapter->port[port].ifp; - break; - case FAILOVER: - port = *(int *)data; - t3_port_failover(adapter, port); - failover_fixup(adapter, port); - break; - case FAILOVER_DONE: - port = *(int *)data; - t3_failover_done(adapter, port); - break; - case FAILOVER_CLEAR: - t3_failover_clear(adapter); - break; - case GET_RX_PAGE_INFO: - rx_page_info = data; - rx_page_info->page_size = tp->rx_pg_size; - rx_page_info->num = tp->rx_num_pgs; - break; - case ULP_ISCSI_GET_PARAMS: - case ULP_ISCSI_SET_PARAMS: - if (!offload_running(adapter)) - return (EAGAIN); - return cxgb_ulp_iscsi_ctl(adapter, req, data); - case RDMA_GET_PARAMS: - case RDMA_CQ_OP: - case RDMA_CQ_SETUP: - case RDMA_CQ_DISABLE: - case RDMA_CTRL_QP_SETUP: - case RDMA_GET_MEM: - if (!offload_running(adapter)) - return (EAGAIN); - return cxgb_rdma_ctl(adapter, req, data); - default: - return (EOPNOTSUPP); - } - return 0; + return (0); } -/* - * Allocate a TOM data structure, - * initialize its cpl_handlers - * and register it as a T3C client - */ static void -t3c_tom_add(struct t3cdev *cdev) +free_tom_data(struct tom_data *td) { - int i; - unsigned int wr_len; - struct tom_data *t; - struct toedev *tdev; - struct adap_ports *port_info; - - t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); - if (t == NULL) - return; - - cdev->send = t3_offload_tx; - cdev->ctl = cxgb_offload_ctl; - - if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0) - goto out_free_tom; - - port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO); - if (!port_info) - goto out_free_tom; - - if (cdev->ctl(cdev, GET_PORTS, port_info) < 0) - goto out_free_all; - - t3_init_wr_tab(wr_len); - t->cdev = cdev; - t->client = &t3c_tom_client; - - /* Register TCP offload device */ - tdev = &t->tdev; - tdev->tod_ttid = cdev2type(cdev); - tdev->tod_lldev = cdev->lldev; - - if (register_toedev(tdev, "toe%d")) { - printf("unable to register offload device"); - goto out_free_all; - } - TOM_DATA(tdev) = t; + KASSERT(TAILQ_EMPTY(&td->toep_list), + ("%s: toep_list not empty", __func__)); - for (i = 0; i < port_info->nports; i++) { - struct ifnet *ifp = port_info->lldevs[i]; - TOEDEV(ifp) = tdev; - - CTR1(KTR_TOM, "enabling toe on %p", ifp); - ifp->if_capabilities |= IFCAP_TOE4; - ifp->if_capenable |= IFCAP_TOE4; - } - t->ports = port_info; + if (td->listen_mask != 0) + hashdestroy(td->listen_hash, M_CXGB, td->listen_mask); - /* Add device to the list of offload devices */ - t3cdev_add(t); - - /* Activate TCP offload device */ - cxgb_offload_activate(TOM_DATA(tdev)->cdev->adapter); - - activate_offload(tdev); - cxgb_register_listeners(); - return; - -out_free_all: - printf("out_free_all fail\n"); - free(port_info, M_CXGB); -out_free_tom: - printf("out_free_tom fail\n"); - free(t, M_CXGB); - return; -} - - - -static int -do_act_open_rpl(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_act_open_rpl *rpl = cplhdr(m); - unsigned int atid = G_TID(ntohl(rpl->atid)); - struct toe_tid_entry *toe_tid; - - toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid); - if (toe_tid->ctx && toe_tid->client && toe_tid->client->handlers && - toe_tid->client->handlers[CPL_ACT_OPEN_RPL]) { - return toe_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, m, - toe_tid->ctx); - } else { - log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", - dev->name, CPL_ACT_OPEN_RPL); - return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; - } -} - -static int -do_stid_rpl(struct t3cdev *dev, struct mbuf *m) -{ - union opcode_tid *p = cplhdr(m); - unsigned int stid = G_TID(ntohl(p->opcode_tid)); - struct toe_tid_entry *toe_tid; - - toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid); - if (toe_tid->ctx && toe_tid->client->handlers && - toe_tid->client->handlers[p->opcode]) { - return toe_tid->client->handlers[p->opcode] (dev, m, toe_tid->ctx); - } else { - log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", - dev->name, p->opcode); - return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; - } -} - -static int -do_hwtid_rpl(struct t3cdev *dev, struct mbuf *m) -{ - union opcode_tid *p = cplhdr(m); - unsigned int hwtid; - struct toe_tid_entry *toe_tid; - - DPRINTF("do_hwtid_rpl opcode=0x%x\n", p->opcode); - hwtid = G_TID(ntohl(p->opcode_tid)); - - toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid); - if (toe_tid->ctx && toe_tid->client->handlers && - toe_tid->client->handlers[p->opcode]) { - return toe_tid->client->handlers[p->opcode] - (dev, m, toe_tid->ctx); - } else { - log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", - dev->name, p->opcode); - return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; - } -} - -static int -do_cr(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_pass_accept_req *req = cplhdr(m); - unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); - struct toe_tid_entry *toe_tid; - - toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid); - if (toe_tid->ctx && toe_tid->client->handlers && - toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) { - return toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ] - (dev, m, toe_tid->ctx); - } else { - log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", - dev->name, CPL_PASS_ACCEPT_REQ); - return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; - } -} - -static int -do_abort_req_rss(struct t3cdev *dev, struct mbuf *m) -{ - union opcode_tid *p = cplhdr(m); - unsigned int hwtid = G_TID(ntohl(p->opcode_tid)); - struct toe_tid_entry *toe_tid; - - toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid); - if (toe_tid->ctx && toe_tid->client->handlers && - toe_tid->client->handlers[p->opcode]) { - return toe_tid->client->handlers[p->opcode] - (dev, m, toe_tid->ctx); - } else { - struct cpl_abort_req_rss *req = cplhdr(m); - struct cpl_abort_rpl *rpl; - - struct mbuf *m = m_get(M_NOWAIT, MT_DATA); - if (!m) { - log(LOG_NOTICE, "do_abort_req_rss: couldn't get mbuf!\n"); - goto out; - } - - m_set_priority(m, CPL_PRIORITY_DATA); - rpl = cplhdr(m); - rpl->wr.wr_hi = - htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); - rpl->wr.wr_lo = htonl(V_WR_TID(GET_TID(req))); - OPCODE_TID(rpl) = - htonl(MK_OPCODE_TID(CPL_ABORT_RPL, GET_TID(req))); - rpl->cmd = req->status; - cxgb_ofld_send(dev, m); - out: - return (CPL_RET_BUF_DONE); - } -} - -static int -do_act_establish(struct t3cdev *dev, struct mbuf *m) -{ - struct cpl_act_establish *req; - unsigned int atid; - struct toe_tid_entry *toe_tid; - - req = cplhdr(m); - atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); - toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid); - if (toe_tid && toe_tid->ctx && toe_tid->client->handlers && - toe_tid->client->handlers[CPL_ACT_ESTABLISH]) { - - return toe_tid->client->handlers[CPL_ACT_ESTABLISH] - (dev, m, toe_tid->ctx); - } else { - - log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", - dev->name, CPL_ACT_ESTABLISH); - return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; - } -} - - -static int -do_term(struct t3cdev *dev, struct mbuf *m) -{ - unsigned int hwtid = ntohl(m_get_priority(m)) >> 8 & 0xfffff; - unsigned int opcode = G_OPCODE(ntohl(m->m_pkthdr.csum_data)); - struct toe_tid_entry *toe_tid; - - toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid); - if (toe_tid && toe_tid->ctx && toe_tid->client->handlers && - toe_tid->client->handlers[opcode]) { - return toe_tid->client->handlers[opcode](dev, m, toe_tid->ctx); - } else { - log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", - dev->name, opcode); - return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; - } - return (0); -} - -/* - * Process a received packet with an unknown/unexpected CPL opcode. - */ -static int -do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) -{ - log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name, - 0xFF & *mtod(m, unsigned int *)); - return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG); -} - -/* - * Add a new handler to the CPL dispatch table. A NULL handler may be supplied - * to unregister an existing handler. - */ -void -t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h) -{ - if (opcode < UCHAR_MAX) - tom_cpl_handlers[opcode] = h ? h : do_bad_cpl; - else - log(LOG_ERR, "Chelsio T3 TOM: handler registration for " - "opcode %u failed\n", opcode); + if (mtx_initialized(&td->toep_list_lock)) + mtx_destroy(&td->toep_list_lock); + if (mtx_initialized(&td->lctx_hash_lock)) + mtx_destroy(&td->lctx_hash_lock); + if (mtx_initialized(&td->tid_release_lock)) + mtx_destroy(&td->tid_release_lock); + if (td->l2t) + t3_free_l2t(td->l2t); + free_tid_tabs(&td->tid_maps); + free(td, M_CXGB); } /* - * Make a preliminary determination if a connection can be offloaded. It's OK - * to fail the offload later if we say we can offload here. For now this - * always accepts the offload request unless there are IP options. + * Ground control to Major TOM + * Commencing countdown, engines on */ static int -can_offload(struct toedev *dev, struct socket *so) -{ - struct tom_data *tomd = TOM_DATA(dev); - struct t3cdev *cdev = T3CDEV(dev->tod_lldev); - struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; - - return so_sotoinpcb(so)->inp_depend4.inp4_options == NULL && - tomd->conf.activated && - (tomd->conf.max_conn < 0 || - atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn); -} - -static int -tom_ctl(struct toedev *dev, unsigned int req, void *data) -{ - struct tom_data *t = TOM_DATA(dev); - struct t3cdev *cdev = t->cdev; - - if (cdev->ctl) - return cdev->ctl(cdev, req, data); - - return (EOPNOTSUPP); -} - -/* - * Free an active-open TID. - */ -void * -cxgb_free_atid(struct t3cdev *tdev, int atid) -{ - struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; - union active_open_entry *p = atid2entry(t, atid); - void *ctx = p->toe_tid.ctx; - - mtx_lock(&t->atid_lock); - p->next = t->afree; - t->afree = p; - t->atids_in_use--; - mtx_unlock(&t->atid_lock); - - return ctx; -} - -/* - * Free a server TID and return it to the free pool. - */ -void -cxgb_free_stid(struct t3cdev *tdev, int stid) +t3_tom_activate(struct adapter *sc) { - struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; - union listen_entry *p = stid2entry(t, stid); - - mtx_lock(&t->stid_lock); - p->next = t->sfree; - t->sfree = p; - t->stids_in_use--; - mtx_unlock(&t->stid_lock); -} - -/* - * Free a server TID and return it to the free pool. - */ -void * -cxgb_get_lctx(struct t3cdev *tdev, int stid) -{ - struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; - union listen_entry *p = stid2entry(t, stid); + struct tom_data *td; + struct toedev *tod; + int i, rc = 0; + struct mc5_params *mc5 = &sc->params.mc5; + u_int ntids, natids, mtus; - return (p->toe_tid.ctx); -} - -void -cxgb_insert_tid(struct t3cdev *tdev, struct cxgb_client *client, - void *ctx, unsigned int tid) -{ - struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; - - t->tid_tab[tid].client = client; - t->tid_tab[tid].ctx = ctx; - atomic_add_int(&t->tids_in_use, 1); -} - -/* use ctx as a next pointer in the tid release list */ -void -cxgb_queue_tid_release(struct t3cdev *tdev, unsigned int tid) -{ - struct t3c_data *td = T3C_DATA (tdev); - struct toe_tid_entry *p = &td->tid_maps.tid_tab[tid]; - - CTR0(KTR_TOM, "queuing tid release\n"); - - mtx_lock(&td->tid_release_lock); - p->ctx = td->tid_release_list; - td->tid_release_list = p; - - if (!p->ctx) - taskqueue_enqueue(tdev->adapter->tq, &td->tid_release_task); - - mtx_unlock(&td->tid_release_lock); -} - -/* - * Remove a tid from the TID table. A client may defer processing its last - * CPL message if it is locked at the time it arrives, and while the message - * sits in the client's backlog the TID may be reused for another connection. - * To handle this we atomically switch the TID association if it still points - * to the original client context. - */ -void -cxgb_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid) -{ - struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; - - if (tid >= t->ntids) - panic("tid=%d >= t->ntids=%d", tid, t->ntids); - - if (tdev->type == T3A) - atomic_cmpset_ptr((uintptr_t *)&t->tid_tab[tid].ctx, (long)NULL, (long)ctx); - else { - struct mbuf *m; - - m = m_get(M_NOWAIT, MT_DATA); - if (__predict_true(m != NULL)) { - mk_tid_release(m, tid); - CTR1(KTR_CXGB, "releasing tid=%u", tid); - - cxgb_ofld_send(tdev, m); - t->tid_tab[tid].ctx = NULL; - } else - cxgb_queue_tid_release(tdev, tid); - } - atomic_add_int(&t->tids_in_use, -1); -} + ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */ -int -cxgb_alloc_atid(struct t3cdev *tdev, struct cxgb_client *client, - void *ctx) -{ - int atid = -1; - struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; - - mtx_lock(&t->atid_lock); - if (t->afree) { - union active_open_entry *p = t->afree; - - atid = (p - t->atid_tab) + t->atid_base; - t->afree = p->next; - p->toe_tid.ctx = ctx; - p->toe_tid.client = client; - t->atids_in_use++; - } - mtx_unlock(&t->atid_lock); - return atid; -} + /* per-adapter softc for TOM */ + td = malloc(sizeof(*td), M_CXGB, M_ZERO | M_NOWAIT); + if (td == NULL) + return (ENOMEM); -int -cxgb_alloc_stid(struct t3cdev *tdev, struct cxgb_client *client, - void *ctx) -{ - int stid = -1; - struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; - - mtx_lock(&t->stid_lock); - if (t->sfree) { - union listen_entry *p = t->sfree; - - stid = (p - t->stid_tab) + t->stid_base; - t->sfree = p->next; - p->toe_tid.ctx = ctx; - p->toe_tid.client = client; - t->stids_in_use++; - } - mtx_unlock(&t->stid_lock); - return stid; + /* List of TOE PCBs and associated lock */ + mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); + TAILQ_INIT(&td->toep_list); + + /* Listen context */ + mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); + td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGB, + &td->listen_mask, HASH_NOWAIT); + + /* TID release task */ + TASK_INIT(&td->tid_release_task, 0 , t3_process_tid_release_list, td); + mtx_init(&td->tid_release_lock, "tid release", NULL, MTX_DEF); + + /* L2 table */ + td->l2t = t3_init_l2t(L2T_SIZE); + if (td->l2t == NULL) { + rc = ENOMEM; + goto done; + } + + /* TID tables */ + ntids = t3_mc5_size(&sc->mc5) - mc5->nroutes - mc5->nfilters - + mc5->nservers; + natids = min(ntids / 2, 64 * 1024); + rc = alloc_tid_tabs(&td->tid_maps, ntids, natids, mc5->nservers, + 0x100000 /* ATID_BASE */, ntids); + if (rc != 0) + goto done; + + /* CPL handlers */ + t3_init_listen_cpl_handlers(sc); + t3_init_l2t_cpl_handlers(sc); + t3_init_cpl_io(sc); + + /* toedev ops */ + tod = &td->tod; + init_toedev(tod); + tod->tod_softc = sc; + tod->tod_connect = t3_connect; + tod->tod_listen_start = t3_listen_start; + tod->tod_listen_stop = t3_listen_stop; + tod->tod_rcvd = t3_rcvd; + tod->tod_output = t3_tod_output; + tod->tod_send_rst = t3_send_rst; + tod->tod_send_fin = t3_send_fin; + tod->tod_pcb_detach = t3_pcb_detach; + tod->tod_l2_update = t3_l2_update; + tod->tod_syncache_added = t3_syncache_added; + tod->tod_syncache_removed = t3_syncache_removed; + tod->tod_syncache_respond = t3_syncache_respond; + tod->tod_offload_socket = t3_offload_socket; + + /* port MTUs */ + mtus = sc->port[0].ifp->if_mtu; + if (sc->params.nports > 1) + mtus |= sc->port[1].ifp->if_mtu << 16; + t3_write_reg(sc, A_TP_MTU_PORT_TABLE, mtus); + t3_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd, + sc->params.rev == 0 ? sc->port[0].ifp->if_mtu : 0xffff); + + /* SMT entry for each port */ + for_each_port(sc, i) { + write_smt_entry(sc, i); + TOEDEV(sc->port[i].ifp) = &td->tod; + } + + /* Switch TP to offload mode */ + t3_tp_set_offload_mode(sc, 1); + + sc->tom_softc = td; + sc->flags |= TOM_INIT_DONE; + register_toedev(tod); + +done: + if (rc != 0) + free_tom_data(td); + + return (rc); } - static int -is_offloading(struct ifnet *ifp) -{ - struct adapter *adapter; - int port; - - rw_rlock(&adapter_list_lock); - TAILQ_FOREACH(adapter, &adapter_list, adapter_entry) { - for_each_port(adapter, port) { - if (ifp == adapter->port[port].ifp) { - rw_runlock(&adapter_list_lock); - return 1; - } - } - } - rw_runlock(&adapter_list_lock); - return 0; -} - - -static void -cxgb_arp_update_event(void *unused, struct rtentry *rt0, - uint8_t *enaddr, struct sockaddr *sa) -{ - - if (!is_offloading(rt0->rt_ifp)) - return; - - RT_ADDREF(rt0); - RT_UNLOCK(rt0); - cxgb_neigh_update(rt0, enaddr, sa); - RT_LOCK(rt0); - RT_REMREF(rt0); -} - -static void -cxgb_redirect_event(void *unused, int event, struct rtentry *rt0, - struct rtentry *rt1, struct sockaddr *sa) -{ - /* - * ignore events on non-offloaded interfaces - */ - if (!is_offloading(rt0->rt_ifp)) - return; - - /* - * Cannot redirect to non-offload device. - */ - if (!is_offloading(rt1->rt_ifp)) { - log(LOG_WARNING, "%s: Redirect to non-offload" - "device ignored.\n", __FUNCTION__); - return; - } - - /* - * avoid LORs by dropping the route lock but keeping a reference - * - */ - RT_ADDREF(rt0); - RT_UNLOCK(rt0); - RT_ADDREF(rt1); - RT_UNLOCK(rt1); - - cxgb_redirect(rt0, rt1, sa); - cxgb_neigh_update(rt1, NULL, sa); - - RT_LOCK(rt0); - RT_REMREF(rt0); - RT_LOCK(rt1); - RT_REMREF(rt1); -} - -void -cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa) +t3_tom_deactivate(struct adapter *sc) { + int rc = 0; + struct tom_data *td = sc->tom_softc; - if (rt->rt_ifp && is_offloading(rt->rt_ifp) && (rt->rt_ifp->if_flags & IFCAP_TOE)) { - struct t3cdev *tdev = T3CDEV(rt->rt_ifp); + ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */ - PANIC_IF(!tdev); - t3_l2t_update(tdev, rt, enaddr, sa); - } -} - -static void -set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e) -{ - struct mbuf *m; - struct cpl_set_tcb_field *req; + if (td == NULL) + return (0); /* XXX. KASSERT? */ - m = m_gethdr(M_NOWAIT, MT_DATA); - if (!m) { - log(LOG_ERR, "%s: cannot allocate mbuf!\n", __FUNCTION__); - return; - } - - m_set_priority(m, CPL_PRIORITY_CONTROL); - req = mtod(m, struct cpl_set_tcb_field *); - m->m_pkthdr.len = m->m_len = sizeof(*req); - - req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); - OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); - req->reply = 0; - req->cpu_idx = 0; - req->word = htons(W_TCB_L2T_IX); - req->mask = htobe64(V_TCB_L2T_IX(M_TCB_L2T_IX)); - req->val = htobe64(V_TCB_L2T_IX(e->idx)); - tdev->send(tdev, m); -} + if (sc->offload_map != 0) + return (EBUSY); /* at least one port has IFCAP_TOE enabled */ -void -cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa) -{ - struct ifnet *olddev, *newdev; - struct tid_info *ti; - struct t3cdev *tdev; - u32 tid; - int update_tcb; - struct l2t_entry *e; - struct toe_tid_entry *te; - - olddev = old->rt_ifp; - newdev = new->rt_ifp; - if (!is_offloading(olddev)) - return; - if (!is_offloading(newdev)) { - log(LOG_WARNING, "%s: Redirect to non-offload" - "device ignored.\n", __FUNCTION__); - return; - } - tdev = T3CDEV(olddev); - PANIC_IF(!tdev); - if (tdev != T3CDEV(newdev)) { - log(LOG_WARNING, "%s: Redirect to different " - "offload device ignored.\n", __FUNCTION__); - return; - } + mtx_lock(&td->toep_list_lock); + if (!TAILQ_EMPTY(&td->toep_list)) + rc = EBUSY; + mtx_unlock(&td->toep_list_lock); - /* Add new L2T entry */ - e = t3_l2t_get(tdev, new, new->rt_ifp, sa); - if (!e) { - log(LOG_ERR, "%s: couldn't allocate new l2t entry!\n", - __FUNCTION__); - return; - } + mtx_lock(&td->lctx_hash_lock); + if (td->lctx_count > 0) + rc = EBUSY; + mtx_unlock(&td->lctx_hash_lock); - /* Walk tid table and notify clients of dst change. */ - ti = &(T3C_DATA (tdev))->tid_maps; - for (tid=0; tid < ti->ntids; tid++) { - te = lookup_tid(ti, tid); - PANIC_IF(!te); - if (te->ctx && te->client && te->client->redirect) { - update_tcb = te->client->redirect(te->ctx, old, new, - e); - if (update_tcb) { - l2t_hold(L2DATA(tdev), e); - set_l2t_ix(tdev, tid, e); - } - } + if (rc == 0) { + unregister_toedev(&td->tod); + t3_tp_set_offload_mode(sc, 0); + free_tom_data(td); + sc->tom_softc = NULL; + sc->flags &= ~TOM_INIT_DONE; } - l2t_release(L2DATA(tdev), e); -} -/* - * Initialize the CPL dispatch table. - */ -static void -init_cpl_handlers(void) -{ - int i; - - for (i = 0; i < 256; ++i) - tom_cpl_handlers[i] = do_bad_cpl; - - t3_init_listen_cpl_handlers(); + return (rc); } static int -t3_toe_attach(struct toedev *dev, const struct offload_id *entry) -{ - struct tom_data *t = TOM_DATA(dev); - struct t3cdev *cdev = t->cdev; - struct ddp_params ddp; - struct ofld_page_info rx_page_info; - int err; - - t3_init_tunables(t); - mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF); - CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry); - - dev->tod_can_offload = can_offload; - dev->tod_connect = t3_connect; - dev->tod_ctl = tom_ctl; -#if 0 - dev->tod_failover = t3_failover; -#endif - err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp); - if (err) - return err; - - err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info); - if (err) - return err; - - t->ddp_llimit = ddp.llimit; - t->ddp_ulimit = ddp.ulimit; - t->pdev = ddp.pdev; - t->rx_page_size = rx_page_info.page_size; - /* OK if this fails, we just can't do DDP */ - t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE; - t->ppod_map = malloc(t->nppods, M_DEVBUF, M_NOWAIT|M_ZERO); - - mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF); - - - t3_sysctl_register(cdev->adapter, &t->conf); - return (0); -} - -static void -cxgb_toe_listen_start(void *unused, struct tcpcb *tp) -{ - struct socket *so = inp_inpcbtosocket(tp->t_inpcb); - struct tom_data *p; - - mtx_lock(&cxgb_list_lock); - TAILQ_FOREACH(p, &cxgb_list, entry) { - t3_listen_start(&p->tdev, so, p->cdev); - } - mtx_unlock(&cxgb_list_lock); -} - -static void -cxgb_toe_listen_stop(void *unused, struct tcpcb *tp) +t3_tom_mod_load(void) { - struct socket *so = inp_inpcbtosocket(tp->t_inpcb); - struct tom_data *p; - - mtx_lock(&cxgb_list_lock); - TAILQ_FOREACH(p, &cxgb_list, entry) { - if (tp->t_state == TCPS_LISTEN) - t3_listen_stop(&p->tdev, so, p->cdev); - } - mtx_unlock(&cxgb_list_lock); -} + int rc; -static void -cxgb_toe_listen_start_handler(struct inpcb *inp, void *arg) -{ - struct tcpcb *tp = intotcpcb(inp); + rc = t3_register_uld(&tom_uld_info); + if (rc != 0) + t3_tom_mod_unload(); - if (tp->t_state == TCPS_LISTEN) - cxgb_toe_listen_start(NULL, tp); + return (rc); } static void -cxgb_register_listeners(void) +tom_uninit(struct adapter *sc, void *arg __unused) { - - inp_apply_all(cxgb_toe_listen_start_handler, NULL); + /* Try to free resources (works only if no port has IFCAP_TOE) */ + ADAPTER_LOCK(sc); + if (sc->flags & TOM_INIT_DONE) + t3_deactivate_uld(sc, ULD_TOM); + ADAPTER_UNLOCK(sc); } static int -t3_tom_init(void) +t3_tom_mod_unload(void) { - init_cpl_handlers(); - if (t3_init_cpl_io() < 0) { - log(LOG_ERR, - "Unable to initialize cpl io ops\n"); - return -1; - } - t3_init_socket_ops(); + t3_iterate(tom_uninit, NULL); - /* Register with the TOE device layer. */ + if (t3_unregister_uld(&tom_uld_info) == EBUSY) + return (EBUSY); - if (register_tom(&t3_tom_info) != 0) { - log(LOG_ERR, - "Unable to register Chelsio T3 TCP offload module.\n"); - return -1; - } - - rw_init(&adapter_list_lock, "ofld adap list"); - TAILQ_INIT(&adapter_list); - EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event, - NULL, EVENTHANDLER_PRI_ANY); - EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event, - NULL, EVENTHANDLER_PRI_ANY); - - mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF); - listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start, - cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY); - listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop, - cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY); - TAILQ_INIT(&cxgb_list); - - - - t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl); - t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl); - t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr); - t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl); - t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl); - t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl); - t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl); - t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl); - t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl); - t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl); - t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); - t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl); - t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl); - t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss); - t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); - t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term); - t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl); - t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl); - t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl); - t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl); - t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl); - t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl); - - /* Register to offloading devices */ - cxgb_register_client(&t3c_tom_client); - return (0); } +#endif /* ifdef TCP_OFFLOAD */ static int -t3_tom_load(module_t mod, int cmd, void *arg) +t3_tom_modevent(module_t mod, int cmd, void *arg) { - int err = 0; + int rc = 0; +#ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: - t3_tom_init(); - break; - case MOD_QUIESCE: + rc = t3_tom_mod_load(); break; + case MOD_UNLOAD: - printf("uhm, ... unloading isn't really supported for toe\n"); - break; - case MOD_SHUTDOWN: + rc = t3_tom_mod_unload(); break; + default: - err = EOPNOTSUPP; - break; + rc = EINVAL; } - - return (err); +#else + rc = EOPNOTSUPP; +#endif + return (rc); } -static moduledata_t mod_data= { +static moduledata_t t3_tom_moddata= { "t3_tom", - t3_tom_load, + t3_tom_modevent, 0 }; + MODULE_VERSION(t3_tom, 1); MODULE_DEPEND(t3_tom, toecore, 1, 1, 1); -MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1); -DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); - +MODULE_DEPEND(t3_tom, cxgbc, 1, 1, 1); +DECLARE_MODULE(t3_tom, t3_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h index 2f3201d..5cc29a8 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom.h +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h @@ -1,7 +1,6 @@ - /************************************************************************** -Copyright (c) 2007, Chelsio Inc. +Copyright (c) 2007, 2009 Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,128 +32,248 @@ $FreeBSD$ #ifndef CXGB_TOM_H_ #define CXGB_TOM_H_ #include -#include +#include -#define LISTEN_INFO_HASH_SIZE 32 +MALLOC_DECLARE(M_CXGB); -struct listen_info { - struct listen_info *next; /* Link to next entry */ - struct socket *so; /* The listening socket */ - unsigned int stid; /* The server TID */ -}; +#define KTR_CXGB KTR_SPARE3 +#define LISTEN_HASH_SIZE 32 /* - * TOM tunable parameters. They can be manipulated through sysctl(2) or /proc. + * Holds the size, base address, free list start, etc of the TID, server TID, + * and active-open TID tables for a offload device. + * The tables themselves are allocated dynamically. */ -struct tom_tunables { - int max_host_sndbuf; // max host RAM consumed by a sndbuf - int tx_hold_thres; // push/pull threshold for non-full TX sk_buffs - int max_wrs; // max # of outstanding WRs per connection - int rx_credit_thres; // min # of RX credits needed for RX_DATA_ACK - int cong_alg; // Congestion control algorithm - int mss; // max TX_DATA WR payload size - int delack; // delayed ACK control - int max_conn; // maximum number of offloaded connections - int soft_backlog_limit; // whether the listen backlog limit is soft - int ddp; // whether to put new connections in DDP mode - int ddp_thres; // min recvmsg size before activating DDP - int ddp_copy_limit; // capacity of kernel DDP buffer - int ddp_push_wait; // whether blocking DDP waits for PSH flag - int ddp_rcvcoalesce; // whether receive coalescing is enabled - int zcopy_sosend_enabled; // < is never zcopied - int zcopy_sosend_partial_thres; // < is never zcopied - int zcopy_sosend_partial_copy; // bytes copied in partial zcopy - int zcopy_sosend_thres;// >= are mostly zcopied - int zcopy_sosend_copy; // bytes coped in zcopied - int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA - int activated; // TOE engine activation state +struct tid_info { + void **tid_tab; + unsigned int ntids; + volatile unsigned int tids_in_use; + + union listen_entry *stid_tab; + unsigned int nstids; + unsigned int stid_base; + + union active_open_entry *atid_tab; + unsigned int natids; + unsigned int atid_base; + + /* + * The following members are accessed R/W so we put them in their own + * cache lines. TOM_XXX: actually do what is said here. + * + * XXX We could combine the atid fields above with the lock here since + * atids are use once (unlike other tids). OTOH the above fields are + * usually in cache due to tid_tab. + */ + struct mtx atid_lock; + union active_open_entry *afree; + unsigned int atids_in_use; + + struct mtx stid_lock; + union listen_entry *sfree; + unsigned int stids_in_use; }; struct tom_data { - TAILQ_ENTRY(tom_data) entry; - - struct t3cdev *cdev; - struct pci_dev *pdev; - struct toedev tdev; + struct toedev tod; + + /* + * toepcb's associated with this TOE device are either on the + * toep list or in the synq of a listening socket in lctx hash. + */ + struct mtx toep_list_lock; + TAILQ_HEAD(, toepcb) toep_list; - struct cxgb_client *client; - struct tom_tunables conf; - struct tom_sysctl_table *sysctl; + struct l2t_data *l2t; + struct tid_info tid_maps; /* - * The next three locks listen_lock, deferq.lock, and tid_release_lock - * are used rarely so we let them potentially share a cacheline. + * The next two locks listen_lock, and tid_release_lock are used rarely + * so we let them potentially share a cacheline. */ - struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE]; - struct mtx listen_lock; + LIST_HEAD(, listen_ctx) *listen_hash; + u_long listen_mask; + int lctx_count; /* # of lctx in the hash table */ + struct mtx lctx_hash_lock; - struct mbuf_head deferq; - struct task deferq_task; - - struct socket **tid_release_list; + void **tid_release_list; struct mtx tid_release_lock; struct task tid_release_task; +}; - volatile int tx_dma_pending; - - unsigned int ddp_llimit; - unsigned int ddp_ulimit; - - unsigned int rx_page_size; +struct synq_entry { + TAILQ_ENTRY(synq_entry) link; /* listen_ctx's synq link */ + int flags; /* same as toepcb's tp_flags */ + int tid; + struct mbuf *m; /* backpointer to containing mbuf */ + struct listen_ctx *lctx; /* backpointer to listen ctx */ + struct cpl_pass_establish *cpl; + struct toepcb *toep; + struct l2t_entry *e; + uint32_t iss; + uint32_t ts; + uint32_t opt0h; + uint32_t qset; + int rx_credits; + volatile u_int refcnt; - u8 *ppod_map; - unsigned int nppods; - struct mtx ppod_map_lock; - - struct adap_ports *ports; - struct taskqueue *tq; +#define RPL_OK 0 /* ok to reply */ +#define RPL_DONE 1 /* replied already */ +#define RPL_DONT 2 /* don't reply */ + volatile u_int reply; /* see above. */ }; +#define LCTX_RPL_PENDING 1 /* waiting for CPL_PASS_OPEN_RPL */ struct listen_ctx { - struct socket *lso; - struct tom_data *tom_data; - int ulp_mode; - LIST_HEAD(, toepcb) synq_head; - + LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ + volatile int refcnt; + int stid; + int flags; + struct inpcb *inp; /* listening socket's inp */ + int qset; + TAILQ_HEAD(, synq_entry) synq; }; -#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt) -#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev) -#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev) -#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param) +void t3_process_tid_release_list(void *data, int pending); + +static inline struct tom_data * +t3_tomdata(struct toedev *tod) +{ + return (member2struct(tom_data, tod, tod)); +} + +union listen_entry { + void *ctx; + union listen_entry *next; +}; -#define TP_DATASENT (1 << 0) -#define TP_TX_WAIT_IDLE (1 << 1) -#define TP_FIN_SENT (1 << 2) -#define TP_ABORT_RPL_PENDING (1 << 3) -#define TP_ABORT_SHUTDOWN (1 << 4) -#define TP_ABORT_RPL_RCVD (1 << 5) -#define TP_ABORT_REQ_RCVD (1 << 6) -#define TP_CLOSE_CON_REQUESTED (1 << 7) -#define TP_SYN_RCVD (1 << 8) -#define TP_ESTABLISHED (1 << 9) +union active_open_entry { + void *ctx; + union active_open_entry *next; +}; -void t3_init_tunables(struct tom_data *t); +/* + * Map an ATID or STID to their entries in the corresponding TID tables. + */ +static inline union active_open_entry *atid2entry(const struct tid_info *t, + unsigned int atid) +{ + return &t->atid_tab[atid - t->atid_base]; +} -void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p); -static __inline struct mbuf * -m_gethdr_nofail(int len) +static inline union listen_entry *stid2entry(const struct tid_info *t, + unsigned int stid) { - struct mbuf *m; - - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) { - panic("implement lowmem cache\n"); - } - - KASSERT(len < MHLEN, ("requested header size too large for mbuf")); - m->m_pkthdr.len = m->m_len = len; - return (m); + return &t->stid_tab[stid - t->stid_base]; } +/* + * Find the connection corresponding to a TID. + */ +static inline void *lookup_tid(const struct tid_info *t, unsigned int tid) +{ + void *p; + + if (tid >= t->ntids) + return (NULL); + + p = t->tid_tab[tid]; + if (p < (void *)t->tid_tab || p >= (void *)&t->atid_tab[t->natids]) + return (p); + + return (NULL); +} + +/* + * Find the connection corresponding to a server TID. + */ +static inline void *lookup_stid(const struct tid_info *t, unsigned int tid) +{ + void *p; + + if (tid < t->stid_base || tid >= t->stid_base + t->nstids) + return (NULL); + + p = stid2entry(t, tid)->ctx; + if (p < (void *)t->tid_tab || p >= (void *)&t->atid_tab[t->natids]) + return (p); + + return (NULL); +} + +/* + * Find the connection corresponding to an active-open TID. + */ +static inline void *lookup_atid(const struct tid_info *t, unsigned int tid) +{ + void *p; + + if (tid < t->atid_base || tid >= t->atid_base + t->natids) + return (NULL); + + p = atid2entry(t, tid)->ctx; + if (p < (void *)t->tid_tab || p >= (void *)&t->atid_tab[t->natids]) + return (p); + + return (NULL); +} + +static inline uint32_t +calc_opt2(int cpu_idx) +{ + uint32_t opt2 = F_CPU_INDEX_VALID | V_CPU_INDEX(cpu_idx); + + /* 3 = highspeed CC algorithm */ + opt2 |= V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(3) | + V_PACING_FLAVOR(1); + + /* coalesce and push bit semantics */ + opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(3); + + return (htobe32(opt2)); +} + +/* cxgb_tom.c */ +struct toepcb *toepcb_alloc(struct toedev *); +void toepcb_free(struct toepcb *); + +/* cxgb_cpl_io.c */ +void t3_init_cpl_io(struct adapter *); +int t3_push_frames(struct socket *, int); +int t3_connect(struct toedev *, struct socket *, struct rtentry *, + struct sockaddr *); +int t3_tod_output(struct toedev *, struct tcpcb *); +int t3_send_rst(struct toedev *, struct tcpcb *); +int t3_send_fin(struct toedev *, struct tcpcb *); +void insert_tid(struct tom_data *, void *, unsigned int); +void update_tid(struct tom_data *, void *, unsigned int); +void remove_tid(struct tom_data *, unsigned int); +uint32_t calc_opt0h(struct socket *, int, int, struct l2t_entry *); +uint32_t calc_opt0l(struct socket *, int); +void queue_tid_release(struct toedev *, unsigned int); +void offload_socket(struct socket *, struct toepcb *); +void undo_offload_socket(struct socket *); +int select_rcv_wscale(void); +unsigned long select_rcv_wnd(struct socket *); +int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int); +void make_established(struct socket *, uint32_t, uint32_t, uint16_t); +void t3_rcvd(struct toedev *, struct tcpcb *); +void t3_pcb_detach(struct toedev *, struct tcpcb *); +void send_abort_rpl(struct toedev *, int, int); +void release_tid(struct toedev *, unsigned int, int); +/* cxgb_listen.c */ +void t3_init_listen_cpl_handlers(struct adapter *); +int t3_listen_start(struct toedev *, struct tcpcb *); +int t3_listen_stop(struct toedev *, struct tcpcb *); +void t3_syncache_added(struct toedev *, void *); +void t3_syncache_removed(struct toedev *, void *); +int t3_syncache_respond(struct toedev *, void *, struct mbuf *); +int do_abort_req_synqe(struct sge_qset *, struct rsp_desc *, struct mbuf *); +int do_abort_rpl_synqe(struct sge_qset *, struct rsp_desc *, struct mbuf *); +void t3_offload_socket(struct toedev *, void *, struct socket *); #endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c deleted file mode 100644 index 926b445..0000000 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c +++ /dev/null @@ -1,140 +0,0 @@ -/************************************************************************** - -Copyright (c) 2007, Chelsio Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the Chelsio Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid clutter in the hw.* space, keep all toe tunables within hw.cxgb */ -SYSCTL_DECL(_hw_cxgb); -static SYSCTL_NODE(_hw_cxgb, OID_AUTO, toe, CTLFLAG_RD, 0, "TOE parameters"); - -static struct tom_tunables default_tunable_vals = { - .max_host_sndbuf = 32 * 1024, - .tx_hold_thres = 0, - .max_wrs = 15, - .rx_credit_thres = 15 * 1024, - .cong_alg = -1, - .mss = 16384, - .delack = 1, - .max_conn = -1, - .soft_backlog_limit = 0, - .ddp = 1, - .ddp_thres = 14 * 4096, - .ddp_copy_limit = 13 * 4096, - .ddp_push_wait = 1, - .ddp_rcvcoalesce = 0, - .zcopy_sosend_enabled = 0, - .zcopy_sosend_partial_thres = 40960, - .zcopy_sosend_partial_copy = 4096 * 3, - .zcopy_sosend_thres = 128 * 1024, - .zcopy_sosend_copy = 4096 * 2, - .zcopy_sosend_ret_pending_dma = 1, - .activated = 1, -}; - -static int activated = 1; -TUNABLE_INT("hw.cxgb.toe.activated", &activated); -SYSCTL_UINT(_hw_cxgb_toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0, - "enable TOE at init time"); - -static int ddp = 1; -TUNABLE_INT("hw.cxgb.toe.ddp", &ddp); -SYSCTL_UINT(_hw_cxgb_toe, OID_AUTO, ddp, CTLFLAG_RDTUN, &ddp, 0, "enable DDP"); - -void -t3_init_tunables(struct tom_data *t) -{ - t->conf = default_tunable_vals; - - /* Adjust tunables */ - t->conf.activated = activated; - t->conf.ddp = ddp; - - /* Now apply device specific fixups. */ - t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk; - t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs; -} - -void -t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p) -{ - struct sysctl_ctx_list *ctx; - struct sysctl_oid_list *children; - - ctx = device_get_sysctl_ctx(sc->dev); - children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); - -} - diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 6be75bc..ba5335a 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -157,6 +157,7 @@ enum { INTR_DIRECT = (1 << 2), /* direct interrupts for everything */ MASTER_PF = (1 << 3), ADAP_SYSCTL_CTX = (1 << 4), + TOM_INIT_DONE = (1 << 5), CXGBE_BUSY = (1 << 9), @@ -199,7 +200,7 @@ struct port_info { int first_txq; /* index of first tx queue */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int nofldtxq; /* # of offload tx queues */ int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ @@ -213,6 +214,8 @@ struct port_info { struct link_config link_cfg; struct port_stats stats; + eventhandler_tag vlan_c; + struct callout tick; struct sysctl_ctx_list ctx; /* from ifconfig up to driver detach */ @@ -296,7 +299,7 @@ struct sge_iq { enum { EQ_CTRL = 1, EQ_ETH = 2, -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD EQ_OFLD = 3, #endif @@ -422,14 +425,36 @@ struct sge_rxq { } __aligned(CACHE_LINE_SIZE); -#ifndef TCP_OFFLOAD_DISABLE +static inline struct sge_rxq * +iq_to_rxq(struct sge_iq *iq) +{ + + return (member2struct(sge_rxq, iq, iq)); +} + + +#ifdef TCP_OFFLOAD /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_ofld_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ } __aligned(CACHE_LINE_SIZE); + +static inline struct sge_ofld_rxq * +iq_to_ofld_rxq(struct sge_iq *iq) +{ + + return (member2struct(sge_ofld_rxq, iq, iq)); +} #endif +struct wrqe { + STAILQ_ENTRY(wrqe) link; + struct sge_wrq *wrq; + int wr_len; + uint64_t wr[] __aligned(16); +}; + /* * wrq: SGE egress queue that is given prebuilt work requests. Both the control * and offload tx queues are of this type. @@ -438,8 +463,9 @@ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; - struct mbuf *head; /* held up due to lack of descriptors */ - struct mbuf *tail; /* valid only if head is valid */ + + /* List of WRs held up due to lack of tx descriptors */ + STAILQ_HEAD(, wrqe) wr_list; /* stats for common events first */ @@ -457,7 +483,7 @@ struct sge { int nrxq; /* total # of Ethernet rx queues */ int ntxq; /* total # of Ethernet tx tx queues */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ #endif @@ -469,7 +495,7 @@ struct sge { struct sge_wrq *ctrlq; /* Control queues */ struct sge_txq *txq; /* NIC tx queues */ struct sge_rxq *rxq; /* NIC rx queues */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_wrq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ #endif @@ -483,6 +509,7 @@ struct sge { struct rss_header; typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *, struct mbuf *); +typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *); struct adapter { SLIST_ENTRY(adapter) link; @@ -519,15 +546,15 @@ struct adapter { uint8_t chan_map[NCHAN]; uint32_t filter_mode; -#ifndef TCP_OFFLOAD_DISABLE - struct uld_softc tom; +#ifdef TCP_OFFLOAD + void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; #endif struct l2t_data *l2t; /* L2 table */ struct tid_info tids; int open_device_map; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int offload_map; #endif int flags; @@ -554,7 +581,8 @@ struct adapter { TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; - cpl_handler_t cpl_handler[256] __aligned(CACHE_LINE_SIZE); + an_handler_t an_handler __aligned(CACHE_LINE_SIZE); + cpl_handler_t cpl_handler[256]; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) @@ -609,82 +637,96 @@ struct adapter { static inline uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { + return bus_space_read_4(sc->bt, sc->bh, reg); } static inline void t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val) { + bus_space_write_4(sc->bt, sc->bh, reg, val); } static inline uint64_t t4_read_reg64(struct adapter *sc, uint32_t reg) { + return t4_bus_space_read_8(sc->bt, sc->bh, reg); } static inline void t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val) { + t4_bus_space_write_8(sc->bt, sc->bh, reg, val); } static inline void t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val) { + *val = pci_read_config(sc->dev, reg, 1); } static inline void t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val) { + pci_write_config(sc->dev, reg, val, 1); } static inline void t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val) { + *val = pci_read_config(sc->dev, reg, 2); } static inline void t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val) { + pci_write_config(sc->dev, reg, val, 2); } static inline void t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val) { + *val = pci_read_config(sc->dev, reg, 4); } static inline void t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val) { + pci_write_config(sc->dev, reg, val, 4); } static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) { + return (sc->port[idx]); } static inline void t4_os_set_hw_addr(struct adapter *sc, int idx, uint8_t hw_addr[]) { + bcopy(hw_addr, sc->port[idx]->hw_addr, ETHER_ADDR_LEN); } static inline bool is_10G_port(const struct port_info *pi) { + return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0); } static inline int tx_resume_threshold(struct sge_eq *eq) { + return (eq->qsize / 4); } @@ -698,6 +740,7 @@ void t4_os_portmod_changed(const struct adapter *, int); void t4_os_link_changed(struct adapter *, int, int); void t4_iterate(void (*)(struct adapter *, void *), void *); int t4_register_cpl_handler(struct adapter *, int, cpl_handler_t); +int t4_register_an_handler(struct adapter *, an_handler_t); /* t4_sge.c */ void t4_sge_modload(void); @@ -714,21 +757,45 @@ void t4_intr_all(void *); void t4_intr(void *); void t4_intr_err(void *); void t4_intr_evt(void *); -int t4_mgmt_tx(struct adapter *, struct mbuf *); -int t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct mbuf *); +void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *); void t4_update_fl_bufsize(struct ifnet *); int can_resume_tx(struct sge_eq *); -static inline int t4_wrq_tx(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m) +static inline struct wrqe * +alloc_wrqe(int wr_len, struct sge_wrq *wrq) { - int rc; + int len = offsetof(struct wrqe, wr) + wr_len; + struct wrqe *wr; + + wr = malloc(len, M_CXGBE, M_NOWAIT); + if (__predict_false(wr == NULL)) + return (NULL); + wr->wr_len = wr_len; + wr->wrq = wrq; + return (wr); +} + +static inline void * +wrtod(struct wrqe *wr) +{ + return (&wr->wr[0]); +} + +static inline void +free_wrqe(struct wrqe *wr) +{ + free(wr, M_CXGBE); +} + +static inline void +t4_wrq_tx(struct adapter *sc, struct wrqe *wr) +{ + struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); - rc = t4_wrq_tx_locked(sc, wrq, m); + t4_wrq_tx_locked(sc, wrq, wr); TXQ_UNLOCK(wrq); - return (rc); } - #endif diff --git a/sys/dev/cxgbe/common/t4_hw.c b/sys/dev/cxgbe/common/t4_hw.c index 6f4dd8d..f629cbe 100644 --- a/sys/dev/cxgbe/common/t4_hw.c +++ b/sys/dev/cxgbe/common/t4_hw.c @@ -27,6 +27,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include "common.h" #include "t4_regs.h" #include "t4_regs_values.h" diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index f6ada9d..1ae9f1f 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -31,12 +31,6 @@ #ifndef __T4_OFFLOAD_H__ #define __T4_OFFLOAD_H__ -/* XXX: flagrant misuse of mbuf fields (during tx by TOM) */ -#define MBUF_EQ(m) (*((void **)(&(m)->m_pkthdr.rcvif))) -/* These have to work for !M_PKTHDR so we use a field from m_hdr. */ -#define MBUF_TX_CREDITS(m) ((m)->m_hdr.pad[0]) -#define MBUF_DMA_MAPPED(m) ((m)->m_hdr.pad[1]) - #define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \ (w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \ (w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \ @@ -119,7 +113,7 @@ struct t4_virt_res { /* virtualized HW resources */ struct t4_range ocq; }; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD enum { ULD_TOM = 1, }; @@ -130,13 +124,8 @@ struct uld_info { SLIST_ENTRY(uld_info) link; int refcount; int uld_id; - int (*attach)(struct adapter *, void **); - int (*detach)(void *); -}; - -struct uld_softc { - struct uld_info *uld; - void *softc; + int (*activate)(struct adapter *); + int (*deactivate)(struct adapter *); }; struct tom_tunables { @@ -148,6 +137,8 @@ struct tom_tunables { int t4_register_uld(struct uld_info *); int t4_unregister_uld(struct uld_info *); +int t4_activate_uld(struct adapter *, int); +int t4_deactivate_uld(struct adapter *, int); #endif #endif diff --git a/sys/dev/cxgbe/t4_l2t.c b/sys/dev/cxgbe/t4_l2t.c index 55491cd..8373c32 100644 --- a/sys/dev/cxgbe/t4_l2t.c +++ b/sys/dev/cxgbe/t4_l2t.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2011 Chelsio Communications, Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,16 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include "common/common.h" #include "common/jhash.h" @@ -72,42 +63,11 @@ __FBSDID("$FreeBSD$"); * lifetime of an L2T entry is fully contained in the lifetime of the TOE. */ -/* identifies sync vs async L2T_WRITE_REQs */ -#define S_SYNC_WR 12 -#define V_SYNC_WR(x) ((x) << S_SYNC_WR) -#define F_SYNC_WR V_SYNC_WR(1) - -enum { - L2T_STATE_VALID, /* entry is up to date */ - L2T_STATE_STALE, /* entry may be used but needs revalidation */ - L2T_STATE_RESOLVING, /* entry needs address resolution */ - L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */ - - /* when state is one of the below the entry is not hashed */ - L2T_STATE_SWITCHING, /* entry is being used by a switching filter */ - L2T_STATE_UNUSED /* entry not in use */ -}; - -struct l2t_data { - struct rwlock lock; - volatile int nfree; /* number of free entries */ - struct l2t_entry *rover;/* starting point for next allocation */ - struct l2t_entry l2tab[L2T_SIZE]; -}; - -static int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, - struct mbuf *); - -#define VLAN_NONE 0xfff -#define SA(x) ((struct sockaddr *)(x)) -#define SIN(x) ((struct sockaddr_in *)(x)) -#define SINADDR(x) (SIN(x)->sin_addr.s_addr) - /* * Allocate a free L2T entry. Must be called with l2t_data.lock held. */ -static struct l2t_entry * -alloc_l2e(struct l2t_data *d) +struct l2t_entry * +t4_alloc_l2e(struct l2t_data *d) { struct l2t_entry *end, *e, **p; @@ -121,7 +81,8 @@ alloc_l2e(struct l2t_data *d) if (atomic_load_acq_int(&e->refcnt) == 0) goto found; - for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) ; + for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) + continue; found: d->rover = e + 1; atomic_subtract_int(&d->nfree, 1); @@ -148,19 +109,18 @@ found: * Write an L2T entry. Must be called with the entry locked. * The write may be synchronous or asynchronous. */ -static int -write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) +int +t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) { - struct mbuf *m; + struct wrqe *wr; struct cpl_l2t_write_req *req; mtx_assert(&e->lock, MA_OWNED); - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) + wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq); + if (wr == NULL) return (ENOMEM); - - req = mtod(m, struct cpl_l2t_write_req *); - m->m_pkthdr.len = m->m_len = sizeof(*req); + req = wrtod(wr); INIT_TP_WR(req, 0); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx | @@ -170,7 +130,7 @@ write_l2e(struct adapter *sc, struct l2t_entry *e, int sync) req->vlan = htons(e->vlan); memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); - t4_mgmt_tx(sc, m); + t4_wrq_tx(sc, wr); if (sync && e->state != L2T_STATE_SWITCHING) e->state = L2T_STATE_SYNC_WRITE; @@ -189,7 +149,7 @@ t4_l2t_alloc_switching(struct l2t_data *d) struct l2t_entry *e; rw_rlock(&d->lock); - e = alloc_l2e(d); + e = t4_alloc_l2e(d); if (e) { mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ e->state = L2T_STATE_SWITCHING; @@ -214,7 +174,7 @@ t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan, e->lport = port; memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN); mtx_lock(&e->lock); - rc = write_l2e(sc, e, 0); + rc = t4_write_l2e(sc, e, 0); mtx_unlock(&e->lock); return (rc); } @@ -234,10 +194,13 @@ t4_init_l2t(struct adapter *sc, int flags) rw_init(&d->lock, "L2T"); for (i = 0; i < L2T_SIZE; i++) { - d->l2tab[i].idx = i; - d->l2tab[i].state = L2T_STATE_UNUSED; - mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF); - atomic_store_rel_int(&d->l2tab[i].refcnt, 0); + struct l2t_entry *e = &d->l2tab[i]; + + e->idx = i; + e->state = L2T_STATE_UNUSED; + mtx_init(&e->lock, "L2T_E", NULL, MTX_DEF); + STAILQ_INIT(&e->wr_list); + atomic_store_rel_int(&e->refcnt, 0); } sc->l2t = d; @@ -259,6 +222,24 @@ t4_free_l2t(struct l2t_data *d) return (0); } +int +do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(rpl); + unsigned int idx = tid & (L2T_SIZE - 1); + + if (__predict_false(rpl->status != CPL_ERR_NONE)) { + log(LOG_ERR, + "Unexpected L2T_WRITE_RPL status %u for entry %u\n", + rpl->status, idx); + return (EINVAL); + } + + return (0); +} + #ifdef SBUF_DRAIN static inline unsigned int vlan_prio(const struct l2t_entry *e) @@ -273,7 +254,7 @@ l2e_state(const struct l2t_entry *e) case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */ case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */ case L2T_STATE_SYNC_WRITE: return 'W'; - case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R'; + case L2T_STATE_RESOLVING: return STAILQ_EMPTY(&e->wr_list) ? 'R' : 'A'; case L2T_STATE_SWITCHING: return 'X'; default: return 'U'; } @@ -311,20 +292,20 @@ sysctl_l2t(SYSCTL_HANDLER_ARGS) "Ethernet address VLAN/P LP State Users Port"); header = 1; } - if (e->state == L2T_STATE_SWITCHING || e->v6) + if (e->state == L2T_STATE_SWITCHING) ip[0] = 0; else snprintf(ip, sizeof(ip), "%s", - inet_ntoa(*(struct in_addr *)&e->addr[0])); + inet_ntoa(*(struct in_addr *)&e->addr)); - /* XXX: accessing lle probably not safe? */ + /* XXX: e->ifp may not be around */ sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d" " %u %2u %c %5u %s", e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5], e->vlan & 0xfff, vlan_prio(e), e->lport, l2e_state(e), atomic_load_acq_int(&e->refcnt), - e->lle ? e->lle->lle_tbl->llt_ifp->if_xname : ""); + e->ifp->if_xname); skip: mtx_unlock(&e->lock); } @@ -335,459 +316,3 @@ skip: return (rc); } #endif - -#ifndef TCP_OFFLOAD_DISABLE -static inline void -l2t_hold(struct l2t_data *d, struct l2t_entry *e) -{ - if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */ - atomic_subtract_int(&d->nfree, 1); -} - -/* - * To avoid having to check address families we do not allow v4 and v6 - * neighbors to be on the same hash chain. We keep v4 entries in the first - * half of available hash buckets and v6 in the second. - */ -enum { - L2T_SZ_HALF = L2T_SIZE / 2, - L2T_HASH_MASK = L2T_SZ_HALF - 1 -}; - -static inline unsigned int -arp_hash(const uint32_t *key, int ifindex) -{ - return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK; -} - -static inline unsigned int -ipv6_hash(const uint32_t *key, int ifindex) -{ - uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3]; - - return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK); -} - -static inline unsigned int -addr_hash(const uint32_t *addr, int addr_len, int ifindex) -{ - return addr_len == 4 ? arp_hash(addr, ifindex) : - ipv6_hash(addr, ifindex); -} - -/* - * Checks if an L2T entry is for the given IP/IPv6 address. It does not check - * whether the L2T entry and the address are of the same address family. - * Callers ensure an address is only checked against L2T entries of the same - * family, something made trivial by the separation of IP and IPv6 hash chains - * mentioned above. Returns 0 if there's a match, - */ -static inline int -addreq(const struct l2t_entry *e, const uint32_t *addr) -{ - if (e->v6) - return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) | - (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]); - return e->addr[0] ^ addr[0]; -} - -/* - * Add a packet to an L2T entry's queue of packets awaiting resolution. - * Must be called with the entry's lock held. - */ -static inline void -arpq_enqueue(struct l2t_entry *e, struct mbuf *m) -{ - mtx_assert(&e->lock, MA_OWNED); - - KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt not NULL", __func__)); - if (e->arpq_head) - e->arpq_tail->m_nextpkt = m; - else - e->arpq_head = m; - e->arpq_tail = m; -} - -static inline void -send_pending(struct adapter *sc, struct l2t_entry *e) -{ - struct mbuf *m, *next; - - mtx_assert(&e->lock, MA_OWNED); - - for (m = e->arpq_head; m; m = next) { - next = m->m_nextpkt; - m->m_nextpkt = NULL; - t4_wrq_tx(sc, MBUF_EQ(m), m); - } - e->arpq_head = e->arpq_tail = NULL; -} - -#ifdef INET -/* - * Looks up and fills up an l2t_entry's lle. We grab all the locks that we need - * ourself, and update e->state at the end if e->lle was successfully filled. - * - * The lle passed in comes from arpresolve and is ignored as it does not appear - * to be of much use. - */ -static int -l2t_fill_lle(struct adapter *sc, struct l2t_entry *e, struct llentry *unused) -{ - int rc = 0; - struct sockaddr_in sin; - struct ifnet *ifp = e->ifp; - struct llentry *lle; - - bzero(&sin, sizeof(struct sockaddr_in)); - if (e->v6) - panic("%s: IPv6 L2 resolution not supported yet.", __func__); - - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in)); - - mtx_assert(&e->lock, MA_NOTOWNED); - KASSERT(e->addr && ifp, ("%s: bad prep before call", __func__)); - - IF_AFDATA_LOCK(ifp); - lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, SA(&sin)); - IF_AFDATA_UNLOCK(ifp); - if (!LLE_IS_VALID(lle)) - return (ENOMEM); - if (!(lle->la_flags & LLE_VALID)) { - rc = EINVAL; - goto done; - } - - LLE_ADDREF(lle); - - mtx_lock(&e->lock); - if (e->state == L2T_STATE_RESOLVING) { - KASSERT(e->lle == NULL, ("%s: lle already valid", __func__)); - e->lle = lle; - memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN); - write_l2e(sc, e, 1); - } else { - KASSERT(e->lle == lle, ("%s: lle changed", __func__)); - LLE_REMREF(lle); - } - mtx_unlock(&e->lock); -done: - LLE_WUNLOCK(lle); - return (rc); -} -#endif - -int -t4_l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e) -{ -#ifndef INET - return (EINVAL); -#else - struct llentry *lle = NULL; - struct sockaddr_in sin; - struct ifnet *ifp = e->ifp; - - if (e->v6) - panic("%s: IPv6 L2 resolution not supported yet.", __func__); - - bzero(&sin, sizeof(struct sockaddr_in)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in)); - -again: - switch (e->state) { - case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ - if (arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0) - l2t_fill_lle(sc, e, lle); - - /* Fall through */ - - case L2T_STATE_VALID: /* fast-path, send the packet on */ - return t4_wrq_tx(sc, MBUF_EQ(m), m); - - case L2T_STATE_RESOLVING: - case L2T_STATE_SYNC_WRITE: - mtx_lock(&e->lock); - if (e->state != L2T_STATE_SYNC_WRITE && - e->state != L2T_STATE_RESOLVING) { - /* state changed by the time we got here */ - mtx_unlock(&e->lock); - goto again; - } - arpq_enqueue(e, m); - mtx_unlock(&e->lock); - - if (e->state == L2T_STATE_RESOLVING && - arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0) - l2t_fill_lle(sc, e, lle); - } - - return (0); -#endif -} - -/* - * Called when an L2T entry has no more users. The entry is left in the hash - * table since it is likely to be reused but we also bump nfree to indicate - * that the entry can be reallocated for a different neighbor. We also drop - * the existing neighbor reference in case the neighbor is going away and is - * waiting on our reference. - * - * Because entries can be reallocated to other neighbors once their ref count - * drops to 0 we need to take the entry's lock to avoid races with a new - * incarnation. - */ -static void -t4_l2e_free(struct l2t_entry *e) -{ - struct llentry *lle = NULL; - struct l2t_data *d; - - mtx_lock(&e->lock); - if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */ - lle = e->lle; - e->lle = NULL; - /* - * Don't need to worry about the arpq, an L2T entry can't be - * released if any packets are waiting for resolution as we - * need to be able to communicate with the device to close a - * connection. - */ - } - mtx_unlock(&e->lock); - - d = container_of(e, struct l2t_data, l2tab[e->idx]); - atomic_add_int(&d->nfree, 1); - - if (lle) - LLE_FREE(lle); -} - -void -t4_l2t_release(struct l2t_entry *e) -{ - if (atomic_fetchadd_int(&e->refcnt, -1) == 1) - t4_l2e_free(e); -} - -static int -do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, - struct mbuf *m) -{ - struct adapter *sc = iq->adapter; - const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); - unsigned int tid = GET_TID(rpl); - unsigned int idx = tid & (L2T_SIZE - 1); - - if (__predict_false(rpl->status != CPL_ERR_NONE)) { - log(LOG_ERR, - "Unexpected L2T_WRITE_RPL status %u for entry %u\n", - rpl->status, idx); - return (EINVAL); - } - - if (tid & F_SYNC_WR) { - struct l2t_entry *e = &sc->l2t->l2tab[idx]; - - mtx_lock(&e->lock); - if (e->state != L2T_STATE_SWITCHING) { - send_pending(sc, e); - e->state = L2T_STATE_VALID; - } - mtx_unlock(&e->lock); - } - - return (0); -} - -/* - * Reuse an L2T entry that was previously used for the same next hop. - */ -static void -reuse_entry(struct l2t_entry *e) -{ - struct llentry *lle; - - mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ - lle = e->lle; - if (lle) { - KASSERT(lle->la_flags & LLE_VALID, - ("%s: invalid lle stored in l2t_entry", __func__)); - - if (lle->la_expire >= time_uptime) - e->state = L2T_STATE_STALE; - else - e->state = L2T_STATE_VALID; - } else - e->state = L2T_STATE_RESOLVING; - mtx_unlock(&e->lock); -} - -/* - * The TOE wants an L2 table entry that it can use to reach the next hop over - * the specified port. Produce such an entry - create one if needed. - * - * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on - * top of the real cxgbe interface. - */ -struct l2t_entry * -t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa) -{ - struct l2t_entry *e; - struct l2t_data *d = pi->adapter->l2t; - int addr_len; - uint32_t *addr; - int hash; - struct sockaddr_in6 *sin6; - unsigned int smt_idx = pi->port_id; - - if (sa->sa_family == AF_INET) { - addr = (uint32_t *)&SINADDR(sa); - addr_len = sizeof(SINADDR(sa)); - } else if (sa->sa_family == AF_INET6) { - sin6 = (struct sockaddr_in6 *)sa; - addr = (uint32_t *)&sin6->sin6_addr.s6_addr; - addr_len = sizeof(sin6->sin6_addr.s6_addr); - } else - return (NULL); - -#ifndef VLAN_TAG - if (ifp->if_type == IFT_L2VLAN) - return (NULL); -#endif - - hash = addr_hash(addr, addr_len, ifp->if_index); - - rw_wlock(&d->lock); - for (e = d->l2tab[hash].first; e; e = e->next) { - if (!addreq(e, addr) && e->ifp == ifp && e->smt_idx == smt_idx){ - l2t_hold(d, e); - if (atomic_load_acq_int(&e->refcnt) == 1) - reuse_entry(e); - goto done; - } - } - - /* Need to allocate a new entry */ - e = alloc_l2e(d); - if (e) { - mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ - e->state = L2T_STATE_RESOLVING; - memcpy(e->addr, addr, addr_len); - e->ifindex = ifp->if_index; - e->smt_idx = smt_idx; - e->ifp = ifp; - e->hash = hash; - e->lport = pi->lport; - e->v6 = (addr_len == 16); - e->lle = NULL; - atomic_store_rel_int(&e->refcnt, 1); -#ifdef VLAN_TAG - if (ifp->if_type == IFT_L2VLAN) - VLAN_TAG(ifp, &e->vlan); - else - e->vlan = VLAN_NONE; -#endif - e->next = d->l2tab[hash].first; - d->l2tab[hash].first = e; - mtx_unlock(&e->lock); - } -done: - rw_wunlock(&d->lock); - return e; -} - -/* - * Called when the host's neighbor layer makes a change to some entry that is - * loaded into the HW L2 table. - */ -void -t4_l2t_update(struct adapter *sc, struct llentry *lle) -{ - struct l2t_entry *e; - struct l2t_data *d = sc->l2t; - struct sockaddr *sa = L3_ADDR(lle); - struct llentry *old_lle = NULL; - uint32_t *addr = (uint32_t *)&SINADDR(sa); - struct ifnet *ifp = lle->lle_tbl->llt_ifp; - int hash = addr_hash(addr, sizeof(*addr), ifp->if_index); - - KASSERT(d != NULL, ("%s: no L2 table", __func__)); - LLE_WLOCK_ASSERT(lle); - KASSERT(lle->la_flags & LLE_VALID || lle->la_flags & LLE_DELETED, - ("%s: entry neither valid nor deleted.", __func__)); - - rw_rlock(&d->lock); - for (e = d->l2tab[hash].first; e; e = e->next) { - if (!addreq(e, addr) && e->ifp == ifp) { - mtx_lock(&e->lock); - if (atomic_load_acq_int(&e->refcnt)) - goto found; - e->state = L2T_STATE_STALE; - mtx_unlock(&e->lock); - break; - } - } - rw_runlock(&d->lock); - - /* The TOE has no interest in this LLE */ - return; - - found: - rw_runlock(&d->lock); - - if (atomic_load_acq_int(&e->refcnt)) { - - /* Entry is referenced by at least 1 offloaded connection. */ - - /* Handle deletes first */ - if (lle->la_flags & LLE_DELETED) { - if (lle == e->lle) { - e->lle = NULL; - e->state = L2T_STATE_RESOLVING; - LLE_REMREF(lle); - } - goto done; - } - - if (lle != e->lle) { - old_lle = e->lle; - LLE_ADDREF(lle); - e->lle = lle; - } - - if (e->state == L2T_STATE_RESOLVING || - memcmp(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN)) { - - /* unresolved -> resolved; or dmac changed */ - - memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN); - write_l2e(sc, e, 1); - } else { - - /* +ve reinforcement of a valid or stale entry */ - - } - - e->state = L2T_STATE_VALID; - - } else { - /* - * Entry was used previously but is unreferenced right now. - * e->lle has been released and NULL'd out by t4_l2t_free, or - * l2t_release is about to call t4_l2t_free and do that. - * - * Either way this is of no interest to us. - */ - } - -done: - mtx_unlock(&e->lock); - if (old_lle) - LLE_FREE(old_lle); -} - -#endif diff --git a/sys/dev/cxgbe/t4_l2t.h b/sys/dev/cxgbe/t4_l2t.h index 5dfce83..0303885 100644 --- a/sys/dev/cxgbe/t4_l2t.h +++ b/sys/dev/cxgbe/t4_l2t.h @@ -30,8 +30,25 @@ #ifndef __T4_L2T_H #define __T4_L2T_H +/* identifies sync vs async L2T_WRITE_REQs */ +#define S_SYNC_WR 12 +#define V_SYNC_WR(x) ((x) << S_SYNC_WR) +#define F_SYNC_WR V_SYNC_WR(1) + enum { L2T_SIZE = 4096 }; /* # of L2T entries */ +enum { + L2T_STATE_VALID, /* entry is up to date */ + L2T_STATE_STALE, /* entry may be used but needs revalidation */ + L2T_STATE_RESOLVING, /* entry needs address resolution */ + L2T_STATE_FAILED, /* failed to resolve */ + L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */ + + /* when state is one of the below the entry is not hashed */ + L2T_STATE_SWITCHING, /* entry is being used by a switching filter */ + L2T_STATE_UNUSED /* entry not in use */ +}; + /* * Each L2T entry plays multiple roles. First of all, it keeps state for the * corresponding entry of the HW L2 table and maintains a queue of offload @@ -43,39 +60,49 @@ enum { L2T_SIZE = 4096 }; /* # of L2T entries */ struct l2t_entry { uint16_t state; /* entry state */ uint16_t idx; /* entry index */ - uint32_t addr[4]; /* next hop IP or IPv6 address */ + uint32_t addr; /* next hop IP address */ struct ifnet *ifp; /* outgoing interface */ uint16_t smt_idx; /* SMT index */ uint16_t vlan; /* VLAN TCI (id: 0-11, prio: 13-15) */ - int ifindex; /* interface index */ - struct llentry *lle; /* llentry for next hop */ struct l2t_entry *first; /* start of hash chain */ struct l2t_entry *next; /* next l2t_entry on chain */ - struct mbuf *arpq_head; /* list of mbufs awaiting resolution */ - struct mbuf *arpq_tail; + STAILQ_HEAD(, wrqe) wr_list; /* list of WRs awaiting resolution */ struct mtx lock; volatile int refcnt; /* entry reference count */ uint16_t hash; /* hash bucket the entry is on */ - uint8_t v6; /* whether entry is for IPv6 */ uint8_t lport; /* associated offload logical port */ uint8_t dmac[ETHER_ADDR_LEN]; /* next hop's MAC address */ }; +struct l2t_data { + struct rwlock lock; + volatile int nfree; /* number of free entries */ + struct l2t_entry *rover;/* starting point for next allocation */ + struct l2t_entry l2tab[L2T_SIZE]; +}; + + int t4_init_l2t(struct adapter *, int); int t4_free_l2t(struct l2t_data *); +struct l2t_entry *t4_alloc_l2e(struct l2t_data *); struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *); int t4_l2t_set_switching(struct adapter *, struct l2t_entry *, uint16_t, uint8_t, uint8_t *); -void t4_l2t_release(struct l2t_entry *); +int t4_write_l2e(struct adapter *, struct l2t_entry *, int); +int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); + +static inline void +t4_l2t_release(struct l2t_entry *e) +{ + struct l2t_data *d = container_of(e, struct l2t_data, l2tab[e->idx]); + + if (atomic_fetchadd_int(&e->refcnt, -1) == 1) + atomic_add_int(&d->nfree, 1); +} + + #ifdef SBUF_DRAIN int sysctl_l2t(SYSCTL_HANDLER_ARGS); #endif -#ifndef TCP_OFFLOAD_DISABLE -struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *, - struct sockaddr *); -int t4_l2t_send(struct adapter *, struct mbuf *, struct l2t_entry *); -void t4_l2t_update(struct adapter *, struct llentry *); -#endif - #endif /* __T4_L2T_H */ diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 874a6ad..a91363b 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -119,9 +119,13 @@ static void cxgbe_media_status(struct ifnet *, struct ifmediareq *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4 Ethernet driver and services"); +/* + * Correct lock order when you need to acquire multiple locks is t4_list_lock, + * then ADAPTER_LOCK, then t4_uld_list_lock. + */ static struct mtx t4_list_lock; static SLIST_HEAD(, adapter) t4_list; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static struct mtx t4_uld_list_lock; static SLIST_HEAD(, uld_info) t4_uld_list; #endif @@ -149,7 +153,7 @@ TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); static int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g); @@ -237,7 +241,7 @@ struct intrs_and_queues { int nrxq10g; /* # of NIC rxq's for each 10G port */ int ntxq1g; /* # of NIC txq's for each 1G port */ int nrxq1g; /* # of NIC rxq's for each 1G port */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int nofldtxq10g; /* # of TOE txq's for each 10G port */ int nofldrxq10g; /* # of TOE rxq's for each 10G port */ int nofldtxq1g; /* # of TOE txq's for each 1G port */ @@ -297,8 +301,10 @@ static void reg_block_dump(struct adapter *, uint8_t *, unsigned int, unsigned int); static void t4_get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void cxgbe_tick(void *); +static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t); static int cpl_not_handled(struct sge_iq *, const struct rss_header *, struct mbuf *); +static int an_not_handled(struct sge_iq *, const struct rsp_ctrl *); static int t4_sysctls(struct adapter *); static int cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); @@ -342,10 +348,8 @@ static int filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); static int get_sge_context(struct adapter *, struct t4_sge_context *); static int read_card_mem(struct adapter *, struct t4_mem_range *); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int toe_capability(struct port_info *, int); -static int activate_uld(struct adapter *, int, struct uld_softc *); -static int deactivate_uld(struct uld_softc *); #endif static int t4_mod_event(module_t, int, void *); @@ -368,8 +372,12 @@ struct t4_pciids { {0x440a, 4, "Chelsio T404-BT"}, }; -#ifndef TCP_OFFLOAD_DISABLE -/* This is used in service_iq() to get to the fl associated with an iq. */ +#ifdef TCP_OFFLOAD +/* + * service_iq() has an iq and needs the fl. Offset of fl from the iq should be + * exactly the same for both rxq and ofld_rxq. + */ +CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif @@ -401,7 +409,7 @@ t4_attach(device_t dev) int rc = 0, i, n10g, n1g, rqidx, tqidx; struct intrs_and_queues iaq; struct sge *s; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif @@ -436,6 +444,7 @@ t4_attach(device_t dev) goto done; /* error message displayed already */ memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); + sc->an_handler = an_not_handled; for (i = 0; i < ARRAY_SIZE(sc->cpl_handler); i++) sc->cpl_handler[i] = cpl_not_handled; t4_register_cpl_handler(sc, CPL_SET_TCB_RPL, filter_rpl); @@ -595,7 +604,7 @@ t4_attach(device_t dev) s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; @@ -631,7 +640,7 @@ t4_attach(device_t dev) * tx queues that each port should get. */ rqidx = tqidx = 0; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; #endif for_each_port(sc, i) { @@ -653,7 +662,7 @@ t4_attach(device_t dev) rqidx += pi->nrxq; tqidx += pi->ntxq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { pi->first_ofld_rxq = ofld_rqidx; pi->first_ofld_txq = ofld_tqidx; @@ -761,7 +770,7 @@ t4_detach(device_t dev) if (sc->l2t) t4_free_l2t(sc->l2t); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); #endif @@ -832,7 +841,7 @@ cxgbe_attach(device_t dev) ifp->if_qflush = cxgbe_qflush; ifp->if_capabilities = T4_CAP; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) ifp->if_capabilities |= IFCAP_TOE4; #endif @@ -844,9 +853,12 @@ cxgbe_attach(device_t dev) cxgbe_media_status); build_medialist(pi); + pi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, + EVENTHANDLER_PRI_ANY); + ether_ifattach(ifp, pi->hw_addr); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { device_printf(dev, "%d txq, %d rxq (NIC); %d txq, %d rxq (TOE)\n", @@ -876,6 +888,9 @@ cxgbe_detach(device_t dev) SET_BUSY(sc); ADAPTER_UNLOCK(sc); + if (pi->vlan_c) + EVENTHANDLER_DEREGISTER(vlan_config, pi->vlan_c); + PORT_LOCK(pi); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&pi->tick); @@ -1042,7 +1057,7 @@ fail: } #endif } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; @@ -1292,7 +1307,7 @@ cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, iaq->ntxq1g = t4_ntxq1g; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldtxq10g = t4_nofldtxq10g; iaq->nofldtxq1g = t4_nofldtxq1g; iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g; @@ -1364,7 +1379,7 @@ restart: n++; } iaq->nrxq10g = min(n, nrxq10g); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldrxq10g = min(n, nofldrxq10g); #endif } @@ -1379,7 +1394,7 @@ restart: n++; } iaq->nrxq1g = min(n, nrxq1g); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldrxq1g = min(n, nofldrxq1g); #endif } @@ -1392,7 +1407,7 @@ restart: * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif @@ -2305,7 +2320,7 @@ adapter_full_init(struct adapter *sc) struct irq *irq; struct port_info *pi; struct sge_rxq *rxq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif @@ -2369,7 +2384,7 @@ adapter_full_init(struct adapter *sc) for_each_port(sc, p) { pi = sc->port[p]; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD /* * Skip over the NIC queues if they aren't taking direct * interrupts. @@ -2386,7 +2401,7 @@ adapter_full_init(struct adapter *sc) rid++; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD /* * Skip over the offload queues if they aren't taking * direct interrupts. @@ -2494,7 +2509,7 @@ port_full_uninit(struct port_info *pi) int i; struct sge_rxq *rxq; struct sge_txq *txq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif @@ -2507,7 +2522,7 @@ port_full_uninit(struct port_info *pi) quiesce_eq(sc, &txq->eq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { quiesce_eq(sc, &ofld_txq->eq); } @@ -2518,7 +2533,7 @@ port_full_uninit(struct port_info *pi) quiesce_fl(sc, &rxq->fl); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); @@ -2892,14 +2907,27 @@ cxgbe_tick(void *arg) PORT_UNLOCK(pi); } +static void +cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid) +{ + struct ifnet *vlan; + + if (arg != ifp) + return; + + vlan = VLAN_DEVAT(ifp, vid); + VLAN_SETCOOKIE(vlan, ifp); +} + static int cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { + #ifdef INVARIANTS - panic("%s: opcode %02x on iq %p with payload %p", + panic("%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); #else - log(LOG_ERR, "%s: opcode %02x on iq %p with payload %p", + log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); m_freem(m); #endif @@ -2922,6 +2950,31 @@ t4_register_cpl_handler(struct adapter *sc, int opcode, cpl_handler_t h) } static int +an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl) +{ + +#ifdef INVARIANTS + panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl); +#else + log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)", + __func__, iq, ctrl); +#endif + return (EDOOFUS); +} + +int +t4_register_an_handler(struct adapter *sc, an_handler_t h) +{ + uintptr_t *loc, new; + + new = h ? (uintptr_t)h : (uintptr_t)an_not_handled; + loc = (uintptr_t *) &sc->an_handler; + atomic_store_rel_ptr(loc, new); + + return (0); +} + +static int t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; @@ -3072,7 +3125,7 @@ t4_sysctls(struct adapter *sc) sysctl_tx_rate, "A", "Tx rate"); #endif -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { /* * dev.t4nex.X.toe. @@ -3125,7 +3178,7 @@ cxgbe_sysctls(struct port_info *pi) SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &pi->first_txq, 0, "index of first tx queue"); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(pi->adapter)) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &pi->nofldrxq, 0, @@ -4543,7 +4596,7 @@ set_filter_mode(struct adapter *sc, uint32_t mode) goto done; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (sc->offload_map) { rc = EBUSY; goto done; @@ -4734,7 +4787,7 @@ static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; - struct mbuf *m; + struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; @@ -4755,12 +4808,11 @@ set_filter_wr(struct adapter *sc, int fidx) ftid = sc->tids.ftid_base + fidx; - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) + wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); + if (wr == NULL) return (ENOMEM); - fwr = mtod(m, struct fw_filter_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*fwr); + fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); @@ -4830,7 +4882,7 @@ set_filter_wr(struct adapter *sc, int fidx) f->pending = 1; sc->tids.ftids_in_use++; - t4_mgmt_tx(sc, m); + t4_wrq_tx(sc, wr); return (0); } @@ -4838,7 +4890,7 @@ static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; - struct mbuf *m; + struct wrqe *wr; struct fw_filter_wr *fwr; unsigned int ftid; @@ -4846,18 +4898,16 @@ del_filter_wr(struct adapter *sc, int fidx) ftid = sc->tids.ftid_base + fidx; - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) + wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq); + if (wr == NULL) return (ENOMEM); - - fwr = mtod(m, struct fw_filter_wr *); - m->m_len = m->m_pkthdr.len = sizeof(*fwr); + fwr = wrtod(wr); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; - t4_mgmt_tx(sc, m); + t4_wrq_tx(sc, wr); return (0); } @@ -5215,7 +5265,7 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, return (rc); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int toe_capability(struct port_info *pi, int enable) { @@ -5228,13 +5278,28 @@ toe_capability(struct port_info *pi, int enable) return (ENODEV); if (enable) { + if (!(sc->flags & FULL_INIT_DONE)) { + log(LOG_WARNING, + "You must enable a cxgbe interface first\n"); + return (EAGAIN); + } + if (isset(&sc->offload_map, pi->port_id)) return (0); - if (sc->offload_map == 0) { - rc = activate_uld(sc, ULD_TOM, &sc->tom); + if (!(sc->flags & TOM_INIT_DONE)) { + rc = t4_activate_uld(sc, ULD_TOM); + if (rc == EAGAIN) { + log(LOG_WARNING, + "You must kldload t4_tom.ko before trying " + "to enable TOE on a cxgbe interface.\n"); + } if (rc != 0) return (rc); + KASSERT(sc->tom_softc != NULL, + ("%s: TOM activated but softc NULL", __func__)); + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM activated but flag not set", __func__)); } setbit(&sc->offload_map, pi->port_id); @@ -5242,15 +5307,9 @@ toe_capability(struct port_info *pi, int enable) if (!isset(&sc->offload_map, pi->port_id)) return (0); + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); - - if (sc->offload_map == 0) { - rc = deactivate_uld(&sc->tom); - if (rc != 0) { - setbit(&sc->offload_map, pi->port_id); - return (rc); - } - } } return (0); @@ -5305,8 +5364,8 @@ done: return (rc); } -static int -activate_uld(struct adapter *sc, int id, struct uld_softc *usc) +int +t4_activate_uld(struct adapter *sc, int id) { int rc = EAGAIN; struct uld_info *ui; @@ -5315,13 +5374,9 @@ activate_uld(struct adapter *sc, int id, struct uld_softc *usc) SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { - rc = ui->attach(sc, &usc->softc); - if (rc == 0) { - KASSERT(usc->softc != NULL, - ("%s: ULD %d has no state", __func__, id)); + rc = ui->activate(sc); + if (rc == 0) ui->refcount++; - usc->uld = ui; - } goto done; } } @@ -5331,25 +5386,21 @@ done: return (rc); } -static int -deactivate_uld(struct uld_softc *usc) +int +t4_deactivate_uld(struct adapter *sc, int id) { - int rc; + int rc = EINVAL; + struct uld_info *ui; mtx_lock(&t4_uld_list_lock); - if (usc->uld == NULL || usc->softc == NULL) { - rc = EINVAL; - goto done; - } - - rc = usc->uld->detach(usc->softc); - if (rc == 0) { - KASSERT(usc->uld->refcount > 0, - ("%s: ULD has bad refcount", __func__)); - usc->uld->refcount--; - usc->uld = NULL; - usc->softc = NULL; + SLIST_FOREACH(ui, &t4_uld_list, link) { + if (ui->uld_id == id) { + rc = ui->deactivate(sc); + if (rc == 0) + ui->refcount--; + goto done; + } } done: mtx_unlock(&t4_uld_list_lock); @@ -5379,7 +5430,7 @@ tweak_tunables(void) if (t4_nrxq1g < 1) t4_nrxq1g = min(nc, NRXQ_1G); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (t4_nofldtxq10g < 1) t4_nofldtxq10g = min(nc, NOFLDTXQ_10G); @@ -5426,7 +5477,7 @@ t4_mod_event(module_t mod, int cmd, void *arg) t4_sge_modload(); mtx_init(&t4_list_lock, "T4 adapters", 0, MTX_DEF); SLIST_INIT(&t4_list); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD mtx_init(&t4_uld_list_lock, "T4 ULDs", 0, MTX_DEF); SLIST_INIT(&t4_uld_list); #endif @@ -5434,7 +5485,7 @@ t4_mod_event(module_t mod, int cmd, void *arg) break; case MOD_UNLOAD: -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD mtx_lock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 8f39f10..92c9212 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -51,7 +52,6 @@ __FBSDID("$FreeBSD$"); #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" -#include "t4_l2t.h" struct fl_buf_info { int size; @@ -115,14 +115,14 @@ static int free_mgmtq(struct adapter *); static int alloc_rxq(struct port_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct port_info *, struct sge_rxq *); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct port_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct port_info *, struct sge_ofld_rxq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *, struct port_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct port_info *, struct sge_eq *); @@ -397,7 +397,7 @@ first_vector(struct port_info *pi) if (i == pi->port_id) break; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (sc->flags & INTR_DIRECT) rc += pi->nrxq + pi->nofldrxq; else @@ -434,7 +434,7 @@ port_intr_iq(struct port_info *pi, int idx) if (sc->intr_count == 1) return (&sc->sge.fwq); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (sc->flags & INTR_DIRECT) { idx %= pi->nrxq + pi->nofldrxq; @@ -475,19 +475,20 @@ t4_setup_port_queues(struct port_info *pi) struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *ctrlq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; + struct sysctl_oid *oid2 = NULL; #endif char name[16]; struct adapter *sc = pi->adapter; - struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev), *oid2 = NULL; + struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD if (is_offload(sc)) { oid2 = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, @@ -515,7 +516,7 @@ t4_setup_port_queues(struct port_info *pi) init_fl(&rxq->fl, pi->qsize_rxq / 8, pi->ifp->if_mtu, name); if (sc->flags & INTR_DIRECT -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD || (sc->intr_count > 1 && pi->nrxq >= pi->nofldrxq) #endif ) { @@ -527,7 +528,7 @@ t4_setup_port_queues(struct port_info *pi) } } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { snprintf(name, sizeof(name), "%s ofld_rxq%d-iq", @@ -567,7 +568,7 @@ t4_setup_port_queues(struct port_info *pi) j++; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) continue; @@ -603,7 +604,7 @@ t4_setup_port_queues(struct port_info *pi) j++; } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(pi, i, ofld_txq) { @@ -655,7 +656,7 @@ t4_teardown_port_queues(struct port_info *pi) struct adapter *sc = pi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif @@ -677,7 +678,7 @@ t4_teardown_port_queues(struct port_info *pi) free_txq(pi, txq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_txq(pi, i, ofld_txq) { free_wrq(sc, ofld_txq); } @@ -693,7 +694,7 @@ t4_teardown_port_queues(struct port_info *pi) free_rxq(pi, rxq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(pi, ofld_rxq); @@ -709,7 +710,7 @@ t4_teardown_port_queues(struct port_info *pi) free_rxq(pi, rxq); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD for_each_ofld_rxq(pi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(pi, ofld_rxq); @@ -775,7 +776,7 @@ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; - struct sge_rxq *rxq = (void *)iq; /* Use iff iq is part of rxq */ + struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ struct sge_fl *fl = &rxq->fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; struct rsp_ctrl *ctrl; @@ -862,7 +863,8 @@ service_iq(struct sge_iq *iq, int budget) break; default: - panic("%s: rsp_type %u", __func__, rsp_type); + sc->an_handler(iq, ctrl); + break; } iq_next(iq); @@ -1076,42 +1078,33 @@ t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) return (0); } -int -t4_mgmt_tx(struct adapter *sc, struct mbuf *m) -{ - return t4_wrq_tx(sc, &sc->sge.mgmtq, m); -} - /* * Doesn't fail. Holds on to work requests it can't send right away. */ -int -t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) +void +t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { struct sge_eq *eq = &wrq->eq; int can_reclaim; caddr_t dst; - struct mbuf *wr, *next; TXQ_LOCK_ASSERT_OWNED(wrq); +#ifdef TCP_OFFLOAD KASSERT((eq->flags & EQ_TYPEMASK) == EQ_OFLD || (eq->flags & EQ_TYPEMASK) == EQ_CTRL, ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); +#else + KASSERT((eq->flags & EQ_TYPEMASK) == EQ_CTRL, + ("%s: eq type %d", __func__, eq->flags & EQ_TYPEMASK)); +#endif - if (__predict_true(m0 != NULL)) { - if (wrq->head) - wrq->tail->m_nextpkt = m0; - else - wrq->head = m0; - while (m0->m_nextpkt) - m0 = m0->m_nextpkt; - wrq->tail = m0; - } + if (__predict_true(wr != NULL)) + STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); can_reclaim = reclaimable(eq); if (__predict_false(eq->flags & EQ_STALLED)) { if (can_reclaim < tx_resume_threshold(eq)) - return (0); + return; eq->flags &= ~EQ_STALLED; eq->unstalled++; } @@ -1120,39 +1113,34 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) if (__predict_false(eq->cidx >= eq->cap)) eq->cidx -= eq->cap; - for (wr = wrq->head; wr; wr = next) { + while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL) { int ndesc; - struct mbuf *m; - next = wr->m_nextpkt; - wr->m_nextpkt = NULL; + if (__predict_false(wr->wr_len < 0 || + wr->wr_len > SGE_MAX_WR_LEN || (wr->wr_len & 0x7))) { - M_ASSERTPKTHDR(wr); - KASSERT(wr->m_pkthdr.len > 0 && (wr->m_pkthdr.len & 0x7) == 0, - ("%s: work request len %d.", __func__, wr->m_pkthdr.len)); - - if (wr->m_pkthdr.len > SGE_MAX_WR_LEN) { #ifdef INVARIANTS - panic("%s: oversized work request", __func__); -#else - log(LOG_ERR, "%s: %s work request too long (%d)", - device_get_nameunit(sc->dev), __func__, - wr->m_pkthdr.len); - m_freem(wr); - continue; + panic("%s: work request with length %d", __func__, + wr->wr_len); #endif +#ifdef KDB + kdb_backtrace(); +#endif + log(LOG_ERR, "%s: %s work request with length %d", + device_get_nameunit(sc->dev), __func__, wr->wr_len); + STAILQ_REMOVE_HEAD(&wrq->wr_list, link); + free_wrqe(wr); + continue; } - ndesc = howmany(wr->m_pkthdr.len, EQ_ESIZE); + ndesc = howmany(wr->wr_len, EQ_ESIZE); if (eq->avail < ndesc) { - wr->m_nextpkt = next; wrq->no_desc++; break; } dst = (void *)&eq->desc[eq->pidx]; - for (m = wr; m; m = m->m_next) - copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); + copy_to_txd(eq, wrtod(wr), &dst, wr->wr_len); eq->pidx += ndesc; eq->avail -= ndesc; @@ -1164,7 +1152,8 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) ring_eq_db(sc, eq); wrq->tx_wrs++; - m_freem(wr); + STAILQ_REMOVE_HEAD(&wrq->wr_list, link); + free_wrqe(wr); if (eq->avail < 8) { can_reclaim = reclaimable(eq); @@ -1178,20 +1167,11 @@ t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct mbuf *m0) if (eq->pending) ring_eq_db(sc, eq); - if (wr == NULL) - wrq->head = wrq->tail = NULL; - else { - wrq->head = wr; - - KASSERT(wrq->tail->m_nextpkt == NULL, - ("%s: wrq->tail grew a tail of its own", __func__)); - + if (wr != NULL) { eq->flags |= EQ_STALLED; if (callout_pending(&eq->tx_callout) == 0) callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq); } - - return (0); } /* Per-packet header in a coalesced tx WR, before the SGL starts (in flits) */ @@ -1792,6 +1772,7 @@ alloc_mgmtq(struct adapter *sc) static int free_mgmtq(struct adapter *sc) { + return free_wrq(sc, &sc->sge.mgmtq); } @@ -1885,7 +1866,7 @@ free_rxq(struct port_info *pi, struct sge_rxq *rxq) return (rc); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct port_info *pi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) @@ -2031,7 +2012,7 @@ eth_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) return (rc); } -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) { @@ -2103,7 +2084,7 @@ alloc_eq(struct adapter *sc, struct port_info *pi, struct sge_eq *eq) rc = eth_eq_alloc(sc, pi, eq); break; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD case EQ_OFLD: rc = ofld_eq_alloc(sc, pi, eq); break; @@ -2141,7 +2122,7 @@ free_eq(struct adapter *sc, struct sge_eq *eq) eq->cntxt_id); break; -#ifndef TCP_OFFLOAD_DISABLE +#ifdef TCP_OFFLOAD case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); @@ -2183,6 +2164,7 @@ alloc_wrq(struct adapter *sc, struct port_info *pi, struct sge_wrq *wrq, return (rc); wrq->adapter = sc; + STAILQ_INIT(&wrq->wr_list); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); @@ -3179,7 +3161,7 @@ write_sgl_to_txd(struct sge_eq *eq, struct sgl *sgl, caddr_t *to) static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { - if ((uintptr_t)(*to) + len <= (uintptr_t)eq->spg) { + if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)eq->spg)) { bcopy(from, *to, len); (*to) += len; } else { diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c new file mode 100644 index 0000000..bc59171 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -0,0 +1,377 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include + +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +/* atid services */ +static int alloc_atid(struct adapter *, void *); +static void *lookup_atid(struct adapter *, int); +static void free_atid(struct adapter *, int); + +static int +alloc_atid(struct adapter *sc, void *ctx) +{ + struct tid_info *t = &sc->tids; + int atid = -1; + + mtx_lock(&t->atid_lock); + if (t->afree) { + union aopen_entry *p = t->afree; + + atid = p - t->atid_tab; + t->afree = p->next; + p->data = ctx; + t->atids_in_use++; + } + mtx_unlock(&t->atid_lock); + return (atid); +} + +static void * +lookup_atid(struct adapter *sc, int atid) +{ + struct tid_info *t = &sc->tids; + + return (t->atid_tab[atid].data); +} + +static void +free_atid(struct adapter *sc, int atid) +{ + struct tid_info *t = &sc->tids; + union aopen_entry *p = &t->atid_tab[atid]; + + mtx_lock(&t->atid_lock); + p->next = t->afree; + t->afree = p; + t->atids_in_use--; + mtx_unlock(&t->atid_lock); +} + +/* + * Active open failed. + */ +static int +do_act_establish(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_act_establish *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + unsigned int atid = G_TID_TID(ntohl(cpl->tos_atid)); + struct toepcb *toep = lookup_atid(sc, atid); + struct inpcb *inp = toep->inp; + + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); + + CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); + free_atid(sc, atid); + + INP_WLOCK(inp); + toep->tid = tid; + insert_tid(sc, tid, toep); + if (inp->inp_flags & INP_DROPPED) { + + /* socket closed by the kernel before hw told us it connected */ + + send_flowc_wr(toep, NULL); + send_reset(sc, toep, be32toh(cpl->snd_isn)); + goto done; + } + + make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); +done: + INP_WUNLOCK(inp); + return (0); +} + +static inline int +act_open_has_tid(unsigned int status) +{ + + return (status != CPL_ERR_TCAM_FULL && + status != CPL_ERR_TCAM_PARITY && + status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS); +} + +/* + * Convert an ACT_OPEN_RPL status to an errno. + */ +static inline int +act_open_rpl_status_to_errno(int status) +{ + + switch (status) { + case CPL_ERR_CONN_RESET: + return (ECONNREFUSED); + case CPL_ERR_ARP_MISS: + return (EHOSTUNREACH); + case CPL_ERR_CONN_TIMEDOUT: + return (ETIMEDOUT); + case CPL_ERR_TCAM_FULL: + return (ENOMEM); + case CPL_ERR_CONN_EXIST: + log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); + return (EADDRINUSE); + default: + return (EIO); + } +} + +static int +do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); + unsigned int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status))); + unsigned int status = G_AOPEN_STATUS(be32toh(cpl->atid_status)); + struct toepcb *toep = lookup_atid(sc, atid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp = intotcpcb(inp); + struct toedev *tod = &toep->td->tod; + + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == atid, ("%s: toep tid/atid mismatch", __func__)); + + CTR3(KTR_CXGBE, "%s: atid %u, status %u ", __func__, atid, status); + + /* Ignore negative advice */ + if (status == CPL_ERR_RTX_NEG_ADVICE) + return (0); + + free_atid(sc, atid); + toep->tid = -1; + + if (status && act_open_has_tid(status)) + release_tid(sc, GET_TID(cpl), toep->ctrlq); + + if (status == CPL_ERR_TCAM_FULL) { + INP_WLOCK(inp); + toe_connect_failed(tod, tp, EAGAIN); + final_cpl_received(toep); /* unlocks inp */ + } else { + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + toe_connect_failed(tod, tp, act_open_rpl_status_to_errno(status)); + final_cpl_received(toep); /* unlocks inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + } + + return (0); +} + +/* + * Options2 for active open. + */ +static uint32_t +calc_opt2a(struct socket *so) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct port_info *pi = toep->port; + struct adapter *sc = pi->adapter; + uint32_t opt2 = 0; + + if (tp->t_flags & TF_SACK_PERMIT) + opt2 |= F_SACK_EN; + + if (tp->t_flags & TF_REQ_TSTMP) + opt2 |= F_TSTAMPS_EN; + + if (tp->t_flags & TF_REQ_SCALE) + opt2 |= F_WND_SCALE_EN; + + if (V_tcp_do_ecn) + opt2 |= F_CCTRL_ECN; + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE); + opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); + + return (htobe32(opt2)); +} + + +void +t4_init_connect_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_ACT_ESTABLISH, do_act_establish); + t4_register_cpl_handler(sc, CPL_ACT_OPEN_RPL, do_act_open_rpl); +} + +/* + * active open (soconnect). + * + * State of affairs on entry: + * soisconnecting (so_state |= SS_ISCONNECTING) + * tcbinfo not locked (This has changed - used to be WLOCKed) + * inp WLOCKed + * tp->t_state = TCPS_SYN_SENT + * rtalloc1, RT_UNLOCK on rt. + */ +int +t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, + struct sockaddr *nam) +{ + struct adapter *sc = tod->tod_softc; + struct toepcb *toep = NULL; + struct wrqe *wr = NULL; + struct cpl_act_open_req *cpl; + struct l2t_entry *e = NULL; + struct ifnet *rt_ifp = rt->rt_ifp; + struct port_info *pi; + int atid = -1, mtu_idx, rscale, qid_atid, rc = ENOMEM; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + + INP_WLOCK_ASSERT(inp); + + if (nam->sa_family != AF_INET) + CXGBE_UNIMPLEMENTED("IPv6 connect"); + + if (rt_ifp->if_type == IFT_ETHER) + pi = rt_ifp->if_softc; + else if (rt_ifp->if_type == IFT_L2VLAN) { + struct ifnet *ifp = VLAN_COOKIE(rt_ifp); + + pi = ifp->if_softc; + } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) + return (ENOSYS); /* XXX: implement lagg support */ + else + return (ENOTSUP); + + toep = alloc_toepcb(pi, -1, -1, M_NOWAIT); + if (toep == NULL) + goto failed; + + atid = alloc_atid(sc, toep); + if (atid < 0) + goto failed; + + e = t4_l2t_get(pi, rt_ifp, + rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam); + if (e == NULL) + goto failed; + + wr = alloc_wrqe(sizeof(*cpl), toep->ctrlq); + if (wr == NULL) + goto failed; + cpl = wrtod(wr); + + toep->tid = atid; + toep->l2te = e; + toep->ulp_mode = ULP_MODE_NONE; + SOCKBUF_LOCK(&so->so_rcv); + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + SOCKBUF_UNLOCK(&so->so_rcv); + + offload_socket(so, toep); + + /* + * The kernel sets request_r_scale based on sb_max whereas we need to + * take hardware's MAX_RCV_WND into account too. This is normally a + * no-op as MAX_RCV_WND is much larger than the default sb_max. + */ + if (tp->t_flags & TF_REQ_SCALE) + rscale = tp->request_r_scale = select_rcv_wscale(); + else + rscale = 0; + mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); + qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | atid; + + INIT_TP_WR(cpl, 0); + OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); + inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, + &cpl->peer_port); + cpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, toep->rx_credits, + toep->ulp_mode); + cpl->params = select_ntuple(pi, e, sc->filter_mode); + cpl->opt2 = calc_opt2a(so); + + CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, + toep->tid, tcpstates[tp->t_state], toep, inp); + + rc = t4_l2t_send(sc, wr, e); + if (rc == 0) { + toepcb_set_flag(toep, TPF_CPL_PENDING); + return (0); + } + + undo_offload_socket(so); +failed: + CTR5(KTR_CXGBE, "%s: FAILED, atid %d, toep %p, l2te %p, wr %p", + __func__, atid, toep, e, wr); + + if (e) + t4_l2t_release(e); + if (wr) + free_wrqe(wr); + if (atid >= 0) + free_atid(sc, atid); + if (toep) + free_toepcb(toep); + + return (rc); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c new file mode 100644 index 0000000..161fc12 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -0,0 +1,1276 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include +#include + +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +VNET_DECLARE(int, tcp_do_autosndbuf); +#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) +VNET_DECLARE(int, tcp_autosndbuf_inc); +#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) +VNET_DECLARE(int, tcp_autosndbuf_max); +#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) +VNET_DECLARE(int, tcp_do_autorcvbuf); +#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) +VNET_DECLARE(int, tcp_autorcvbuf_inc); +#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) +VNET_DECLARE(int, tcp_autorcvbuf_max); +#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) + +void +send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) +{ + struct wrqe *wr; + struct fw_flowc_wr *flowc; + unsigned int nparams = ftxp ? 8 : 4, flowclen; + struct port_info *pi = toep->port; + struct adapter *sc = pi->adapter; + unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN; + struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; + + KASSERT(!toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc for tid %u sent already", __func__, toep->tid)); + + CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); + + flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); + + wr = alloc_wrqe(roundup(flowclen, 16), toep->ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + flowc = wrtod(wr); + memset(flowc, 0, wr->wr_len); + + flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | + V_FW_FLOWC_WR_NPARAMS(nparams)); + flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | + V_FW_WR_FLOWID(toep->tid)); + + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = htobe32(pfvf); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = htobe32(pi->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = htobe32(pi->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = htobe32(toep->ofld_rxq->iq.abs_id); + if (ftxp) { + uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); + + flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT; + flowc->mnemval[4].val = htobe32(ftxp->snd_nxt); + flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT; + flowc->mnemval[5].val = htobe32(ftxp->rcv_nxt); + flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF; + flowc->mnemval[6].val = htobe32(sndbuf); + flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; + flowc->mnemval[7].val = htobe32(ftxp->mss); + } + + txsd->tx_credits = howmany(flowclen, 16); + txsd->plen = 0; + KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, + ("%s: not enough credits (%d)", __func__, toep->tx_credits)); + toep->tx_credits -= txsd->tx_credits; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) + toep->txsd_pidx = 0; + toep->txsd_avail--; + + toepcb_set_flag(toep, TPF_FLOWC_WR_SENT); + t4_wrq_tx(sc, wr); +} + +void +send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) +{ + struct wrqe *wr; + struct cpl_abort_req *req; + int tid = toep->tid; + struct inpcb *inp = toep->inp; + struct tcpcb *tp = intotcpcb(inp); /* don't use if INP_DROPPED */ + + INP_WLOCK_ASSERT(inp); + + CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s", + __func__, toep->tid, + inp->inp_flags & INP_DROPPED ? "inp dropped" : + tcpstates[tp->t_state], + toep->flags, inp->inp_flags, + toepcb_flag(toep, TPF_ABORT_SHUTDOWN) ? + " (abort already in progress)" : ""); + + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) + return; /* abort already in progress */ + + toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN); + + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc_wr not sent for tid %d.", __func__, tid)); + + wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + req = wrtod(wr); + + INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid); + if (inp->inp_flags & INP_DROPPED) + req->rsvd0 = htobe32(snd_nxt); + else + req->rsvd0 = htobe32(tp->snd_nxt); + req->rsvd1 = !toepcb_flag(toep, TPF_TX_DATA_SENT); + req->cmd = CPL_ABORT_SEND_RST; + + /* + * XXX: What's the correct way to tell that the inp hasn't been detached + * from its socket? Should I even be flushing the snd buffer here? + */ + if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { + struct socket *so = inp->inp_socket; + + if (so != NULL) /* because I'm not sure. See comment above */ + sbflush(&so->so_snd); + } + + t4_l2t_send(sc, wr, toep->l2te); +} + +/* + * Called when a connection is established to translate the TCP options + * reported by HW to FreeBSD's native format. + */ +static void +assign_rxopt(struct tcpcb *tp, unsigned int opt) +{ + struct toepcb *toep = tp->t_toe; + struct adapter *sc = td_adapter(toep->td); + + INP_LOCK_ASSERT(tp->t_inpcb); + + tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40; + + if (G_TCPOPT_TSTAMP(opt)) { + tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ + tp->ts_recent = 0; /* hmmm */ + tp->ts_recent_age = tcp_ts_getticks(); + tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; + } + + if (G_TCPOPT_SACK(opt)) + tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ + else + tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ + + if (G_TCPOPT_WSCALE_OK(opt)) + tp->t_flags |= TF_RCVD_SCALE; + + /* Doing window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == + (TF_RCVD_SCALE | TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); + } +} + +/* + * Completes some final bits of initialization for just established connections + * and changes their state to TCPS_ESTABLISHED. + * + * The ISNs are from after the exchange of SYNs. i.e., the true ISN + 1. + */ +void +make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, + uint16_t opt) +{ + struct inpcb *inp = toep->inp; + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); + long bufsize; + uint32_t iss = be32toh(snd_isn) - 1; /* true ISS */ + uint32_t irs = be32toh(rcv_isn) - 1; /* true IRS */ + uint16_t tcpopt = be16toh(opt); + struct flowc_tx_params ftxp; + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_state == TCPS_SYN_SENT || + tp->t_state == TCPS_SYN_RECEIVED, + ("%s: TCP state %s", __func__, tcpstates[tp->t_state])); + + CTR4(KTR_CXGBE, "%s: tid %d, toep %p, inp %p", + __func__, toep->tid, toep, inp); + + tp->t_state = TCPS_ESTABLISHED; + tp->t_starttime = ticks; + TCPSTAT_INC(tcps_connects); + + tp->irs = irs; + tcp_rcvseqinit(tp); + tp->rcv_wnd = toep->rx_credits << 10; + tp->rcv_adv += tp->rcv_wnd; + tp->last_ack_sent = tp->rcv_nxt; + + /* + * If we were unable to send all rx credits via opt0, save the remainder + * in rx_credits so that they can be handed over with the next credit + * update. + */ + SOCKBUF_LOCK(&so->so_rcv); + bufsize = select_rcv_wnd(so); + SOCKBUF_UNLOCK(&so->so_rcv); + toep->rx_credits = bufsize - tp->rcv_wnd; + + tp->iss = iss; + tcp_sendseqinit(tp); + tp->snd_una = iss + 1; + tp->snd_nxt = iss + 1; + tp->snd_max = iss + 1; + + assign_rxopt(tp, tcpopt); + + SOCKBUF_LOCK(&so->so_snd); + if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) + bufsize = V_tcp_autosndbuf_max; + else + bufsize = sbspace(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); + + ftxp.snd_nxt = tp->snd_nxt; + ftxp.rcv_nxt = tp->rcv_nxt; + ftxp.snd_space = bufsize; + ftxp.mss = tp->t_maxseg; + send_flowc_wr(toep, &ftxp); + + soisconnected(so); +} + +static int +send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits) +{ + struct wrqe *wr; + struct cpl_rx_data_ack *req; + uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + + wr = alloc_wrqe(sizeof(*req), toep->ctrlq); + if (wr == NULL) + return (0); + req = wrtod(wr); + + INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid); + req->credit_dack = htobe32(dack | V_RX_CREDITS(credits)); + + t4_wrq_tx(sc, wr); + return (credits); +} + +void +t4_rcvd(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct sockbuf *so_rcv = &so->so_rcv; + struct toepcb *toep = tp->t_toe; + int must_send; + + INP_WLOCK_ASSERT(inp); + + SOCKBUF_LOCK(so_rcv); + KASSERT(toep->enqueued >= so_rcv->sb_cc, + ("%s: so_rcv->sb_cc > enqueued", __func__)); + toep->rx_credits += toep->enqueued - so_rcv->sb_cc; + toep->enqueued = so_rcv->sb_cc; + SOCKBUF_UNLOCK(so_rcv); + + must_send = toep->rx_credits + 16384 >= tp->rcv_wnd; + if (must_send || toep->rx_credits >= 15 * 1024) { + int credits; + + credits = send_rx_credits(sc, toep, toep->rx_credits); + toep->rx_credits -= credits; + tp->rcv_wnd += credits; + tp->rcv_adv += credits; + } +} + +/* + * Close a connection by sending a CPL_CLOSE_CON_REQ message. + */ +static int +close_conn(struct adapter *sc, struct toepcb *toep) +{ + struct wrqe *wr; + struct cpl_close_con_req *req; + unsigned int tid = toep->tid; + + CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid, + toepcb_flag(toep, TPF_FIN_SENT) ? ", IGNORED" : ""); + + if (toepcb_flag(toep, TPF_FIN_SENT)) + return (0); + + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc_wr not sent for tid %u.", __func__, tid)); + + wr = alloc_wrqe(sizeof(*req), toep->ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + req = wrtod(wr); + + req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | + V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr))); + req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) | + V_FW_WR_FLOWID(tid)); + req->wr.wr_lo = cpu_to_be64(0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = 0; + + toepcb_set_flag(toep, TPF_FIN_SENT); + toepcb_clr_flag(toep, TPF_SEND_FIN); + t4_l2t_send(sc, wr, toep->l2te); + + return (0); +} + +#define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16) +#define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16)) + +/* Maximum amount of immediate data we could stuff in a WR */ +static inline int +max_imm_payload(int tx_credits) +{ + const int n = 2; /* Use only up to 2 desc for imm. data WR */ + + KASSERT(tx_credits >= 0 && + tx_credits <= MAX_OFLD_TX_CREDITS, + ("%s: %d credits", __func__, tx_credits)); + + if (tx_credits < MIN_OFLD_TX_CREDITS) + return (0); + + if (tx_credits >= (n * EQ_ESIZE) / 16) + return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr)); + else + return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr)); +} + +/* Maximum number of SGL entries we could stuff in a WR */ +static inline int +max_dsgl_nsegs(int tx_credits) +{ + int nseg = 1; /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */ + int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS; + + KASSERT(tx_credits >= 0 && + tx_credits <= MAX_OFLD_TX_CREDITS, + ("%s: %d credits", __func__, tx_credits)); + + if (tx_credits < MIN_OFLD_TX_CREDITS) + return (0); + + nseg += 2 * (sge_pair_credits * 16 / 24); + if ((sge_pair_credits * 16) % 24 == 16) + nseg++; + + return (nseg); +} + +static inline void +write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, + unsigned int plen, uint8_t credits, int more_to_come) +{ + struct fw_ofld_tx_data_wr *txwr = dst; + int shove = !more_to_come; + int compl = 1; + + /* + * We always request completion notifications from the firmware. The + * only exception is when we know we'll get more data to send shortly + * and that we'll have some tx credits remaining to transmit that data. + */ + if (more_to_come && toep->tx_credits - credits >= MIN_OFLD_TX_CREDITS) + compl = 0; + + txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) | + V_FW_WR_COMPL(compl) | V_FW_WR_IMMDLEN(immdlen)); + txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | + V_FW_WR_LEN16(credits)); + txwr->tunnel_to_proxy = + htobe32(V_FW_OFLD_TX_DATA_WR_ULPMODE(toep->ulp_mode) | + V_FW_OFLD_TX_DATA_WR_URGENT(0) | /* XXX */ + V_FW_OFLD_TX_DATA_WR_SHOVE(shove)); + txwr->plen = htobe32(plen); +} + +/* + * Generate a DSGL from a starting mbuf. The total number of segments and the + * maximum segments in any one mbuf are provided. + */ +static void +write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n) +{ + struct mbuf *m; + struct ulptx_sgl *usgl = dst; + int i, j, rc; + struct sglist sg; + struct sglist_seg segs[n]; + + KASSERT(nsegs > 0, ("%s: nsegs 0", __func__)); + + sglist_init(&sg, n, segs); + usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | + V_ULPTX_NSGE(nsegs)); + + i = -1; + for (m = start; m != stop; m = m->m_next) { + rc = sglist_append(&sg, mtod(m, void *), m->m_len); + if (__predict_false(rc != 0)) + panic("%s: sglist_append %d", __func__, rc); + + for (j = 0; j < sg.sg_nseg; i++, j++) { + if (i < 0) { + usgl->len0 = htobe32(segs[j].ss_len); + usgl->addr0 = htobe64(segs[j].ss_paddr); + } else { + usgl->sge[i / 2].len[i & 1] = + htobe32(segs[j].ss_len); + usgl->sge[i / 2].addr[i & 1] = + htobe64(segs[j].ss_paddr); + } +#ifdef INVARIANTS + nsegs--; +#endif + } + sglist_reset(&sg); + } + if (i & 1) + usgl->sge[i / 2].len[1] = htobe32(0); + KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p", + __func__, nsegs, start, stop)); +} + +/* + * Max number of SGL entries an offload tx work request can have. This is 41 + * (1 + 40) for a full 512B work request. + * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40) + */ +#define OFLD_SGL_LEN (41) + +/* + * Send data and/or a FIN to the peer. + * + * The socket's so_snd buffer consists of a stream of data starting with sb_mb + * and linked together with m_next. sb_sndptr, if set, is the last mbuf that + * was transmitted. + */ +static void +t4_push_frames(struct adapter *sc, struct toepcb *toep) +{ + struct mbuf *sndptr, *m, *sb_sndptr; + struct fw_ofld_tx_data_wr *txwr; + struct wrqe *wr; + unsigned int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf; + struct inpcb *inp = toep->inp; + struct tcpcb *tp = intotcpcb(inp); + struct socket *so = inp->inp_socket; + struct sockbuf *sb = &so->so_snd; + int tx_credits; + struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; + + INP_WLOCK_ASSERT(inp); + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); + + if (toep->ulp_mode != ULP_MODE_NONE) + CXGBE_UNIMPLEMENTED("ulp_mode"); + + /* + * This function doesn't resume by itself. Someone else must clear the + * flag and call this function. + */ + if (__predict_false(toepcb_flag(toep, TPF_TX_SUSPENDED))) + return; + + do { + tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); + max_imm = max_imm_payload(tx_credits); + max_nsegs = max_dsgl_nsegs(tx_credits); + + SOCKBUF_LOCK(sb); + sb_sndptr = sb->sb_sndptr; + sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb; + plen = 0; + nsegs = 0; + max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */ + for (m = sndptr; m != NULL; m = m->m_next) { + int n = sglist_count(mtod(m, void *), m->m_len); + + nsegs += n; + plen += m->m_len; + + /* This mbuf sent us _over_ the nsegs limit, back out */ + if (plen > max_imm && nsegs > max_nsegs) { + nsegs -= n; + plen -= m->m_len; + if (plen == 0) { + /* Too few credits */ + toepcb_set_flag(toep, TPF_TX_SUSPENDED); + SOCKBUF_UNLOCK(sb); + return; + } + break; + } + + if (max_nsegs_1mbuf < n) + max_nsegs_1mbuf = n; + sb_sndptr = m; /* new sb->sb_sndptr if all goes well */ + + /* This mbuf put us right at the max_nsegs limit */ + if (plen > max_imm && nsegs == max_nsegs) { + m = m->m_next; + break; + } + } + + if (sb->sb_flags & SB_AUTOSIZE && + V_tcp_do_autosndbuf && + sb->sb_hiwat < V_tcp_autosndbuf_max && + sbspace(sb) < sb->sb_hiwat / 8 * 7) { + int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc, + V_tcp_autosndbuf_max); + + if (!sbreserve_locked(sb, newsize, so, NULL)) + sb->sb_flags &= ~SB_AUTOSIZE; + else { + sowwakeup_locked(so); /* room available */ + SOCKBUF_UNLOCK_ASSERT(sb); + goto unlocked; + } + } + SOCKBUF_UNLOCK(sb); +unlocked: + + /* nothing to send */ + if (plen == 0) { + KASSERT(m == NULL, + ("%s: nothing to send, but m != NULL", __func__)); + break; + } + + if (__predict_false(toepcb_flag(toep, TPF_FIN_SENT))) + panic("%s: excess tx.", __func__); + + if (plen <= max_imm) { + + /* Immediate data tx */ + + wr = alloc_wrqe(roundup(sizeof(*txwr) + plen, 16), + toep->ofld_txq); + if (wr == NULL) { + /* XXX: how will we recover from this? */ + toepcb_set_flag(toep, TPF_TX_SUSPENDED); + return; + } + txwr = wrtod(wr); + credits = howmany(wr->wr_len, 16); + write_tx_wr(txwr, toep, plen, plen, credits, + tp->t_flags & TF_MORETOCOME); + m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); + } else { + int wr_len; + + /* DSGL tx */ + + wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) + + ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8; + wr = alloc_wrqe(roundup(wr_len, 16), toep->ofld_txq); + if (wr == NULL) { + /* XXX: how will we recover from this? */ + toepcb_set_flag(toep, TPF_TX_SUSPENDED); + return; + } + txwr = wrtod(wr); + credits = howmany(wr_len, 16); + write_tx_wr(txwr, toep, 0, plen, credits, + tp->t_flags & TF_MORETOCOME); + write_tx_sgl(txwr + 1, sndptr, m, nsegs, + max_nsegs_1mbuf); + if (wr_len & 0xf) { + uint64_t *pad = (uint64_t *) + ((uintptr_t)txwr + wr_len); + *pad = 0; + } + } + + KASSERT(toep->tx_credits >= credits, + ("%s: not enough credits", __func__)); + + toep->tx_credits -= credits; + + tp->snd_nxt += plen; + tp->snd_max += plen; + + SOCKBUF_LOCK(sb); + KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__)); + sb->sb_sndptr = sb_sndptr; + SOCKBUF_UNLOCK(sb); + + toepcb_set_flag(toep, TPF_TX_DATA_SENT); + + KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__)); + txsd->plen = plen; + txsd->tx_credits = credits; + txsd++; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) { + toep->txsd_pidx = 0; + txsd = &toep->txsd[0]; + } + toep->txsd_avail--; + + t4_l2t_send(sc, wr, toep->l2te); + } while (m != NULL); + + /* Send a FIN if requested, but only if there's no more data to send */ + if (m == NULL && toepcb_flag(toep, TPF_SEND_FIN)) + close_conn(sc, toep); +} + +int +t4_tod_output(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; +#ifdef INVARIANTS + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("%s: inp %p dropped.", __func__, inp)); + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + + t4_push_frames(sc, toep); + + return (0); +} + +int +t4_send_fin(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; +#ifdef INVARIANTS + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("%s: inp %p dropped.", __func__, inp)); + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + + toepcb_set_flag(toep, TPF_SEND_FIN); + t4_push_frames(sc, toep); + + return (0); +} + +int +t4_send_rst(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; +#if defined(INVARIANTS) + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_DROPPED) == 0, + ("%s: inp %p dropped.", __func__, inp)); + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + + /* hmmmm */ + KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), + ("%s: flowc for tid %u [%s] not sent already", + __func__, toep->tid, tcpstates[tp->t_state])); + + send_reset(sc, toep, 0); + return (0); +} + +/* + * Peer has sent us a FIN. + */ +static int +do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_peer_close *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp = NULL; + struct socket *so = NULL; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PEER_CLOSE, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = intotcpcb(inp); + + CTR5(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x, inp %p", __func__, + tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags, inp); + + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) + goto done; + + so = inp->inp_socket; + + socantrcvmore(so); + tp->rcv_nxt++; /* FIN */ + KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), + ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, + be32toh(cpl->rcv_nxt))); + + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + case TCPS_FIN_WAIT_2: + tcp_twstart(tp); + INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + + INP_WLOCK(inp); + final_cpl_received(toep); + return (0); + + default: + log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n", + __func__, tid, tp->t_state); + } +done: + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); +} + +/* + * Peer has ACK'd our FIN. + */ +static int +do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp = NULL; + struct socket *so = NULL; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_CLOSE_CON_RPL, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = intotcpcb(inp); + + CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x", + __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags); + + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) + goto done; + + so = inp->inp_socket; + tp->snd_una = be32toh(cpl->snd_nxt) - 1; /* exclude FIN */ + + switch (tp->t_state) { + case TCPS_CLOSING: /* see TCPS_FIN_WAIT_2 in do_peer_close too */ + tcp_twstart(tp); +release: + INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ + INP_INFO_WUNLOCK(&V_tcbinfo); + + INP_WLOCK(inp); + final_cpl_received(toep); /* no more CPLs expected */ + + return (0); + case TCPS_LAST_ACK: + if (tcp_close(tp)) + INP_WUNLOCK(inp); + goto release; + + case TCPS_FIN_WAIT_1: + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + soisdisconnected(so); + tp->t_state = TCPS_FIN_WAIT_2; + break; + + default: + log(LOG_ERR, + "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n", + __func__, tid, tcpstates[tp->t_state]); + } +done: + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); +} + +void +send_abort_rpl(struct adapter *sc, struct sge_wrq *ofld_txq, int tid, + int rst_status) +{ + struct wrqe *wr; + struct cpl_abort_rpl *cpl; + + wr = alloc_wrqe(sizeof(*cpl), ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + cpl = wrtod(wr); + + INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid); + cpl->cmd = rst_status; + + t4_wrq_tx(sc, wr); +} + +static int +abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason) +{ + switch (abort_reason) { + case CPL_ERR_BAD_SYN: + case CPL_ERR_CONN_RESET: + return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: + return (ETIMEDOUT); + default: + return (EIO); + } +} + +/* + * TCP RST from the peer, timeout, or some other such critical error. + */ +static int +do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct sge_wrq *ofld_txq = toep->ofld_txq; + struct inpcb *inp; + struct tcpcb *tp; + struct socket *so; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_REQ_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + + if (toepcb_flag(toep, TPF_SYNQE)) + return (do_abort_req_synqe(iq, rss, m)); + + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + if (cpl->status == CPL_ERR_RTX_NEG_ADVICE || + cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) { + CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)", + __func__, cpl->status, tid, toep->flags); + return (0); /* Ignore negative advice */ + } + + inp = toep->inp; + INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_WLOCK(inp); + + tp = intotcpcb(inp); + so = inp->inp_socket; + + CTR6(KTR_CXGBE, + "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d", + __func__, tid, tcpstates[tp->t_state], toep->flags, inp->inp_flags, + cpl->status); + + /* + * If we'd initiated an abort earlier the reply to it is responsible for + * cleaning up resources. Otherwise we tear everything down right here + * right now. We owe the T4 a CPL_ABORT_RPL no matter what. + */ + if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) { + INP_WUNLOCK(inp); + goto done; + } + toepcb_set_flag(toep, TPF_ABORT_SHUTDOWN); + + so_error_set(so, abort_status_to_errno(tp, cpl->status)); + tp = tcp_close(tp); + if (tp == NULL) + INP_WLOCK(inp); /* re-acquire */ + + final_cpl_received(toep); +done: + INP_INFO_WUNLOCK(&V_tcbinfo); + send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); + return (0); +} + +/* + * Reply to the CPL_ABORT_REQ (send_reset) + */ +static int +do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_RPL_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + + if (toepcb_flag(toep, TPF_SYNQE)) + return (do_abort_rpl_synqe(iq, rss, m)); + + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d", + __func__, tid, toep, inp, cpl->status); + + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: wasn't expecting abort reply", __func__)); + + INP_WLOCK(inp); + final_cpl_received(toep); + + return (0); +} + +static int +do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_rx_data *cpl = mtod(m, const void *); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp = toep->inp; + struct tcpcb *tp; + struct socket *so; + struct sockbuf *so_rcv; + + if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) { + /* + * do_pass_establish failed and must be attempting to abort the + * synqe's tid. Meanwhile, the T4 has sent us data for such a + * connection. + */ + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: synqe and tid isn't being aborted.", __func__)); + m_freem(m); + return (0); + } + + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + /* strip off CPL header */ + m_adj(m, sizeof(*cpl)); + + INP_WLOCK(inp); + if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { + CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", + __func__, tid, m->m_pkthdr.len, inp->inp_flags); + INP_WUNLOCK(inp); + m_freem(m); + return (0); + } + + tp = intotcpcb(inp); + +#ifdef INVARIANTS + if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq))) { + log(LOG_ERR, + "%s: unexpected seq# %x for TID %u, rcv_nxt %x\n", + __func__, be32toh(cpl->seq), toep->tid, tp->rcv_nxt); + } +#endif + + tp->rcv_nxt += m->m_pkthdr.len; + KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, + ("%s: negative window size", __func__)); + tp->rcv_wnd -= m->m_pkthdr.len; + tp->t_rcvtime = ticks; + + so = inp_inpcbtosocket(inp); + so_rcv = &so->so_rcv; + SOCKBUF_LOCK(so_rcv); + + if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { + CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", + __func__, tid, m->m_pkthdr.len); + m_freem(m); + SOCKBUF_UNLOCK(so_rcv); + INP_WUNLOCK(inp); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tp = tcp_drop(tp, ECONNRESET); + if (tp) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + + return (0); + } + + /* receive buffer autosize */ + if (so_rcv->sb_flags & SB_AUTOSIZE && + V_tcp_do_autorcvbuf && + so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && + m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) { + unsigned int hiwat = so_rcv->sb_hiwat; + unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); + + if (!sbreserve_locked(so_rcv, newsize, so, NULL)) + so_rcv->sb_flags &= ~SB_AUTOSIZE; + else + toep->rx_credits += newsize - hiwat; + } + toep->enqueued += m->m_pkthdr.len; + sbappendstream_locked(so_rcv, m); + sorwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(so_rcv); + + INP_WUNLOCK(inp); + return (0); +} + +#define S_CPL_FW4_ACK_OPCODE 24 +#define M_CPL_FW4_ACK_OPCODE 0xff +#define V_CPL_FW4_ACK_OPCODE(x) ((x) << S_CPL_FW4_ACK_OPCODE) +#define G_CPL_FW4_ACK_OPCODE(x) \ + (((x) >> S_CPL_FW4_ACK_OPCODE) & M_CPL_FW4_ACK_OPCODE) + +#define S_CPL_FW4_ACK_FLOWID 0 +#define M_CPL_FW4_ACK_FLOWID 0xffffff +#define V_CPL_FW4_ACK_FLOWID(x) ((x) << S_CPL_FW4_ACK_FLOWID) +#define G_CPL_FW4_ACK_FLOWID(x) \ + (((x) >> S_CPL_FW4_ACK_FLOWID) & M_CPL_FW4_ACK_FLOWID) + +#define S_CPL_FW4_ACK_CR 24 +#define M_CPL_FW4_ACK_CR 0xff +#define V_CPL_FW4_ACK_CR(x) ((x) << S_CPL_FW4_ACK_CR) +#define G_CPL_FW4_ACK_CR(x) (((x) >> S_CPL_FW4_ACK_CR) & M_CPL_FW4_ACK_CR) + +#define S_CPL_FW4_ACK_SEQVAL 0 +#define M_CPL_FW4_ACK_SEQVAL 0x1 +#define V_CPL_FW4_ACK_SEQVAL(x) ((x) << S_CPL_FW4_ACK_SEQVAL) +#define G_CPL_FW4_ACK_SEQVAL(x) \ + (((x) >> S_CPL_FW4_ACK_SEQVAL) & M_CPL_FW4_ACK_SEQVAL) +#define F_CPL_FW4_ACK_SEQVAL V_CPL_FW4_ACK_SEQVAL(1U) + +static int +do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); + unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); + struct toepcb *toep = lookup_tid(sc, tid); + struct inpcb *inp; + struct tcpcb *tp; + struct socket *so; + uint8_t credits = cpl->credits; + struct ofld_tx_sdesc *txsd; + int plen; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + /* + * Very unusual case: we'd sent a flowc + abort_req for a synq entry and + * now this comes back carrying the credits for the flowc. + */ + if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) { + KASSERT(toepcb_flag(toep, TPF_ABORT_SHUTDOWN), + ("%s: credits for a synq entry %p", __func__, toep)); + return (0); + } + + inp = toep->inp; + + KASSERT(opcode == CPL_FW4_ACK, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + + INP_WLOCK(inp); + + if (__predict_false(toepcb_flag(toep, TPF_ABORT_SHUTDOWN))) { + INP_WUNLOCK(inp); + return (0); + } + + KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, + ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); + + tp = intotcpcb(inp); + + if (cpl->seq_vld) { + tcp_seq snd_una = be32toh(cpl->snd_una); + +#ifdef INVARIANTS + if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { + log(LOG_ERR, + "%s: unexpected seq# %x for TID %u, snd_una %x\n", + __func__, snd_una, toep->tid, tp->snd_una); + } +#endif + + if (tp->snd_una != snd_una) { + tp->snd_una = snd_una; + tp->ts_recent_age = tcp_ts_getticks(); + } + } + + so = inp->inp_socket; + txsd = &toep->txsd[toep->txsd_cidx]; + plen = 0; + while (credits) { + KASSERT(credits >= txsd->tx_credits, + ("%s: too many (or partial) credits", __func__)); + credits -= txsd->tx_credits; + toep->tx_credits += txsd->tx_credits; + plen += txsd->plen; + txsd++; + toep->txsd_avail++; + KASSERT(toep->txsd_avail <= toep->txsd_total, + ("%s: txsd avail > total", __func__)); + if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { + txsd = &toep->txsd[0]; + toep->txsd_cidx = 0; + } + } + + if (plen > 0) { + struct sockbuf *sb = &so->so_snd; + + SOCKBUF_LOCK(sb); + sbdrop_locked(sb, plen); + sowwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(sb); + } + + /* XXX */ + if ((toepcb_flag(toep, TPF_TX_SUSPENDED) && + toep->tx_credits >= MIN_OFLD_TX_CREDITS) || + toep->tx_credits == toep->txsd_total * + howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) { + toepcb_clr_flag(toep, TPF_TX_SUSPENDED); + t4_push_frames(sc, toep); + } + INP_WUNLOCK(inp); + + return (0); +} + +void +t4_init_cpl_io_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_PEER_CLOSE, do_peer_close); + t4_register_cpl_handler(sc, CPL_CLOSE_CON_RPL, do_close_con_rpl); + t4_register_cpl_handler(sc, CPL_ABORT_REQ_RSS, do_abort_req); + t4_register_cpl_handler(sc, CPL_ABORT_RPL_RSS, do_abort_rpl); + t4_register_cpl_handler(sc, CPL_RX_DATA, do_rx_data); + t4_register_cpl_handler(sc, CPL_FW4_ACK, do_fw4_ack); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c new file mode 100644 index 0000000..895e57a --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -0,0 +1,1362 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include + +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +/* stid services */ +static int alloc_stid(struct adapter *, void *); +static void *lookup_stid(struct adapter *, int); +static void free_stid(struct adapter *, int); + +/* lctx services */ +static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, + struct port_info *); +static int free_lctx(struct adapter *, struct listen_ctx *); +static void hold_lctx(struct listen_ctx *); +static void listen_hash_add(struct adapter *, struct listen_ctx *); +static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); +static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); +static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); + +static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *); +static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); +static void send_reset_synqe(struct toedev *, struct synq_entry *); + +/* XXX: won't work for IPv6 */ +static int +alloc_stid(struct adapter *sc, void *ctx) +{ + struct tid_info *t = &sc->tids; + int stid = -1; + + mtx_lock(&t->stid_lock); + if (t->sfree) { + union serv_entry *p = t->sfree; + + stid = p - t->stid_tab; + stid += t->stid_base; + t->sfree = p->next; + p->data = ctx; + t->stids_in_use++; + } + mtx_unlock(&t->stid_lock); + return (stid); +} + +static void * +lookup_stid(struct adapter *sc, int stid) +{ + struct tid_info *t = &sc->tids; + + return (t->stid_tab[stid - t->stid_base].data); +} + +static void +free_stid(struct adapter *sc, int stid) +{ + struct tid_info *t = &sc->tids; + union serv_entry *p = &t->stid_tab[stid - t->stid_base]; + + mtx_lock(&t->stid_lock); + p->next = t->sfree; + t->sfree = p; + t->stids_in_use--; + mtx_unlock(&t->stid_lock); +} + +static struct listen_ctx * +alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi) +{ + struct listen_ctx *lctx; + + INP_WLOCK_ASSERT(inp); + + lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); + if (lctx == NULL) + return (NULL); + + lctx->stid = alloc_stid(sc, lctx); + if (lctx->stid < 0) { + free(lctx, M_CXGBE); + return (NULL); + } + + lctx->ctrlq = &sc->sge.ctrlq[pi->port_id]; + lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq]; + refcount_init(&lctx->refcount, 1); + TAILQ_INIT(&lctx->synq); + + lctx->inp = inp; + in_pcbref(inp); + + return (lctx); +} + +/* Don't call this directly, use release_lctx instead */ +static int +free_lctx(struct adapter *sc, struct listen_ctx *lctx) +{ + struct inpcb *inp = lctx->inp; + + INP_WLOCK_ASSERT(inp); + KASSERT(lctx->refcount == 0, + ("%s: refcount %d", __func__, lctx->refcount)); + KASSERT(TAILQ_EMPTY(&lctx->synq), + ("%s: synq not empty.", __func__)); + KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); + + CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", + __func__, lctx->stid, lctx, lctx->inp); + + free_stid(sc, lctx->stid); + free(lctx, M_CXGBE); + + return (in_pcbrele_wlocked(inp)); +} + +static void +hold_lctx(struct listen_ctx *lctx) +{ + + refcount_acquire(&lctx->refcount); +} + +static inline uint32_t +listen_hashfn(void *key, u_long mask) +{ + + return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); +} + +/* + * Add a listen_ctx entry to the listen hash table. + */ +static void +listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) +{ + struct tom_data *td = sc->tom_softc; + int bucket = listen_hashfn(lctx->inp, td->listen_mask); + + mtx_lock(&td->lctx_hash_lock); + LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); + td->lctx_count++; + mtx_unlock(&td->lctx_hash_lock); +} + +/* + * Look for the listening socket's context entry in the hash and return it. + */ +static struct listen_ctx * +listen_hash_find(struct adapter *sc, struct inpcb *inp) +{ + struct tom_data *td = sc->tom_softc; + int bucket = listen_hashfn(inp, td->listen_mask); + struct listen_ctx *lctx; + + mtx_lock(&td->lctx_hash_lock); + LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { + if (lctx->inp == inp) + break; + } + mtx_unlock(&td->lctx_hash_lock); + + return (lctx); +} + +/* + * Removes the listen_ctx structure for inp from the hash and returns it. + */ +static struct listen_ctx * +listen_hash_del(struct adapter *sc, struct inpcb *inp) +{ + struct tom_data *td = sc->tom_softc; + int bucket = listen_hashfn(inp, td->listen_mask); + struct listen_ctx *lctx, *l; + + mtx_lock(&td->lctx_hash_lock); + LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { + if (lctx->inp == inp) { + LIST_REMOVE(lctx, link); + td->lctx_count--; + break; + } + } + mtx_unlock(&td->lctx_hash_lock); + + return (lctx); +} + +/* + * Releases a hold on the lctx. Must be called with the listening socket's inp + * locked. The inp may be freed by this function and it returns NULL to + * indicate this. + */ +static struct inpcb * +release_lctx(struct adapter *sc, struct listen_ctx *lctx) +{ + struct inpcb *inp = lctx->inp; + int inp_freed = 0; + + INP_WLOCK_ASSERT(inp); + if (refcount_release(&lctx->refcount)) + inp_freed = free_lctx(sc, lctx); + + return (inp_freed ? NULL : inp); +} + +static void +send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) +{ + struct adapter *sc = tod->tod_softc; + struct mbuf *m = synqe->syn; + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct port_info *pi = ifp->if_softc; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + struct wrqe *wr; + struct fw_flowc_wr *flowc; + struct cpl_abort_req *req; + int txqid, rxqid, flowclen; + struct sge_wrq *ofld_txq; + struct sge_ofld_rxq *ofld_rxq; + const int nparams = 4; + unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN; + + INP_WLOCK_ASSERT(synqe->lctx->inp); + + CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s", + __func__, synqe, synqe->tid, + synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ? + " (abort already in progress)" : ""); + if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) + return; /* abort already in progress */ + synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN); + + get_qids_from_mbuf(m, &txqid, &rxqid); + ofld_txq = &sc->sge.ofld_txq[txqid]; + ofld_rxq = &sc->sge.ofld_rxq[rxqid]; + + /* The wrqe will have two WRs - a flowc followed by an abort_req */ + flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); + + wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + flowc = wrtod(wr); + req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE)); + + /* First the flowc ... */ + memset(flowc, 0, wr->wr_len); + flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | + V_FW_FLOWC_WR_NPARAMS(nparams)); + flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | + V_FW_WR_FLOWID(synqe->tid)); + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; + flowc->mnemval[0].val = htobe32(pfvf); + flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; + flowc->mnemval[1].val = htobe32(pi->tx_chan); + flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; + flowc->mnemval[2].val = htobe32(pi->tx_chan); + flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; + flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); + synqe_set_flag(synqe, TPF_FLOWC_WR_SENT); + + /* ... then ABORT request */ + INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); + req->rsvd0 = 0; /* don't have a snd_nxt */ + req->rsvd1 = 1; /* no data sent yet */ + req->cmd = CPL_ABORT_SEND_RST; + + t4_l2t_send(sc, wr, e); +} + +static int +create_server(struct adapter *sc, struct listen_ctx *lctx) +{ + struct wrqe *wr; + struct cpl_pass_open_req *req; + struct in_conninfo *inc = &lctx->inp->inp_inc; + + wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); + if (wr == NULL) { + log(LOG_ERR, "%s: allocation failure", __func__); + return (ENOMEM); + } + req = wrtod(wr); + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); + req->local_port = inc->inc_lport; + req->peer_port = 0; + req->local_ip = inc->inc_laddr.s_addr; + req->peer_ip = 0; + req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); + req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | + F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); + + t4_wrq_tx(sc, wr); + return (0); +} + +static int +destroy_server(struct adapter *sc, struct listen_ctx *lctx) +{ + struct wrqe *wr; + struct cpl_close_listsvr_req *req; + + wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); + if (wr == NULL) { + /* XXX */ + panic("%s: allocation failure.", __func__); + } + req = wrtod(wr); + + INIT_TP_WR(req, 0); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, + lctx->stid)); + req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); + req->rsvd = htobe16(0); + + t4_wrq_tx(sc, wr); + return (0); +} + +/* + * Start a listening server by sending a passive open request to HW. + * + * Can't take adapter lock here and access to sc->flags, sc->open_device_map, + * sc->offload_map, if_capenable are all race prone. + */ +int +t4_listen_start(struct toedev *tod, struct tcpcb *tp) +{ + struct adapter *sc = tod->tod_softc; + struct port_info *pi; + struct inpcb *inp = tp->t_inpcb; + struct listen_ctx *lctx; + int i; + + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_vflag & INP_IPV4) == 0) + return (0); + +#if 0 + ADAPTER_LOCK(sc); + if (IS_BUSY(sc)) { + log(LOG_ERR, "%s: listen request ignored, %s is busy", + __func__, device_get_nameunit(sc->dev)); + goto done; + } + + KASSERT(sc->flags & TOM_INIT_DONE, + ("%s: TOM not initialized", __func__)); +#endif + + if ((sc->open_device_map & sc->offload_map) == 0) + goto done; /* no port that's UP with IFCAP_TOE enabled */ + + /* + * Find a running port with IFCAP_TOE4. We'll use the first such port's + * queues to send the passive open and receive the reply to it. + * + * XXX: need a way to mark a port in use by offload. if_cxgbe should + * then reject any attempt to bring down such a port (and maybe reject + * attempts to disable IFCAP_TOE on that port too?). + */ + for_each_port(sc, i) { + if (isset(&sc->open_device_map, i) && + sc->port[i]->ifp->if_capenable & IFCAP_TOE4) + break; + } + KASSERT(i < sc->params.nports, + ("%s: no running port with TOE capability enabled.", __func__)); + pi = sc->port[i]; + + if (listen_hash_find(sc, inp) != NULL) + goto done; /* already setup */ + + lctx = alloc_lctx(sc, inp, pi); + if (lctx == NULL) { + log(LOG_ERR, + "%s: listen request ignored, %s couldn't allocate lctx\n", + __func__, device_get_nameunit(sc->dev)); + goto done; + } + listen_hash_add(sc, lctx); + + CTR5(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p", __func__, + lctx->stid, tcpstates[tp->t_state], lctx, inp); + + if (create_server(sc, lctx) != 0) { + log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__, + device_get_nameunit(sc->dev)); + (void) listen_hash_del(sc, inp); + inp = release_lctx(sc, lctx); + /* can't be freed, host stack has a reference */ + KASSERT(inp != NULL, ("%s: inp freed", __func__)); + goto done; + } + lctx->flags |= LCTX_RPL_PENDING; +done: +#if 0 + ADAPTER_UNLOCK(sc); +#endif + return (0); +} + +int +t4_listen_stop(struct toedev *tod, struct tcpcb *tp) +{ + struct listen_ctx *lctx; + struct adapter *sc = tod->tod_softc; + struct inpcb *inp = tp->t_inpcb; + struct synq_entry *synqe; + + INP_WLOCK_ASSERT(inp); + + lctx = listen_hash_del(sc, inp); + if (lctx == NULL) + return (ENOENT); /* no hardware listener for this inp */ + + CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, + lctx, lctx->flags); + + /* + * If the reply to the PASS_OPEN is still pending we'll wait for it to + * arrive and clean up when it does. + */ + if (lctx->flags & LCTX_RPL_PENDING) { + KASSERT(TAILQ_EMPTY(&lctx->synq), + ("%s: synq not empty.", __func__)); + return (EINPROGRESS); + } + + /* + * The host stack will abort all the connections on the listening + * socket's so_comp. It doesn't know about the connections on the synq + * so we need to take care of those. + */ + TAILQ_FOREACH(synqe, &lctx->synq, link) + send_reset_synqe(tod, synqe); + + destroy_server(sc, lctx); + return (0); +} + +static inline void +hold_synqe(struct synq_entry *synqe) +{ + + refcount_acquire(&synqe->refcnt); +} + +static inline void +release_synqe(struct synq_entry *synqe) +{ + + if (refcount_release(&synqe->refcnt)) { + int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE); + + m_freem(synqe->syn); + if (needfree) + free(synqe, M_CXGBE); + } +} + +void +t4_syncache_added(struct toedev *tod __unused, void *arg) +{ + struct synq_entry *synqe = arg; + + hold_synqe(synqe); +} + +void +t4_syncache_removed(struct toedev *tod __unused, void *arg) +{ + struct synq_entry *synqe = arg; + + release_synqe(synqe); +} + +/* XXX */ +extern void tcp_dooptions(struct tcpopt *, u_char *, int, int); + +int +t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) +{ + struct adapter *sc = tod->tod_softc; + struct synq_entry *synqe = arg; + struct wrqe *wr; + struct l2t_entry *e; + struct tcpopt to; + struct ip *ip = mtod(m, struct ip *); + struct tcphdr *th = (void *)(ip + 1); + + wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr); + if (wr == NULL) + return (EALREADY); + + bzero(&to, sizeof(to)); + tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th), + TO_SYN); + + /* save these for later */ + synqe->iss = be32toh(th->th_seq); + synqe->ts = to.to_tsval; + + e = &sc->l2t->l2tab[synqe->l2e_idx]; + t4_l2t_send(sc, wr, e); + + m_freem(m); /* don't need this any more */ + return (0); +} + +static int +do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); + int stid = GET_TID(cpl); + unsigned int status = cpl->status; + struct listen_ctx *lctx = lookup_stid(sc, stid); + struct inpcb *inp = lctx->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PASS_OPEN_RPL, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + + INP_WLOCK(inp); + + CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", + __func__, stid, status, lctx->flags); + + lctx->flags &= ~LCTX_RPL_PENDING; + + if (status != CPL_ERR_NONE) + log(LOG_ERR, "listener with stid %u failed: %d", stid, status); + +#ifdef INVARIANTS + /* + * If the inp has been dropped (listening socket closed) then + * listen_stop must have run and taken the inp out of the hash. + */ + if (inp->inp_flags & INP_DROPPED) { + KASSERT(listen_hash_del(sc, inp) == NULL, + ("%s: inp %p still in listen hash", __func__, inp)); + } +#endif + + if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { + if (release_lctx(sc, lctx) != NULL) + INP_WUNLOCK(inp); + return (status); + } + + /* + * Listening socket stopped listening earlier and now the chip tells us + * it has started the hardware listener. Stop it; the lctx will be + * released in do_close_server_rpl. + */ + if (inp->inp_flags & INP_DROPPED) { + destroy_server(sc, lctx); + INP_WUNLOCK(inp); + return (status); + } + + /* + * Failed to start hardware listener. Take inp out of the hash and + * release our reference on it. An error message has been logged + * already. + */ + if (status != CPL_ERR_NONE) { + listen_hash_del(sc, inp); + if (release_lctx(sc, lctx) != NULL) + INP_WUNLOCK(inp); + return (status); + } + + /* hardware listener open for business */ + + INP_WUNLOCK(inp); + return (status); +} + +static int +do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); + int stid = GET_TID(cpl); + unsigned int status = cpl->status; + struct listen_ctx *lctx = lookup_stid(sc, stid); + struct inpcb *inp = lctx->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + + CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); + + if (status != CPL_ERR_NONE) { + log(LOG_ERR, "%s: failed (%u) to close listener for stid %u", + __func__, status, stid); + return (status); + } + + INP_WLOCK(inp); + inp = release_lctx(sc, lctx); + if (inp != NULL) + INP_WUNLOCK(inp); + + return (status); +} + +static void +done_with_synqe(struct adapter *sc, struct synq_entry *synqe) +{ + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + + INP_WLOCK_ASSERT(inp); + + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(sc, lctx); + if (inp) + INP_WUNLOCK(inp); + remove_tid(sc, synqe->tid); + release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]); + t4_l2t_release(e); + release_synqe(synqe); /* removed from synq list */ +} + +int +do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(sc, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + int txqid; + struct sge_wrq *ofld_txq; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_REQ_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); + + CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", + __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); + + if (cpl->status == CPL_ERR_RTX_NEG_ADVICE || + cpl->status == CPL_ERR_PERSIST_NEG_ADVICE) + return (0); /* Ignore negative advice */ + + INP_WLOCK(inp); + + get_qids_from_mbuf(synqe->syn, &txqid, NULL); + ofld_txq = &sc->sge.ofld_txq[txqid]; + + /* + * If we'd initiated an abort earlier the reply to it is responsible for + * cleaning up resources. Otherwise we tear everything down right here + * right now. We owe the T4 a CPL_ABORT_RPL no matter what. + */ + if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) { + INP_WUNLOCK(inp); + goto done; + } + + done_with_synqe(sc, synqe); + /* inp lock released by done_with_synqe */ +done: + send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); + return (0); +} + +int +do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(sc, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_ABORT_RPL_RSS, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); + + CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", + __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); + + INP_WLOCK(inp); + KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: wasn't expecting abort reply for synqe %p (0x%x)", + __func__, synqe, synqe->flags)); + + done_with_synqe(sc, synqe); + /* inp lock released by done_with_synqe */ + + return (0); +} + +void +t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) +{ + struct adapter *sc = tod->tod_softc; + struct synq_entry *synqe = arg; +#ifdef INVARIANTS + struct inpcb *inp = sotoinpcb(so); +#endif + struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); + struct toepcb *toep = *(struct toepcb **)(cpl + 1); + + INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_WLOCK_ASSERT(inp); + KASSERT(synqe_flag(synqe, TPF_SYNQE), + ("%s: %p not a synq_entry?", __func__, arg)); + + offload_socket(so, toep); + make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); + toepcb_set_flag(toep, TPF_CPL_PENDING); + update_tid(sc, synqe->tid, toep); +} + +static inline void +save_qids_in_mbuf(struct mbuf *m, struct port_info *pi) +{ + uint32_t txqid, rxqid; + + txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq; + rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq; + + m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); +} + +static inline void +get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid) +{ + + if (txqid) + *txqid = m->m_pkthdr.flowid >> 16; + if (rxqid) + *rxqid = m->m_pkthdr.flowid & 0xffff; +} + +/* + * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to + * store some state temporarily. + */ +static struct synq_entry * +mbuf_to_synqe(struct mbuf *m) +{ + int len = roundup(sizeof (struct synq_entry), 8); + int tspace = M_TRAILINGSPACE(m); + struct synq_entry *synqe = NULL; + + if (tspace < len) { + synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT); + if (synqe == NULL) + return (NULL); + } else + synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe)); + + synqe->flags = 0; + synqe_set_flag(synqe, TPF_SYNQE); + if (tspace < len) + synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE); + + return (synqe); +} + +static void +t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) +{ + bzero(to, sizeof(*to)); + + if (t4opt->mss) { + to->to_flags |= TOF_MSS; + to->to_mss = be16toh(t4opt->mss); + } + + if (t4opt->wsf) { + to->to_flags |= TOF_SCALE; + to->to_wscale = t4opt->wsf; + } + + if (t4opt->tstamp) + to->to_flags |= TOF_TS; + + if (t4opt->sack) + to->to_flags |= TOF_SACKPERM; +} + +/* + * Options2 for passive open. + */ +static uint32_t +calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, + const struct tcp_options *tcpopt, struct tcphdr *th) +{ + uint32_t opt2 = 0; + struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; + + if (V_tcp_do_rfc1323) { + if (tcpopt->tstamp) + opt2 |= F_TSTAMPS_EN; + if (tcpopt->sack) + opt2 |= F_SACK_EN; + if (tcpopt->wsf > 0) + opt2 |= F_WND_SCALE_EN; + } + + if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR)) + opt2 |= F_CCTRL_ECN; + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE); + opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); + + return htobe32(opt2); +} + +/* XXX: duplication. */ +static inline void +tcp_fields_to_host(struct tcphdr *th) +{ + + th->th_seq = ntohl(th->th_seq); + th->th_ack = ntohl(th->th_ack); + th->th_win = ntohs(th->th_win); + th->th_urp = ntohs(th->th_urp); +} + +static void +pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc, + struct tcphdr *th) +{ + const struct cpl_pass_accept_req *cpl = mtod(m, const void *); + const struct ether_header *eh; + unsigned int hlen = be32toh(cpl->hdr_len); + const struct ip *ip; + const struct tcphdr *tcp; + + eh = (const void *)(cpl + 1); + ip = (const void *)((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); + tcp = (const void *)((uintptr_t)ip + G_IP_HDR_LEN(hlen)); + + if (inc) { + bzero(inc, sizeof(*inc)); + inc->inc_faddr = ip->ip_src; + inc->inc_laddr = ip->ip_dst; + inc->inc_fport = tcp->th_sport; + inc->inc_lport = tcp->th_dport; + if (ip->ip_v == 6) + inc->inc_flags |= INC_ISIPV6; + } + + if (th) { + bcopy(tcp, th, sizeof(*th)); + tcp_fields_to_host(th); /* just like tcp_input */ + } +} + +#define REJECT_PASS_ACCEPT() do { \ + reject_reason = __LINE__; \ + goto reject; \ +} while (0) + +/* + * The context associated with a tid entry via insert_tid could be a synq_entry + * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. + */ +CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); + +/* + * Incoming SYN on a listening socket. + * + * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, + * etc. + */ +static int +do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + struct toedev *tod; + const struct cpl_pass_accept_req *cpl = mtod(m, const void *); + struct cpl_pass_accept_rpl *rpl; + struct wrqe *wr; + unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); + unsigned int tid = GET_TID(cpl); + struct listen_ctx *lctx = lookup_stid(sc, stid); + struct inpcb *inp; + struct socket *so; + struct in_conninfo inc; + struct tcphdr th; + struct tcpopt to; + struct port_info *pi; + struct ifnet *ifp, *ifp_vlan = NULL; + struct l2t_entry *e = NULL; + struct rtentry *rt; + struct sockaddr_in nam; + int rscale, mtu_idx, rx_credits, rxqid; + struct synq_entry *synqe = NULL; + int reject_reason; + uint16_t vid; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PASS_ACCEPT_REQ, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + + CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, + lctx); + + pass_accept_req_to_protohdrs(m, &inc, &th); + t4opt_to_tcpopt(&cpl->tcpopt, &to); + + pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; + ifp = pi->ifp; + m->m_pkthdr.rcvif = ifp; + tod = TOEDEV(ifp); + + /* + * Don't offload if the interface that received the SYN doesn't have + * IFCAP_TOE enabled. + */ + if ((ifp->if_capenable & IFCAP_TOE4) == 0) + REJECT_PASS_ACCEPT(); + + /* Don't offload IPv6 connections. XXX: add IPv6 support */ + if (inc.inc_flags & INC_ISIPV6) + REJECT_PASS_ACCEPT(); + + /* + * Don't offload if the SYN had a VLAN tag and the vid doesn't match + * anything on this interface. + */ + vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); + if (vid != 0xfff) { + ifp_vlan = VLAN_DEVAT(ifp, vid); + if (ifp_vlan == NULL) + REJECT_PASS_ACCEPT(); + } + + /* + * Don't offload if the peer requested a TCP option that's not known to + * the silicon. + */ + if (cpl->tcpopt.unknown) + REJECT_PASS_ACCEPT(); + + /* + * Don't offload if the outgoing interface for the route back to the + * peer is not the same as the interface that received the SYN. + * XXX: too restrictive. + */ + nam.sin_len = sizeof(nam); + nam.sin_family = AF_INET; + nam.sin_addr = inc.inc_faddr; + rt = rtalloc1((struct sockaddr *)&nam, 0, 0); + if (rt == NULL) + REJECT_PASS_ACCEPT(); + else { + struct sockaddr *nexthop; + + RT_UNLOCK(rt); + nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : + (struct sockaddr *)&nam; + if (rt->rt_ifp == ifp || + (ifp_vlan != NULL && rt->rt_ifp == ifp_vlan)) + e = t4_l2t_get(pi, rt->rt_ifp, nexthop); + RTFREE(rt); + if (e == NULL) + REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ + } + + synqe = mbuf_to_synqe(m); + if (synqe == NULL) + REJECT_PASS_ACCEPT(); + + wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]); + if (wr == NULL) + REJECT_PASS_ACCEPT(); + rpl = wrtod(wr); + + INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check, syncache_add */ + + /* Don't offload if the 4-tuple is already in use */ + if (toe_4tuple_check(&inc, &th, ifp) != 0) { + INP_INFO_WUNLOCK(&V_tcbinfo); + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } + + inp = lctx->inp; /* listening socket, not owned by TOE */ + INP_WLOCK(inp); + + /* Don't offload if the listening socket has closed */ + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * The listening socket has closed. The reply from the TOE to + * our CPL_CLOSE_LISTSRV_REQ will ultimately release all + * resources tied to this listen context. + */ + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } + so = inp->inp_socket; + + mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); + rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; + SOCKBUF_LOCK(&so->so_rcv); + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); + SOCKBUF_UNLOCK(&so->so_rcv); + + save_qids_in_mbuf(m, pi); + get_qids_from_mbuf(m, NULL, &rxqid); + + INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); + rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, + ULP_MODE_NONE); + rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th); + + synqe->tid = tid; + synqe->lctx = lctx; + synqe->syn = m; + m = NULL; + refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the + duration of this function */ + synqe->l2e_idx = e->idx; + synqe->rcv_bufsize = rx_credits; + atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr); + + insert_tid(sc, tid, synqe); + TAILQ_INSERT_TAIL(&lctx->synq, synqe, link); + hold_synqe(synqe); /* hold for the duration it's in the synq */ + hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */ + + /* + * If all goes well t4_syncache_respond will get called during + * syncache_add. Also note that syncache_add releases both pcbinfo and + * pcb locks. + */ + toe_syncache_add(&inc, &to, &th, inp, tod, synqe); + INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + + /* + * If we replied during syncache_add (synqe->wr has been consumed), + * good. Otherwise, set it to 0 so that further syncache_respond + * attempts by the kernel will be ignored. + * + * The extra hold on the synqe makes sure that it is still around, even + * if the listener has been dropped and the synqe was aborted and the + * reply to the abort has removed and released the synqe from the synq + * list. + */ + if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) { + + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* listener closed. synqe must have been aborted. */ + KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: listener %p closed but synqe %p not aborted", + __func__, inp, synqe)); + + CTR5(KTR_CXGBE, + "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED", + __func__, stid, tid, lctx, synqe); + INP_WUNLOCK(inp); + free(wr, M_CXGBE); + release_synqe(synqe); /* about to exit function */ + return (__LINE__); + } + + /* + * synqe aborted before TOM replied to PASS_ACCEPT_REQ. But + * that can only happen if the listener was closed and we just + * checked for that. + */ + KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: synqe %p aborted, but listener %p not dropped.", + __func__, synqe, inp)); + + /* Yank the synqe out of the lctx synq. */ + TAILQ_REMOVE(&lctx->synq, synqe, link); + release_synqe(synqe); /* removed from synq list */ + inp = release_lctx(sc, lctx); + if (inp) + INP_WUNLOCK(inp); + + /* + * syncache may or may not have a hold on the synqe, which may + * or may not be stashed in the original SYN mbuf passed to us. + * Just copy it over instead of dealing with all possibilities. + */ + m = m_dup(synqe->syn, M_DONTWAIT); + if (m) + m->m_pkthdr.rcvif = ifp; + + release_synqe(synqe); /* about to exit function */ + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } + release_synqe(synqe); /* about to exit function */ + CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK", + __func__, stid, tid, lctx, synqe); + return (0); +reject: + CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, + reject_reason); + + if (e) + t4_l2t_release(e); + release_tid(sc, tid, lctx->ctrlq); + + if (__predict_true(m != NULL)) { + m_adj(m, sizeof(*cpl)); + m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + ifp->if_input(ifp, m); + } + + return (reject_reason); +} + +static void +synqe_to_protohdrs(struct synq_entry *synqe, + const struct cpl_pass_establish *cpl, struct in_conninfo *inc, + struct tcphdr *th, struct tcpopt *to) +{ + uint16_t tcp_opt = be16toh(cpl->tcp_opt); + + /* start off with the original SYN */ + pass_accept_req_to_protohdrs(synqe->syn, inc, th); + + /* modify parts to make it look like the ACK to our SYN|ACK */ + th->th_flags = TH_ACK; + th->th_ack = synqe->iss + 1; + th->th_seq = be32toh(cpl->rcv_isn); + bzero(to, sizeof(*to)); + if (G_TCPOPT_TSTAMP(tcp_opt)) { + to->to_flags |= TOF_TS; + to->to_tsecr = synqe->ts; + } +} + +static int +do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + struct port_info *pi; + struct ifnet *ifp; + const struct cpl_pass_establish *cpl = (const void *)(rss + 1); +#if defined(KTR) || defined(INVARIANTS) + unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); +#endif + unsigned int tid = GET_TID(cpl); + struct synq_entry *synqe = lookup_tid(sc, tid); + struct listen_ctx *lctx = synqe->lctx; + struct inpcb *inp = lctx->inp; + struct socket *so; + struct tcphdr th; + struct tcpopt to; + struct in_conninfo inc; + struct toepcb *toep; + u_int txqid, rxqid; +#ifdef INVARIANTS + unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); +#endif + + KASSERT(opcode == CPL_PASS_ESTABLISH, + ("%s: unexpected opcode 0x%x", __func__, opcode)); + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); + KASSERT(synqe_flag(synqe, TPF_SYNQE), + ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); + + INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_WLOCK(inp); + + CTR6(KTR_CXGBE, + "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", + __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); + + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * The listening socket has closed. The TOM must have aborted + * all the embryonic connections (including this one) that were + * on the lctx's synq. do_abort_rpl for the tid is responsible + * for cleaning up. + */ + KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN), + ("%s: listen socket dropped but tid %u not aborted.", + __func__, tid)); + + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); + } + + ifp = synqe->syn->m_pkthdr.rcvif; + pi = ifp->if_softc; + KASSERT(pi->adapter == sc, + ("%s: pi %p, sc %p mismatch", __func__, pi, sc)); + + get_qids_from_mbuf(synqe->syn, &txqid, &rxqid); + KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], + ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid, + (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); + + toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT); + if (toep == NULL) { +reset: + /* The reply to this abort will perform final cleanup */ + send_reset_synqe(TOEDEV(ifp), synqe); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); + } + toep->tid = tid; + toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; + toep->ulp_mode = ULP_MODE_NONE; + /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ + toep->rx_credits = synqe->rcv_bufsize; + + so = inp->inp_socket; + KASSERT(so != NULL, ("%s: socket is NULL", __func__)); + + /* Come up with something that syncache_expand should be ok with. */ + synqe_to_protohdrs(synqe, cpl, &inc, &th, &to); + + /* + * No more need for anything in the mbuf that carried the + * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer + * there. XXX: bad form but I don't want to increase the size of synqe. + */ + m = synqe->syn; + KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len, + ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len)); + bcopy(cpl, mtod(m, void *), sizeof(*cpl)); + *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep; + + if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { + free_toepcb(toep); + goto reset; + } + + /* Done with the synqe */ + TAILQ_REMOVE(&lctx->synq, synqe, link); + inp = release_lctx(sc, lctx); + if (inp != NULL) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + release_synqe(synqe); + + return (0); +} + +void +t4_init_listen_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl); + t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); + t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req); + t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c new file mode 100644 index 0000000..c6e9a1f --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -0,0 +1,755 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include + +#ifdef TCP_OFFLOAD +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +/* Module ops */ +static int t4_tom_mod_load(void); +static int t4_tom_mod_unload(void); +static int t4_tom_modevent(module_t, int, void *); + +/* ULD ops and helpers */ +static int t4_tom_activate(struct adapter *); +static int t4_tom_deactivate(struct adapter *); + +static struct uld_info tom_uld_info = { + .uld_id = ULD_TOM, + .activate = t4_tom_activate, + .deactivate = t4_tom_deactivate, +}; + +static void queue_tid_release(struct adapter *, int); +static void release_offload_resources(struct toepcb *); +static int alloc_tid_tabs(struct tid_info *); +static void free_tid_tabs(struct tid_info *); +static void free_tom_data(struct adapter *, struct tom_data *); + +struct toepcb * +alloc_toepcb(struct port_info *pi, int txqid, int rxqid, int flags) +{ + struct adapter *sc = pi->adapter; + struct toepcb *toep; + int tx_credits, txsd_total, len; + + /* + * The firmware counts tx work request credits in units of 16 bytes + * each. Reserve room for an ABORT_REQ so the driver never has to worry + * about tx credits if it wants to abort a connection. + */ + tx_credits = sc->params.ofldq_wr_cred; + tx_credits -= howmany(sizeof(struct cpl_abort_req), 16); + + /* + * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte + * immediate payload, and firmware counts tx work request credits in + * units of 16 byte. Calculate the maximum work requests possible. + */ + txsd_total = tx_credits / + howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16); + + if (txqid < 0) + txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq; + KASSERT(txqid >= pi->first_ofld_txq && + txqid < pi->first_ofld_txq + pi->nofldtxq, + ("%s: txqid %d for port %p (first %d, n %d)", __func__, txqid, pi, + pi->first_ofld_txq, pi->nofldtxq)); + + if (rxqid < 0) + rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq; + KASSERT(rxqid >= pi->first_ofld_rxq && + rxqid < pi->first_ofld_rxq + pi->nofldrxq, + ("%s: rxqid %d for port %p (first %d, n %d)", __func__, rxqid, pi, + pi->first_ofld_rxq, pi->nofldrxq)); + + len = offsetof(struct toepcb, txsd) + + txsd_total * sizeof(struct ofld_tx_sdesc); + + toep = malloc(len, M_CXGBE, M_ZERO | flags); + if (toep == NULL) + return (NULL); + + toep->td = sc->tom_softc; + toep->port = pi; + toep->tx_credits = tx_credits; + toep->ofld_txq = &sc->sge.ofld_txq[txqid]; + toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; + toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; + toep->txsd_total = txsd_total; + toep->txsd_avail = txsd_total; + toep->txsd_pidx = 0; + toep->txsd_cidx = 0; + + return (toep); +} + +void +free_toepcb(struct toepcb *toep) +{ + + KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0, + ("%s: attached to an inpcb", __func__)); + KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0, + ("%s: CPL pending", __func__)); + + free(toep, M_CXGBE); +} + +/* + * Set up the socket for TCP offload. + */ +void +offload_socket(struct socket *so, struct toepcb *toep) +{ + struct tom_data *td = toep->td; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct sockbuf *sb; + + INP_WLOCK_ASSERT(inp); + + /* Update socket */ + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + sb->sb_flags |= SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + sb->sb_flags |= SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + + /* Update TCP PCB */ + tp->tod = &td->tod; + tp->t_toe = toep; + tp->t_flags |= TF_TOE; + + /* Install an extra hold on inp */ + toep->inp = inp; + toepcb_set_flag(toep, TPF_ATTACHED); + in_pcbref(inp); + + /* Add the TOE PCB to the active list */ + mtx_lock(&td->toep_list_lock); + TAILQ_INSERT_HEAD(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); +} + +/* This is _not_ the normal way to "unoffload" a socket. */ +void +undo_offload_socket(struct socket *so) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep = tp->t_toe; + struct tom_data *td = toep->td; + struct sockbuf *sb; + + INP_WLOCK_ASSERT(inp); + + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + sb->sb_flags &= ~SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + sb->sb_flags &= ~SB_NOCOALESCE; + SOCKBUF_UNLOCK(sb); + + tp->tod = NULL; + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + + toep->inp = NULL; + toepcb_clr_flag(toep, TPF_ATTACHED); + if (in_pcbrele_wlocked(inp)) + panic("%s: inp freed.", __func__); + + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); +} + +static void +release_offload_resources(struct toepcb *toep) +{ + struct tom_data *td = toep->td; + struct adapter *sc = td_adapter(td); + int tid = toep->tid; + + KASSERT(toepcb_flag(toep, TPF_CPL_PENDING) == 0, + ("%s: %p has CPL pending.", __func__, toep)); + KASSERT(toepcb_flag(toep, TPF_ATTACHED) == 0, + ("%s: %p is still attached.", __func__, toep)); + + CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)", + __func__, toep, tid, toep->l2te); + + if (toep->l2te) + t4_l2t_release(toep->l2te); + + if (tid >= 0) { + remove_tid(sc, tid); + release_tid(sc, tid, toep->ctrlq); + } + + mtx_lock(&td->toep_list_lock); + TAILQ_REMOVE(&td->toep_list, toep, link); + mtx_unlock(&td->toep_list_lock); + + free_toepcb(toep); +} + +/* + * The kernel is done with the TCP PCB and this is our opportunity to unhook the + * toepcb hanging off of it. If the TOE driver is also done with the toepcb (no + * pending CPL) then it is time to release all resources tied to the toepcb. + * + * Also gets called when an offloaded active open fails and the TOM wants the + * kernel to take the TCP PCB back. + */ +static void +t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp) +{ +#if defined(KTR) || defined(INVARIANTS) + struct inpcb *inp = tp->t_inpcb; +#endif + struct toepcb *toep = tp->t_toe; + + INP_WLOCK_ASSERT(inp); + + KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); + KASSERT(toepcb_flag(toep, TPF_ATTACHED), + ("%s: not attached", __func__)); + +#ifdef KTR + if (tp->t_state == TCPS_SYN_SENT) { + CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)", + __func__, toep->tid, toep, toep->flags, inp, + inp->inp_flags); + } else { + CTR6(KTR_CXGBE, + "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)", + toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp, + inp->inp_flags); + } +#endif + + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + toepcb_clr_flag(toep, TPF_ATTACHED); + + if (toepcb_flag(toep, TPF_CPL_PENDING) == 0) + release_offload_resources(toep); +} + +/* + * The TOE driver will not receive any more CPLs for the tid associated with the + * toepcb; release the hold on the inpcb. + */ +void +final_cpl_received(struct toepcb *toep) +{ + struct inpcb *inp = toep->inp; + + KASSERT(inp != NULL, ("%s: inp is NULL", __func__)); + INP_WLOCK_ASSERT(inp); + KASSERT(toepcb_flag(toep, TPF_CPL_PENDING), + ("%s: CPL not pending already?", __func__)); + + CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", + __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); + + toep->inp = NULL; + toepcb_clr_flag(toep, TPF_CPL_PENDING); + + if (toepcb_flag(toep, TPF_ATTACHED) == 0) + release_offload_resources(toep); + + if (!in_pcbrele_wlocked(inp)) + INP_WUNLOCK(inp); +} + +void +insert_tid(struct adapter *sc, int tid, void *ctx) +{ + struct tid_info *t = &sc->tids; + + t->tid_tab[tid] = ctx; + atomic_add_int(&t->tids_in_use, 1); +} + +void * +lookup_tid(struct adapter *sc, int tid) +{ + struct tid_info *t = &sc->tids; + + return (t->tid_tab[tid]); +} + +void +update_tid(struct adapter *sc, int tid, void *ctx) +{ + struct tid_info *t = &sc->tids; + + t->tid_tab[tid] = ctx; +} + +void +remove_tid(struct adapter *sc, int tid) +{ + struct tid_info *t = &sc->tids; + + t->tid_tab[tid] = NULL; + atomic_subtract_int(&t->tids_in_use, 1); +} + +void +release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) +{ + struct wrqe *wr; + struct cpl_tid_release *req; + + wr = alloc_wrqe(sizeof(*req), ctrlq); + if (wr == NULL) { + queue_tid_release(sc, tid); /* defer */ + return; + } + req = wrtod(wr); + + INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); + + t4_wrq_tx(sc, wr); +} + +static void +queue_tid_release(struct adapter *sc, int tid) +{ + + CXGBE_UNIMPLEMENTED("deferred tid release"); +} + +/* + * What mtu_idx to use, given a 4-tuple and/or an MSS cap + */ +int +find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) +{ + unsigned short *mtus = &sc->params.mtus[0]; + int i = 0, mss; + + KASSERT(inc != NULL || pmss > 0, + ("%s: at least one of inc/pmss must be specified", __func__)); + + mss = inc ? tcp_mssopt(inc) : pmss; + if (pmss > 0 && mss > pmss) + mss = pmss; + + while (i < NMTUS - 1 && mtus[i + 1] <= mss + 40) + ++i; + + return (i); +} + +/* + * Determine the receive window size for a socket. + */ +u_long +select_rcv_wnd(struct socket *so) +{ + unsigned long wnd; + + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + + wnd = sbspace(&so->so_rcv); + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; + + return min(wnd, MAX_RCV_WND); +} + +int +select_rcv_wscale(void) +{ + int wscale = 0; + unsigned long space = sb_max; + + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; + + while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space) + wscale++; + + return (wscale); +} + +extern int always_keepalive; +#define VIID_SMACIDX(v) (((unsigned int)(v) & 0x7f) << 1) + +/* + * socket so could be a listening socket too. + */ +uint64_t +calc_opt0(struct socket *so, struct port_info *pi, struct l2t_entry *e, + int mtu_idx, int rscale, int rx_credits, int ulp_mode) +{ + uint64_t opt0; + + KASSERT(rx_credits <= M_RCV_BUFSIZ, + ("%s: rcv_bufsiz too high", __func__)); + + opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | + V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits); + + if (so != NULL) { + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + int keepalive = always_keepalive || + so_options_get(so) & SO_KEEPALIVE; + + opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); + opt0 |= V_KEEP_ALIVE(keepalive != 0); + } + + if (e != NULL) + opt0 |= V_L2T_IDX(e->idx); + + if (pi != NULL) { + opt0 |= V_SMAC_SEL(VIID_SMACIDX(pi->viid)); + opt0 |= V_TX_CHAN(pi->tx_chan); + } + + return htobe64(opt0); +} + +#define FILTER_SEL_WIDTH_P_FC (3 + 1) +#define FILTER_SEL_WIDTH_VIN_P_FC (6 + 7 + FILTER_SEL_WIDTH_P_FC) +#define FILTER_SEL_WIDTH_TAG_P_FC (3 + FILTER_SEL_WIDTH_VIN_P_FC) +#define FILTER_SEL_WIDTH_VLD_TAG_P_FC (1 + FILTER_SEL_WIDTH_TAG_P_FC) +#define VLAN_NONE 0xfff +#define FILTER_SEL_VLAN_NONE 0xffff + +uint32_t +select_ntuple(struct port_info *pi, struct l2t_entry *e, uint32_t filter_mode) +{ + uint16_t viid = pi->viid; + uint32_t ntuple = 0; + + if (filter_mode == HW_TPL_FR_MT_PR_IV_P_FC) { + if (e->vlan == VLAN_NONE) + ntuple |= FILTER_SEL_VLAN_NONE << FILTER_SEL_WIDTH_P_FC; + else { + ntuple |= e->vlan << FILTER_SEL_WIDTH_P_FC; + ntuple |= 1 << FILTER_SEL_WIDTH_VLD_TAG_P_FC; + } + ntuple |= e->lport << S_PORT; + ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC; + } else if (filter_mode == HW_TPL_FR_MT_PR_OV_P_FC) { + ntuple |= G_FW_VIID_VIN(viid) << FILTER_SEL_WIDTH_P_FC; + ntuple |= G_FW_VIID_PFN(viid) << FILTER_SEL_WIDTH_VIN_P_FC; + ntuple |= G_FW_VIID_VIVLD(viid) << FILTER_SEL_WIDTH_TAG_P_FC; + ntuple |= e->lport << S_PORT; + ntuple |= IPPROTO_TCP << FILTER_SEL_WIDTH_VLD_TAG_P_FC; + } + + return (htobe32(ntuple)); +} + +static int +alloc_tid_tabs(struct tid_info *t) +{ + size_t size; + unsigned int i; + + size = t->ntids * sizeof(*t->tid_tab) + + t->natids * sizeof(*t->atid_tab) + + t->nstids * sizeof(*t->stid_tab); + + t->tid_tab = malloc(size, M_CXGBE, M_ZERO | M_NOWAIT); + if (t->tid_tab == NULL) + return (ENOMEM); + + mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); + t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids]; + t->afree = t->atid_tab; + t->atids_in_use = 0; + for (i = 1; i < t->natids; i++) + t->atid_tab[i - 1].next = &t->atid_tab[i]; + t->atid_tab[t->natids - 1].next = NULL; + + mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF); + t->stid_tab = (union serv_entry *)&t->atid_tab[t->natids]; + t->sfree = t->stid_tab; + t->stids_in_use = 0; + for (i = 1; i < t->nstids; i++) + t->stid_tab[i - 1].next = &t->stid_tab[i]; + t->stid_tab[t->nstids - 1].next = NULL; + + atomic_store_rel_int(&t->tids_in_use, 0); + + return (0); +} + +static void +free_tid_tabs(struct tid_info *t) +{ + KASSERT(t->tids_in_use == 0, + ("%s: %d tids still in use.", __func__, t->tids_in_use)); + KASSERT(t->atids_in_use == 0, + ("%s: %d atids still in use.", __func__, t->atids_in_use)); + KASSERT(t->stids_in_use == 0, + ("%s: %d tids still in use.", __func__, t->stids_in_use)); + + free(t->tid_tab, M_CXGBE); + t->tid_tab = NULL; + + if (mtx_initialized(&t->atid_lock)) + mtx_destroy(&t->atid_lock); + if (mtx_initialized(&t->stid_lock)) + mtx_destroy(&t->stid_lock); +} + +static void +free_tom_data(struct adapter *sc, struct tom_data *td) +{ + KASSERT(TAILQ_EMPTY(&td->toep_list), + ("%s: TOE PCB list is not empty.", __func__)); + KASSERT(td->lctx_count == 0, + ("%s: lctx hash table is not empty.", __func__)); + + t4_uninit_l2t_cpl_handlers(sc); + + if (td->listen_mask != 0) + hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); + + if (mtx_initialized(&td->lctx_hash_lock)) + mtx_destroy(&td->lctx_hash_lock); + if (mtx_initialized(&td->toep_list_lock)) + mtx_destroy(&td->toep_list_lock); + + free_tid_tabs(&sc->tids); + free(td, M_CXGBE); +} + +/* + * Ground control to Major TOM + * Commencing countdown, engines on + */ +static int +t4_tom_activate(struct adapter *sc) +{ + struct tom_data *td; + struct toedev *tod; + int i, rc; + + ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */ + + /* per-adapter softc for TOM */ + td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT); + if (td == NULL) + return (ENOMEM); + + /* List of TOE PCBs and associated lock */ + mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF); + TAILQ_INIT(&td->toep_list); + + /* Listen context */ + mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF); + td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE, + &td->listen_mask, HASH_NOWAIT); + + /* TID tables */ + rc = alloc_tid_tabs(&sc->tids); + if (rc != 0) + goto done; + + /* CPL handlers */ + t4_init_connect_cpl_handlers(sc); + t4_init_l2t_cpl_handlers(sc); + t4_init_listen_cpl_handlers(sc); + t4_init_cpl_io_handlers(sc); + + /* toedev ops */ + tod = &td->tod; + init_toedev(tod); + tod->tod_softc = sc; + tod->tod_connect = t4_connect; + tod->tod_listen_start = t4_listen_start; + tod->tod_listen_stop = t4_listen_stop; + tod->tod_rcvd = t4_rcvd; + tod->tod_output = t4_tod_output; + tod->tod_send_rst = t4_send_rst; + tod->tod_send_fin = t4_send_fin; + tod->tod_pcb_detach = t4_pcb_detach; + tod->tod_l2_update = t4_l2_update; + tod->tod_syncache_added = t4_syncache_added; + tod->tod_syncache_removed = t4_syncache_removed; + tod->tod_syncache_respond = t4_syncache_respond; + tod->tod_offload_socket = t4_offload_socket; + + for_each_port(sc, i) + TOEDEV(sc->port[i]->ifp) = &td->tod; + + sc->tom_softc = td; + sc->flags |= TOM_INIT_DONE; + register_toedev(sc->tom_softc); + +done: + if (rc != 0) + free_tom_data(sc, td); + return (rc); +} + +static int +t4_tom_deactivate(struct adapter *sc) +{ + int rc = 0; + struct tom_data *td = sc->tom_softc; + + ADAPTER_LOCK_ASSERT_OWNED(sc); /* for sc->flags */ + + if (td == NULL) + return (0); /* XXX. KASSERT? */ + + if (sc->offload_map != 0) + return (EBUSY); /* at least one port has IFCAP_TOE enabled */ + + mtx_lock(&td->toep_list_lock); + if (!TAILQ_EMPTY(&td->toep_list)) + rc = EBUSY; + mtx_unlock(&td->toep_list_lock); + + mtx_lock(&td->lctx_hash_lock); + if (td->lctx_count > 0) + rc = EBUSY; + mtx_unlock(&td->lctx_hash_lock); + + if (rc == 0) { + unregister_toedev(sc->tom_softc); + free_tom_data(sc, td); + sc->tom_softc = NULL; + sc->flags &= ~TOM_INIT_DONE; + } + + return (rc); +} + +static int +t4_tom_mod_load(void) +{ + int rc; + + rc = t4_register_uld(&tom_uld_info); + if (rc != 0) + t4_tom_mod_unload(); + + return (rc); +} + +static void +tom_uninit(struct adapter *sc, void *arg __unused) +{ + /* Try to free resources (works only if no port has IFCAP_TOE) */ + ADAPTER_LOCK(sc); + if (sc->flags & TOM_INIT_DONE) + t4_deactivate_uld(sc, ULD_TOM); + ADAPTER_UNLOCK(sc); +} + +static int +t4_tom_mod_unload(void) +{ + t4_iterate(tom_uninit, NULL); + + if (t4_unregister_uld(&tom_uld_info) == EBUSY) + return (EBUSY); + + return (0); +} +#endif /* TCP_OFFLOAD */ + +static int +t4_tom_modevent(module_t mod, int cmd, void *arg) +{ + int rc = 0; + +#ifdef TCP_OFFLOAD + switch (cmd) { + case MOD_LOAD: + rc = t4_tom_mod_load(); + break; + + case MOD_UNLOAD: + rc = t4_tom_mod_unload(); + break; + + default: + rc = EINVAL; + } +#else + printf("t4_tom: compiled without TCP_OFFLOAD support.\n"); + rc = EOPNOTSUPP; +#endif + return (rc); +} + +static moduledata_t t4_tom_moddata= { + "t4_tom", + t4_tom_modevent, + 0 +}; + +MODULE_VERSION(t4_tom, 1); +MODULE_DEPEND(t4_tom, toecore, 1, 1, 1); +MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1); +DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h new file mode 100644 index 0000000..4e171e7 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -0,0 +1,248 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef __T4_TOM_H__ +#define __T4_TOM_H__ + +#define KTR_CXGBE KTR_SPARE3 +#define LISTEN_HASH_SIZE 32 + +/* + * Min receive window. We want it to be large enough to accommodate receive + * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. + */ +#define MIN_RCV_WND (24 * 1024U) + +/* + * Max receive window supported by HW in bytes. Only a small part of it can + * be set through option0, the rest needs to be set through RX_DATA_ACK. + */ +#define MAX_RCV_WND ((1U << 27) - 1) + +/* TOE PCB flags */ +enum { + TPF_ATTACHED, /* a tcpcb refers to this toepcb */ + TPF_FLOWC_WR_SENT, /* firmware flow context WR sent */ + TPF_TX_DATA_SENT, /* some data sent */ + TPF_TX_SUSPENDED, /* tx suspended for lack of resources */ + TPF_SEND_FIN, /* send FIN after sending all pending data */ + TPF_FIN_SENT, /* FIN has been sent */ + TPF_ABORT_SHUTDOWN, /* connection abort is in progress */ + TPF_CPL_PENDING, /* haven't received the last CPL */ + TPF_SYNQE, /* synq_entry, not really a toepcb */ + TPF_SYNQE_NEEDFREE, /* synq_entry was allocated externally */ +}; + +struct ofld_tx_sdesc { + uint32_t plen; /* payload length */ + uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ +}; + +struct toepcb { + TAILQ_ENTRY(toepcb) link; /* toep_list */ + unsigned int flags; /* miscellaneous flags */ + struct tom_data *td; + struct inpcb *inp; /* backpointer to host stack's PCB */ + struct port_info *port; /* physical port */ + struct sge_wrq *ofld_txq; + struct sge_ofld_rxq *ofld_rxq; + struct sge_wrq *ctrlq; + struct l2t_entry *l2te; /* L2 table entry used by this connection */ + int tid; /* Connection identifier */ + unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */ + unsigned int enqueued; /* # of bytes added to so_rcv (not yet read) */ + int rx_credits; /* rx credits (in bytes) to be returned to hw */ + + unsigned int ulp_mode; /* ULP mode */ + + /* Tx software descriptor */ + uint8_t txsd_total; + uint8_t txsd_pidx; + uint8_t txsd_cidx; + uint8_t txsd_avail; + struct ofld_tx_sdesc txsd[]; +}; + +struct flowc_tx_params { + uint32_t snd_nxt; + uint32_t rcv_nxt; + unsigned int snd_space; + unsigned int mss; +}; + +static inline int +toepcb_flag(struct toepcb *toep, int flag) +{ + + return isset(&toep->flags, flag); +} + +static inline void +toepcb_set_flag(struct toepcb *toep, int flag) +{ + + setbit(&toep->flags, flag); +} + +static inline void +toepcb_clr_flag(struct toepcb *toep, int flag) +{ + + clrbit(&toep->flags, flag); +} + +/* + * Compressed state for embryonic connections for a listener. Barely fits in + * 64B, try not to grow it further. + */ +struct synq_entry { + TAILQ_ENTRY(synq_entry) link; /* listen_ctx's synq link */ + int flags; /* same as toepcb's tp_flags */ + int tid; + struct listen_ctx *lctx; /* backpointer to listen ctx */ + struct mbuf *syn; + uint32_t iss; + uint32_t ts; + volatile uintptr_t wr; + volatile u_int refcnt; + uint16_t l2e_idx; + uint16_t rcv_bufsize; +}; + +static inline int +synqe_flag(struct synq_entry *synqe, int flag) +{ + + return isset(&synqe->flags, flag); +} + +static inline void +synqe_set_flag(struct synq_entry *synqe, int flag) +{ + + setbit(&synqe->flags, flag); +} + +static inline void +synqe_clr_flag(struct synq_entry *synqe, int flag) +{ + + clrbit(&synqe->flags, flag); +} + +/* listen_ctx flags */ +#define LCTX_RPL_PENDING 1 /* waiting for a CPL_PASS_OPEN_RPL */ + +struct listen_ctx { + LIST_ENTRY(listen_ctx) link; /* listen hash linkage */ + volatile int refcount; + int stid; + int flags; + struct inpcb *inp; /* listening socket's inp */ + struct sge_wrq *ctrlq; + struct sge_ofld_rxq *ofld_rxq; + TAILQ_HEAD(, synq_entry) synq; +}; + +struct tom_data { + struct toedev tod; + + /* toepcb's associated with this TOE device */ + struct mtx toep_list_lock; + TAILQ_HEAD(, toepcb) toep_list; + + LIST_HEAD(, listen_ctx) *listen_hash; + u_long listen_mask; + int lctx_count; /* # of lctx in the hash table */ + struct mtx lctx_hash_lock; +}; + +static inline struct tom_data * +tod_td(struct toedev *tod) +{ + + return (member2struct(tom_data, tod, tod)); +} + +static inline struct adapter * +td_adapter(struct tom_data *td) +{ + + return (td->tod.tod_softc); +} + +/* t4_tom.c */ +struct toepcb *alloc_toepcb(struct port_info *, int, int, int); +void free_toepcb(struct toepcb *); +void offload_socket(struct socket *, struct toepcb *); +void undo_offload_socket(struct socket *); +void final_cpl_received(struct toepcb *); +void insert_tid(struct adapter *, int, void *); +void *lookup_tid(struct adapter *, int); +void update_tid(struct adapter *, int, void *); +void remove_tid(struct adapter *, int); +void release_tid(struct adapter *, int, struct sge_wrq *); +int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int); +u_long select_rcv_wnd(struct socket *); +int select_rcv_wscale(void); +uint64_t calc_opt0(struct socket *, struct port_info *, struct l2t_entry *, + int, int, int, int); +uint32_t select_ntuple(struct port_info *, struct l2t_entry *, uint32_t); + +/* t4_connect.c */ +void t4_init_connect_cpl_handlers(struct adapter *); +int t4_connect(struct toedev *, struct socket *, struct rtentry *, + struct sockaddr *); + +/* t4_listen.c */ +void t4_init_listen_cpl_handlers(struct adapter *); +int t4_listen_start(struct toedev *, struct tcpcb *); +int t4_listen_stop(struct toedev *, struct tcpcb *); +void t4_syncache_added(struct toedev *, void *); +void t4_syncache_removed(struct toedev *, void *); +int t4_syncache_respond(struct toedev *, void *, struct mbuf *); +int do_abort_req_synqe(struct sge_iq *, const struct rss_header *, + struct mbuf *); +int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *, + struct mbuf *); +void t4_offload_socket(struct toedev *, void *, struct socket *); + +/* t4_cpl_io.c */ +void t4_init_cpl_io_handlers(struct adapter *); +void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int); +void send_flowc_wr(struct toepcb *, struct flowc_tx_params *); +void send_reset(struct adapter *, struct toepcb *, uint32_t); +void make_established(struct toepcb *, uint32_t, uint32_t, uint16_t); +void t4_rcvd(struct toedev *, struct tcpcb *); +int t4_tod_output(struct toedev *, struct tcpcb *); +int t4_send_fin(struct toedev *, struct tcpcb *); +int t4_send_rst(struct toedev *, struct tcpcb *); + +#endif diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.c b/sys/dev/cxgbe/tom/t4_tom_l2t.c new file mode 100644 index 0000000..ffe64c5 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom_l2t.c @@ -0,0 +1,405 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#ifdef TCP_OFFLOAD +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/common.h" +#include "common/jhash.h" +#include "common/t4_msg.h" +#include "tom/t4_tom_l2t.h" +#include "tom/t4_tom.h" + +#define VLAN_NONE 0xfff + +#define SA(x) ((struct sockaddr *)(x)) +#define SIN(x) ((struct sockaddr_in *)(x)) +#define SINADDR(x) (SIN(x)->sin_addr.s_addr) + +static inline void +l2t_hold(struct l2t_data *d, struct l2t_entry *e) +{ + if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */ + atomic_subtract_int(&d->nfree, 1); +} + +static inline unsigned int +arp_hash(const uint32_t key, int ifindex) +{ + return jhash_2words(key, ifindex, 0) & (L2T_SIZE - 1); +} + +/* + * Add a WR to an L2T entry's queue of work requests awaiting resolution. + * Must be called with the entry's lock held. + */ +static inline void +arpq_enqueue(struct l2t_entry *e, struct wrqe *wr) +{ + mtx_assert(&e->lock, MA_OWNED); + + STAILQ_INSERT_TAIL(&e->wr_list, wr, link); +} + +static inline void +send_pending(struct adapter *sc, struct l2t_entry *e) +{ + struct wrqe *wr; + + mtx_assert(&e->lock, MA_OWNED); + + while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { + STAILQ_REMOVE_HEAD(&e->wr_list, link); + t4_wrq_tx(sc, wr); + } +} + +static void +resolution_failed_for_wr(struct wrqe *wr) +{ + log(LOG_ERR, "%s: leaked work request %p, wr_len %d", __func__, wr, + wr->wr_len); + + /* free(wr, M_CXGBE); */ +} + +static void +resolution_failed(struct l2t_entry *e) +{ + struct wrqe *wr; + + mtx_assert(&e->lock, MA_OWNED); + + while ((wr = STAILQ_FIRST(&e->wr_list)) != NULL) { + STAILQ_REMOVE_HEAD(&e->wr_list, link); + resolution_failed_for_wr(wr); + } +} + +static void +update_entry(struct adapter *sc, struct l2t_entry *e, uint8_t *lladdr, + uint16_t vtag) +{ + + mtx_assert(&e->lock, MA_OWNED); + + /* + * The entry may be in active use (e->refcount > 0) or not. We update + * it even when it's not as this simplifies the case where we decide to + * reuse the entry later. + */ + + if (lladdr == NULL && + (e->state == L2T_STATE_RESOLVING || e->state == L2T_STATE_FAILED)) { + /* + * Never got a valid L2 address for this one. Just mark it as + * failed instead of removing it from the hash (for which we'd + * need to wlock the table). + */ + e->state = L2T_STATE_FAILED; + resolution_failed(e); + return; + + } else if (lladdr == NULL) { + + /* Valid or already-stale entry was deleted (or expired) */ + + KASSERT(e->state == L2T_STATE_VALID || + e->state == L2T_STATE_STALE, + ("%s: lladdr NULL, state %d", __func__, e->state)); + + e->state = L2T_STATE_STALE; + + } else { + + if (e->state == L2T_STATE_RESOLVING || + e->state == L2T_STATE_FAILED || + memcmp(e->dmac, lladdr, ETHER_ADDR_LEN)) { + + /* unresolved -> resolved; or dmac changed */ + + memcpy(e->dmac, lladdr, ETHER_ADDR_LEN); + e->vlan = vtag; + t4_write_l2e(sc, e, 1); + } + e->state = L2T_STATE_VALID; + } +} + +static int +resolve_entry(struct adapter *sc, struct l2t_entry *e) +{ + struct tom_data *td = sc->tom_softc; + struct toedev *tod = &td->tod; + struct sockaddr_in sin = {0}; + uint8_t dmac[ETHER_ADDR_LEN]; + uint16_t vtag = VLAN_NONE; + int rc; + + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + SINADDR(&sin) = e->addr; + + rc = toe_l2_resolve(tod, e->ifp, SA(&sin), dmac, &vtag); + if (rc == EWOULDBLOCK) + return (rc); + + mtx_lock(&e->lock); + update_entry(sc, e, rc == 0 ? dmac : NULL, vtag); + mtx_unlock(&e->lock); + + return (rc); +} + +int +t4_l2t_send_slow(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e) +{ + +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + + if (resolve_entry(sc, e) != EWOULDBLOCK) + goto again; /* entry updated, re-examine state */ + + /* Fall through */ + + case L2T_STATE_VALID: /* fast-path, send the packet on */ + + t4_wrq_tx(sc, wr); + return (0); + + case L2T_STATE_RESOLVING: + case L2T_STATE_SYNC_WRITE: + + mtx_lock(&e->lock); + if (e->state != L2T_STATE_SYNC_WRITE && + e->state != L2T_STATE_RESOLVING) { + /* state changed by the time we got here */ + mtx_unlock(&e->lock); + goto again; + } + arpq_enqueue(e, wr); + mtx_unlock(&e->lock); + + if (resolve_entry(sc, e) == EWOULDBLOCK) + break; + + mtx_lock(&e->lock); + if (e->state == L2T_STATE_VALID && !STAILQ_EMPTY(&e->wr_list)) + send_pending(sc, e); + if (e->state == L2T_STATE_FAILED) + resolution_failed(e); + mtx_unlock(&e->lock); + break; + + case L2T_STATE_FAILED: + resolution_failed_for_wr(wr); + return (EHOSTUNREACH); + } + + return (0); +} + +/* + * Called when an L2T entry has no more users. The entry is left in the hash + * table since it is likely to be reused but we also bump nfree to indicate + * that the entry can be reallocated for a different neighbor. We also drop + * the existing neighbor reference in case the neighbor is going away and is + * waiting on our reference. + * + * Because entries can be reallocated to other neighbors once their ref count + * drops to 0 we need to take the entry's lock to avoid races with a new + * incarnation. + */ + +static int +do_l2t_write_rpl2(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(rpl); + unsigned int idx = tid & (L2T_SIZE - 1); + int rc; + + rc = do_l2t_write_rpl(iq, rss, m); + if (rc != 0) + return (rc); + + if (tid & F_SYNC_WR) { + struct l2t_entry *e = &sc->l2t->l2tab[idx]; + + mtx_lock(&e->lock); + if (e->state != L2T_STATE_SWITCHING) { + send_pending(sc, e); + e->state = L2T_STATE_VALID; + } + mtx_unlock(&e->lock); + } + + return (0); +} + +void +t4_init_l2t_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl2); +} + +void +t4_uninit_l2t_cpl_handlers(struct adapter *sc) +{ + + t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl); +} + +/* + * The TOE wants an L2 table entry that it can use to reach the next hop over + * the specified port. Produce such an entry - create one if needed. + * + * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on + * top of the real cxgbe interface. + */ +struct l2t_entry * +t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa) +{ + struct l2t_entry *e; + struct l2t_data *d = pi->adapter->l2t; + uint32_t addr = SINADDR(sa); + int hash = arp_hash(addr, ifp->if_index); + unsigned int smt_idx = pi->port_id; + + if (sa->sa_family != AF_INET) + return (NULL); /* XXX: no IPv6 support right now */ + +#ifndef VLAN_TAG + if (ifp->if_type == IFT_L2VLAN) + return (NULL); +#endif + + rw_wlock(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) { + if (e->addr == addr && e->ifp == ifp && e->smt_idx == smt_idx) { + l2t_hold(d, e); + goto done; + } + } + + /* Need to allocate a new entry */ + e = t4_alloc_l2e(d); + if (e) { + mtx_lock(&e->lock); /* avoid race with t4_l2t_free */ + e->next = d->l2tab[hash].first; + d->l2tab[hash].first = e; + + e->state = L2T_STATE_RESOLVING; + e->addr = addr; + e->ifp = ifp; + e->smt_idx = smt_idx; + e->hash = hash; + e->lport = pi->lport; + atomic_store_rel_int(&e->refcnt, 1); +#ifdef VLAN_TAG + if (ifp->if_type == IFT_L2VLAN) + VLAN_TAG(ifp, &e->vlan); + else + e->vlan = VLAN_NONE; +#endif + mtx_unlock(&e->lock); + } +done: + rw_wunlock(&d->lock); + return e; +} + +/* + * Called when the host's ARP layer makes a change to some entry that is loaded + * into the HW L2 table. + */ +void +t4_l2_update(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, + uint8_t *lladdr, uint16_t vtag) +{ + struct adapter *sc = tod->tod_softc; + struct l2t_entry *e; + struct l2t_data *d = sc->l2t; + uint32_t addr = SINADDR(sa); + int hash = arp_hash(addr, ifp->if_index); + + KASSERT(d != NULL, ("%s: no L2 table", __func__)); + + rw_rlock(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) { + if (e->addr == addr && e->ifp == ifp) { + mtx_lock(&e->lock); + if (atomic_load_acq_int(&e->refcnt)) + goto found; + e->state = L2T_STATE_STALE; + mtx_unlock(&e->lock); + break; + } + } + rw_runlock(&d->lock); + + /* + * This is of no interest to us. We've never had an offloaded + * connection to this destination, and we aren't attempting one right + * now. + */ + return; + +found: + rw_runlock(&d->lock); + + KASSERT(e->state != L2T_STATE_UNUSED, + ("%s: unused entry in the hash.", __func__)); + + update_entry(sc, e, lladdr, vtag); + mtx_unlock(&e->lock); +} +#endif diff --git a/sys/dev/cxgbe/tom/t4_tom_l2t.h b/sys/dev/cxgbe/tom/t4_tom_l2t.h new file mode 100644 index 0000000..3d76735 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_tom_l2t.h @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef __T4_TOM_L2T_H +#define __T4_TOM_L2T_H + +#include "t4_l2t.h" + +int t4_l2t_send_slow(struct adapter *, struct wrqe *, struct l2t_entry *); +struct l2t_entry *t4_l2t_get(struct port_info *, struct ifnet *, + struct sockaddr *); +void t4_l2_update(struct toedev *, struct ifnet *, struct sockaddr *, + uint8_t *, uint16_t); +void t4_init_l2t_cpl_handlers(struct adapter *); +void t4_uninit_l2t_cpl_handlers(struct adapter *); + +static inline int +t4_l2t_send(struct adapter *sc, struct wrqe *wr, struct l2t_entry *e) +{ + if (__predict_true(e->state == L2T_STATE_VALID)) { + t4_wrq_tx(sc, wr); + return (0); + } else + return (t4_l2t_send_slow(sc, wr, e)); +} + +#endif /* __T4_TOM_L2T_H */ diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index b857e24..3ff96c9 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -30,6 +30,7 @@ options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking options INET6 # IPv6 communications protocols +options TCP_OFFLOAD # TCP offload options SCTP # Stream Control Transmission Protocol options FFS # Berkeley Fast Filesystem options SOFTUPDATES # Enable FFS soft updates support diff --git a/sys/i386/conf/XEN b/sys/i386/conf/XEN index 0a289ea..a9c838c 100644 --- a/sys/i386/conf/XEN +++ b/sys/i386/conf/XEN @@ -7,7 +7,7 @@ cpu I686_CPU ident XEN makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols -makeoptions WITHOUT_MODULES="aha ahb amd cxgb dpt drm drm2 hptmv ida malo mps mwl nve sound sym trm xfs" +makeoptions WITHOUT_MODULES="aha ahb amd cxgb dpt drm drm2 hptmv ida malo mps mwl nve rdma sound sym trm xfs" options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 0e66091..80ac632 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -314,6 +314,7 @@ SUBDIR= \ ${_ti} \ tl \ tmpfs \ + ${_toecore} \ ${_tpm} \ trm \ ${_twa} \ @@ -392,6 +393,7 @@ _random= random .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _carp= carp +_toecore= toecore .endif .if ${MK_INET_SUPPORT} != "no" || defined(ALL_MODULES) diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile index a6af817..325c705 100644 --- a/sys/modules/cxgb/Makefile +++ b/sys/modules/cxgb/Makefile @@ -1,39 +1,12 @@ # $FreeBSD$ SUBDIR= cxgb -SUBDIR+= ${_toecore} +SUBDIR+= cxgb_t3fw SUBDIR+= ${_tom} SUBDIR+= ${_iw_cxgb} -SUBDIR+= cxgb_t3fw - -.if defined(SYSDIR) -_sysdir = ${SYSDIR} -.endif - -# Based on bsd.kmod.mk but we don't modify SYSDIR in this one. -.for _dir in ${.CURDIR}/../.. ${.CURDIR}/../../.. ${.CURDIR}/../../../.. \ - /sys /usr/src/sys -.if !defined(_sysdir) && exists(${_dir}/kern/) && exists(${_dir}/conf/kmod.mk) -_sysdir = ${_dir} -.endif -.endfor -.if !defined(_sysdir) || !exists(${_sysdir}/kern/) || \ - !exists(${_sysdir}/conf/kmod.mk) -.error "can't find kernel source tree" -.endif - -_toe_header = ${_sysdir}/netinet/toedev.h - -.if exists(${_toe_header}) -_toecore = toecore -#_tom = tom -.endif - -.if ${MACHINE_CPUARCH} == "i386" && exists(${_toe_header}) -_iw_cxgb = iw_cxgb -.endif -.if ${MACHINE_CPUARCH} == "amd64" && exists(${_toe_header}) -_iw_cxgb = iw_cxgb +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" +_tom= tom +_iw_cxgb= iw_cxgb .endif .include diff --git a/sys/modules/cxgb/cxgb/Makefile b/sys/modules/cxgb/cxgb/Makefile index ed4e229..8807ece 100644 --- a/sys/modules/cxgb/cxgb/Makefile +++ b/sys/modules/cxgb/cxgb/Makefile @@ -8,7 +8,7 @@ CXGB = ${.CURDIR}/../../../dev/cxgb KMOD= if_cxgb SRCS= cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c SRCS+= cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c cxgb_aq100x.c -SRCS+= cxgb_sge.c cxgb_offload.c cxgb_tn1010.c +SRCS+= cxgb_sge.c cxgb_tn1010.c SRCS+= device_if.h bus_if.h pci_if.h SRCS+= opt_inet.h opt_inet6.h opt_zero.h opt_sched.h SRCS+= uipc_mvec.c @@ -19,6 +19,7 @@ CFLAGS+= -g -DDEFAULT_JUMBO -I${CXGB} .if ${MK_INET_SUPPORT} != "no" opt_inet.h: @echo "#define INET 1" > ${.TARGET} + @echo "#define TCP_OFFLOAD 1" >> ${.TARGET} .endif .if ${MK_INET6_SUPPORT} != "no" diff --git a/sys/modules/cxgb/iw_cxgb/Makefile b/sys/modules/cxgb/iw_cxgb/Makefile index 2cf2ef8..f633bd5 100644 --- a/sys/modules/cxgb/iw_cxgb/Makefile +++ b/sys/modules/cxgb/iw_cxgb/Makefile @@ -1,5 +1,7 @@ # $FreeBSD$ +.include + CXGB = ${.CURDIR}/../../../dev/cxgb .PATH: ${CXGB}/ulp/iw_cxgb @@ -8,8 +10,15 @@ SRCS= iw_cxgb.c iw_cxgb_cm.c iw_cxgb_hal.c SRCS+= iw_cxgb_provider.c iw_cxgb_qp.c iw_cxgb_resource.c SRCS+= iw_cxgb_ev.c iw_cxgb_mem.c iw_cxgb_dbg.c iw_cxgb_cq.c SRCS+= bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h opt_ktr.h -SRCS+= opt_inet.h -CFLAGS+= -g -I${CXGB} -#CFLAGS+= -DDEBUG +SRCS+= opt_inet.h opt_ofed.h vnode_if.h +CFLAGS+= -I${CXGB} -I${.CURDIR}/../../../ofed/include -DLINUX_TYPES_DEFINED + +.if !defined(KERNBUILDDIR) +.if ${MK_INET_SUPPORT} != "no" +opt_inet.h: + echo "#define INET 1" > ${.TARGET} + echo "#define TCP_OFFLOAD 1" >> ${.TARGET} +.endif +.endif .include diff --git a/sys/modules/cxgb/toecore/Makefile b/sys/modules/cxgb/toecore/Makefile deleted file mode 100644 index 4342519..0000000 --- a/sys/modules/cxgb/toecore/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# $FreeBSD$ - -CXGB = ${.CURDIR}/../../../dev/cxgb -.PATH: ${CXGB}/ulp/toecore - -KMOD= toecore -SRCS= toedev.c -SRCS+= device_if.h bus_if.h pci_if.h opt_sched.h opt_inet.h -CFLAGS+= -g -I${CXGB} - -.include diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile index 4c22670..8b08fe2 100644 --- a/sys/modules/cxgb/tom/Makefile +++ b/sys/modules/cxgb/tom/Makefile @@ -1,15 +1,25 @@ # $FreeBSD$ +.include + CXGB = ${.CURDIR}/../../../dev/cxgb .PATH: ${CXGB}/ulp/tom -KMOD= tom -SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c -SRCS+= cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c +KMOD= t3_tom +SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_l2t.c SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h SRCS+= device_if.h bus_if.h pci_if.h CFLAGS+= -g -I${CXGB} #CFLAGS+= -DDEBUG_PRINT -DDEBUG + +.if !defined(KERNBUILDDIR) +.if ${MK_INET_SUPPORT} != "no" +opt_inet.h: + echo "#define INET 1" > ${.TARGET} + echo "#define TCP_OFFLOAD 1" >> ${.TARGET} +.endif +.endif + .include diff --git a/sys/modules/cxgbe/Makefile b/sys/modules/cxgbe/Makefile index 1d69f76..5ec8dcc 100644 --- a/sys/modules/cxgbe/Makefile +++ b/sys/modules/cxgbe/Makefile @@ -4,5 +4,10 @@ SUBDIR = if_cxgbe SUBDIR+= firmware +SUBDIR+= ${_tom} + +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" +_tom= tom +.endif .include diff --git a/sys/modules/cxgbe/tom/Makefile b/sys/modules/cxgbe/tom/Makefile new file mode 100644 index 0000000..647984d --- /dev/null +++ b/sys/modules/cxgbe/tom/Makefile @@ -0,0 +1,15 @@ +# +# $FreeBSD$ +# + +CXGBE = ${.CURDIR}/../../../dev/cxgbe +.PATH: ${CXGBE}/tom + +KMOD = t4_tom +SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c +SRCS+= device_if.h bus_if.h pci_if.h +SRCS+= opt_inet.h + +CFLAGS+= -I${CXGBE} + +.include diff --git a/sys/modules/rdma/krping/Makefile b/sys/modules/rdma/krping/Makefile index 5e4871b..ae6cc80 100644 --- a/sys/modules/rdma/krping/Makefile +++ b/sys/modules/rdma/krping/Makefile @@ -6,5 +6,7 @@ RDMA= ${.CURDIR}/../../../contrib/rdma/krping KMOD= krping SRCS= krping.c krping_dev.c getopt.c SRCS+= bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h +SRCS+= vnode_if.h +CFLAGS+= -I${.CURDIR}/../../../ofed/include .include diff --git a/sys/modules/toecore/Makefile b/sys/modules/toecore/Makefile new file mode 100644 index 0000000..0b30ab0 --- /dev/null +++ b/sys/modules/toecore/Makefile @@ -0,0 +1,9 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../netinet + +KMOD= toecore +SRCS= toecore.c +SRCS+= opt_ofed.h + +.include diff --git a/sys/net/if_var.h b/sys/net/if_var.h index ab078b8..3d57953 100644 --- a/sys/net/if_var.h +++ b/sys/net/if_var.h @@ -712,6 +712,8 @@ drbr_inuse(struct ifnet *ifp, struct buf_ring *br) #define IF_MINMTU 72 #define IF_MAXMTU 65535 +#define TOEDEV(ifp) ((ifp)->if_llsoftc) + #endif /* _KERNEL */ /* diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index 95ed455..1225680 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -746,8 +746,8 @@ vlan_modevent(module_t mod, int type, void *data) vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; - vlan_cookie_p = vlan_cookie; - vlan_setcookie_p = vlan_setcookie; + vlan_cookie_p = NULL; + vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCK_DESTROY(); if (bootverbose) @@ -1503,6 +1503,22 @@ vlan_capabilities(struct ifvlan *ifv) ifp->if_capenable &= ~(p->if_capenable & IFCAP_TSO); ifp->if_hwassist &= ~(p->if_hwassist & CSUM_TSO); } + + /* + * If the parent interface can offload TCP connections over VLANs then + * propagate its TOE capability to the VLAN interface. + * + * All TOE drivers in the tree today can deal with VLANs. If this + * changes then IFCAP_VLAN_TOE should be promoted to a full capability + * with its own bit. + */ +#define IFCAP_VLAN_TOE IFCAP_TOE + if (p->if_capabilities & IFCAP_VLAN_TOE) + ifp->if_capabilities |= p->if_capabilities & IFCAP_TOE; + if (p->if_capenable & IFCAP_VLAN_TOE) { + TOEDEV(ifp) = TOEDEV(p); + ifp->if_capenable |= p->if_capenable & IFCAP_TOE; + } } static void diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index bdb4efc..d6a7fd1 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -180,6 +180,17 @@ arptimer(void *arg) callout_active(&lle->la_timer)) { callout_stop(&lle->la_timer); LLE_REMREF(lle); + + if (lle->la_flags != LLE_DELETED) { + int evt; + + if (lle->la_flags & LLE_VALID) + evt = LLENTRY_EXPIRED; + else + evt = LLENTRY_TIMEDOUT; + EVENTHANDLER_INVOKE(lle_event, lle, evt); + } + pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); @@ -726,7 +737,7 @@ match: (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); la->la_flags |= LLE_VALID; - EVENTHANDLER_INVOKE(arp_update_event, la); + EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { int canceled; diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h index 8d44d35..e37a964 100644 --- a/sys/netinet/if_ether.h +++ b/sys/netinet/if_ether.h @@ -122,8 +122,14 @@ void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *); void arp_ifscrub(struct ifnet *, uint32_t); #include -typedef void (*llevent_arp_update_fn)(void *, struct llentry *); -EVENTHANDLER_DECLARE(arp_update_event, llevent_arp_update_fn); +enum { + LLENTRY_RESOLVED, + LLENTRY_TIMEDOUT, + LLENTRY_DELETED, + LLENTRY_EXPIRED, +}; +typedef void (*lle_event_fn)(void *, struct llentry *, int); +EVENTHANDLER_DECLARE(lle_event, lle_event_fn); #endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c index c1cbcb1..1beddd6 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -1469,7 +1469,7 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); lle->la_flags = LLE_DELETED; - EVENTHANDLER_INVOKE(arp_update_event, lle); + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); LLE_WUNLOCK(lle); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 4d3234f..6d8ebee 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -105,6 +105,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif #ifdef IPSEC #include @@ -958,6 +961,14 @@ relocked: goto dropwithreset; } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_input(tp, m); + m = NULL; /* consumed by the TOE driver */ + goto dropunlock; + } +#endif + /* * We've identified a valid inpcb, but it could be that we need an * inpcbinfo write lock but don't hold it. In this case, attempt to @@ -1320,7 +1331,7 @@ relocked: (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); - syncache_add(&inc, &to, th, inp, &so, m); + syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); /* * Entry added to syncache and mbuf consumed. * Everything already unlocked by syncache_add(). diff --git a/sys/netinet/tcp_offload.c b/sys/netinet/tcp_offload.c index 899a37c..b772418 100644 --- a/sys/netinet/tcp_offload.c +++ b/sys/netinet/tcp_offload.c @@ -1,145 +1,176 @@ /*- - * Copyright (c) 2007, Chelsio Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #include #include #include -#include -#include -#include #include #include #include - +#include #include -#include -#include #include -#include - #include -#include #include #include #include #include -#include +#define TCPOUTFLAGS +#include +#include -uint32_t toedev_registration_count; +int registered_toedevs; +/* + * Provide an opportunity for a TOE driver to offload. + */ int tcp_offload_connect(struct socket *so, struct sockaddr *nam) { struct ifnet *ifp; - struct toedev *tdev; + struct toedev *tod; struct rtentry *rt; - int error; - - if (toedev_registration_count == 0) - return (EINVAL); - - /* - * Look up the route used for the connection to - * determine if it uses an interface capable of - * offloading the connection. - */ - rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/); - if (rt) + int error = EOPNOTSUPP; + + INP_WLOCK_ASSERT(sotoinpcb(so)); + KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, + ("%s: called with sa_family %d", __func__, nam->sa_family)); + + if (registered_toedevs == 0) + return (error); + + rt = rtalloc1(nam, 0, 0); + if (rt) RT_UNLOCK(rt); - else + else return (EHOSTUNREACH); ifp = rt->rt_ifp; - if ((ifp->if_capenable & IFCAP_TOE) == 0) { - error = EINVAL; - goto fail; - } - - tdev = TOEDEV(ifp); - if (tdev == NULL) { - error = EPERM; - goto fail; - } - - if (tdev->tod_can_offload(tdev, so) == 0) { - error = EPERM; - goto fail; - } - - return (tdev->tod_connect(tdev, so, rt, nam)); -fail: + + if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) + goto done; + if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)) + goto done; + + tod = TOEDEV(ifp); + if (tod != NULL) + error = tod->tod_connect(tod, so, rt, nam); +done: RTFREE(rt); return (error); } +void +tcp_offload_listen_start(struct tcpcb *tp) +{ -/* - * This file contains code as a short-term staging area before it is moved in - * to sys/netinet/tcp_offload.c - */ + INP_WLOCK_ASSERT(tp->t_inpcb); + + EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); +} void -tcp_offload_twstart(struct tcpcb *tp) +tcp_offload_listen_stop(struct tcpcb *tp) { - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); } -struct tcpcb * -tcp_offload_close(struct tcpcb *tp) +void +tcp_offload_input(struct tcpcb *tp, struct mbuf *m) { + struct toedev *tod = tp->tod; - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tp = tcp_close(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); - if (tp) - INP_WUNLOCK(tp->t_inpcb); + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); - return (tp); + tod->tod_input(tod, tp, m); } -struct tcpcb * -tcp_offload_drop(struct tcpcb *tp, int error) +int +tcp_offload_output(struct tcpcb *tp) { + struct toedev *tod = tp->tod; + int error, flags; - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tp = tcp_drop(tp, error); - INP_INFO_WUNLOCK(&V_tcbinfo); - if (tp) - INP_WUNLOCK(tp->t_inpcb); + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); - return (tp); + flags = tcp_outflags[tp->t_state]; + + if (flags & TH_RST) { + /* XXX: avoid repeated calls like we do for FIN */ + error = tod->tod_send_rst(tod, tp); + } else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) && + (tp->t_flags & TF_SENTFIN) == 0) { + error = tod->tod_send_fin(tod, tp); + if (error == 0) + tp->t_flags |= TF_SENTFIN; + } else + error = tod->tod_output(tod, tp); + + return (error); +} + +void +tcp_offload_rcvd(struct tcpcb *tp) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_rcvd(tod, tp); +} + +void +tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name); } +void +tcp_offload_detach(struct tcpcb *tp) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_pcb_detach(tod, tp); +} diff --git a/sys/netinet/tcp_offload.h b/sys/netinet/tcp_offload.h index 313185f..a052366 100644 --- a/sys/netinet/tcp_offload.h +++ b/sys/netinet/tcp_offload.h @@ -1,30 +1,30 @@ /*- - * Copyright (c) 2007, Chelsio Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * $FreeBSD$ + * */ #ifndef _NETINET_TCP_OFFLOAD_H_ @@ -34,321 +34,15 @@ #error "no user-serviceable parts inside" #endif -/* - * A driver publishes that it provides offload services - * by setting IFCAP_TOE in the ifnet. The offload connect - * will bypass any further work if the interface that a - * connection would use does not support TCP offload. - * - * The TOE API assumes that the tcp offload engine can offload the - * the entire connection from set up to teardown, with some provision - * being made to allowing the software stack to handle time wait. If - * the device does not meet these criteria, it is the driver's responsibility - * to overload the functions that it needs to in tcp_usrreqs and make - * its own calls to tcp_output if it needs to do so. - * - * There is currently no provision for the device advertising the congestion - * control algorithms it supports as there is currently no API for querying - * an operating system for the protocols that it has loaded. This is a desirable - * future extension. - * - * - * - * It is assumed that individuals deploying TOE will want connections - * to be offloaded without software changes so all connections on an - * interface providing TOE are offloaded unless the SO_NO_OFFLOAD - * flag is set on the socket. - * - * - * The toe_usrreqs structure constitutes the TOE driver's - * interface to the TCP stack for functionality that doesn't - * interact directly with userspace. If one wants to provide - * (optional) functionality to do zero-copy to/from - * userspace one still needs to override soreceive/sosend - * with functions that fault in and pin the user buffers. - * - * + tu_send - * - tells the driver that new data may have been added to the - * socket's send buffer - the driver should not fail if the - * buffer is in fact unchanged - * - the driver is responsible for providing credits (bytes in the send window) - * back to the socket by calling sbdrop() as segments are acknowledged. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_rcvd - * - returns credits to the driver and triggers window updates - * to the peer (a credit as used here is a byte in the peer's receive window) - * - the driver is expected to determine how many bytes have been - * consumed and credit that back to the card so that it can grow - * the window again by maintaining its own state between invocations. - * - In principle this could be used to shrink the window as well as - * grow the window, although it is not used for that now. - * - this function needs to correctly handle being called any number of - * times without any bytes being consumed from the receive buffer. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_disconnect - * - tells the driver to send FIN to peer - * - driver is expected to send the remaining data and then do a clean half close - * - disconnect implies at least half-close so only send, reset, and detach - * are legal - * - the driver is expected to handle transition through the shutdown - * state machine and allow the stack to support SO_LINGER. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_reset - * - closes the connection and sends a RST to peer - * - driver is expectd to trigger an RST and detach the toepcb - * - no further calls are legal after reset - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * The following fields in the tcpcb are expected to be referenced by the driver: - * + iss - * + rcv_nxt - * + rcv_wnd - * + snd_isn - * + snd_max - * + snd_nxt - * + snd_una - * + t_flags - * + t_inpcb - * + t_maxseg - * + t_toe - * - * The following fields in the inpcb are expected to be referenced by the driver: - * + inp_lport - * + inp_fport - * + inp_laddr - * + inp_fport - * + inp_socket - * + inp_ip_tos - * - * The following fields in the socket are expected to be referenced by the - * driver: - * + so_comp - * + so_error - * + so_linger - * + so_options - * + so_rcv - * + so_snd - * + so_state - * + so_timeo - * - * These functions all return 0 on success and can return the following errors - * as appropriate: - * + EPERM: - * + ENOBUFS: memory allocation failed - * + EMSGSIZE: MTU changed during the call - * + EHOSTDOWN: - * + EHOSTUNREACH: - * + ENETDOWN: - * * ENETUNREACH: the peer is no longer reachable - * - * + tu_detach - * - tells driver that the socket is going away so disconnect - * the toepcb and free appropriate resources - * - allows the driver to cleanly handle the case of connection state - * outliving the socket - * - no further calls are legal after detach - * - the driver is expected to provide its own synchronization between - * detach and receiving new data. - * - * + tu_syncache_event - * - even if it is not actually needed, the driver is expected to - * call syncache_add for the initial SYN and then syncache_expand - * for the SYN,ACK - * - tells driver that a connection either has not been added or has - * been dropped from the syncache - * - the driver is expected to maintain state that lives outside the - * software stack so the syncache needs to be able to notify the - * toe driver that the software stack is not going to create a connection - * for a received SYN - * - The driver is responsible for any synchronization required between - * the syncache dropping an entry and the driver processing the SYN,ACK. - * - */ -struct toe_usrreqs { - int (*tu_send)(struct tcpcb *tp); - int (*tu_rcvd)(struct tcpcb *tp); - int (*tu_disconnect)(struct tcpcb *tp); - int (*tu_reset)(struct tcpcb *tp); - void (*tu_detach)(struct tcpcb *tp); - void (*tu_syncache_event)(int event, void *toep); -}; - -/* - * Proxy for struct tcpopt between TOE drivers and TCP functions. - */ -struct toeopt { - u_int64_t to_flags; /* see tcpopt in tcp_var.h */ - u_int16_t to_mss; /* maximum segment size */ - u_int8_t to_wscale; /* window scaling */ +extern int registered_toedevs; - u_int8_t _pad1; /* explicit pad for 64bit alignment */ - u_int32_t _pad2; /* explicit pad for 64bit alignment */ - u_int64_t _pad3[4]; /* TBD */ -}; +int tcp_offload_connect(struct socket *, struct sockaddr *); +void tcp_offload_listen_start(struct tcpcb *); +void tcp_offload_listen_stop(struct tcpcb *); +void tcp_offload_input(struct tcpcb *, struct mbuf *); +int tcp_offload_output(struct tcpcb *); +void tcp_offload_rcvd(struct tcpcb *); +void tcp_offload_ctloutput(struct tcpcb *, int, int); +void tcp_offload_detach(struct tcpcb *); -#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */ -#define TOE_SC_DROP 2 /* connection was timed out */ - -/* - * Because listen is a one-to-many relationship (a socket can be listening - * on all interfaces on a machine some of which may be using different TCP - * offload devices), listen uses a publish/subscribe mechanism. The TCP - * offload driver registers a listen notification function with the stack. - * When a listen socket is created all TCP offload devices are notified - * so that they can do the appropriate set up to offload connections on the - * port to which the socket is bound. When the listen socket is closed, - * the offload devices are notified so that they will stop listening on that - * port and free any associated resources as well as sending RSTs on any - * connections in the SYN_RCVD state. - * - */ - -typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); -typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); - -EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); -EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); - -/* - * Check if the socket can be offloaded by the following steps: - * - determine the egress interface - * - check the interface for TOE capability and TOE is enabled - * - check if the device has resources to offload the connection - */ -int tcp_offload_connect(struct socket *so, struct sockaddr *nam); - -/* - * The tcp_output_* routines are wrappers around the toe_usrreqs calls - * which trigger packet transmission. In the non-offloaded case they - * translate to tcp_output. The tcp_offload_* routines notify TOE - * of specific events. I the non-offloaded case they are no-ops. - * - * Listen is a special case because it is a 1 to many relationship - * and there can be more than one offload driver in the system. - */ - -/* - * Connection is offloaded - */ -#define tp_offload(tp) ((tp)->t_flags & TF_TOE) - -/* - * hackish way of allowing this file to also be included by TOE - * which needs to be kept ignorant of socket implementation details - */ -#ifdef _SYS_SOCKETVAR_H_ -/* - * The socket has not been marked as "do not offload" - */ -#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0) - -static __inline int -tcp_output_connect(struct socket *so, struct sockaddr *nam) -{ - struct tcpcb *tp = sototcpcb(so); - int error; - - /* - * If offload has been disabled for this socket or the - * connection cannot be offloaded just call tcp_output - * to start the TCP state machine. - */ -#ifndef TCP_OFFLOAD_DISABLE - if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0) -#endif - error = tcp_output(tp); - return (error); -} - -static __inline int -tcp_output_send(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_send(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_rcvd(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_rcvd(tp)); #endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_disconnect(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_disconnect(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_reset(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_reset(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline void -tcp_offload_detach(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - tp->t_tu->tu_detach(tp); -#endif -} - -static __inline void -tcp_offload_listen_open(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket)) - EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); -#endif -} - -static __inline void -tcp_offload_listen_close(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); -#endif -} -#undef SO_OFFLOADABLE -#endif /* _SYS_SOCKETVAR_H_ */ -#undef tp_offload - -void tcp_offload_twstart(struct tcpcb *tp); -struct tcpcb *tcp_offload_close(struct tcpcb *tp); -struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error); - -#endif /* _NETINET_TCP_OFFLOAD_H_ */ diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 8e0f369..1881c54 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -75,6 +75,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include #endif +#ifdef TCP_OFFLOAD +#include +#endif #ifdef IPSEC #include @@ -191,6 +194,11 @@ tcp_output(struct tcpcb *tp) INP_WLOCK_ASSERT(tp->t_inpcb); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return (tcp_offload_output(tp)); +#endif + /* * Determine length of data that should be transmitted, * and flags that will be used. diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 05da82e..9d35e0a 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -85,7 +85,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #ifdef INET6 #include #endif @@ -96,6 +95,9 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include #endif +#ifdef TCP_OFFLOAD +#include +#endif #ifdef IPSEC #include @@ -824,7 +826,7 @@ tcp_drop(struct tcpcb *tp, int errno) if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; - (void) tcp_output_reset(tp); + (void) tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -924,8 +926,12 @@ tcp_discardcb(struct tcpcb *tp) /* free the reassembly queue, if any */ tcp_reass_flush(tp); + +#ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ - tcp_offload_detach(tp); + if (tp->t_flags & TF_TOE) + tcp_offload_detach(tp); +#endif tcp_free_sackholes(tp); @@ -954,9 +960,10 @@ tcp_close(struct tcpcb *tp) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); - /* Notify any offload devices of listener close */ +#ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) - tcp_offload_listen_close(tp); + tcp_offload_listen_stop(tp); +#endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); @@ -1695,7 +1702,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output_send(tp); + tcp_output(tp); return (inp); } diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 21a72f4..0f7637d 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -81,10 +81,12 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #ifdef INET6 #include #endif +#ifdef TCP_OFFLOAD +#include +#endif #ifdef IPSEC #include @@ -110,10 +112,8 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); -#ifdef TCP_OFFLOAD_DISABLE -#define TOEPCB_ISSET(sc) (0) -#else -#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) +#ifdef TCP_OFFLOAD +#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); @@ -332,6 +332,14 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_added(tod, sc->sc_todctx); + } +#endif + /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; @@ -356,10 +364,14 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch) TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; -#ifndef TCP_OFFLOAD_DISABLE - if (sc->sc_tu) - sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); -#endif +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_removed(tod, sc->sc_todctx); + } +#endif + syncache_free(sc); V_tcp_syncache.cache_count--; } @@ -846,6 +858,18 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) if (sc->sc_rxmits > 1) tp->snd_cwnd = tp->t_maxseg; +#ifdef TCP_OFFLOAD + /* + * Allow a TOE driver to install its hooks. Note that we hold the + * pcbinfo lock too and that prevents tcp_usr_accept from accepting a + * new connection before the TOE driver has done its thing. + */ + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_offload_socket(tod, sc->sc_todctx, so); + } +#endif /* * Copy and activate timers. */ @@ -926,6 +950,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_removed(tod, sc->sc_todctx); + } +#endif V_tcp_syncache.cache_count--; SCH_UNLOCK(sch); } @@ -934,7 +965,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ - if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) { + if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); @@ -945,9 +976,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ - if ((SEQ_LEQ(th->th_seq, sc->sc_irs) || - SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) && - !TOEPCB_ISSET(sc)) { + if (SEQ_LEQ(th->th_seq, sc->sc_irs) || + SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); @@ -964,8 +994,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ - if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts && - !TOEPCB_ISSET(sc)) { + if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", @@ -993,25 +1022,6 @@ failed: return (0); } -int -tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct socket **lsop, struct mbuf *m) -{ - struct tcpopt to; - int rc; - - bzero(&to, sizeof(struct tcpopt)); - to.to_mss = toeo->to_mss; - to.to_wscale = toeo->to_wscale; - to.to_flags = toeo->to_flags; - - INP_INFO_WLOCK(&V_tcbinfo); - rc = syncache_expand(inc, &to, th, lsop, m); - INP_INFO_WUNLOCK(&V_tcbinfo); - - return (rc); -} - /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: @@ -1025,10 +1035,10 @@ tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ -static void -_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m, - struct toe_usrreqs *tu, void *toepcb) +void +syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, + void *todctx) { struct tcpcb *tp; struct socket *so; @@ -1114,11 +1124,6 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { -#ifndef TCP_OFFLOAD_DISABLE - if (sc->sc_tu) - sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, - sc->sc_toepcb); -#endif TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* @@ -1151,7 +1156,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { + if (syncache_respond(sc) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1202,9 +1207,9 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } -#ifndef TCP_OFFLOAD_DISABLE - sc->sc_tu = tu; - sc->sc_toepcb = toepcb; +#ifdef TCP_OFFLOAD + sc->sc_tod = tod; + sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); @@ -1299,7 +1304,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* * Do a standard 3-way handshake. */ - if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) { + if (syncache_respond(sc) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1491,37 +1496,21 @@ syncache_respond(struct syncache *sc) m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); + + return (error); + } +#endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } -void -syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m) -{ - _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL); -} - -void -tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct inpcb *inp, struct socket **lsop, - struct toe_usrreqs *tu, void *toepcb) -{ - struct tcpopt to; - - bzero(&to, sizeof(struct tcpopt)); - to.to_mss = toeo->to_mss; - to.to_wscale = toeo->to_wscale; - to.to_flags = toeo->to_flags; - - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - - _syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb); -} - /* * The purpose of SYN cookies is to avoid keeping track of all SYN's we * receive and to be able to handle SYN floods from bogus source addresses diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h index 5783b6d..d18ee07 100644 --- a/sys/netinet/tcp_syncache.h +++ b/sys/netinet/tcp_syncache.h @@ -34,8 +34,6 @@ #define _NETINET_TCP_SYNCACHE_H_ #ifdef _KERNEL -struct toeopt; - void syncache_init(void); #ifdef VIMAGE void syncache_destroy(void); @@ -43,14 +41,9 @@ void syncache_destroy(void); void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); -int tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct socket **lsop, struct mbuf *m); void syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); -void tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *, - struct tcphdr *, struct inpcb *, struct socket **, - struct toe_usrreqs *tu, void *toepcb); - + struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, + void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); int syncache_pcbcount(void); @@ -75,10 +68,10 @@ struct syncache { u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; u_int16_t sc_flags; -#ifndef TCP_OFFLOAD_DISABLE - struct toe_usrreqs *sc_tu; /* TOE operations */ - void *sc_toepcb; /* TOE protocol block */ -#endif +#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE) + struct toedev *sc_tod; /* entry added by this TOE */ + void *sc_todctx; /* TOE driver context */ +#endif struct label *sc_label; /* MAC label reference */ struct ucred *sc_cred; /* cred cache for jail checks */ diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 9c3c749..b3ddacc 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -602,6 +602,11 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) struct inpcb *inp = tp->t_inpcb; int cpu = INP_CPU(inp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return; +#endif + switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index a9045f3..b69961e 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -87,7 +87,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include #endif +#ifdef TCP_OFFLOAD #include +#endif /* * TCP protocol interface to socket abstraction. @@ -367,7 +369,9 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); - tcp_offload_listen_open(tp); +#ifdef TCP_OFFLOAD + tcp_offload_listen_start(tp); +#endif } SOCK_UNLOCK(so); @@ -409,6 +413,9 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); +#ifdef TCP_OFFLOAD + tcp_offload_listen_start(tp); +#endif } SOCK_UNLOCK(so); @@ -459,7 +466,13 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) TCPDEBUG1(); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + error = tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); @@ -519,7 +532,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + error = tcp_output(tp); goto out; } #endif @@ -530,7 +548,13 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + error = tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); @@ -709,7 +733,7 @@ tcp_usr_shutdown(struct socket *so) socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output_disconnect(tp); + error = tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); @@ -739,7 +763,11 @@ tcp_usr_rcvd(struct socket *so, int flags) } tp = intotcpcb(inp); TCPDEBUG1(); - tcp_output_rcvd(tp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + tcp_offload_rcvd(tp); +#endif + tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); @@ -835,7 +863,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (!(inp->inp_flags & INP_DROPPED)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output_send(tp); + error = tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -884,7 +912,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; - error = tcp_output_send(tp); + error = tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } out: @@ -1119,7 +1147,6 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1192,7 +1219,6 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1323,9 +1349,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; - INP_WUNLOCK(inp); - break; + goto unlock_and_done; #endif /* TCP_SIGNATURE */ + case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); @@ -1351,6 +1377,13 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_flags |= opt; else tp->t_flags &= ~opt; +unlock_and_done: +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif INP_WUNLOCK(inp); break; @@ -1369,8 +1402,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) if (TCPS_HAVEESTABLISHED(tp->t_state)) error = tcp_output(tp); } - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); @@ -1385,8 +1417,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_maxseg = optval; else error = EINVAL; - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); @@ -1438,8 +1469,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } } CC_LIST_RUNLOCK(); - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_KEEPIDLE: case TCP_KEEPINTVL: @@ -1491,8 +1521,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) TP_KEEPINIT(tp)); break; } - INP_WUNLOCK(inp); - break; + goto unlock_and_done; default: INP_WUNLOCK(inp); @@ -1635,7 +1664,7 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output_disconnect(tp); + tcp_output(tp); } } @@ -1658,7 +1687,9 @@ tcp_usrclosed(struct tcpcb *tp) switch (tp->t_state) { case TCPS_LISTEN: - tcp_offload_listen_close(tp); +#ifdef TCP_OFFLOAD + tcp_offload_listen_stop(tp); +#endif /* FALLTHROUGH */ case TCPS_CLOSED: tp->t_state = TCPS_CLOSED; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 3b8bdf7..90ecca1 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -194,7 +194,7 @@ struct tcpcb { int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ - struct toe_usrreqs *t_tu; /* offload operations vector */ + struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c new file mode 100644 index 0000000..4b4efb7 --- /dev/null +++ b/sys/netinet/toecore.c @@ -0,0 +1,575 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include +#include +#include +#include +#include +#include + +static struct mtx toedev_lock; +static TAILQ_HEAD(, toedev) toedev_list; +static eventhandler_tag listen_start_eh; +static eventhandler_tag listen_stop_eh; +static eventhandler_tag lle_event_eh; +static eventhandler_tag route_redirect_eh; + +static int +toedev_connect(struct toedev *tod __unused, struct socket *so __unused, + struct rtentry *rt __unused, struct sockaddr *nam __unused) +{ + + return (ENOTSUP); +} + +static int +toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return (ENOTSUP); +} + +static int +toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return (ENOTSUP); +} + +static void +toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused, + struct mbuf *m) +{ + + m_freem(m); + return; +} + +static void +toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return; +} + +static int +toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return (ENOTSUP); +} + +static void +toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused) +{ + + return; +} + +static void +toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused, + struct sockaddr *sa __unused, uint8_t *lladdr __unused, + uint16_t vtag __unused) +{ + + return; +} + +static void +toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused, + struct rtentry *rt0 __unused, struct rtentry *rt1 __unused) +{ + + return; +} + +static void +toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused) +{ + + return; +} + +static void +toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused) +{ + + return; +} + +static int +toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused, + struct mbuf *m) +{ + + m_freem(m); + return (0); +} + +static void +toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused, + struct socket *so __unused) +{ + + return; +} + +static void +toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused, + int sopt_dir __unused, int sopt_name __unused) +{ + + return; +} + +/* + * Inform one or more TOE devices about a listening socket. + */ +static void +toe_listen_start(struct inpcb *inp, void *arg) +{ + struct toedev *t, *tod; + struct tcpcb *tp; + + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_pcbinfo == &V_tcbinfo, + ("%s: inp is not a TCP inp", __func__)); + + if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) + return; + + tp = intotcpcb(inp); + if (tp->t_state != TCPS_LISTEN) + return; + + t = arg; + mtx_lock(&toedev_lock); + TAILQ_FOREACH(tod, &toedev_list, link) { + if (t == NULL || t == tod) + tod->tod_listen_start(tod, tp); + } + mtx_unlock(&toedev_lock); +} + +static void +toe_listen_start_event(void *arg __unused, struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_state == TCPS_LISTEN, + ("%s: t_state %s", __func__, tcpstates[tp->t_state])); + + toe_listen_start(inp, NULL); +} + +static void +toe_listen_stop_event(void *arg __unused, struct tcpcb *tp) +{ + struct toedev *tod; +#ifdef INVARIANTS + struct inpcb *inp = tp->t_inpcb; +#endif + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_state == TCPS_LISTEN, + ("%s: t_state %s", __func__, tcpstates[tp->t_state])); + + mtx_lock(&toedev_lock); + TAILQ_FOREACH(tod, &toedev_list, link) + tod->tod_listen_stop(tod, tp); + mtx_unlock(&toedev_lock); +} + +/* + * Fill up a freshly allocated toedev struct with reasonable defaults. + */ +void +init_toedev(struct toedev *tod) +{ + + tod->tod_softc = NULL; + + /* + * Provide no-op defaults so that the kernel can call any toedev + * function without having to check whether the TOE driver supplied one + * or not. + */ + tod->tod_connect = toedev_connect; + tod->tod_listen_start = toedev_listen_start; + tod->tod_listen_stop = toedev_listen_stop; + tod->tod_input = toedev_input; + tod->tod_rcvd = toedev_rcvd; + tod->tod_output = toedev_output; + tod->tod_send_rst = toedev_output; + tod->tod_send_fin = toedev_output; + tod->tod_pcb_detach = toedev_pcb_detach; + tod->tod_l2_update = toedev_l2_update; + tod->tod_route_redirect = toedev_route_redirect; + tod->tod_syncache_added = toedev_syncache_added; + tod->tod_syncache_removed = toedev_syncache_removed; + tod->tod_syncache_respond = toedev_syncache_respond; + tod->tod_offload_socket = toedev_offload_socket; + tod->tod_ctloutput = toedev_ctloutput; +} + +/* + * Register an active TOE device with the system. This allows it to receive + * notifications from the kernel. + */ +int +register_toedev(struct toedev *tod) +{ + struct toedev *t; + + mtx_lock(&toedev_lock); + TAILQ_FOREACH(t, &toedev_list, link) { + if (t == tod) { + mtx_unlock(&toedev_lock); + return (EEXIST); + } + } + + TAILQ_INSERT_TAIL(&toedev_list, tod, link); + registered_toedevs++; + mtx_unlock(&toedev_lock); + + inp_apply_all(toe_listen_start, tod); + + return (0); +} + +/* + * Remove the TOE device from the global list of active TOE devices. It is the + * caller's responsibility to ensure that the TOE device is quiesced prior to + * this call. + */ +int +unregister_toedev(struct toedev *tod) +{ + struct toedev *t, *t2; + int rc = ENODEV; + + mtx_lock(&toedev_lock); + TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) { + if (t == tod) { + TAILQ_REMOVE(&toedev_list, tod, link); + registered_toedevs--; + rc = 0; + break; + } + } + KASSERT(registered_toedevs >= 0, + ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs)); + mtx_unlock(&toedev_lock); + return (rc); +} + +void +toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, void *tod, void *todctx) +{ + struct socket *lso = inp->inp_socket; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx); +} + +int +toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to, + struct tcphdr *th, struct socket **lsop) +{ + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + return (syncache_expand(inc, to, th, lsop, NULL)); +} + +/* + * General purpose check to see if a 4-tuple is in use by the kernel. If a TCP + * header (presumably for an incoming SYN) is also provided, an existing 4-tuple + * in TIME_WAIT may be assassinated freeing it up for re-use. + * + * Note that the TCP header must have been run through tcp_fields_to_host() or + * equivalent. + */ +int +toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp) +{ + struct inpcb *inp; + + if (inc->inc_flags & INC_ISIPV6) + return (ENOSYS); /* XXX: implement */ + + inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport, + inc->inc_laddr, inc->inc_lport, INPLOOKUP_WLOCKPCB, ifp); + if (inp != NULL) { + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) { + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */ + if (!tcp_twcheck(inp, NULL, th, NULL, 0)) + return (EADDRINUSE); + } else { + INP_WUNLOCK(inp); + return (EADDRINUSE); + } + } + + return (0); +} + +static void +toe_lle_event(void *arg __unused, struct llentry *lle, int evt) +{ + struct toedev *tod; + struct ifnet *ifp; + struct sockaddr *sa; + uint8_t *lladdr; + uint16_t vtag; + + LLE_WLOCK_ASSERT(lle); + + ifp = lle->lle_tbl->llt_ifp; + sa = L3_ADDR(lle); + + KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6, + ("%s: lle_event %d for lle %p but sa %p !INET && !INET6", + __func__, evt, lle, sa)); + + /* + * Not interested if the interface's TOE capability is not enabled. + */ + if ((sa->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) || + (sa->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))) + return; + + tod = TOEDEV(ifp); + if (tod == NULL) + return; + + vtag = 0xfff; + if (evt != LLENTRY_RESOLVED) { + + /* + * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean + * this entry is going to be deleted. + */ + + lladdr = NULL; + } else { + + KASSERT(lle->la_flags & LLE_VALID, + ("%s: %p resolved but not valid?", __func__, lle)); + + lladdr = (uint8_t *)&lle->ll_addr; +#ifdef VLAN_TAG + VLAN_TAG(ifp, &vtag); +#endif + } + + tod->tod_l2_update(tod, ifp, sa, lladdr, vtag); +} + +/* + * XXX: implement. + */ +static void +toe_route_redirect_event(void *arg __unused, struct rtentry *rt0, + struct rtentry *rt1, struct sockaddr *sa) +{ + + return; +} + +/* + * Returns 0 or EWOULDBLOCK on success (any other value is an error). 0 means + * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's + * tod_l2_update will be called later, when the entry is resolved or times out. + */ +int +toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa, + uint8_t *lladdr, uint16_t *vtag) +{ + struct llentry *lle; + int rc; + + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + rc = arpresolve(ifp, NULL, NULL, sa, lladdr, &lle); + break; +#endif +#ifdef INET6 + case AF_INET6: + rc = nd6_storelladdr(ifp, NULL, sa, lladdr, &lle); + break; +#endif + default: + return (EPROTONOSUPPORT); + } + + if (rc == 0) { +#ifdef VLAN_TAG + if (VLAN_TAG(ifp, vtag) != 0) +#endif + *vtag = 0xfff; + } + + return (rc); +} + +void +toe_connect_failed(struct toedev *tod, struct tcpcb *tp, int err) +{ + struct inpcb *inp = tp->t_inpcb; + + INP_WLOCK_ASSERT(inp); + KASSERT(tp->t_flags & TF_TOE, + ("%s: tp %p not offloaded.", __func__, tp)); + + if (!(inp->inp_flags & INP_DROPPED)) { + if (err == EAGAIN) { + + /* + * Temporary failure during offload, take this PCB back. + * Detach from the TOE driver and do the rest of what + * TCP's pru_connect would have done if the connection + * wasn't offloaded. + */ + + tod->tod_pcb_detach(tod, tp); + KASSERT(!(tp->t_flags & TF_TOE), + ("%s: tp %p still offloaded.", __func__, tp)); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + (void) tcp_output(tp); + } else { + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tp = tcp_drop(tp, err); + if (tp == NULL) + INP_WLOCK(inp); /* re-acquire */ + } + } + INP_WLOCK_ASSERT(inp); +} + +static int +toecore_load(void) +{ + + mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF); + TAILQ_INIT(&toedev_list); + + listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start, + toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY); + listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop, + toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY); + lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL, + EVENTHANDLER_PRI_ANY); + route_redirect_eh = EVENTHANDLER_REGISTER(route_redirect_event, + toe_route_redirect_event, NULL, EVENTHANDLER_PRI_ANY); + + return (0); +} + +static int +toecore_unload(void) +{ + + mtx_lock(&toedev_lock); + if (!TAILQ_EMPTY(&toedev_list)) { + mtx_unlock(&toedev_lock); + return (EBUSY); + } + + EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh); + EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh); + EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); + EVENTHANDLER_DEREGISTER(route_redirect_event, route_redirect_eh); + + mtx_unlock(&toedev_lock); + mtx_destroy(&toedev_lock); + + return (0); +} + +static int +toecore_mod_handler(module_t mod, int cmd, void *arg) +{ + + if (cmd == MOD_LOAD) + return (toecore_load()); + + if (cmd == MOD_UNLOAD) + return (toecore_unload()); + + return (EOPNOTSUPP); +} + +static moduledata_t mod_data= { + "toecore", + toecore_mod_handler, + 0 +}; + +MODULE_VERSION(toecore, 1); +DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/netinet/toecore.h b/sys/netinet/toecore.h new file mode 100644 index 0000000..a381825 --- /dev/null +++ b/sys/netinet/toecore.h @@ -0,0 +1,130 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TOE_H_ +#define _NETINET_TOE_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct tcpopt; +struct tcphdr; +struct in_conninfo; + +struct toedev { + TAILQ_ENTRY(toedev) link; /* glue for toedev_list */ + void *tod_softc; /* TOE driver private data */ + + /* + * Active open. If a failure occurs, it is reported back by the driver + * via toe_connect_failed. + */ + int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *, + struct sockaddr *); + + /* Passive open. */ + int (*tod_listen_start)(struct toedev *, struct tcpcb *); + int (*tod_listen_stop)(struct toedev *, struct tcpcb *); + + /* + * The kernel uses this routine to pass on any frame it receives for an + * offloaded connection to the TOE driver. This is an unusual event. + */ + void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *); + + /* + * This is called by the kernel during pru_rcvd for an offloaded TCP + * connection and provides an opportunity for the TOE driver to manage + * its rx window and credits. + */ + void (*tod_rcvd)(struct toedev *, struct tcpcb *); + + /* + * Transmit routine. The kernel calls this to have the TOE driver + * evaluate whether there is data to be transmitted, and transmit it. + */ + int (*tod_output)(struct toedev *, struct tcpcb *); + + /* Immediate teardown: send RST to peer. */ + int (*tod_send_rst)(struct toedev *, struct tcpcb *); + + /* Initiate orderly disconnect by sending FIN to the peer. */ + int (*tod_send_fin)(struct toedev *, struct tcpcb *); + + /* Called to indicate that the kernel is done with this TCP PCB. */ + void (*tod_pcb_detach)(struct toedev *, struct tcpcb *); + + /* + * The kernel calls this once it has information about an L2 entry that + * the TOE driver enquired about previously (via toe_l2_resolve). + */ + void (*tod_l2_update)(struct toedev *, struct ifnet *, + struct sockaddr *, uint8_t *, uint16_t); + + /* XXX. Route has been redirected. */ + void (*tod_route_redirect)(struct toedev *, struct ifnet *, + struct rtentry *, struct rtentry *); + + /* Syncache interaction. */ + void (*tod_syncache_added)(struct toedev *, void *); + void (*tod_syncache_removed)(struct toedev *, void *); + int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *); + void (*tod_offload_socket)(struct toedev *, void *, struct socket *); + + /* TCP socket option */ + void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int); +}; + +#include +typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); +typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); +EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); +EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); + +void init_toedev(struct toedev *); +int register_toedev(struct toedev *); +int unregister_toedev(struct toedev *); + +/* + * General interface for looking up L2 information for an IP address. If an + * answer is not available right away then the TOE driver's tod_l2_update will + * be called later. + */ +int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *, + uint8_t *, uint16_t *); + +void toe_connect_failed(struct toedev *, struct tcpcb *, int); + +void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, + struct inpcb *, void *, void *); +int toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, + struct socket **); + +int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *); +#endif diff --git a/sys/netinet/toedev.h b/sys/netinet/toedev.h deleted file mode 100644 index 7edaca1..0000000 --- a/sys/netinet/toedev.h +++ /dev/null @@ -1,162 +0,0 @@ -/*- - * Copyright (c) 2007, Chelsio Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _NETINET_TOEDEV_H_ -#define _NETINET_TOEDEV_H_ - -#ifndef _KERNEL -#error "no user-serviceable parts inside" -#endif - -extern uint32_t toedev_registration_count; - -/* Parameter values for offload_get_phys_egress(). */ -enum { - TOE_OPEN, - TOE_FAILOVER, -}; - -/* Parameter values for toe_failover(). */ -enum { - TOE_ACTIVE_SLAVE, - TOE_LINK_DOWN, - TOE_LINK_UP, - TOE_RELEASE, - TOE_RELEASE_ALL, -}; - -#define TOENAMSIZ 16 - -/* Get the toedev associated with a ifnet. */ -#define TOEDEV(ifp) ((ifp)->if_llsoftc) - -struct offload_id { - unsigned int id; - unsigned long data; -}; - -struct ifnet; -struct rt_entry; -struct tom_info; -struct sysctl_oid; -struct socket; -struct mbuf; - -struct toedev { - TAILQ_ENTRY(toedev) entry; - char tod_name[TOENAMSIZ]; /* TOE device name */ - unsigned int tod_ttid; /* TOE type id */ - unsigned long tod_flags; /* device flags */ - unsigned int tod_mtu; /* max TX offloaded data */ - unsigned int tod_nconn; /* max # of offloaded - * connections - */ - struct ifnet *tod_lldev; /* first interface */ - const struct tom_info *tod_offload_mod; /* TCP offload module */ - - /* - * This TOE device is capable of offloading the connection for socket so - */ - int (*tod_can_offload)(struct toedev *dev, struct socket *so); - - /* - * Establish a connection to nam using the TOE device dev - */ - int (*tod_connect)(struct toedev *dev, struct socket *so, - struct rtentry *rt, struct sockaddr *nam); - /* - * Send an mbuf down to the toe device - */ - int (*tod_send)(struct toedev *dev, struct mbuf *m); - /* - * Receive an array of mbufs from the TOE device dev - */ - int (*tod_recv)(struct toedev *dev, struct mbuf **m, int n); - /* - * Device specific ioctl interface - */ - int (*tod_ctl)(struct toedev *dev, unsigned int req, void *data); - /* - * Update L2 entry in toedev - */ - void (*tod_arp_update)(struct toedev *dev, struct rtentry *neigh); - /* - * Failover from one toe device to another - */ - void (*tod_failover)(struct toedev *dev, struct ifnet *bond_ifp, - struct ifnet *ndev, int event); - void *tod_priv; /* driver private data */ - void *tod_l2opt; /* optional layer 2 data */ - void *tod_l3opt; /* optional layer 3 data */ - void *tod_l4opt; /* optional layer 4 data */ - void *tod_ulp; /* upper lever protocol */ -}; - -struct tom_info { - TAILQ_ENTRY(tom_info) entry; - int (*ti_attach)(struct toedev *dev, - const struct offload_id *entry); - int (*ti_detach)(struct toedev *dev); - const char *ti_name; - const struct offload_id *ti_id_table; -}; - -static __inline void -init_offload_dev(struct toedev *dev) -{ -} - -int register_tom(struct tom_info *t); -int unregister_tom(struct tom_info *t); -int register_toedev(struct toedev *dev, const char *name); -int unregister_toedev(struct toedev *dev); -int activate_offload(struct toedev *dev); -int toe_send(struct toedev *dev, struct mbuf *m); -void toe_arp_update(struct rtentry *rt); -struct ifnet *offload_get_phys_egress(struct ifnet *ifp, - struct socket *so, int context); -int toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n); - -static __inline void -toe_neigh_update(struct ifnet *ifp) -{ -} - -static __inline void -toe_failover(struct ifnet *bond_ifp, struct ifnet *fail_ifp, int event) -{ -} - -static __inline int -toe_enslave(struct ifnet *bond_ifp, struct ifnet *slave_ifp) -{ - return (0); -} - -#endif /* _NETINET_TOEDEV_H_ */ diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c index 750d2d5..9867f10 100644 --- a/sys/ofed/drivers/infiniband/core/cma.c +++ b/sys/ofed/drivers/infiniband/core/cma.c @@ -59,10 +59,10 @@ static int tavor_quirk = 0; module_param_named(tavor_quirk, tavor_quirk, int, 0644); MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0"); -int unify_tcp_port_space = 0; +int unify_tcp_port_space = 1; module_param(unify_tcp_port_space, int, 0644); MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " - "space allocation (default=0)"); + "space allocation (default=1)"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 @@ -1478,6 +1478,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) struct sockaddr_in *sin; id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, + id_priv->sock, iw_conn_req_handler, id_priv); if (IS_ERR(id_priv->cm_id.iw)) @@ -2055,7 +2056,16 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; } } - return rdma_bind_addr(id, src_addr); + if (!cma_any_addr(src_addr)) + return rdma_bind_addr(id, src_addr); + else { + struct sockaddr_in addr_in; + + memset(&addr_in, 0, sizeof addr_in); + addr_in.sin_family = dst_addr->sa_family; + addr_in.sin_len = sizeof addr_in; + return rdma_bind_addr(id, (struct sockaddr *) &addr_in); + } } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2247,6 +2257,7 @@ static int cma_get_tcp_port(struct rdma_id_private *id_priv) sock_release(sock); return ret; } + size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); ret = sock_getname(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, @@ -2255,6 +2266,7 @@ static int cma_get_tcp_port(struct rdma_id_private *id_priv) sock_release(sock); return ret; } + id_priv->sock = sock; return 0; } @@ -2604,7 +2616,8 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, int ret; struct iw_cm_conn_param iw_param; - cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); + cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, + cma_iw_handler, id_priv); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto out; diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c index 625fec5..b13e53a 100644 --- a/sys/ofed/drivers/infiniband/core/iwcm.c +++ b/sys/ofed/drivers/infiniband/core/iwcm.c @@ -189,6 +189,7 @@ static void rem_ref(struct iw_cm_id *cm_id) static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, + struct socket *so, iw_cm_handler cm_handler, void *context) { @@ -205,6 +206,7 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; + cm_id_priv->id.so = so; spin_lock_init(&cm_id_priv->lock); atomic_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); @@ -629,6 +631,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, spin_unlock_irqrestore(&listen_id_priv->lock, flags); cm_id = iw_create_cm_id(listen_id_priv->id.device, + iw_event->so, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ diff --git a/sys/ofed/include/linux/net.h b/sys/ofed/include/linux/net.h index 6e2aff3..f47acf9 100644 --- a/sys/ofed/include/linux/net.h +++ b/sys/ofed/include/linux/net.h @@ -48,12 +48,12 @@ sock_getname(struct socket *so, struct sockaddr *addr, int *sockaddr_len, int error; nam = NULL; - if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) - return (-ENOTCONN); + if (peer) { + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) + return (-ENOTCONN); - if (peer) error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, nam); - else + } else error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam); if (error) return (-error); diff --git a/sys/ofed/include/net/netevent.h b/sys/ofed/include/net/netevent.h index db5b50e..c7bbc5f 100644 --- a/sys/ofed/include/net/netevent.h +++ b/sys/ofed/include/net/netevent.h @@ -42,7 +42,7 @@ enum netevent_notif_type { struct llentry; static inline void -_handle_arp_update_event(void *arg, struct llentry *lle) +_handle_arp_update_event(void *arg, struct llentry *lle, int evt __unused) { struct notifier_block *nb; @@ -54,7 +54,7 @@ static inline int register_netevent_notifier(struct notifier_block *nb) { nb->tags[NETEVENT_NEIGH_UPDATE] = EVENTHANDLER_REGISTER( - arp_update_event, _handle_arp_update_event, nb, 0); + lle_event, _handle_arp_update_event, nb, 0); return (0); } @@ -62,8 +62,7 @@ static inline int unregister_netevent_notifier(struct notifier_block *nb) { - EVENTHANDLER_DEREGISTER(arp_update_event, - nb->tags[NETEVENT_NEIGH_UPDATE]); + EVENTHANDLER_DEREGISTER(lle_event, nb->tags[NETEVENT_NEIGH_UPDATE]); return (0); } diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h index cbb822e..412320e 100644 --- a/sys/ofed/include/rdma/iw_cm.h +++ b/sys/ofed/include/rdma/iw_cm.h @@ -63,6 +63,7 @@ struct iw_cm_event { void *private_data; u8 private_data_len; void *provider_data; + struct socket *so; }; /** @@ -98,6 +99,7 @@ struct iw_cm_id { /* Used by provider to add and remove refs on IW cm_id */ void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); + struct socket *so; }; struct iw_cm_conn_param { @@ -139,7 +141,7 @@ struct iw_cm_verbs { * returned IW CM identifier. * @context: User specified context associated with the id. */ -struct iw_cm_id *iw_create_cm_id(struct ib_device *device, +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so, iw_cm_handler cm_handler, void *context); /** diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index 50d2bdd..9d8c97e 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -461,7 +461,10 @@ protopr(u_long off, const char *name, int af1, int proto) #endif vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "4 " : " "; - printf("%-3.3s%-2.2s ", name, vchar); + if (istcp && (tp->t_flags & TF_TOE) != 0) + printf("%-3.3s%-2.2s ", "toe", vchar); + else + printf("%-3.3s%-2.2s ", name, vchar); if (Lflag) { char buf1[15]; diff --git a/usr.bin/sockstat/sockstat.c b/usr.bin/sockstat/sockstat.c index 334fb93..f60d9fb 100644 --- a/usr.bin/sockstat/sockstat.c +++ b/usr.bin/sockstat/sockstat.c @@ -325,6 +325,7 @@ gather_inet(int proto) } inp = &xtp->xt_inp; so = &xtp->xt_socket; + protoname = xtp->xt_tp.t_flags & TF_TOE ? "toe" : "tcp"; break; case IPPROTO_UDP: case IPPROTO_DIVERT: -- cgit v1.1