diff options
46 files changed, 11279 insertions, 114 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 9edface..66b36de 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -38,6 +38,7 @@ source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/ipath/Kconfig" source "drivers/infiniband/hw/ehca/Kconfig" source "drivers/infiniband/hw/amso1100/Kconfig" +source "drivers/infiniband/hw/cxgb3/Kconfig" source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index 2b5d109..da2066c 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/ obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/ obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index d2bb5a9..a91001c 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -373,7 +373,7 @@ static struct notifier_block nb = { static int addr_init(void) { - addr_wq = create_singlethread_workqueue("ib_addr_wq"); + addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9e0ab04..db88e60 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -71,6 +71,7 @@ static struct workqueue_struct *cma_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); +static int next_port; struct cma_device { struct list_head list; @@ -1722,33 +1723,74 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; - int port, start, ret; + int port, ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; - start = snum ? snum : sysctl_local_port_range[0]; + do { + ret = idr_get_new_above(ps, bind_list, snum, &port); + } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); + + if (ret) + goto err1; + + if (port != snum) { + ret = -EADDRNOTAVAIL; + goto err2; + } + + bind_list->ps = ps; + bind_list->port = (unsigned short) port; + cma_bind_port(bind_list, id_priv); + return 0; +err2: + idr_remove(ps, port); +err1: + kfree(bind_list); + return ret; +} +static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list; + int port, ret; + + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); + if (!bind_list) + return -ENOMEM; + +retry: do { - ret = idr_get_new_above(ps, bind_list, start, &port); + ret = idr_get_new_above(ps, bind_list, next_port, &port); } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) - goto err; + goto err1; - if ((snum && port != snum) || - (!snum && port > sysctl_local_port_range[1])) { - idr_remove(ps, port); + if (port > sysctl_local_port_range[1]) { + if (next_port != sysctl_local_port_range[0]) { + idr_remove(ps, port); + next_port = sysctl_local_port_range[0]; + goto retry; + } ret = -EADDRNOTAVAIL; - goto err; + goto err2; } + if (port == sysctl_local_port_range[1]) + next_port = sysctl_local_port_range[0]; + else + next_port = port + 1; + bind_list->ps = ps; bind_list->port = (unsigned short) port; cma_bind_port(bind_list, id_priv); return 0; -err: +err2: + idr_remove(ps, port); +err1: kfree(bind_list); return ret; } @@ -1811,7 +1853,7 @@ static int cma_get_port(struct rdma_id_private *id_priv) mutex_lock(&lock); if (cma_any_port(&id_priv->id.route.addr.src_addr)) - ret = cma_alloc_port(ps, id_priv, 0); + ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); @@ -2448,7 +2490,11 @@ static int cma_init(void) { int ret; - cma_wq = create_singlethread_workqueue("rdma_cm_wq"); + get_random_bytes(&next_port, sizeof next_port); + next_port = (next_port % (sysctl_local_port_range[1] - + sysctl_local_port_range[0])) + + sysctl_local_port_range[0]; + cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 63d2a39..7fabb42 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -36,6 +36,7 @@ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/mutex.h> @@ -93,7 +94,7 @@ static int ib_device_check_mandatory(struct ib_device *device) }; int i; - for (i = 0; i < sizeof mandatory_table / sizeof mandatory_table[0]; ++i) { + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig new file mode 100644 index 0000000..77977f5 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/Kconfig @@ -0,0 +1,27 @@ +config INFINIBAND_CXGB3 + tristate "Chelsio RDMA Driver" + depends on CHELSIO_T3 && INFINIBAND && INET + select GENERIC_ALLOCATOR + ---help--- + This is an iWARP/RDMA driver for the Chelsio T3 1GbE and + 10GbE adapters. + + For general information about Chelsio and our products, visit + our website at <http://www.chelsio.com>. + + For customer support, please visit our customer support page at + <http://www.chelsio.com/support.htm>. + + Please send feedback to <linux-bugs@chelsio.com>. + + To compile this driver as a module, choose M here: the module + will be called iw_cxgb3. + +config INFINIBAND_CXGB3_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_CXGB3 + default n + ---help--- + This option causes the Chelsio RDMA driver to produce copious + amounts of debug messages. Select this if you are developing + the driver or trying to diagnose a problem. diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile new file mode 100644 index 0000000..0e110f3 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/Makefile @@ -0,0 +1,12 @@ +EXTRA_CFLAGS += -I$(TOPDIR)/drivers/net/cxgb3 \ + -I$(TOPDIR)/drivers/infiniband/hw/cxgb3/core + +obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o + +iw_cxgb3-y := iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \ + iwch_provider.o iwch.o cxio_hal.o cxio_resource.o + +ifdef CONFIG_INFINIBAND_CXGB3_DEBUG +EXTRA_CFLAGS += -DDEBUG +iw_cxgb3-y += cxio_dbg.o +endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c new file mode 100644 index 0000000..5a7306f --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_dbg.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef DEBUG +#include <linux/types.h> +#include "common.h" +#include "cxgb3_ioctl.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size = 32; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; + m->len = size; + PDBG("%s TPT addr 0x%x len %d\n", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size, npages; + + shift += 12; + npages = (len + (1ULL << shift) - 1) >> shift; + size = npages * sizeof(u64); + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = pbl_addr; + m->len = size; + PDBG("%s PBL addr 0x%x len %d depth %d\n", + __FUNCTION__, m->addr, m->len, npages); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_wqe(union t3_wr *wqe) +{ + __be64 *data = (__be64 *)wqe; + uint size = (uint)(be64_to_cpu(*data) & 0xff); + + if (size == 0) + size = 8; + while (size > 0) { + PDBG("WQE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size--; + data++; + } +} + +void cxio_dump_wce(struct t3_cqe *wce) +{ + __be64 *data = (__be64 *)wce; + int size = sizeof(*wce); + + while (size > 0) { + PDBG("WCE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size -= 8; + data++; + } +} + +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents) +{ + struct ch_mem_range *m; + int size = nents * 64; + u64 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; + m->len = size; + PDBG("%s RQT addr 0x%x len %d\n", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid) +{ + struct ch_mem_range *m; + int size = TCB_SIZE; + u32 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_CM; + m->addr = hwtid * size; + m->len = size; + PDBG("%s TCB %d len %d\n", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u32 *)m->buf; + while (size > 0) { + printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", + m->addr, + *(data+2), *(data+3), *(data),*(data+1), + *(data+6), *(data+7), *(data+4), *(data+5)); + size -= 32; + data += 8; + m->addr += 32; + } + kfree(m); +} +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c new file mode 100644 index 0000000..82fa720 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -0,0 +1,1280 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <asm/delay.h> + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/pci.h> + +#include "cxio_resource.h" +#include "cxio_hal.h" +#include "cxgb3_offload.h" +#include "sge_defs.h" + +static LIST_HEAD(rdev_list); +static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL; + +static inline struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (!strcmp(rdev->dev_name, dev_name)) + return rdev; + return NULL; +} + +static inline struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev + *tdev) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (rdev->t3cdev_p == tdev) + return rdev; + return NULL; +} + +int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit) +{ + int ret; + struct t3_cqe *cqe; + u32 rptr; + + struct rdma_cq_op setup; + setup.id = cq->cqid; + setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0; + setup.op = op; + ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup); + + if ((ret < 0) || (op == CQ_CREDIT_UPDATE)) + return ret; + + /* + * If the rearm returned an index other than our current index, + * then there might be CQE's in flight (being DMA'd). We must wait + * here for them to complete or the consumer can miss a notification. + */ + if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) { + int i=0; + + rptr = cq->rptr; + + /* + * Keep the generation correct by bumping rptr until it + * matches the index returned by the rearm - 1. + */ + while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret) + rptr++; + + /* + * Now rptr is the index for the (last) cqe that was + * in-flight at the time the HW rearmed the CQ. We + * spin until that CQE is valid. + */ + cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2); + while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) { + udelay(1); + if (i++ > 1000000) { + BUG_ON(1); + printk(KERN_ERR "%s: stalled rnic\n", + rdev_p->dev_name); + return -EIO; + } + } + } + return 0; +} + +static inline int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid) +{ + struct rdma_cq_setup setup; + setup.id = cqid; + setup.base_addr = 0; /* NULL address */ + setup.size = 0; /* disaable the CQ */ + setup.credits = 0; + setup.credit_thres = 0; + setup.ovfl_mode = 0; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) +{ + u64 sge_cmd; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + if (!skb) { + PDBG("%s alloc_skb failed\n", __FUNCTION__); + return -ENOMEM; + } + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 1, qpid, 7); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = qpid << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + skb->priority = CPL_PRIORITY_CONTROL; + return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); +} + +int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + + cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); + if (!cq->cqid) + return -ENOMEM; + cq->sw_queue = kzalloc(size, GFP_KERNEL); + if (!cq->sw_queue) + return -ENOMEM; + cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (cq->size_log2)) * + sizeof(struct t3_cqe), + &(cq->dma_addr), GFP_KERNEL); + if (!cq->queue) { + kfree(cq->sw_queue); + return -ENOMEM; + } + pci_unmap_addr_set(cq, mapping, cq->dma_addr); + memset(cq->queue, 0, size); + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = 65535; + setup.credit_thres = 1; + if (rdev_p->t3cdev_p->type == T3B) + setup.ovfl_mode = 0; + else + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = setup.size; + setup.credit_thres = setup.size; /* TBD: overflow recovery */ + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + u32 qpid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->qpids)) { + entry = list_entry(uctx->qpids.next, struct cxio_qpid_list, + entry); + list_del(&entry->entry); + qpid = entry->qpid; + kfree(entry); + } else { + qpid = cxio_hal_get_qpid(rdev_p->rscp); + if (!qpid) + goto out; + for (i = qpid+1; i & rdev_p->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + break; + entry->qpid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + return qpid; +} + +static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid, + struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + entry->qpid = qpid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->qpids); + mutex_unlock(&uctx->lock); +} + +void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct list_head *pos, *nxt; + struct cxio_qpid_list *entry; + + mutex_lock(&uctx->lock); + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct cxio_qpid_list, entry); + list_del_init(&entry->entry); + if (!(entry->qpid & rdev_p->qpmask)) + cxio_hal_put_qpid(rdev_p->rscp, entry->qpid); + kfree(entry); + } + mutex_unlock(&uctx->lock); +} + +void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + INIT_LIST_HEAD(&uctx->qpids); + mutex_init(&uctx->lock); +} + +int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, + struct t3_wq *wq, struct cxio_ucontext *uctx) +{ + int depth = 1UL << wq->size_log2; + int rqsize = 1UL << wq->rq_size_log2; + + wq->qpid = get_qpid(rdev_p, uctx); + if (!wq->qpid) + return -ENOMEM; + + wq->rq = kzalloc(depth * sizeof(u64), GFP_KERNEL); + if (!wq->rq) + goto err1; + + wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize); + if (!wq->rq_addr) + goto err2; + + wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL); + if (!wq->sq) + goto err3; + + wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), + depth * sizeof(union t3_wr), + &(wq->dma_addr), GFP_KERNEL); + if (!wq->queue) + goto err4; + + memset(wq->queue, 0, depth * sizeof(union t3_wr)); + pci_unmap_addr_set(wq, mapping, wq->dma_addr); + wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + if (!kernel_domain) + wq->udb = (u64)rdev_p->rnic_info.udbell_physbase + + (wq->qpid << rdev_p->qpshift); + PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __FUNCTION__, + wq->qpid, wq->doorbell, (unsigned long long) wq->udb); + return 0; +err4: + kfree(wq->sq); +err3: + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize); +err2: + kfree(wq->rq); +err1: + put_qpid(rdev_p, wq->qpid, uctx); + return -ENOMEM; +} + +int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + int err; + err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + kfree(cq->sw_queue); + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (cq->size_log2)) + * sizeof(struct t3_cqe), cq->queue, + pci_unmap_addr(cq, mapping)); + cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); + return err; +} + +int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, + struct cxio_ucontext *uctx) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (wq->size_log2)) + * sizeof(union t3_wr), wq->queue, + pci_unmap_addr(wq, mapping)); + kfree(wq->sq); + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2)); + kfree(wq->rq); + put_qpid(rdev_p, wq->qpid, uctx); + return 0; +} + +static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __FUNCTION__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(T3_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + u32 ptr; + + PDBG("%s wq %p cq %p\n", __FUNCTION__, wq, cq); + + /* flush RQ */ + PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __FUNCTION__, + wq->rq_rptr, wq->rq_wptr, count); + ptr = wq->rq_rptr + count; + while (ptr++ != wq->rq_wptr) + insert_recv_cqe(wq, cq); +} + +static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, + struct t3_swsq *sqp) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __FUNCTION__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(sqp->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + cqe.u.scqe.wrid_hi = sqp->sq_wptr; + + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + __u32 ptr; + struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); + + ptr = wq->sq_rptr + count; + sqp += count; + while (ptr != wq->sq_wptr) { + insert_sq_cqe(wq, cq, sqp); + sqp++; + ptr++; + } +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + */ +void cxio_flush_hw_cq(struct t3_cq *cq) +{ + struct t3_cqe *cqe, *swcqe; + + PDBG("%s cq %p cqid 0x%x\n", __FUNCTION__, cq, cq->cqid); + cqe = cxio_next_hw_cqe(cq); + while (cqe) { + PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n", + __FUNCTION__, cq->rptr, cq->sw_wptr); + swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2); + *swcqe = *cqe; + swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); + cq->sw_wptr++; + cq->rptr++; + cqe = cxio_next_hw_cqe(cq); + } +} + +static inline int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq) +{ + if (CQE_OPCODE(*cqe) == T3_TERMINATE) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) + return 0; + + return 1; +} + +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) && + (CQE_QPID(*cqe) == wq->qpid)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __FUNCTION__, cq, *count); +} + +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + PDBG("%s count zero %d\n", __FUNCTION__, *count); + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) && + (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __FUNCTION__, cq, *count); +} + +static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p) +{ + struct rdma_cq_setup setup; + setup.id = 0; + setup.base_addr = 0; /* NULL address */ + setup.size = 1; /* enable the CQ */ + setup.credits = 0; + + /* force SGE to redirect to RspQ and interrupt */ + setup.credit_thres = 0; + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) +{ + int err; + u64 sge_cmd, ctx0, ctx1; + u64 base_addr; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + + + if (!skb) { + PDBG("%s alloc_skb failed\n", __FUNCTION__); + return -ENOMEM; + } + err = cxio_hal_init_ctrl_cq(rdev_p); + if (err) { + PDBG("%s err %d initializing ctrl_cq\n", __FUNCTION__, err); + return err; + } + rdev_p->ctrl_qp.workq = dma_alloc_coherent( + &(rdev_p->rnic_info.pdev->dev), + (1 << T3_CTRL_QP_SIZE_LOG2) * + sizeof(union t3_wr), + &(rdev_p->ctrl_qp.dma_addr), + GFP_KERNEL); + if (!rdev_p->ctrl_qp.workq) { + PDBG("%s dma_alloc_coherent failed\n", __FUNCTION__); + return -ENOMEM; + } + pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping, + rdev_p->ctrl_qp.dma_addr); + rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + memset(rdev_p->ctrl_qp.workq, 0, + (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); + + mutex_init(&rdev_p->ctrl_qp.lock); + init_waitqueue_head(&rdev_p->ctrl_qp.waitq); + + /* update HW Ctrl QP context */ + base_addr = rdev_p->ctrl_qp.dma_addr; + base_addr >>= 12; + ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) | + V_EC_BASE_LO((u32) base_addr & 0xffff)); + ctx0 <<= 32; + ctx0 |= V_EC_CREDITS(FW_WR_NUM); + base_addr >>= 16; + ctx1 = (u32) base_addr; + base_addr >>= 32; + ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) | + V_EC_TYPE(0) | V_EC_GEN(1) | + V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32; + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 1, + T3_CTL_QP_TID, 7); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + wqe->ctx1 = cpu_to_be64(ctx1); + wqe->ctx0 = cpu_to_be64(ctx0); + PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n", + (unsigned long long) rdev_p->ctrl_qp.dma_addr, + rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); + skb->priority = CPL_PRIORITY_CONTROL; + return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); +} + +static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << T3_CTRL_QP_SIZE_LOG2) + * sizeof(union t3_wr), rdev_p->ctrl_qp.workq, + pci_unmap_addr(&rdev_p->ctrl_qp, mapping)); + return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID); +} + +/* write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + * caller aquires the ctrl_qp lock before the call + */ +static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, + u32 len, void *data, int completion) +{ + u32 i, nr_wqe, copy_len; + u8 *copy_data; + u8 wr_len, utx_len; /* lenght in 8 byte flit */ + enum t3_wr_flags flag; + __be64 *wqe; + u64 utx_cmd; + addr &= 0x7FFFFFF; + nr_wqe = len % 96 ? len / 96 + 1 : len / 96; /* 96B max per WQE */ + PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n", + __FUNCTION__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len, + nr_wqe, data, addr); + utx_len = 3; /* in 32B unit */ + for (i = 0; i < nr_wqe; i++) { + if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2)) { + PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, " + "wait for more space i %d\n", __FUNCTION__, + rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i); + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + !Q_FULL(rdev_p->ctrl_qp.rptr, + rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2))) { + PDBG("%s ctrl_qp workq interrupted\n", + __FUNCTION__); + return -ERESTARTSYS; + } + PDBG("%s ctrl_qp wakeup, continue posting work request " + "i %d\n", __FUNCTION__, i); + } + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + flag = 0; + if (i == (nr_wqe - 1)) { + /* last WQE */ + flag = completion ? T3_COMPLETION_FLAG : 0; + if (len % 32) + utx_len = len / 32 + 1; + else + utx_len = len / 32; + } + + /* + * Force a CQE to return the credit to the workq in case + * we posted more than half the max QP size of WRs + */ + if ((i != 0) && + (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) { + flag = T3_COMPLETION_FLAG; + PDBG("%s force completion at i %d\n", __FUNCTION__, i); + } + + /* build the utx mem command */ + wqe += (sizeof(struct t3_bypass_wr) >> 3); + utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3); + utx_cmd <<= 32; + utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1); + *wqe = cpu_to_be64(utx_cmd); + wqe++; + copy_data = (u8 *) data + i * 96; + copy_len = len > 96 ? 96 : len; + + /* clear memory content if data is NULL */ + if (data) + memcpy(wqe, copy_data, copy_len); + else + memset(wqe, 0, copy_len); + if (copy_len % 32) + memset(((u8 *) wqe) + copy_len, 0, + 32 - (copy_len % 32)); + wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 + + (utx_len << 2); + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + + /* wptr in the WRID[31:0] */ + ((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr; + + /* + * This must be the last write with a memory barrier + * for the genbit + */ + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag, + Q_GENBIT(rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID, + wr_len); + if (flag == T3_COMPLETION_FLAG) + ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID); + len -= 96; + rdev_p->ctrl_qp.wptr++; + } + return 0; +} + +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size + * OUT: stag index, actual pbl_size, pbl_addr allocated. + * TBD: shared memory region support + */ +static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum tpt_mem_type type, enum tpt_mem_perm perm, + u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl, + u32 *pbl_size, u32 *pbl_addr) +{ + int err; + struct tpt_entry tpt; + u32 stag_idx; + u32 wptr; + int rereg = (*stag != T3_STAG_UNSET); + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) { + stag_idx = cxio_hal_get_stag(rdev_p->rscp); + if (!stag_idx) + return -ENOMEM; + *stag = (stag_idx << 8) | ((*stag) & 0xFF); + } + PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", + __FUNCTION__, stag_state, type, pdid, stag_idx); + + if (reset_tpt_entry) + cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3); + else if (!rereg) { + *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3); + if (!*pbl_addr) { + return -ENOMEM; + } + } + + mutex_lock(&rdev_p->ctrl_qp.lock); + + /* write PBL first if any - update pbl only if pbl list exist */ + if (pbl) { + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base, + *pbl_size); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + (*pbl_addr >> 5), + (*pbl_size << 3), pbl, 0); + if (err) + goto ret; + } + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID | + V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) | + V_TPT_STAG_STATE(stag_state) | + V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid)); + BUG_ON(page_size >= 28); + tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) | + F_TPT_MW_BIND_ENABLE | + V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | + V_TPT_PAGE_SIZE(page_size)); + tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : + cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3)); + tpt.len = cpu_to_be32(len); + tpt.va_hi = cpu_to_be32((u32) (to >> 32)); + tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); + tpt.rsvd_bind_cnt_or_pstag = 0; + tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : + cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2)); + } + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + stag_idx + + (rdev_p->rnic_info.tpt_base >> 5), + sizeof(tpt), &tpt, 1); + + /* release the stag index to free pool */ + if (reset_tpt_entry) + cxio_hal_put_stag(rdev_p->rscp, stag_idx); +ret: + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (!err) + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + return err; +} + +/* IN : stag key, pdid, pbl_size + * Out: stag index, actaul pbl_size, and pbl_addr allocated. + */ +int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 * pbl_size, u32 * pbl_addr) +{ + *stag = T3_STAG_UNSET; + return (__cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR, + perm, 0, 0ULL, 0, 0, NULL, pbl_size, pbl_addr)); +} + +int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl, pbl_size, pbl_addr); +} + +int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl, pbl_size, pbl_addr); +} + +int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, + &pbl_size, &pbl_addr); +} + +int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) +{ + u32 pbl_size = 0; + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, + NULL, &pbl_size, NULL); +} + +int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, + NULL, NULL); +} + +int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) +{ + struct t3_rdma_init_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + PDBG("%s rdev_p %p\n", __FUNCTION__, rdev_p); + wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe)); + wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT)); + wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) | + V_FW_RIWR_LEN(sizeof(*wqe) >> 3)); + wqe->wrid.id1 = 0; + wqe->qpid = cpu_to_be32(attr->qpid); + wqe->pdid = cpu_to_be32(attr->pdid); + wqe->scqid = cpu_to_be32(attr->scqid); + wqe->rcqid = cpu_to_be32(attr->rcqid); + wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base); + wqe->rq_size = cpu_to_be32(attr->rq_size); + wqe->mpaattrs = attr->mpaattrs; + wqe->qpcaps = attr->qpcaps; + wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); + wqe->flags = cpu_to_be32(attr->flags); + wqe->ord = cpu_to_be32(attr->ord); + wqe->ird = cpu_to_be32(attr->ird); + wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); + wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size); + wqe->rsvd = 0; + skb->priority = 0; /* 0=>ToeQ; 1=>CtrlQ */ + return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); +} + +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = ev_cb; +} + +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = NULL; +} + +static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb) +{ + static int cnt; + struct cxio_rdev *rdev_p = NULL; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x" + " se %0x notify %0x cqbranch %0x creditth %0x\n", + cnt, __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg), + RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg), + RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg), + RSPQ_CREDIT_THRESH(rsp_msg)); + PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d " + "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + rdev_p = (struct cxio_rdev *)t3cdev_p->ulp; + if (!rdev_p) { + PDBG("%s called by t3cdev %p with null ulp\n", __FUNCTION__, + t3cdev_p); + return 0; + } + if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) { + rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1; + wake_up_interruptible(&rdev_p->ctrl_qp.waitq); + dev_kfree_skb_irq(skb); + } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8) + dev_kfree_skb_irq(skb); + else if (cxio_ev_cb) + (*cxio_ev_cb) (rdev_p, skb); + else + dev_kfree_skb_irq(skb); + cnt++; + return 0; +} + +/* Caller takes care of locking if needed */ +int cxio_rdev_open(struct cxio_rdev *rdev_p) +{ + struct net_device *netdev_p = NULL; + int err = 0; + if (strlen(rdev_p->dev_name)) { + if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) { + return -EBUSY; + } + netdev_p = dev_get_by_name(rdev_p->dev_name); + if (!netdev_p) { + return -EINVAL; + } + dev_put(netdev_p); + } else if (rdev_p->t3cdev_p) { + if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) { + return -EBUSY; + } + netdev_p = rdev_p->t3cdev_p->lldev; + strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name, + T3_MAX_DEV_NAME_LEN); + } else { + PDBG("%s t3cdev_p or dev_name must be set\n", __FUNCTION__); + return -EINVAL; + } + + list_add_tail(&rdev_p->entry, &rdev_list); + + PDBG("%s opening rnic dev %s\n", __FUNCTION__, rdev_p->dev_name); + memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp)); + if (!rdev_p->t3cdev_p) + rdev_p->t3cdev_p = T3CDEV(netdev_p); + rdev_p->t3cdev_p->ulp = (void *) rdev_p; + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS, + &(rdev_p->rnic_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __FUNCTION__, rdev_p->t3cdev_p, err); + goto err1; + } + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS, + &(rdev_p->port_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __FUNCTION__, rdev_p->t3cdev_p, err); + goto err1; + } + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + cxio_init_ucontext(rdev_p, &rdev_p->uctx); + rdev_p->qpshift = PAGE_SHIFT - + ilog2(65536 >> + ilog2(rdev_p->rnic_info.udbell_len >> + PAGE_SHIFT)); + rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT; + rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1; + PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d " + "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n", + __FUNCTION__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base, + rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p), + rdev_p->rnic_info.pbl_base, + rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base, + rdev_p->rnic_info.rqt_top); + PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu " + "qpnr %d qpmask 0x%x\n", + rdev_p->rnic_info.udbell_len, + rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr, + rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask); + + err = cxio_hal_init_ctrl_qp(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing ctrl_qp.\n", + __FUNCTION__, err); + goto err1; + } + err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0, + 0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ, + T3_MAX_NUM_PD); + if (err) { + printk(KERN_ERR "%s error %d initializing hal resources.\n", + __FUNCTION__, err); + goto err2; + } + err = cxio_hal_pblpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing pbl mem pool.\n", + __FUNCTION__, err); + goto err3; + } + err = cxio_hal_rqtpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing rqt mem pool.\n", + __FUNCTION__, err); + goto err4; + } + return 0; +err4: + cxio_hal_pblpool_destroy(rdev_p); +err3: + cxio_hal_destroy_resource(rdev_p->rscp); +err2: + cxio_hal_destroy_ctrl_qp(rdev_p); +err1: + list_del(&rdev_p->entry); + return err; +} + +void cxio_rdev_close(struct cxio_rdev *rdev_p) +{ + if (rdev_p) { + cxio_hal_pblpool_destroy(rdev_p); + cxio_hal_rqtpool_destroy(rdev_p); + list_del(&rdev_p->entry); + rdev_p->t3cdev_p->ulp = NULL; + cxio_hal_destroy_ctrl_qp(rdev_p); + cxio_hal_destroy_resource(rdev_p->rscp); + } +} + +int __init cxio_hal_init(void) +{ + if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI)) + return -ENOMEM; + t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler); + return 0; +} + +void __exit cxio_hal_exit(void) +{ + struct cxio_rdev *rdev, *tmp; + + t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL); + list_for_each_entry_safe(rdev, tmp, &rdev_list, entry) + cxio_rdev_close(rdev); + cxio_hal_destroy_rhdl_resource(); +} + +static inline void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_swsq *sqp; + __u32 ptr = wq->sq_rptr; + int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr); + + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (count--) + if (!sqp->signaled) { + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + } else if (sqp->complete) { + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n", + __FUNCTION__, Q_PTR2IDX(ptr, wq->sq_size_log2), + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)); + sqp->cqe.header |= htonl(V_CQE_SWCQE(1)); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) + = sqp->cqe; + cq->sw_wptr++; + sqp->signaled = 0; + break; + } else + break; +} + +static inline void create_read_req_cqe(struct t3_wq *wq, + struct t3_cqe *hw_cqe, + struct t3_cqe *read_cqe) +{ + read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr; + read_cqe->len = wq->oldest_read->read_len; + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) | + V_CQE_SWCQE(SW_CQE(*hw_cqe)) | + V_CQE_OPCODE(T3_READ_REQ) | + V_CQE_TYPE(1)); +} + +/* + * Return a ptr to the next read wr in the SWSQ or NULL. + */ +static inline void advance_oldest_read(struct t3_wq *wq) +{ + + u32 rptr = wq->oldest_read - wq->sq + 1; + u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2); + + while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) { + wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2); + + if (wq->oldest_read->opcode == T3_READ_REQ) + return; + rptr++; + } + wq->oldest_read = NULL; +} + +/* + * cxio_poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned, + * -1 CQE skipped, try again. + */ +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t3_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + hw_cqe = cxio_next_cqe(cq); + + PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __FUNCTION__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe), + CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe), + CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe), + CQE_WRID_LOW(*hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -1; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + /* + * T3A: Discard TERMINATE CQEs. + */ + if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) { + ret = -1; + wq->error = 1; + goto skip_cqe; + } + + if (CQE_STATUS(*hw_cqe) || wq->error) { + *cqe_flushed = wq->error; + wq->error = 1; + + /* + * T3A inserts errors into the CQE. We cannot return + * these as work completions. + */ + /* incoming write failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE) + && RQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + /* incoming read request failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + + /* incoming SEND with no receive posted failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + ret = -1; + goto skip_cqe; + } + goto proc_cqe; + } + + /* + * RECV completion. + */ + if (RQ_TYPE(*hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with TPT_ERR_MSN and mark the wq in + * error. + */ + if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) { + wq->error = 1; + hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) { + struct t3_swsq *sqp; + + PDBG("%s out of order completion going in swsq at idx %ld\n", + __FUNCTION__, + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2)); + sqp = wq->sq + + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2); + sqp->cqe = *hw_cqe; + sqp->complete = 1; + ret = -1; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(*hw_cqe)) { + wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe); + PDBG("%s completing sq idx %ld\n", __FUNCTION__, + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)); + *cookie = (wq->sq + + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id; + wq->sq_rptr++; + } else { + PDBG("%s completing rq idx %ld\n", __FUNCTION__, + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + *cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + wq->rq_rptr++; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(*hw_cqe)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n", + __FUNCTION__, cq, cq->cqid, cq->sw_rptr); + ++cq->sw_rptr; + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n", + __FUNCTION__, cq, cq->cqid, cq->rptr); + ++cq->rptr; + + /* + * T3A: compute credits. + */ + if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1))) + || ((cq->rptr - cq->wptr) >= 128)) { + *credit = cq->rptr - cq->wptr; + cq->wptr = cq->rptr; + } + } + return ret; +} diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h new file mode 100644 index 0000000..1b97e80 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_HAL_H__ +#define __CXIO_HAL_H__ + +#include <linux/list.h> +#include <linux/mutex.h> + +#include "t3_cpl.h" +#include "t3cdev.h" +#include "cxgb3_ctl_defs.h" +#include "cxio_wr.h" + +#define T3_CTRL_QP_ID FW_RI_SGEEC_START +#define T3_CTL_QP_TID FW_RI_TID_START +#define T3_CTRL_QP_SIZE_LOG2 8 +#define T3_CTRL_CQ_ID 0 + +/* TBD */ +#define T3_MAX_NUM_RI (1<<15) +#define T3_MAX_NUM_QP (1<<15) +#define T3_MAX_NUM_CQ (1<<15) +#define T3_MAX_NUM_PD (1<<15) +#define T3_MAX_PBL_SIZE 256 +#define T3_MAX_RQ_SIZE 1024 +#define T3_MAX_NUM_STAG (1<<15) + +#define T3_STAG_UNSET 0xffffffff + +#define T3_MAX_DEV_NAME_LEN 32 + +struct cxio_hal_ctrl_qp { + u32 wptr; + u32 rptr; + struct mutex lock; /* for the wtpr, can sleep */ + wait_queue_head_t waitq;/* wait for RspQ/CQE msg */ + union t3_wr *workq; /* the work request queue */ + dma_addr_t dma_addr; /* pci bus address of the workq */ + DECLARE_PCI_UNMAP_ADDR(mapping) + void __iomem *doorbell; +}; + +struct cxio_hal_resource { + struct kfifo *tpt_fifo; + spinlock_t tpt_fifo_lock; + struct kfifo *qpid_fifo; + spinlock_t qpid_fifo_lock; + struct kfifo *cqid_fifo; + spinlock_t cqid_fifo_lock; + struct kfifo *pdid_fifo; + spinlock_t pdid_fifo_lock; +}; + +struct cxio_qpid_list { + struct list_head entry; + u32 qpid; +}; + +struct cxio_ucontext { + struct list_head qpids; + struct mutex lock; +}; + +struct cxio_rdev { + char dev_name[T3_MAX_DEV_NAME_LEN]; + struct t3cdev *t3cdev_p; + struct rdma_info rnic_info; + struct adap_ports port_info; + struct cxio_hal_resource *rscp; + struct cxio_hal_ctrl_qp ctrl_qp; + void *ulp; + unsigned long qpshift; + u32 qpnr; + u32 qpmask; + struct cxio_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct list_head entry; +}; + +static inline int cxio_num_stags(struct cxio_rdev *rdev_p) +{ + return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5)); +} + +typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p, + struct sk_buff * skb); + +#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff) +#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff) +#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1) +#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1) +#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1) +#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1) +#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1) +#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1) +#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1) + +struct respQ_msg_t { + __be32 flags; /* flit 0 */ + __be32 cq_ptrid; + __be64 rsvd; /* flit 1 */ + struct t3_cqe cqe; /* flits 2-3 */ +}; + +enum t3_cq_opcode { + CQ_ARM_AN = 0x2, + CQ_ARM_SE = 0x6, + CQ_FORCE_AN = 0x3, + CQ_CREDIT_UPDATE = 0x7 +}; + +int cxio_rdev_open(struct cxio_rdev *rdev); +void cxio_rdev_close(struct cxio_rdev *rdev); +int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit); +int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev, u32 qpid); +int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_allocate_stag(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 * pbl_size, u32 * pbl_addr); +int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr); +int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr); +int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr); +int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); +int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag); +int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr); +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +u32 cxio_hal_get_rhdl(void); +void cxio_hal_put_rhdl(u32 rhdl); +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); +int __init cxio_hal_init(void); +void __exit cxio_hal_exit(void); +void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_flush_hw_cq(struct t3_cq *cq); +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit); + +#define MOD "iw_cxgb3: " +#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args) + +#ifdef DEBUG +void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); +void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift); +void cxio_dump_wqe(union t3_wr *wqe); +void cxio_dump_wce(struct t3_cqe *wce); +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); +#endif + +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c new file mode 100644 index 0000000..997aa32 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* Crude resource management */ +#include <linux/kernel.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/kfifo.h> +#include <linux/spinlock.h> +#include <linux/errno.h> +#include "cxio_resource.h" +#include "cxio_hal.h" + +static struct kfifo *rhdl_fifo; +static spinlock_t rhdl_fifo_lock; + +#define RANDOM_SIZE 16 + +static int __cxio_init_resource_fifo(struct kfifo **fifo, + spinlock_t *fifo_lock, + u32 nr, u32 skip_low, + u32 skip_high, + int random) +{ + u32 i, j, entry = 0, idx; + u32 random_bytes; + u32 rarray[16]; + spin_lock_init(fifo_lock); + + *fifo = kfifo_alloc(nr * sizeof(u32), GFP_KERNEL, fifo_lock); + if (IS_ERR(*fifo)) + return -ENOMEM; + + for (i = 0; i < skip_low + skip_high; i++) + __kfifo_put(*fifo, (unsigned char *) &entry, sizeof(u32)); + if (random) { + j = 0; + random_bytes = random32(); + for (i = 0; i < RANDOM_SIZE; i++) + rarray[i] = i + skip_low; + for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { + if (j >= RANDOM_SIZE) { + j = 0; + random_bytes = random32(); + } + idx = (random_bytes >> (j * 2)) & 0xF; + __kfifo_put(*fifo, + (unsigned char *) &rarray[idx], + sizeof(u32)); + rarray[idx] = i; + j++; + } + for (i = 0; i < RANDOM_SIZE; i++) + __kfifo_put(*fifo, + (unsigned char *) &rarray[i], + sizeof(u32)); + } else + for (i = skip_low; i < nr - skip_high; i++) + __kfifo_put(*fifo, (unsigned char *) &i, sizeof(u32)); + + for (i = 0; i < skip_low + skip_high; i++) + kfifo_get(*fifo, (unsigned char *) &entry, sizeof(u32)); + return 0; +} + +static int cxio_init_resource_fifo(struct kfifo **fifo, spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 0)); +} + +static int cxio_init_resource_fifo_random(struct kfifo **fifo, + spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 1)); +} + +static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) +{ + u32 i; + + spin_lock_init(&rdev_p->rscp->qpid_fifo_lock); + + rdev_p->rscp->qpid_fifo = kfifo_alloc(T3_MAX_NUM_QP * sizeof(u32), + GFP_KERNEL, + &rdev_p->rscp->qpid_fifo_lock); + if (IS_ERR(rdev_p->rscp->qpid_fifo)) + return -ENOMEM; + + for (i = 16; i < T3_MAX_NUM_QP; i++) + if (!(i & rdev_p->qpmask)) + __kfifo_put(rdev_p->rscp->qpid_fifo, + (unsigned char *) &i, sizeof(u32)); + return 0; +} + +int cxio_hal_init_rhdl_resource(u32 nr_rhdl) +{ + return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1, + 0); +} + +void cxio_hal_destroy_rhdl_resource(void) +{ + kfifo_free(rhdl_fifo); +} + +/* nr_* must be power of 2 */ +int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid) +{ + int err = 0; + struct cxio_hal_resource *rscp; + + rscp = kmalloc(sizeof(*rscp), GFP_KERNEL); + if (!rscp) + return -ENOMEM; + rdev_p->rscp = rscp; + err = cxio_init_resource_fifo_random(&rscp->tpt_fifo, + &rscp->tpt_fifo_lock, + nr_tpt, 1, 0); + if (err) + goto tpt_err; + err = cxio_init_qpid_fifo(rdev_p); + if (err) + goto qpid_err; + err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, + nr_cqid, 1, 0); + if (err) + goto cqid_err; + err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; +pdid_err: + kfifo_free(rscp->cqid_fifo); +cqid_err: + kfifo_free(rscp->qpid_fifo); +qpid_err: + kfifo_free(rscp->tpt_fifo); +tpt_err: + return -ENOMEM; +} + +/* + * returns 0 if no resource available + */ +static inline u32 cxio_hal_get_resource(struct kfifo *fifo) +{ + u32 entry; + if (kfifo_get(fifo, (unsigned char *) &entry, sizeof(u32))) + return entry; + else + return 0; /* fifo emptry */ +} + +static inline void cxio_hal_put_resource(struct kfifo *fifo, u32 entry) +{ + BUG_ON(kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)) == 0); +} + +u32 cxio_hal_get_rhdl(void) +{ + return cxio_hal_get_resource(rhdl_fifo); +} + +void cxio_hal_put_rhdl(u32 rhdl) +{ + cxio_hal_put_resource(rhdl_fifo, rhdl); +} + +u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->tpt_fifo); +} + +void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) +{ + cxio_hal_put_resource(rscp->tpt_fifo, stag); +} + +u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) +{ + u32 qpid = cxio_hal_get_resource(rscp->qpid_fifo); + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + return qpid; +} + +void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) +{ + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + cxio_hal_put_resource(rscp->qpid_fifo, qpid); +} + +u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->cqid_fifo); +} + +void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) +{ + cxio_hal_put_resource(rscp->cqid_fifo, cqid); +} + +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->pdid_fifo); +} + +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) +{ + cxio_hal_put_resource(rscp->pdid_fifo, pdid); +} + +void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) +{ + kfifo_free(rscp->tpt_fifo); + kfifo_free(rscp->cqid_fifo); + kfifo_free(rscp->qpid_fifo); + kfifo_free(rscp->pdid_fifo); + kfree(rscp); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ +#define PBL_CHUNK 2*1024*1024 + +u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size); + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, (u32)addr, size); + return (u32)addr; +} + +void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, addr, size); + gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size); +} + +int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) +{ + unsigned long i; + rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); + if (rdev_p->pbl_pool) + for (i = rdev_p->rnic_info.pbl_base; + i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1; + i += PBL_CHUNK) + gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1); + return rdev_p->pbl_pool ? 0 : -ENOMEM; +} + +void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == mini RQT size (16 entries) */ +#define RQT_CHUNK 2*1024*1024 + +u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6); + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, (u32)addr, size << 6); + return (u32)addr; +} + +void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, addr, size << 6); + gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6); +} + +int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p) +{ + unsigned long i; + rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); + if (rdev_p->rqt_pool) + for (i = rdev_p->rnic_info.rqt_base; + i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1; + i += RQT_CHUNK) + gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1); + return rdev_p->rqt_pool ? 0 : -ENOMEM; +} + +void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->rqt_pool); +} diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h new file mode 100644 index 0000000..a6bbe83 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_RESOURCE_H__ +#define __CXIO_RESOURCE_H__ + +#include <linux/kernel.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/kfifo.h> +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/genalloc.h> +#include "cxio_hal.h" + +extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl); +extern void cxio_hal_destroy_rhdl_resource(void); +extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, + u32 nr_pdid); +extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag); +extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid); +extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid); +extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp); + +#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base ) +extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); + +#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base ) +extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h new file mode 100644 index 0000000..103fc42 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_WR_H__ +#define __CXIO_WR_H__ + +#include <asm/io.h> +#include <linux/pci.h> +#include <linux/timer.h> +#include "firmware_exports.h" + +#define T3_MAX_SGE 4 + +#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr)) +#define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \ + ((rptr)!=(wptr)) ) +#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1)) +#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr))) +#define Q_COUNT(rptr,wptr) ((wptr)-(rptr)) +#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1)) + +static inline void ring_doorbell(void __iomem *doorbell, u32 qpid) +{ + writel(((1<<31) | qpid), doorbell); +} + +#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 )) + +enum t3_wr_flags { + T3_COMPLETION_FLAG = 0x01, + T3_NOTIFY_FLAG = 0x02, + T3_SOLICITED_EVENT_FLAG = 0x04, + T3_READ_FENCE_FLAG = 0x08, + T3_LOCAL_FENCE_FLAG = 0x10 +} __attribute__ ((packed)); + +enum t3_wr_opcode { + T3_WR_BP = FW_WROPCODE_RI_BYPASS, + T3_WR_SEND = FW_WROPCODE_RI_SEND, + T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE, + T3_WR_READ = FW_WROPCODE_RI_RDMA_READ, + T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV, + T3_WR_BIND = FW_WROPCODE_RI_BIND_MW, + T3_WR_RCV = FW_WROPCODE_RI_RECEIVE, + T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT, + T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP +} __attribute__ ((packed)); + +enum t3_rdma_opcode { + T3_RDMA_WRITE, /* IETF RDMAP v1.0 ... */ + T3_READ_REQ, + T3_READ_RESP, + T3_SEND, + T3_SEND_WITH_INV, + T3_SEND_WITH_SE, + T3_SEND_WITH_SE_INV, + T3_TERMINATE, + T3_RDMA_INIT, /* CHELSIO RI specific ... */ + T3_BIND_MW, + T3_FAST_REGISTER, + T3_LOCAL_INV, + T3_QP_MOD, + T3_BYPASS +} __attribute__ ((packed)); + +static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop) +{ + switch (wrop) { + case T3_WR_BP: return T3_BYPASS; + case T3_WR_SEND: return T3_SEND; + case T3_WR_WRITE: return T3_RDMA_WRITE; + case T3_WR_READ: return T3_READ_REQ; + case T3_WR_INV_STAG: return T3_LOCAL_INV; + case T3_WR_BIND: return T3_BIND_MW; + case T3_WR_INIT: return T3_RDMA_INIT; + case T3_WR_QP_MOD: return T3_QP_MOD; + default: break; + } + return -1; +} + + +/* Work request id */ +union t3_wrid { + struct { + u32 hi; + u32 low; + } id0; + u64 id1; +}; + +#define WRID(wrid) (wrid.id1) +#define WRID_GEN(wrid) (wrid.id0.wr_gen) +#define WRID_IDX(wrid) (wrid.id0.wr_idx) +#define WRID_LO(wrid) (wrid.id0.wr_lo) + +struct fw_riwrh { + __be32 op_seop_flags; + __be32 gen_tid_len; +}; + +#define S_FW_RIWR_OP 24 +#define M_FW_RIWR_OP 0xff +#define V_FW_RIWR_OP(x) ((x) << S_FW_RIWR_OP) +#define G_FW_RIWR_OP(x) ((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP) + +#define S_FW_RIWR_SOPEOP 22 +#define M_FW_RIWR_SOPEOP 0x3 +#define V_FW_RIWR_SOPEOP(x) ((x) << S_FW_RIWR_SOPEOP) + +#define S_FW_RIWR_FLAGS 8 +#define M_FW_RIWR_FLAGS 0x3fffff +#define V_FW_RIWR_FLAGS(x) ((x) << S_FW_RIWR_FLAGS) +#define G_FW_RIWR_FLAGS(x) ((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS) + +#define S_FW_RIWR_TID 8 +#define V_FW_RIWR_TID(x) ((x) << S_FW_RIWR_TID) + +#define S_FW_RIWR_LEN 0 +#define V_FW_RIWR_LEN(x) ((x) << S_FW_RIWR_LEN) + +#define S_FW_RIWR_GEN 31 +#define V_FW_RIWR_GEN(x) ((x) << S_FW_RIWR_GEN) + +struct t3_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +/* If num_sgle is zero, flit 5+ contains immediate data.*/ +struct t3_send_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be32 plen; /* 3 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 4+ */ +}; + +struct t3_local_inv_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 reserved3; +}; + +struct t3_rdma_write_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 stag_sink; + __be64 to_sink; /* 3 */ + __be32 plen; /* 4 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 5+ */ +}; + +struct t3_rdma_read_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be64 rem_to; /* 3 */ + __be32 local_stag; /* 4 */ + __be32 local_len; + __be64 local_to; /* 5 */ +}; + +enum t3_addr_type { + T3_VA_BASED_TO = 0x0, + T3_ZERO_BASED_TO = 0x1 +} __attribute__ ((packed)); + +enum t3_mem_perms { + T3_MEM_ACCESS_LOCAL_READ = 0x1, + T3_MEM_ACCESS_LOCAL_WRITE = 0x2, + T3_MEM_ACCESS_REM_READ = 0x4, + T3_MEM_ACCESS_REM_WRITE = 0x8 +} __attribute__ ((packed)); + +struct t3_bind_mw_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u16 reserved; /* 2 */ + u8 type; + u8 perms; + __be32 mr_stag; + __be32 mw_stag; /* 3 */ + __be32 mw_len; + __be64 mw_va; /* 4 */ + __be32 mr_pbl_addr; /* 5 */ + u8 reserved2[3]; + u8 mr_pagesz; +}; + +struct t3_receive_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 pagesz[T3_MAX_SGE]; + __be32 num_sgle; /* 2 */ + struct t3_sge sgl[T3_MAX_SGE]; /* 3+ */ + __be32 pbl_addr[T3_MAX_SGE]; +}; + +struct t3_bypass_wr { + struct fw_riwrh wrh; + union t3_wrid wrid; /* 1 */ +}; + +struct t3_modify_qp_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 flags; /* 2 */ + __be32 quiesce; /* 2 */ + __be32 max_ird; /* 3 */ + __be32 max_ord; /* 3 */ + __be64 sge_cmd; /* 4 */ + __be64 ctx1; /* 5 */ + __be64 ctx0; /* 6 */ +}; + +enum t3_modify_qp_flags { + MODQP_QUIESCE = 0x01, + MODQP_MAX_IRD = 0x02, + MODQP_MAX_ORD = 0x04, + MODQP_WRITE_EC = 0x08, + MODQP_READ_EC = 0x10, +}; + + +enum t3_mpa_attrs { + uP_RI_MPA_RX_MARKER_ENABLE = 0x1, + uP_RI_MPA_TX_MARKER_ENABLE = 0x2, + uP_RI_MPA_CRC_ENABLE = 0x4, + uP_RI_MPA_IETF_ENABLE = 0x8 +} __attribute__ ((packed)); + +enum t3_qp_caps { + uP_RI_QP_RDMA_READ_ENABLE = 0x01, + uP_RI_QP_RDMA_WRITE_ENABLE = 0x02, + uP_RI_QP_BIND_ENABLE = 0x04, + uP_RI_QP_FAST_REGISTER_ENABLE = 0x08, + uP_RI_QP_STAG0_ENABLE = 0x10 +} __attribute__ ((packed)); + +struct t3_rdma_init_attr { + u32 tid; + u32 qpid; + u32 pdid; + u32 scqid; + u32 rcqid; + u32 rq_addr; + u32 rq_size; + enum t3_mpa_attrs mpaattrs; + enum t3_qp_caps qpcaps; + u16 tcp_emss; + u32 ord; + u32 ird; + u64 qp_dma_addr; + u32 qp_dma_size; + u32 flags; +}; + +struct t3_rdma_init_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 qpid; /* 2 */ + __be32 pdid; + __be32 scqid; /* 3 */ + __be32 rcqid; + __be32 rq_addr; /* 4 */ + __be32 rq_size; + u8 mpaattrs; /* 5 */ + u8 qpcaps; + __be16 ulpdu_size; + __be32 flags; /* bits 31-1 - reservered */ + /* bit 0 - set if RECV posted */ + __be32 ord; /* 6 */ + __be32 ird; + __be64 qp_dma_addr; /* 7 */ + __be32 qp_dma_size; /* 8 */ + u32 rsvd; +}; + +struct t3_genbit { + u64 flit[15]; + __be64 genbit; +}; + +enum rdma_init_wr_flags { + RECVS_POSTED = 1, +}; + +union t3_wr { + struct t3_send_wr send; + struct t3_rdma_write_wr write; + struct t3_rdma_read_wr read; + struct t3_receive_wr recv; + struct t3_local_inv_wr local_inv; + struct t3_bind_mw_wr bind; + struct t3_bypass_wr bypass; + struct t3_rdma_init_wr init; + struct t3_modify_qp_wr qp_mod; + struct t3_genbit genbit; + u64 flit[16]; +}; + +#define T3_SQ_CQE_FLIT 13 +#define T3_SQ_COOKIE_FLIT 14 + +#define T3_RQ_COOKIE_FLIT 13 +#define T3_RQ_CQE_FLIT 14 + +static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe) +{ + return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags)); +} + +static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op, + enum t3_wr_flags flags, u8 genbit, u32 tid, + u8 len) +{ + wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) | + V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) | + V_FW_RIWR_FLAGS(flags)); + wmb(); + wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) | + V_FW_RIWR_TID(tid) | + V_FW_RIWR_LEN(len)); + /* 2nd gen bit... */ + ((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit); +} + +/* + * T3 ULP2_TX commands + */ +enum t3_utx_mem_op { + T3_UTX_MEM_READ = 2, + T3_UTX_MEM_WRITE = 3 +}; + +/* T3 MC7 RDMA TPT entry format */ + +enum tpt_mem_type { + TPT_NON_SHARED_MR = 0x0, + TPT_SHARED_MR = 0x1, + TPT_MW = 0x2, + TPT_MW_RELAXED_PROTECTION = 0x3 +}; + +enum tpt_addr_type { + TPT_ZBTO = 0, + TPT_VATO = 1 +}; + +enum tpt_mem_perm { + TPT_LOCAL_READ = 0x8, + TPT_LOCAL_WRITE = 0x4, + TPT_REMOTE_READ = 0x2, + TPT_REMOTE_WRITE = 0x1 +}; + +struct tpt_entry { + __be32 valid_stag_pdid; + __be32 flags_pagesize_qpid; + + __be32 rsvd_pbl_addr; + __be32 len; + __be32 va_hi; + __be32 va_low_or_fbo; + + __be32 rsvd_bind_cnt_or_pstag; + __be32 rsvd_pbl_size; +}; + +#define S_TPT_VALID 31 +#define V_TPT_VALID(x) ((x) << S_TPT_VALID) +#define F_TPT_VALID V_TPT_VALID(1U) + +#define S_TPT_STAG_KEY 23 +#define M_TPT_STAG_KEY 0xFF +#define V_TPT_STAG_KEY(x) ((x) << S_TPT_STAG_KEY) +#define G_TPT_STAG_KEY(x) (((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY) + +#define S_TPT_STAG_STATE 22 +#define V_TPT_STAG_STATE(x) ((x) << S_TPT_STAG_STATE) +#define F_TPT_STAG_STATE V_TPT_STAG_STATE(1U) + +#define S_TPT_STAG_TYPE 20 +#define M_TPT_STAG_TYPE 0x3 +#define V_TPT_STAG_TYPE(x) ((x) << S_TPT_STAG_TYPE) +#define G_TPT_STAG_TYPE(x) (((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE) + +#define S_TPT_PDID 0 +#define M_TPT_PDID 0xFFFFF +#define V_TPT_PDID(x) ((x) << S_TPT_PDID) +#define G_TPT_PDID(x) (((x) >> S_TPT_PDID) & M_TPT_PDID) + +#define S_TPT_PERM 28 +#define M_TPT_PERM 0xF +#define V_TPT_PERM(x) ((x) << S_TPT_PERM) +#define G_TPT_PERM(x) (((x) >> S_TPT_PERM) & M_TPT_PERM) + +#define S_TPT_REM_INV_DIS 27 +#define V_TPT_REM_INV_DIS(x) ((x) << S_TPT_REM_INV_DIS) +#define F_TPT_REM_INV_DIS V_TPT_REM_INV_DIS(1U) + +#define S_TPT_ADDR_TYPE 26 +#define V_TPT_ADDR_TYPE(x) ((x) << S_TPT_ADDR_TYPE) +#define F_TPT_ADDR_TYPE V_TPT_ADDR_TYPE(1U) + +#define S_TPT_MW_BIND_ENABLE 25 +#define V_TPT_MW_BIND_ENABLE(x) ((x) << S_TPT_MW_BIND_ENABLE) +#define F_TPT_MW_BIND_ENABLE V_TPT_MW_BIND_ENABLE(1U) + +#define S_TPT_PAGE_SIZE 20 +#define M_TPT_PAGE_SIZE 0x1F +#define V_TPT_PAGE_SIZE(x) ((x) << S_TPT_PAGE_SIZE) +#define G_TPT_PAGE_SIZE(x) (((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE) + +#define S_TPT_PBL_ADDR 0 +#define M_TPT_PBL_ADDR 0x1FFFFFFF +#define V_TPT_PBL_ADDR(x) ((x) << S_TPT_PBL_ADDR) +#define G_TPT_PBL_ADDR(x) (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR) + +#define S_TPT_QPID 0 +#define M_TPT_QPID 0xFFFFF +#define V_TPT_QPID(x) ((x) << S_TPT_QPID) +#define G_TPT_QPID(x) (((x) >> S_TPT_QPID) & M_TPT_QPID) + +#define S_TPT_PSTAG 0 +#define M_TPT_PSTAG 0xFFFFFF +#define V_TPT_PSTAG(x) ((x) << S_TPT_PSTAG) +#define G_TPT_PSTAG(x) (((x) >> S_TPT_PSTAG) & M_TPT_PSTAG) + +#define S_TPT_PBL_SIZE 0 +#define M_TPT_PBL_SIZE 0xFFFFF +#define V_TPT_PBL_SIZE(x) ((x) << S_TPT_PBL_SIZE) +#define G_TPT_PBL_SIZE(x) (((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE) + +/* + * CQE defs + */ +struct t3_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 wrid_hi; + u32 wrid_low; + } scqe; + } u; +}; + +#define S_CQE_OOO 31 +#define M_CQE_OOO 0x1 +#define G_CQE_OOO(x) ((((x) >> S_CQE_OOO)) & M_CQE_OOO) +#define V_CEQ_OOO(x) ((x)<<S_CQE_OOO) + +#define S_CQE_QPID 12 +#define M_CQE_QPID 0x7FFFF +#define G_CQE_QPID(x) ((((x) >> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<<S_CQE_QPID) + +#define S_CQE_SWCQE 11 +#define M_CQE_SWCQE 0x1 +#define G_CQE_SWCQE(x) ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<<S_CQE_SWCQE) + +#define S_CQE_GENBIT 10 +#define M_CQE_GENBIT 0x1 +#define G_CQE_GENBIT(x) (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<<S_CQE_GENBIT) + +#define S_CQE_STATUS 5 +#define M_CQE_STATUS 0x1F +#define G_CQE_STATUS(x) ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<<S_CQE_STATUS) + +#define S_CQE_TYPE 4 +#define M_CQE_TYPE 0x1 +#define G_CQE_TYPE(x) ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<<S_CQE_TYPE) + +#define S_CQE_OPCODE 0 +#define M_CQE_OPCODE 0xF +#define G_CQE_OPCODE(x) ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<<S_CQE_OPCODE) + +#define SW_CQE(x) (G_CQE_SWCQE(be32_to_cpu((x).header))) +#define CQE_OOO(x) (G_CQE_OOO(be32_to_cpu((x).header))) +#define CQE_QPID(x) (G_CQE_QPID(be32_to_cpu((x).header))) +#define CQE_GENBIT(x) (G_CQE_GENBIT(be32_to_cpu((x).header))) +#define CQE_TYPE(x) (G_CQE_TYPE(be32_to_cpu((x).header))) +#define SQ_TYPE(x) (CQE_TYPE((x))) +#define RQ_TYPE(x) (!CQE_TYPE((x))) +#define CQE_STATUS(x) (G_CQE_STATUS(be32_to_cpu((x).header))) +#define CQE_OPCODE(x) (G_CQE_OPCODE(be32_to_cpu((x).header))) + +#define CQE_LEN(x) (be32_to_cpu((x).len)) + +/* used for RQ completion processing */ +#define CQE_WRID_STAG(x) (be32_to_cpu((x).u.rcqe.stag)) +#define CQE_WRID_MSN(x) (be32_to_cpu((x).u.rcqe.msn)) + +/* used for SQ completion processing */ +#define CQE_WRID_SQ_WPTR(x) ((x).u.scqe.wrid_hi) +#define CQE_WRID_WPTR(x) ((x).u.scqe.wrid_low) + +/* generic accessor macros */ +#define CQE_WRID_HI(x) ((x).u.scqe.wrid_hi) +#define CQE_WRID_LOW(x) ((x).u.scqe.wrid_low) + +#define TPT_ERR_SUCCESS 0x0 +#define TPT_ERR_STAG 0x1 /* STAG invalid: either the */ + /* STAG is offlimt, being 0, */ + /* or STAG_key mismatch */ +#define TPT_ERR_PDID 0x2 /* PDID mismatch */ +#define TPT_ERR_QPID 0x3 /* QPID mismatch */ +#define TPT_ERR_ACCESS 0x4 /* Invalid access right */ +#define TPT_ERR_WRAP 0x5 /* Wrap error */ +#define TPT_ERR_BOUND 0x6 /* base and bounds voilation */ +#define TPT_ERR_INVALIDATE_SHARED_MR 0x7 /* attempt to invalidate a */ + /* shared memory region */ +#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8 /* attempt to invalidate a */ + /* shared memory region */ +#define TPT_ERR_ECC 0x9 /* ECC error detected */ +#define TPT_ERR_ECC_PSTAG 0xA /* ECC error detected when */ + /* reading PSTAG for a MW */ + /* Invalidate */ +#define TPT_ERR_PBL_ADDR_BOUND 0xB /* pbl addr out of bounds: */ + /* software error */ +#define TPT_ERR_SWFLUSH 0xC /* SW FLUSHED */ +#define TPT_ERR_CRC 0x10 /* CRC error */ +#define TPT_ERR_MARKER 0x11 /* Marker error */ +#define TPT_ERR_PDU_LEN_ERR 0x12 /* invalid PDU length */ +#define TPT_ERR_OUT_OF_RQE 0x13 /* out of RQE */ +#define TPT_ERR_DDP_VERSION 0x14 /* wrong DDP version */ +#define TPT_ERR_RDMA_VERSION 0x15 /* wrong RDMA version */ +#define TPT_ERR_OPCODE 0x16 /* invalid rdma opcode */ +#define TPT_ERR_DDP_QUEUE_NUM 0x17 /* invalid ddp queue number */ +#define TPT_ERR_MSN 0x18 /* MSN error */ +#define TPT_ERR_TBIT 0x19 /* tag bit not set correctly */ +#define TPT_ERR_MO 0x1A /* MO not 0 for TERMINATE */ + /* or READ_REQ */ +#define TPT_ERR_MSN_GAP 0x1B +#define TPT_ERR_MSN_RANGE 0x1C +#define TPT_ERR_IRD_OVERFLOW 0x1D +#define TPT_ERR_RQE_ADDR_BOUND 0x1E /* RQE addr out of bounds: */ + /* software error */ +#define TPT_ERR_INTERNAL_ERR 0x1F /* internal error (opcode */ + /* mismatch) */ + +struct t3_swsq { + __u64 wr_id; + struct t3_cqe cqe; + __u32 sq_wptr; + __be32 read_len; + int opcode; + int complete; + int signaled; +}; + +/* + * A T3 WQ implements both the SQ and RQ. + */ +struct t3_wq { + union t3_wr *queue; /* DMA accessable memory */ + dma_addr_t dma_addr; /* DMA address for HW */ + DECLARE_PCI_UNMAP_ADDR(mapping) /* unmap kruft */ + u32 error; /* 1 once we go to ERROR */ + u32 qpid; + u32 wptr; /* idx to next available WR slot */ + u32 size_log2; /* total wq size */ + struct t3_swsq *sq; /* SW SQ */ + struct t3_swsq *oldest_read; /* tracks oldest pending read */ + u32 sq_wptr; /* sq_wptr - sq_rptr == count of */ + u32 sq_rptr; /* pending wrs */ + u32 sq_size_log2; /* sq size */ + u64 *rq; /* SW RQ (holds consumer wr_ids */ + u32 rq_wptr; /* rq_wptr - rq_rptr == count of */ + u32 rq_rptr; /* pending wrs */ + u64 *rq_oldest_wr; /* oldest wr on the SW RQ */ + u32 rq_size_log2; /* rq size */ + u32 rq_addr; /* rq adapter address */ + void __iomem *doorbell; /* kernel db */ + u64 udb; /* user db if any */ +}; + +struct t3_cq { + u32 cqid; + u32 rptr; + u32 wptr; + u32 size_log2; + dma_addr_t dma_addr; + DECLARE_PCI_UNMAP_ADDR(mapping) + struct t3_cqe *queue; + struct t3_cqe *sw_queue; + u32 sw_rptr; + u32 sw_wptr; +}; + +#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \ + CQE_GENBIT(*cqe)) + +static inline void cxio_set_wq_in_error(struct t3_wq *wq) +{ + wq->queue->flit[13] = 1; +} + +static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + return NULL; +} + +static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c new file mode 100644 index 0000000..4611afa --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/module.h> +#include <linux/moduleparam.h> + +#include <rdma/ib_verbs.h> + +#include "cxgb3_offload.h" +#include "iwch_provider.h" +#include "iwch_user.h" +#include "iwch.h" +#include "iwch_cm.h" + +#define DRV_VERSION "1.1" + +MODULE_AUTHOR("Boyd Faulkner, Steve Wise"); +MODULE_DESCRIPTION("Chelsio T3 RDMA Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; + +static void open_rnic_dev(struct t3cdev *); +static void close_rnic_dev(struct t3cdev *); + +struct cxgb3_client t3c_client = { + .name = "iw_cxgb3", + .add = open_rnic_dev, + .remove = close_rnic_dev, + .handlers = t3c_handlers, + .redirect = iwch_ep_redirect +}; + +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(dev_mutex); + +static void rnic_init(struct iwch_dev *rnicp) +{ + PDBG("%s iwch_dev %p\n", __FUNCTION__, rnicp); + idr_init(&rnicp->cqidr); + idr_init(&rnicp->qpidr); + idr_init(&rnicp->mmidr); + spin_lock_init(&rnicp->lock); + + rnicp->attr.vendor_id = 0x168; + rnicp->attr.vendor_part_id = 7; + rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; + rnicp->attr.max_wrs = (1UL << 24) - 1; + rnicp->attr.max_sge_per_wr = T3_MAX_SGE; + rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; + rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; + rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1; + rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); + rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; + rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; + rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */ + rnicp->attr.can_resize_wq = 0; + rnicp->attr.max_rdma_reads_per_qp = 8; + rnicp->attr.max_rdma_read_resources = + rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps; + rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */ + rnicp->attr.max_rdma_read_depth = + rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps; + rnicp->attr.rq_overflow_handled = 0; + rnicp->attr.can_modify_ird = 0; + rnicp->attr.can_modify_ord = 0; + rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1; + rnicp->attr.stag0_value = 1; + rnicp->attr.zbva_support = 1; + rnicp->attr.local_invalidate_fence = 1; + rnicp->attr.cq_overflow_detection = 1; + return; +} + +static void open_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *rnicp; + static int vers_printed; + + PDBG("%s t3cdev %p\n", __FUNCTION__, tdev); + if (!vers_printed++) + printk(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", + DRV_VERSION); + rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + if (!rnicp) { + printk(KERN_ERR MOD "Cannot allocate ib device\n"); + return; + } + rnicp->rdev.ulp = rnicp; + rnicp->rdev.t3cdev_p = tdev; + + mutex_lock(&dev_mutex); + + if (cxio_rdev_open(&rnicp->rdev)) { + mutex_unlock(&dev_mutex); + printk(KERN_ERR MOD "Unable to open CXIO rdev\n"); + ib_dealloc_device(&rnicp->ibdev); + return; + } + + rnic_init(rnicp); + + list_add_tail(&rnicp->entry, &dev_list); + mutex_unlock(&dev_mutex); + + if (iwch_register_device(rnicp)) { + printk(KERN_ERR MOD "Unable to register device\n"); + close_rnic_dev(tdev); + } + printk(KERN_INFO MOD "Initialized device %s\n", + pci_name(rnicp->rdev.rnic_info.pdev)); + return; +} + +static void close_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *dev, *tmp; + PDBG("%s t3cdev %p\n", __FUNCTION__, tdev); + mutex_lock(&dev_mutex); + list_for_each_entry_safe(dev, tmp, &dev_list, entry) { + if (dev->rdev.t3cdev_p == tdev) { + list_del(&dev->entry); + iwch_unregister_device(dev); + cxio_rdev_close(&dev->rdev); + idr_destroy(&dev->cqidr); + idr_destroy(&dev->qpidr); + idr_destroy(&dev->mmidr); + ib_dealloc_device(&dev->ibdev); + break; + } + } + mutex_unlock(&dev_mutex); +} + +static int __init iwch_init_module(void) +{ + int err; + + err = cxio_hal_init(); + if (err) + return err; + err = iwch_cm_init(); + if (err) + return err; + cxio_register_ev_cb(iwch_ev_dispatch); + cxgb3_register_client(&t3c_client); + return 0; +} + +static void __exit iwch_exit_module(void) +{ + cxgb3_unregister_client(&t3c_client); + cxio_unregister_ev_cb(iwch_ev_dispatch); + iwch_cm_term(); + cxio_hal_exit(); +} + +module_init(iwch_init_module); +module_exit(iwch_exit_module); diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h new file mode 100644 index 0000000..6517ef8 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_H__ +#define __IWCH_H__ + +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/idr.h> + +#include <rdma/ib_verbs.h> + +#include "cxio_hal.h" +#include "cxgb3_offload.h" + +struct iwch_pd; +struct iwch_cq; +struct iwch_qp; +struct iwch_mr; + +struct iwch_rnic_attributes { + u32 vendor_id; + u32 vendor_part_id; + u32 max_qps; + u32 max_wrs; /* Max for any SQ/RQ */ + u32 max_sge_per_wr; + u32 max_sge_per_rdma_write_wr; /* for RDMA Write WR */ + u32 max_cqs; + u32 max_cqes_per_cq; + u32 max_mem_regs; + u32 max_phys_buf_entries; /* for phys buf list */ + u32 max_pds; + + /* + * The memory page sizes supported by this RNIC. + * Bit position i in bitmap indicates page of + * size (4k)^i. Phys block list mode unsupported. + */ + u32 mem_pgsizes_bitmask; + u8 can_resize_wq; + + /* + * The maximum number of RDMA Reads that can be outstanding + * per QP with this RNIC as the target. + */ + u32 max_rdma_reads_per_qp; + + /* + * The maximum number of resources used for RDMA Reads + * by this RNIC with this RNIC as the target. + */ + u32 max_rdma_read_resources; + + /* + * The max depth per QP for initiation of RDMA Read + * by this RNIC. + */ + u32 max_rdma_read_qp_depth; + + /* + * The maximum depth for initiation of RDMA Read + * operations by this RNIC on all QPs + */ + u32 max_rdma_read_depth; + u8 rq_overflow_handled; + u32 can_modify_ird; + u32 can_modify_ord; + u32 max_mem_windows; + u32 stag0_value; + u8 zbva_support; + u8 local_invalidate_fence; + u32 cq_overflow_detection; +}; + +struct iwch_dev { + struct ib_device ibdev; + struct cxio_rdev rdev; + u32 device_cap_flags; + struct iwch_rnic_attributes attr; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; + spinlock_t lock; + struct list_head entry; +}; + +static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct iwch_dev, ibdev); +} + +static inline int t3b_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3B; +} + +static inline int t3a_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3A; +} + +static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) +{ + return idr_find(&rhp->cqidr, cqid); +} + +static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) +{ + return idr_find(&rhp->qpidr, qpid); +} + +static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) +{ + return idr_find(&rhp->mmidr, mmid); +} + +static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + int ret; + u32 newid; + + do { + if (!idr_pre_get(idr, GFP_KERNEL)) { + return -ENOMEM; + } + spin_lock_irq(&rhp->lock); + ret = idr_get_new_above(idr, handle, id, &newid); + BUG_ON(newid != id); + spin_unlock_irq(&rhp->lock); + } while (ret == -EAGAIN); + + return ret; +} + +static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) +{ + spin_lock_irq(&rhp->lock); + idr_remove(idr, id); + spin_unlock_irq(&rhp->lock); +} + +extern struct cxgb3_client t3c_client; +extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; +extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb); + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c new file mode 100644 index 0000000..a522b1b --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -0,0 +1,2081 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/module.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/skbuff.h> +#include <linux/timer.h> +#include <linux/notifier.h> + +#include <net/neighbour.h> +#include <net/netevent.h> +#include <net/route.h> + +#include "tcb.h" +#include "cxgb3_offload.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" + +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; + +static int ep_timeout_secs = 10; +module_param(ep_timeout_secs, int, 0444); +MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " + "in seconds (default=10)"); + +static int mpa_rev = 1; +module_param(mpa_rev, int, 0444); +MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " + "1 is spec compliant. (default=1)"); + +static int markers_enabled = 0; +module_param(markers_enabled, int, 0444); +MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +module_param(crc_enabled, int, 0444); +MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +module_param(rcv_win, int, 0444); +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)"); + +static int snd_win = 32 * 1024; +module_param(snd_win, int, 0444); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)"); + +static unsigned int nocong = 0; +module_param(nocong, uint, 0444); +MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)"); + +static unsigned int cong_flavor = 1; +module_param(cong_flavor, uint, 0444); +MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)"); + +static void process_work(struct work_struct *work); +static struct workqueue_struct *workq; +static DECLARE_WORK(skb_work, process_work); + +static struct sk_buff_head rxq; +static cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS]; + +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); +static void ep_timeout(unsigned long arg); +static void connect_reply_upcall(struct iwch_ep *ep, int status); + +static void start_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __FUNCTION__, ep); + if (timer_pending(&ep->timer)) { + PDBG("%s stopped / restarted timer ep %p\n", __FUNCTION__, ep); + del_timer_sync(&ep->timer); + } else + get_ep(&ep->com); + ep->timer.expires = jiffies + ep_timeout_secs * HZ; + ep->timer.data = (unsigned long)ep; + ep->timer.function = ep_timeout; + add_timer(&ep->timer); +} + +static void stop_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __FUNCTION__, ep); + del_timer_sync(&ep->timer); + put_ep(&ep->com); +} + +static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) +{ + struct cpl_tid_release *req; + + skb = get_skb(skb, sizeof *req, GFP_KERNEL); + if (!skb) + return; + req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); + skb->priority = CPL_PRIORITY_SETUP; + tdev->send(tdev, skb); + return; +} + +int iwch_quiesce_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); + + skb->priority = CPL_PRIORITY_DATA; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +int iwch_resume_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = 0; + + skb->priority = CPL_PRIORITY_DATA; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +static void set_emss(struct iwch_ep *ep, u16 opt) +{ + PDBG("%s ep %p opt %u\n", __FUNCTION__, ep, opt); + ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40; + if (G_TCPOPT_TSTAMP(opt)) + ep->emss -= 12; + if (ep->emss < 128) + ep->emss = 128; + PDBG("emss=%d\n", ep->emss); +} + +static enum iwch_ep_state state_read(struct iwch_ep_common *epc) +{ + unsigned long flags; + enum iwch_ep_state state; + + spin_lock_irqsave(&epc->lock, flags); + state = epc->state; + spin_unlock_irqrestore(&epc->lock, flags); + return state; +} + +static inline void __state_set(struct iwch_ep_common *epc, + enum iwch_ep_state new) +{ + epc->state = new; +} + +static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + unsigned long flags; + + spin_lock_irqsave(&epc->lock, flags); + PDBG("%s - %s -> %s\n", __FUNCTION__, states[epc->state], states[new]); + __state_set(epc, new); + spin_unlock_irqrestore(&epc->lock, flags); + return; +} + +static void *alloc_ep(int size, gfp_t gfp) +{ + struct iwch_ep_common *epc; + + epc = kmalloc(size, gfp); + if (epc) { + memset(epc, 0, size); + kref_init(&epc->kref); + spin_lock_init(&epc->lock); + init_waitqueue_head(&epc->waitq); + } + PDBG("%s alloc ep %p\n", __FUNCTION__, epc); + return epc; +} + +void __free_ep(struct kref *kref) +{ + struct iwch_ep_common *epc; + epc = container_of(kref, struct iwch_ep_common, kref); + PDBG("%s ep %p state %s\n", __FUNCTION__, epc, states[state_read(epc)]); + kfree(epc); +} + +static void release_ep_resources(struct iwch_ep *ep) +{ + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid); + cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + if (ep->com.tdev->type == T3B) + release_tid(ep->com.tdev, ep->hwtid, NULL); + put_ep(&ep->com); +} + +static void process_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + void *ep; + struct t3cdev *tdev; + int ret; + + while ((skb = skb_dequeue(&rxq))) { + ep = *((void **) (skb->cb)); + tdev = *((struct t3cdev **) (skb->cb + sizeof(void *))); + ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep); + if (ret & CPL_RET_BUF_DONE) + kfree_skb(skb); + + /* + * ep was referenced in sched(), and is freed here. + */ + put_ep((struct iwch_ep_common *)ep); + } +} + +static int status2errno(int status) +{ + switch (status) { + case CPL_ERR_NONE: + return 0; + case CPL_ERR_CONN_RESET: + return -ECONNRESET; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +/* + * Try and reuse skbs already allocated... + */ +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) +{ + if (skb) { + BUG_ON(skb_cloned(skb)); + skb_trim(skb, 0); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + } + return skb; +} + +static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip, + __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct rtable *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = peer_ip, + .saddr = local_ip, + .tos = tos} + }, + .proto = IPPROTO_TCP, + .uli_u = { + .ports = { + .sport = local_port, + .dport = peer_port} + } + }; + + if (ip_route_output_flow(&rt, &fl, NULL, 0)) + return NULL; + return rt; +} + +static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return i; +} + +static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb) +{ + PDBG("%s t3cdev %p\n", __FUNCTION__, dev); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + printk(KERN_ERR MOD "ARP failure duing connect\n"); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + + PDBG("%s t3cdev %p\n", __FUNCTION__, dev); + req->cmd = CPL_ABORT_NO_RST; + cxgb3_ofld_send(dev, skb); +} + +static int send_halfclose(struct iwch_ep *ep, gfp_t gfp) +{ + struct cpl_close_con_req *req; + struct sk_buff *skb; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(NULL, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid)); + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + struct cpl_abort_req *req; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(skb, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __FUNCTION__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, abort_arp_failure); + req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); + req->cmd = CPL_ABORT_SEND_RST; + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int send_connect(struct iwch_ep *ep) +{ + struct cpl_act_open_req *req; + struct sk_buff *skb; + u32 opt0h, opt0l, opt2; + unsigned int mtu_idx; + int wscale; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __FUNCTION__); + return -ENOMEM; + } + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(cong_flavor); + skb->priority = CPL_PRIORITY_SETUP; + set_arp_failure_handler(skb, act_open_req_arp_failure); + + req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid)); + req->local_port = ep->com.local_addr.sin_port; + req->peer_port = ep->com.remote_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_ip = ep->com.remote_addr.sin_addr.s_addr; + req->opt0h = htonl(opt0h); + req->opt0l = htonl(opt0l); + req->params = 0; + req->opt2 = htonl(opt2); + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + + PDBG("%s ep %p pd_len %d\n", __FUNCTION__, ep, ep->plen); + + BUG_ON(skb_cloned(skb)); + + mpalen = sizeof(*mpa) + ep->plen; + if (skb->data + mpalen + sizeof(*req) > skb->end) { + kfree_skb(skb); + skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + } + skb_trim(skb, 0); + skb_reserve(skb, sizeof(*req)); + skb_put(skb, mpalen); + skb->priority = CPL_PRIORITY_DATA; + mpa = (struct mpa_message *) skb->data; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev; + + if (ep->plen) + memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb->h.raw = skb->data; + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_IMM_ACK|F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + l2t_send(ep->com.tdev, skb, ep->l2t); + start_ep_timer(ep); + state_set(&ep->com, MPA_REQ_SENT); + return; +} + +static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __FUNCTION__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __FUNCTION__); + return -ENOMEM; + } + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb again. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + skb->h.raw = skb->data; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(mpalen); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_IMM_ACK|F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __FUNCTION__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __FUNCTION__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb->h.raw = skb->data; + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_MORE | F_TX_IMM_ACK | F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + ep->mpa_skb = skb; + state_set(&ep->com, MPA_REP_SENT); + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, tid); + + dst_confirm(ep->dst); + + /* setup the hwtid for this connection */ + ep->hwtid = tid; + cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid); + + ep->snd_seq = ntohl(req->snd_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + /* dealloc the atid */ + cxgb3_free_atid(ep->com.tdev, ep->atid); + + /* start MPA negotiation */ + send_mpa_req(ep, skb); + + return 0; +} + +static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + PDBG("%s ep %p\n", __FILE__, ep); + state_set(&ep->com, ABORTING); + send_abort(ep, skb, gfp); +} + +static void close_complete_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + if (ep->com.cm_id) { + PDBG("close complete delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void peer_close_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + PDBG("peer close delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static void peer_abort_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = -ECONNRESET; + if (ep->com.cm_id) { + PDBG("abort delivered ep %p cm_id %p tid %d\n", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_reply_upcall(struct iwch_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p status %d\n", __FUNCTION__, ep, status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + event.local_addr = ep->com.local_addr; + event.remote_addr = ep->com.remote_addr; + + if ((status == 0) || (status == -ECONNREFUSED)) { + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d status %d\n", __FUNCTION__, ep, + ep->hwtid, status); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_request_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + event.local_addr = ep->com.local_addr; + event.remote_addr = ep->com.remote_addr; + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + event.provider_data = ep; + if (state_read(&ep->parent_ep->com) != DEAD) + ep->parent_ep->com.cm_id->event_handler( + ep->parent_ep->com.cm_id, + &event); + put_ep(&ep->parent_ep->com); + ep->parent_ep = NULL; +} + +static void established_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static int update_rx_credits(struct iwch_ep *ep, u32 credits) +{ + struct cpl_rx_data_ack *req; + struct sk_buff *skb; + + PDBG("%s ep %p credits %u\n", __FUNCTION__, ep, credits); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n"); + return 0; + } + + req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid)); + req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1)); + skb->priority = CPL_PRIORITY_ACK; + ep->com.tdev->send(ep->com.tdev, skb); + return credits; +} + +static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + int err; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_SENT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + err = -EINVAL; + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + memcpy(&(ep->mpa_pkt[ep->mpa_pkt_len]), skb->data, skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision != mpa_rev) { + err = -EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + err = -EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + err = -EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + err = -EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + if (mpa->flags & MPA_REJECT) { + err = -ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __FUNCTION__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (!err) + goto out; +err: + abort_connection(ep, skb, GFP_KERNEL); +out: + connect_reply_upcall(ep, err); + return; +} + +static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_WAIT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + PDBG("%s enter (%s line %u)\n", __FUNCTION__, __FILE__, __LINE__); + + /* + * Copy the new data into our accumulation buffer. + */ + memcpy(&(ep->mpa_pkt[ep->mpa_pkt_len]), skb->data, skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + PDBG("%s enter (%s line %u)\n", __FUNCTION__, __FILE__, __LINE__); + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision != mpa_rev) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __FUNCTION__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + connect_request_upcall(ep); + return; +} + +static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_rx_data *hdr = cplhdr(skb); + unsigned int dlen = ntohs(hdr->len); + + PDBG("%s ep %p dlen %u\n", __FUNCTION__, ep, dlen); + + skb_pull(skb, sizeof(*hdr)); + skb_trim(skb, dlen); + + switch (state_read(&ep->com)) { + case MPA_REQ_SENT: + process_mpa_reply(ep, skb); + break; + case MPA_REQ_WAIT: + process_mpa_request(ep, skb); + break; + case MPA_REP_SENT: + break; + default: + printk(KERN_ERR MOD "%s Unexpected streaming data." + " ep %p state %d tid %d\n", + __FUNCTION__, ep, state_read(&ep->com), ep->hwtid); + + /* + * The ep will timeout and inform the ULP of the failure. + * See ep_timeout(). + */ + break; + } + + /* update RX credits */ + update_rx_credits(ep, dlen); + + return CPL_RET_BUF_DONE; +} + +/* + * Upcall from the adapter indicating data has been transmitted. + * For us its just the single MPA request or reply. We can now free + * the skb holding the mpa message. + */ +static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_wr_ack *hdr = cplhdr(skb); + unsigned int credits = ntohs(hdr->credits); + enum iwch_qp_attr_mask mask; + + PDBG("%s ep %p credits %u\n", __FUNCTION__, ep, credits); + + if (credits == 0) + return CPL_RET_BUF_DONE; + BUG_ON(credits != 1); + BUG_ON(ep->mpa_skb == NULL); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; + dst_confirm(ep->dst); + if (state_read(&ep->com) == MPA_REP_SENT) { + struct iwch_qp_attributes attrs; + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ord; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + ep->com.rpl_err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + + if (!ep->com.rpl_err) { + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + } + + ep->com.rpl_done = 1; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + } + return CPL_RET_BUF_DONE; +} + +static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + close_complete_upcall(ep); + state_set(&ep->com, DEAD); + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %u errno %d\n", __FUNCTION__, ep, rpl->status, + status2errno(rpl->status)); + connect_reply_upcall(ep, status2errno(rpl->status)); + state_set(&ep->com, DEAD); + if (ep->com.tdev->type == T3B) + release_tid(ep->com.tdev, GET_TID(rpl), NULL); + cxgb3_free_atid(ep->com.tdev, ep->atid); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + put_ep(&ep->com); + return CPL_RET_BUF_DONE; +} + +static int listen_start(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_pass_open_req *req; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n"); + return -ENOMEM; + } + + req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid)); + req->local_port = ep->com.local_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + skb->priority = 1; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_pass_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %d error %d\n", __FUNCTION__, ep, + rpl->status, status2errno(rpl->status)); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + + return CPL_RET_BUF_DONE; +} + +static int listen_stop(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_close_listserv_req *req; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); + return -ENOMEM; + } + req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid)); + skb->priority = 1; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb, + void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_close_listserv_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p\n", __FUNCTION__, ep); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + return CPL_RET_BUF_DONE; +} + +static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb) +{ + struct cpl_pass_accept_rpl *rpl; + unsigned int mtu_idx; + u32 opt0h, opt0l, opt2; + int wscale; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(*rpl)); + skb_get(skb); + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(cong_flavor); + + rpl = cplhdr(skb); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(opt0h); + rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT); + rpl->opt2 = htonl(opt2); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + skb->priority = CPL_PRIORITY_SETUP; + l2t_send(ep->com.tdev, skb, ep->l2t); + + return; +} + +static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip, + struct sk_buff *skb) +{ + PDBG("%s t3cdev %p tid %u peer_ip %x\n", __FUNCTION__, tdev, hwtid, + peer_ip); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(struct cpl_tid_release)); + skb_get(skb); + + if (tdev->type == T3B) + release_tid(tdev, hwtid, skb); + else { + struct cpl_pass_accept_rpl *rpl; + + rpl = cplhdr(skb); + skb->priority = CPL_PRIORITY_SETUP; + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + rpl->opt2 = 0; + rpl->rsvd = rpl->opt2; + tdev->send(tdev, skb); + } +} + +static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *child_ep, *parent_ep = ctx; + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int hwtid = GET_TID(req); + struct dst_entry *dst; + struct l2t_entry *l2t; + struct rtable *rt; + struct iff_mac tim; + + PDBG("%s parent ep %p tid %u\n", __FUNCTION__, parent_ep, hwtid); + + if (state_read(&parent_ep->com) != LISTEN) { + printk(KERN_ERR "%s - listening ep not in LISTEN\n", + __FUNCTION__); + goto reject; + } + + /* + * Find the netdev for this connection request. + */ + tim.mac_addr = req->dst_mac; + tim.vlan_tag = ntohs(req->vlan_tag); + if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { + printk(KERN_ERR + "%s bad dst mac %02x %02x %02x %02x %02x %02x\n", + __FUNCTION__, + req->dst_mac[0], + req->dst_mac[1], + req->dst_mac[2], + req->dst_mac[3], + req->dst_mac[4], + req->dst_mac[5]); + goto reject; + } + + /* Find output route */ + rt = find_route(tdev, + req->local_ip, + req->peer_ip, + req->local_port, + req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid))); + if (!rt) { + printk(KERN_ERR MOD "%s - failed to find dst entry!\n", + __FUNCTION__); + goto reject; + } + dst = &rt->u.dst; + l2t = t3_l2t_get(tdev, dst->neighbour, dst->neighbour->dev); + if (!l2t) { + printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", + __FUNCTION__); + dst_release(dst); + goto reject; + } + child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); + if (!child_ep) { + printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n", + __FUNCTION__); + l2t_release(L2DATA(tdev), l2t); + dst_release(dst); + goto reject; + } + state_set(&child_ep->com, CONNECTING); + child_ep->com.tdev = tdev; + child_ep->com.cm_id = NULL; + child_ep->com.local_addr.sin_family = PF_INET; + child_ep->com.local_addr.sin_port = req->local_port; + child_ep->com.local_addr.sin_addr.s_addr = req->local_ip; + child_ep->com.remote_addr.sin_family = PF_INET; + child_ep->com.remote_addr.sin_port = req->peer_port; + child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip; + get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid)); + child_ep->l2t = l2t; + child_ep->dst = dst; + child_ep->hwtid = hwtid; + init_timer(&child_ep->timer); + cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid); + accept_cr(child_ep, req->peer_ip, skb); + goto out; +reject: + reject_cr(tdev, hwtid, req->peer_ip, skb); +out: + return CPL_RET_BUF_DONE; +} + +static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_pass_establish *req = cplhdr(skb); + + PDBG("%s ep %p\n", __FUNCTION__, ep); + ep->snd_seq = ntohl(req->snd_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + dst_confirm(ep->dst); + state_set(&ep->com, MPA_REQ_WAIT); + start_ep_timer(ep); + + return CPL_RET_BUF_DONE; +} + +static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int disconnect = 1; + int release = 0; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + dst_confirm(ep->dst); + + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. + */ + __state_set(&ep->com, CLOSING); + get_ep(&ep->com); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case FPDU_MODE: + __state_set(&ep->com, CLOSING); + attrs.next_state = IWCH_QP_STATE_CLOSING; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + peer_close_upcall(ep); + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + start_ep_timer(ep); + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + BUG_ON(1); + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (disconnect) + iwch_ep_disconnect(ep, 0, GFP_KERNEL); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static inline int is_neg_adv_abort(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; +} + +static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct iwch_ep *ep = ctx; + struct cpl_abort_rpl *rpl; + struct sk_buff *rpl_skb; + struct iwch_qp_attributes attrs; + int ret; + int state; + + if (is_neg_adv_abort(req->status)) { + PDBG("%s neg_adv_abort ep %p tid %d\n", __FUNCTION__, ep, + ep->hwtid); + t3_l2t_send_event(ep->com.tdev, ep->l2t); + return CPL_RET_BUF_DONE; + } + + state = state_read(&ep->com); + PDBG("%s ep %p state %u\n", __FUNCTION__, ep, state); + switch (state) { + case CONNECTING: + break; + case MPA_REQ_WAIT: + break; + case MPA_REQ_SENT: + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REP_SENT: + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. + */ + get_ep(&ep->com); + break; + case MORIBUND: + stop_ep_timer(ep); + case FPDU_MODE: + case CLOSING: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + ret = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + printk(KERN_ERR MOD + "%s - qp <- error failed!\n", + __FUNCTION__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __FUNCTION__); + return CPL_RET_BUF_DONE; + default: + BUG_ON(1); + break; + } + dst_confirm(ep->dst); + + rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + if (!rpl_skb) { + printk(KERN_ERR MOD "%s - cannot allocate skb!\n", + __FUNCTION__); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + put_ep(&ep->com); + return CPL_RET_BUF_DONE; + } + rpl_skb->priority = CPL_PRIORITY_DATA; + rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); + rpl->cmd = CPL_ABORT_NO_RST; + ep->com.tdev->send(ep->com.tdev, rpl_skb); + if (state != ABORTING) { + state_set(&ep->com, DEAD); + release_ep_resources(ep); + } + return CPL_RET_BUF_DONE; +} + +static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int release = 0; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + BUG_ON(!ep); + + /* The cm_id may be null if we failed to connect */ + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case CLOSING: + start_ep_timer(ep); + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + case DEAD: + default: + BUG_ON(1); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * T3A does 3 things when a TERM is received: + * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet + * 2) generate an async event on the QP with the TERMINATE opcode + * 3) post a TERMINATE opcde cqe into the associated CQ. + * + * For (1), we save the message in the qp for later consumer consumption. + * For (2), we move the QP into TERMINATE, post a QP event and disconnect. + * For (3), we toss the CQE in cxio_poll_cq(). + * + * terminate() handles case (1)... + */ +static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb_pull(skb, sizeof(struct cpl_rdma_terminate)); + PDBG("%s saving %d bytes of term msg\n", __FUNCTION__, skb->len); + memcpy(ep->com.qp->attr.terminate_buffer, skb->data, skb->len); + ep->com.qp->attr.terminate_msg_len = skb->len; + ep->com.qp->attr.is_terminate_local = 0; + return CPL_RET_BUF_DONE; +} + +static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_rdma_ec_status *rep = cplhdr(skb); + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p tid %u status %d\n", __FUNCTION__, ep, ep->hwtid, + rep->status); + if (rep->status) { + struct iwch_qp_attributes attrs; + + printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n", + __FUNCTION__, ep->hwtid); + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + abort_connection(ep, NULL, GFP_KERNEL); + } + return CPL_RET_BUF_DONE; +} + +static void ep_timeout(unsigned long arg) +{ + struct iwch_ep *ep = (struct iwch_ep *)arg; + struct iwch_qp_attributes attrs; + unsigned long flags; + + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p tid %u state %d\n", __FUNCTION__, ep, ep->hwtid, + ep->com.state); + switch (ep->com.state) { + case MPA_REQ_SENT: + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + break; + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + break; + default: + BUG(); + } + __state_set(&ep->com, CLOSING); + spin_unlock_irqrestore(&ep->com.lock, flags); + abort_connection(ep, NULL, GFP_ATOMIC); + put_ep(&ep->com); +} + +int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + struct iwch_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __FUNCTION__, ep, ep->hwtid); + + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return -ECONNRESET; + } + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + state_set(&ep->com, CLOSING); + if (mpa_rev == 0) + abort_connection(ep, NULL, GFP_KERNEL); + else { + err = send_mpa_reject(ep, pdata, pdata_len); + err = send_halfclose(ep, GFP_KERNEL); + } + return 0; +} + +int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + struct iwch_ep *ep = to_ep(cm_id); + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_qp *qp = get_qhp(h, conn_param->qpn); + + PDBG("%s ep %p tid %u\n", __FUNCTION__, ep, ep->hwtid); + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return -ECONNRESET; + } + + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(!qp); + + if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || + (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { + abort_connection(ep, NULL, GFP_KERNEL); + return -EINVAL; + } + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + + ep->com.rpl_done = 0; + ep->com.rpl_err = 0; + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + PDBG("%s %d ird %d ord %d\n", __FUNCTION__, __LINE__, ep->ird, ep->ord); + get_ep(&ep->com); + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) { + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); + abort_connection(ep, NULL, GFP_KERNEL); + put_ep(&ep->com); + return err; + } + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ord; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + + if (err) { + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); + abort_connection(ep, NULL, GFP_KERNEL); + } else { + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + } + put_ep(&ep->com); + return err; +} + +int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_ep *ep; + struct rtable *rt; + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __FUNCTION__); + err = -ENOMEM; + goto out; + } + init_timer(&ep->timer); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + ep->com.tdev = h->rdev.t3cdev_p; + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(h, conn_param->qpn); + BUG_ON(!ep->com.qp); + PDBG("%s qpn 0x%x qp %p cm_id %p\n", __FUNCTION__, conn_param->qpn, + ep->com.qp, cm_id); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__); + err = -ENOMEM; + goto fail2; + } + + /* find a route */ + rt = find_route(h->rdev.t3cdev_p, + cm_id->local_addr.sin_addr.s_addr, + cm_id->remote_addr.sin_addr.s_addr, + cm_id->local_addr.sin_port, + cm_id->remote_addr.sin_port, IPTOS_LOWDELAY); + if (!rt) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __FUNCTION__); + err = -EHOSTUNREACH; + goto fail3; + } + ep->dst = &rt->u.dst; + + /* get a l2t entry */ + ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst->neighbour, + ep->dst->neighbour->dev); + if (!ep->l2t) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __FUNCTION__); + err = -ENOMEM; + goto fail4; + } + + state_set(&ep->com, CONNECTING); + ep->tos = IPTOS_LOWDELAY; + ep->com.local_addr = cm_id->local_addr; + ep->com.remote_addr = cm_id->remote_addr; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + l2t_release(L2DATA(h->rdev.t3cdev_p), ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + cxgb3_free_atid(ep->com.tdev, ep->atid); +fail2: + put_ep(&ep->com); +out: + return err; +} + +int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_listen_ep *ep; + + + might_sleep(); + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __FUNCTION__); + err = -ENOMEM; + goto fail1; + } + PDBG("%s ep %p\n", __FUNCTION__, ep); + ep->com.tdev = h->rdev.t3cdev_p; + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->backlog = backlog; + ep->com.local_addr = cm_id->local_addr; + + /* + * Allocate a server TID. + */ + ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__); + err = -ENOMEM; + goto fail2; + } + + state_set(&ep->com, LISTEN); + err = listen_start(ep); + if (err) + goto fail3; + + /* wait for pass_open_rpl */ + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (!err) { + cm_id->provider_data = ep; + goto out; + } +fail3: + cxgb3_free_stid(ep->com.tdev, ep->stid); +fail2: + put_ep(&ep->com); +fail1: +out: + return err; +} + +int iwch_destroy_listen(struct iw_cm_id *cm_id) +{ + int err; + struct iwch_listen_ep *ep = to_listen_ep(cm_id); + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + might_sleep(); + state_set(&ep->com, DEAD); + ep->com.rpl_done = 0; + ep->com.rpl_err = 0; + err = listen_stop(ep); + wait_event(ep->com.waitq, ep->com.rpl_done); + cxgb3_free_stid(ep->com.tdev, ep->stid); + err = ep->com.rpl_err; + cm_id->rem_ref(cm_id); + put_ep(&ep->com); + return err; +} + +int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) +{ + int ret=0; + unsigned long flags; + int close = 0; + + spin_lock_irqsave(&ep->com.lock, flags); + + PDBG("%s ep %p state %s, abrupt %d\n", __FUNCTION__, ep, + states[ep->com.state], abrupt); + + if (ep->com.state == DEAD) { + PDBG("%s already dead ep %p\n", __FUNCTION__, ep); + goto out; + } + + if (abrupt) { + if (ep->com.state != ABORTING) { + ep->com.state = ABORTING; + close = 1; + } + goto out; + } + + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + ep->com.state = CLOSING; + close = 1; + break; + case CLOSING: + start_ep_timer(ep); + ep->com.state = MORIBUND; + close = 1; + break; + case MORIBUND: + break; + default: + BUG(); + break; + } +out: + spin_unlock_irqrestore(&ep->com.lock, flags); + if (close) { + if (abrupt) + ret = send_abort(ep, NULL, gfp); + else + ret = send_halfclose(ep, gfp); + } + return ret; +} + +int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, + struct l2t_entry *l2t) +{ + struct iwch_ep *ep = ctx; + + if (ep->dst != old) + return 0; + + PDBG("%s ep %p redirect to dst %p l2t %p\n", __FUNCTION__, ep, new, + l2t); + dst_hold(new); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + ep->l2t = l2t; + dst_release(old); + ep->dst = new; + return 1; +} + +/* + * All the CM events are handled on a work queue to have a safe context. + */ +static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep_common *epc = ctx; + + get_ep(epc); + + /* + * Save ctx and tdev in the skb->cb area. + */ + *((void **) skb->cb) = ctx; + *((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev; + + /* + * Queue the skb and schedule the worker thread. + */ + skb_queue_tail(&rxq, skb); + queue_work(workq, &skb_work); + return 0; +} + +int __init iwch_cm_init(void) +{ + skb_queue_head_init(&rxq); + + workq = create_singlethread_workqueue("iw_cxgb3"); + if (!workq) + return -ENOMEM; + + /* + * All upcalls from the T3 Core go to sched() to + * schedule the processing on a work queue. + */ + t3c_handlers[CPL_ACT_ESTABLISH] = sched; + t3c_handlers[CPL_ACT_OPEN_RPL] = sched; + t3c_handlers[CPL_RX_DATA] = sched; + t3c_handlers[CPL_TX_DMA_ACK] = sched; + t3c_handlers[CPL_ABORT_RPL_RSS] = sched; + t3c_handlers[CPL_ABORT_RPL] = sched; + t3c_handlers[CPL_PASS_OPEN_RPL] = sched; + t3c_handlers[CPL_CLOSE_LISTSRV_RPL] = sched; + t3c_handlers[CPL_PASS_ACCEPT_REQ] = sched; + t3c_handlers[CPL_PASS_ESTABLISH] = sched; + t3c_handlers[CPL_PEER_CLOSE] = sched; + t3c_handlers[CPL_CLOSE_CON_RPL] = sched; + t3c_handlers[CPL_ABORT_REQ_RSS] = sched; + t3c_handlers[CPL_RDMA_TERMINATE] = sched; + t3c_handlers[CPL_RDMA_EC_STATUS] = sched; + + /* + * These are the real handlers that are called from a + * work queue. + */ + work_handlers[CPL_ACT_ESTABLISH] = act_establish; + work_handlers[CPL_ACT_OPEN_RPL] = act_open_rpl; + work_handlers[CPL_RX_DATA] = rx_data; + work_handlers[CPL_TX_DMA_ACK] = tx_ack; + work_handlers[CPL_ABORT_RPL_RSS] = abort_rpl; + work_handlers[CPL_ABORT_RPL] = abort_rpl; + work_handlers[CPL_PASS_OPEN_RPL] = pass_open_rpl; + work_handlers[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl; + work_handlers[CPL_PASS_ACCEPT_REQ] = pass_accept_req; + work_handlers[CPL_PASS_ESTABLISH] = pass_establish; + work_handlers[CPL_PEER_CLOSE] = peer_close; + work_handlers[CPL_ABORT_REQ_RSS] = peer_abort; + work_handlers[CPL_CLOSE_CON_RPL] = close_con_rpl; + work_handlers[CPL_RDMA_TERMINATE] = terminate; + work_handlers[CPL_RDMA_EC_STATUS] = ec_status; + return 0; +} + +void __exit iwch_cm_term(void) +{ + flush_workqueue(workq); + destroy_workqueue(workq); +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h new file mode 100644 index 0000000..7c810d9 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWCH_CM_H_ +#define _IWCH_CM_H_ + +#include <linux/inet.h> +#include <linux/wait.h> +#include <linux/spinlock.h> +#include <linux/kref.h> + +#include <rdma/ib_verbs.h> +#include <rdma/iw_cm.h> + +#include "cxgb3_offload.h" +#include "iwch_provider.h" + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_REV 0 /* XXX - amso1100 uses rev 0 ! */ +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define put_ep(ep) { \ + PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_put(&((ep)->kref), __free_ep); \ +} + +#define get_ep(ep) { \ + PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_get(&((ep)->kref)); \ +} + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum iwch_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum iwch_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum iwch_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum iwch_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03 +}; + +enum iwch_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +struct iwch_ep_common { + struct iw_cm_id *cm_id; + struct iwch_qp *qp; + struct t3cdev *tdev; + enum iwch_ep_state state; + struct kref kref; + spinlock_t lock; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + wait_queue_head_t waitq; + int rpl_done; + int rpl_err; +}; + +struct iwch_listen_ep { + struct iwch_ep_common com; + unsigned int stid; + int backlog; +}; + +struct iwch_ep { + struct iwch_ep_common com; + struct iwch_ep *parent_ep; + struct timer_list timer; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + struct l2t_entry *l2t; + struct dst_entry *dst; + struct sk_buff *mpa_skb; + struct iwch_mpa_attributes mpa_attr; + unsigned int mpa_pkt_len; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + u8 tos; + u16 emss; + u16 plen; + u32 ird; + u32 ord; +}; + +static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<<wscale) < win) + wscale++; + return wscale; +} + +/* CM prototypes */ + +int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int iwch_create_listen(struct iw_cm_id *cm_id, int backlog); +int iwch_destroy_listen(struct iw_cm_id *cm_id); +int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); +int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp); +int iwch_quiesce_tid(struct iwch_ep *ep); +int iwch_resume_tid(struct iwch_ep *ep); +void __free_ep(struct kref *kref); +void iwch_rearp(struct iwch_ep *ep); +int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t); + +int __init iwch_cm_init(void); +void __exit iwch_cm_term(void); + +#endif /* _IWCH_CM_H_ */ diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c new file mode 100644 index 0000000..98b3bdb --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "iwch_provider.h" +#include "iwch.h" + +/* + * Get one cq entry from cxio and map it to openib. + * + * Returns: + * 0 EMPTY; + * 1 cqe returned + * -EAGAIN caller must try again + * any other -errno fatal error + */ +static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, + struct ib_wc *wc) +{ + struct iwch_qp *qhp = NULL; + struct t3_cqe cqe, *rd_cqe; + struct t3_wq *wq; + u32 credit = 0; + u8 cqe_flushed; + u64 cookie; + int ret = 1; + + rd_cqe = cxio_next_cqe(&chp->cq); + + if (!rd_cqe) + return 0; + + qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); + if (!qhp) + wq = NULL; + else { + spin_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, + &credit); + if (t3a_device(chp->rhp) && credit) { + PDBG("%s updating %d cq credits on id %d\n", __FUNCTION__, + credit, chp->cq.cqid); + cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit); + } + + if (ret) { + ret = -EAGAIN; + goto out; + } + ret = 1; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(cqe); + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __FUNCTION__, + CQE_QPID(cqe), CQE_TYPE(cqe), + CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe), + CQE_WRID_LOW(cqe), (unsigned long long) cookie); + + if (CQE_TYPE(cqe) == 0) { + if (!CQE_STATUS(cqe)) + wc->byte_len = CQE_LEN(cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + } else { + switch (CQE_OPCODE(cqe)) { + case T3_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case T3_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(cqe); + break; + case T3_SEND: + case T3_SEND_WITH_SE: + wc->opcode = IB_WC_SEND; + break; + case T3_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + /* these aren't supported yet */ + case T3_SEND_WITH_INV: + case T3_SEND_WITH_SE_INV: + case T3_LOCAL_INV: + case T3_FAST_REGISTER: + default: + printk(KERN_ERR MOD "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(cqe)) { + case TPT_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case TPT_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case TPT_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + case TPT_ERR_OPCODE: + wc->status = IB_WC_FATAL_ERR; + break; + case TPT_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for " + "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + spin_unlock(&qhp->lock); + return ret; +} + +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + unsigned long flags; + int npolled; + int err = 0; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + + spin_lock_irqsave(&chp->lock, flags); + for (npolled = 0; npolled < num_entries; ++npolled) { +#ifdef DEBUG + int i=0; +#endif + + /* + * Because T3 can post CQEs that are _not_ associated + * with a WR, we might have to poll again after removing + * one of these. + */ + do { + err = iwch_poll_cq_one(rhp, chp, wc + npolled); +#ifdef DEBUG + BUG_ON(++i > 1000); +#endif + } while (err == -EAGAIN); + if (err <= 0) + break; + } + spin_unlock_irqrestore(&chp->lock, flags); + + if (err < 0) + return err; + else { + return npolled; + } +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c new file mode 100644 index 0000000..a6efa8fe --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/slab.h> +#include <linux/mman.h> +#include <net/sock.h> +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, + struct respQ_msg_t *rsp_msg, + enum ib_event_type ib_event, + int send_term) +{ + struct ib_event event; + struct iwch_qp_attributes attrs; + struct iwch_qp *qhp; + + printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + + spin_lock(&rnicp->lock); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + + if (!qhp) { + printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n", + __FUNCTION__, CQE_STATUS(rsp_msg->cqe), + CQE_QPID(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + if ((qhp->attr.state == IWCH_QP_STATE_ERROR) || + (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) { + PDBG("%s AE received after RTS - " + "qp state %d qpid 0x%x status 0x%x\n", __FUNCTION__, + qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + atomic_inc(&qhp->refcnt); + spin_unlock(&rnicp->lock); + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } + + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); +} + +void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) +{ + struct iwch_dev *rnicp; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + struct iwch_cq *chp; + struct iwch_qp *qhp; + u32 cqid = RSPQ_CQID(rsp_msg); + + rnicp = (struct iwch_dev *) rdev_p->ulp; + spin_lock(&rnicp->lock); + chp = get_chp(rnicp, cqid); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + if (!chp || !qhp) { + printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n", + cqid, CQE_QPID(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), + CQE_WRID_LOW(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + goto out; + } + iwch_qp_add_ref(&qhp->ibqp); + atomic_inc(&chp->refcnt); + spin_unlock(&rnicp->lock); + + /* + * 1) completion of our sending a TERMINATE. + * 2) incoming TERMINATE message. + */ + if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) && + (CQE_STATUS(rsp_msg->cqe) == 0)) { + if (SQ_TYPE(rsp_msg->cqe)) { + PDBG("%s QPID 0x%x ep %p disconnecting\n", + __FUNCTION__, qhp->wq.qpid, qhp->ep); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } else { + PDBG("%s post REQ_ERR AE QPID 0x%x\n", __FUNCTION__, + qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, + IB_EVENT_QP_REQ_ERR, 0); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } + goto done; + } + + /* Bad incoming Read request */ + if (SQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + /* Bad incoming write */ + if (RQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + switch (CQE_STATUS(rsp_msg->cqe)) { + + /* Completion Events */ + case TPT_ERR_SUCCESS: + + /* + * Confirm the destination entry if this is a RECV completion. + */ + if (qhp->ep && SQ_TYPE(rsp_msg->cqe)) + dst_confirm(qhp->ep->dst); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + break; + + case TPT_ERR_STAG: + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + case TPT_ERR_WRAP: + case TPT_ERR_BOUND: + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + printk(KERN_ERR "%s - CQE Err qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); + break; + + /* Device Fatal Errors */ + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1); + break; + + /* QP Fatal Errors */ + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_PBL_ADDR_BOUND: + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_OPCODE: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_RQE_ADDR_BOUND: + case TPT_ERR_IRD_OVERFLOW: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + + default: + printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n", + CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + } +done: + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + iwch_qp_rem_ref(&qhp->ibqp); +out: + dev_kfree_skb_irq(skb); +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c new file mode 100644 index 0000000..2b6cd53 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <asm/byteorder.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> + +#include "cxio_hal.h" +#include "iwch.h" +#include "iwch_provider.h" + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list) +{ + u32 stag; + u32 mmid; + + + if (cxio_register_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift-12, + page_list, + &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + return -ENOMEM; + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p\n", __FUNCTION__, mmid, mhp); + return 0; +} + +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list, + int npages) +{ + u32 stag; + u32 mmid; + + + /* We could support this... */ + if (npages > mhp->attr.pbl_size) + return -ENOMEM; + + stag = mhp->attr.stag; + if (cxio_reregister_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift-12, + page_list, + &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + return -ENOMEM; + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p\n", __FUNCTION__, mmid, mhp); + return 0; +} + +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return -EINVAL; + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return -EINVAL; + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + } + + if (*total_size > 0xFFFFFFFFULL) + return -ENOMEM; + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if (num_phys_buf > 1) { + if ((1ULL << *shift) & mask) + break; + } else + if (1ULL << *shift >= + buffer_list[0].size + + (buffer_list[0].addr & ((1ULL << *shift) - 1))) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return -EINVAL; + + *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); + if (!*page_list) + return -ENOMEM; + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + + ((u64) j << *shift)); + + PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", + __FUNCTION__, (unsigned long long) *iova_start, + (unsigned long long) mask, *shift, (unsigned long long) *total_size, + *npages); + + return 0; + +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c new file mode 100644 index 0000000..6861087 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -0,0 +1,1203 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/device.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/ethtool.h> + +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/byteorder.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_user_verbs.h> + +#include "cxio_hal.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" +#include "iwch_user.h" + +static int iwch_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + return -ENOSYS; +} + +static struct ib_ah *iwch_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int iwch_ah_destroy(struct ib_ah *ah) +{ + return -ENOSYS; +} + +static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + return -ENOSYS; +} + +static int iwch_dealloc_ucontext(struct ib_ucontext *context) +{ + struct iwch_dev *rhp = to_iwch_dev(context->device); + struct iwch_ucontext *ucontext = to_iwch_ucontext(context); + struct iwch_mm_entry *mm, *tmp; + + PDBG("%s context %p\n", __FUNCTION__, context); + list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) + kfree(mm); + cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); + kfree(ucontext); + return 0; +} + +static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct iwch_ucontext *context; + struct iwch_dev *rhp = to_iwch_dev(ibdev); + + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + cxio_init_ucontext(&rhp->rdev, &context->uctx); + INIT_LIST_HEAD(&context->mmaps); + spin_lock_init(&context->mmap_lock); + return &context->ibucontext; +} + +static int iwch_destroy_cq(struct ib_cq *ib_cq) +{ + struct iwch_cq *chp; + + PDBG("%s ib_cq %p\n", __FUNCTION__, ib_cq); + chp = to_iwch_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + atomic_dec(&chp->refcnt); + wait_event(chp->wait, !atomic_read(&chp->refcnt)); + + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return 0; +} + +static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, + struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + struct iwch_create_cq_resp uresp; + struct iwch_create_cq_req ureq; + struct iwch_ucontext *ucontext = NULL; + + PDBG("%s ib_dev %p entries %d\n", __FUNCTION__, ibdev, entries); + rhp = to_iwch_dev(ibdev); + chp = kzalloc(sizeof(*chp), GFP_KERNEL); + if (!chp) + return ERR_PTR(-ENOMEM); + + if (ib_context) { + ucontext = to_iwch_ucontext(ib_context); + if (!t3a_device(rhp)) { + if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { + kfree(chp); + return ERR_PTR(-EFAULT); + } + chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr; + } + } + + if (t3a_device(rhp)) { + + /* + * T3A: Add some fluff to handle extra CQEs inserted + * for various errors. + * Additional CQE possibilities: + * TERMINATE, + * incoming RDMA WRITE Failures + * incoming RDMA READ REQUEST FAILUREs + * NOTE: We cannot ensure the CQ won't overflow. + */ + entries += 16; + } + entries = roundup_pow_of_two(entries); + chp->cq.size_log2 = ilog2(entries); + + if (cxio_create_cq(&rhp->rdev, &chp->cq)) { + kfree(chp); + return ERR_PTR(-ENOMEM); + } + chp->rhp = rhp; + chp->ibcq.cqe = (1 << chp->cq.size_log2) - 1; + spin_lock_init(&chp->lock); + atomic_set(&chp->refcnt, 1); + init_waitqueue_head(&chp->wait); + insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + + if (ucontext) { + struct iwch_mm_entry *mm; + + mm = kmalloc(sizeof *mm, GFP_KERNEL); + if (!mm) { + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-ENOMEM); + } + uresp.cqid = chp->cq.cqid; + uresp.size_log2 = chp->cq.size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + kfree(mm); + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-EFAULT); + } + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof (struct t3_cqe)); + insert_mmap(ucontext, mm); + } + PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n", + chp->cq.cqid, chp, (1 << chp->cq.size_log2), + (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +} + +static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ +#ifdef notyet + struct iwch_cq *chp = to_iwch_cq(cq); + struct t3_cq oldcq, newcq; + int ret; + + PDBG("%s ib_cq %p cqe %d\n", __FUNCTION__, cq, cqe); + + /* We don't downsize... */ + if (cqe <= cq->cqe) + return 0; + + /* create new t3_cq with new size */ + cqe = roundup_pow_of_two(cqe+1); + newcq.size_log2 = ilog2(cqe); + + /* Dont allow resize to less than the current wce count */ + if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { + return -ENOMEM; + } + + /* Quiesce all QPs using this CQ */ + ret = iwch_quiesce_qps(chp); + if (ret) { + return ret; + } + + ret = cxio_create_cq(&chp->rhp->rdev, &newcq); + if (ret) { + return ret; + } + + /* copy CQEs */ + memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * + sizeof(struct t3_cqe)); + + /* old iwch_qp gets new t3_cq but keeps old cqid */ + oldcq = chp->cq; + chp->cq = newcq; + chp->cq.cqid = oldcq.cqid; + + /* resize new t3_cq to update the HW context */ + ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); + if (ret) { + chp->cq = oldcq; + return ret; + } + chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1; + + /* destroy old t3_cq */ + oldcq.cqid = newcq.cqid; + ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); + if (ret) { + printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n", + __FUNCTION__, ret); + } + + /* add user hooks here */ + + /* resume qps */ + ret = iwch_resume_qps(chp); + return ret; +#else + return -ENOSYS; +#endif +} + +static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + enum t3_cq_opcode cq_op; + int err; + unsigned long flag; + u32 rptr; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + if (notify == IB_CQ_SOLICITED) + cq_op = CQ_ARM_SE; + else + cq_op = CQ_ARM_AN; + if (chp->user_rptr_addr) { + if (get_user(rptr, chp->user_rptr_addr)) + return -EFAULT; + spin_lock_irqsave(&chp->lock, flag); + chp->cq.rptr = rptr; + } else + spin_lock_irqsave(&chp->lock, flag); + PDBG("%s rptr 0x%x\n", __FUNCTION__, chp->cq.rptr); + err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0); + spin_unlock_irqrestore(&chp->lock, flag); + if (err) + printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err, + chp->cq.cqid); + return err; +} + +static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct cxio_rdev *rdev_p; + int ret = 0; + struct iwch_mm_entry *mm; + struct iwch_ucontext *ucontext; + + PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __FUNCTION__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) { + return -EINVAL; + } + + rdev_p = &(to_iwch_dev(context->device)->rdev); + ucontext = to_iwch_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return -EINVAL; + kfree(mm); + + if ((mm->addr >= rdev_p->rnic_info.udbell_physbase) && + (mm->addr < (rdev_p->rnic_info.udbell_physbase + + rdev_p->rnic_info.udbell_len))) { + + /* + * Map T3 DB register. + */ + if (vma->vm_flags & VM_READ) { + return -EPERM; + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_MAYREAD; + ret = io_remap_pfn_range(vma, vma->vm_start, + mm->addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + mm->addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +} + +static int iwch_deallocate_pd(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + + php = to_iwch_pd(pd); + rhp = php->rhp; + PDBG("%s ibpd %p pdid 0x%x\n", __FUNCTION__, pd, php->pdid); + cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); + kfree(php); + return 0; +} + +static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct iwch_pd *php; + u32 pdid; + struct iwch_dev *rhp; + + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + rhp = (struct iwch_dev *) ibdev; + pdid = cxio_hal_get_pdid(rhp->rdev.rscp); + if (!pdid) + return ERR_PTR(-EINVAL); + php = kzalloc(sizeof(*php), GFP_KERNEL); + if (!php) { + cxio_hal_put_pdid(rhp->rdev.rscp, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { + iwch_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + PDBG("%s pdid 0x%0x ptr 0x%p\n", __FUNCTION__, pdid, php); + return &php->ibpd; +} + +static int iwch_dereg_mr(struct ib_mr *ib_mr) +{ + struct iwch_dev *rhp; + struct iwch_mr *mhp; + u32 mmid; + + PDBG("%s ib_mr %p\n", __FUNCTION__, ib_mr); + /* There can be no memory windows */ + if (atomic_read(&ib_mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + remove_handle(rhp, &rhp->mmidr, mmid); + if (mhp->kva) + kfree((void *) (unsigned long) mhp->kva); + PDBG("%s mmid 0x%x ptr %p\n", __FUNCTION__, mmid, mhp); + kfree(mhp); + return 0; +} + +static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + int ret; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + php = to_iwch_pd(pd); + rhp = php->rhp; + + acc = iwch_convert_access(acc); + + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, &page_list); + if (ret) + goto err; + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + /* NOTE: TPT perms are backwards from BIND WR perms! */ + mhp->attr.perms = (acc & 0x1) << 3; + mhp->attr.perms |= (acc & 0x2) << 1; + mhp->attr.perms |= (acc & 0x4) >> 1; + mhp->attr.perms |= (acc & 0x8) >> 3; + + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = iwch_register_mem(rhp, php, mhp, shift, page_list); + kfree(page_list); + if (ret) { + goto err; + } + return &mhp->ibmr; +err: + kfree(mhp); + return ERR_PTR(ret); + +} + +static int iwch_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 * iova_start) +{ + + struct iwch_mr mh, *mhp; + struct iwch_pd *php; + struct iwch_dev *rhp; + int new_acc; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages; + int ret; + + PDBG("%s ib_mr %p ib_pd %p\n", __FUNCTION__, mr, pd); + + /* There can be no memory windows */ + if (atomic_read(&mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(mr); + rhp = mhp->rhp; + php = to_iwch_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return -EINVAL; + + new_acc = mhp->attr.perms; + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_iwch_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mh.attr.perms = iwch_convert_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + + ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages); + kfree(page_list); + if (ret) { + return ret; + } + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = acc; + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + + +static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, + int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, n, len; + int i, j, k; + int err = 0; + struct ib_umem_chunk *chunk; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + struct iwch_reg_user_mr_resp uresp; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + shift = ffs(region->page_size) - 1; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + n = 0; + list_for_each_entry(chunk, ®ion->chunk_list, list) + n += chunk->nents; + + pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err; + } + + acc = iwch_convert_access(acc); + + i = n = 0; + + list_for_each_entry(chunk, ®ion->chunk_list, list) + for (j = 0; j < chunk->nmap; ++j) { + len = sg_dma_len(&chunk->page_list[j]) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address( + &chunk->page_list[j]) + + region->page_size * k); + } + } + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = (acc & 0x1) << 3; + mhp->attr.perms |= (acc & 0x2) << 1; + mhp->attr.perms |= (acc & 0x4) >> 1; + mhp->attr.perms |= (acc & 0x8) >> 3; + mhp->attr.va_fbo = region->virt_base; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) region->length; + mhp->attr.pbl_size = i; + err = iwch_register_mem(rhp, php, mhp, shift, pages); + kfree(pages); + if (err) + goto err; + + if (udata && t3b_device(rhp)) { + uresp.pbl_addr = (mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3; + PDBG("%s user resp pbl_addr 0x%x\n", __FUNCTION__, + uresp.pbl_addr); + + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + iwch_dereg_mr(&mhp->ibmr); + err = -EFAULT; + goto err; + } + } + + return &mhp->ibmr; + +err: + kfree(mhp); + return ERR_PTR(err); +} + +static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva; + struct ib_mr *ibmr; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + + /* + * T3 only supports 32 bits of size. + */ + bl.size = 0xffffffff; + bl.addr = 0; + kva = 0; + ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); + return ibmr; +} + +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + kfree(mhp); + return ERR_PTR(ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __FUNCTION__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +static int iwch_dealloc_mw(struct ib_mw *mw) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + u32 mmid; + + mhp = to_iwch_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + remove_handle(rhp, &rhp->mmidr, mmid); + kfree(mhp); + PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __FUNCTION__, mw, mmid, mhp); + return 0; +} + +static int iwch_destroy_qp(struct ib_qp *ib_qp) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_qp_attributes attrs; + struct iwch_ucontext *ucontext; + + qhp = to_iwch_qp(ib_qp); + rhp = qhp->rhp; + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0); + } + wait_event(qhp->wait, !qhp->ep); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid); + + atomic_dec(&qhp->refcnt); + wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + + ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context) + : NULL; + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __FUNCTION__, + ib_qp, qhp->wq.qpid, qhp); + kfree(qhp); + return 0; +} + +static struct ib_qp *iwch_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_pd *php; + struct iwch_cq *schp; + struct iwch_cq *rchp; + struct iwch_create_qp_resp uresp; + int wqsize, sqsize, rqsize; + struct iwch_ucontext *ucontext; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + php = to_iwch_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + /* The RQT size must be # of entries + 1 rounded up to a power of two */ + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr); + if (rqsize == attrs->cap.max_recv_wr) + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1); + + /* T3 doesn't support RQT depth < 16 */ + if (rqsize < 16) + rqsize = 16; + + if (rqsize > T3_MAX_RQ_SIZE) + return ERR_PTR(-EINVAL); + + /* + * NOTE: The SQ and total WQ sizes don't need to be + * a power of two. However, all the code assumes + * they are. EG: Q_FREECNT() and friends. + */ + sqsize = roundup_pow_of_two(attrs->cap.max_send_wr); + wqsize = roundup_pow_of_two(rqsize + sqsize); + PDBG("%s wqsize %d sqsize %d rqsize %d\n", __FUNCTION__, + wqsize, sqsize, rqsize); + qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.size_log2 = ilog2(wqsize); + qhp->wq.rq_size_log2 = ilog2(rqsize); + qhp->wq.sq_size_log2 = ilog2(sqsize); + ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL; + if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) { + kfree(qhp); + return ERR_PTR(-ENOMEM); + } + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize; + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.next_state = IWCH_QP_STATE_IDLE; + + /* + * XXX - These don't get passed in from the openib user + * at create time. The CM sets them via a QP modify. + * Need to fix... I think the CM should + */ + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 1; + qhp->attr.max_ird = 1; + + spin_lock_init(&qhp->lock); + init_waitqueue_head(&qhp->wait); + atomic_set(&qhp->refcnt, 1); + insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid); + + if (udata) { + + struct iwch_mm_entry *mm1, *mm2; + + mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + if (!mm1) { + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) { + kfree(mm1); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + uresp.qpid = qhp->wq.qpid; + uresp.size_log2 = qhp->wq.size_log2; + uresp.sq_size_log2 = qhp->wq.sq_size_log2; + uresp.rq_size_log2 = qhp->wq.rq_size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.db_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + kfree(mm1); + kfree(mm2); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-EFAULT); + } + mm1->key = uresp.key; + mm1->addr = virt_to_phys(qhp->wq.queue); + mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); + insert_mmap(ucontext, mm1); + mm2->key = uresp.db_key; + mm2->addr = qhp->wq.udb & PAGE_MASK; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + qhp->ibqp.qp_num = qhp->wq.qpid; + init_timer(&(qhp->timer)); + PDBG("%s sq_num_entries %d, rq_num_entries %d " + "qpid 0x%0x qhp %p dma_addr 0x%llx size %d\n", + __FUNCTION__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, + qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr, + 1 << qhp->wq.size_log2); + return &qhp->ibqp; +} + +static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + enum iwch_qp_attr_mask mask = 0; + struct iwch_qp_attributes attrs; + + PDBG("%s ib_qp %p\n", __FUNCTION__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_iwch_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = iwch_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + return iwch_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +void iwch_qp_add_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __FUNCTION__, qp); + atomic_inc(&(to_iwch_qp(qp)->refcnt)); +} + +void iwch_qp_rem_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __FUNCTION__, qp); + if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt))) + wake_up(&(to_iwch_qp(qp)->wait)); +} + +struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn) +{ + PDBG("%s ib_dev %p qpn 0x%x\n", __FUNCTION__, dev, qpn); + return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn); +} + + +static int iwch_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + *pkey = 0; + return 0; +} + +static int iwch_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct iwch_dev *dev; + + PDBG("%s ibdev %p, port %d, index %d, gid %p\n", + __FUNCTION__, ibdev, port, index, gid); + dev = to_iwch_dev(ibdev); + BUG_ON(port == 0 || port > 2); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6); + return 0; +} + +static int iwch_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct iwch_dev *dev; + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + + dev = to_iwch_dev(ibdev); + memset(props, 0, sizeof *props); + memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + props->device_cap_flags = dev->device_cap_flags; + props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; + props->max_mr_size = ~0ull; + props->max_qp = dev->attr.max_qps; + props->max_qp_wr = dev->attr.max_wrs; + props->max_sge = dev->attr.max_sge_per_wr; + props->max_sge_rd = 1; + props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_cq = dev->attr.max_cqs; + props->max_cqe = dev->attr.max_cqes_per_cq; + props->max_mr = dev->attr.max_mem_regs; + props->max_pd = dev->attr.max_pds; + props->local_ca_ack_delay = 0; + + return 0; +} + +static int iwch_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + props->max_mtu = IB_MTU_4096; + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = 2; + props->active_speed = 2; + props->max_msg_sz = -1; + + return 0; +} + +static ssize_t show_rev(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev); + return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type); +} + +static ssize_t show_fw_ver(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + struct ethtool_drvinfo info; + struct net_device *lldev = dev->rdev.t3cdev_p->lldev; + + PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.fw_version); +} + +static ssize_t show_hca(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + struct ethtool_drvinfo info; + struct net_device *lldev = dev->rdev.t3cdev_p->lldev; + + PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + PDBG("%s class dev 0x%p\n", __FUNCTION__, dev); + return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor, + dev->rdev.rnic_info.pdev->device); +} + +static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct class_device_attribute *iwch_class_attributes[] = { + &class_device_attr_hw_rev, + &class_device_attr_fw_ver, + &class_device_attr_hca_type, + &class_device_attr_board_id +}; + +int iwch_register_device(struct iwch_dev *dev) +{ + int ret; + int i; + + PDBG("%s iwch_dev %p\n", __FUNCTION__, dev); + strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + dev->ibdev.owner = THIS_MODULE; + dev->device_cap_flags = + (IB_DEVICE_ZERO_STAG | + IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW); + + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; + dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev); + dev->ibdev.class_dev.dev = &(dev->rdev.rnic_info.pdev->dev); + dev->ibdev.query_device = iwch_query_device; + dev->ibdev.query_port = iwch_query_port; + dev->ibdev.modify_port = iwch_modify_port; + dev->ibdev.query_pkey = iwch_query_pkey; + dev->ibdev.query_gid = iwch_query_gid; + dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; + dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; + dev->ibdev.mmap = iwch_mmap; + dev->ibdev.alloc_pd = iwch_allocate_pd; + dev->ibdev.dealloc_pd = iwch_deallocate_pd; + dev->ibdev.create_ah = iwch_ah_create; + dev->ibdev.destroy_ah = iwch_ah_destroy; + dev->ibdev.create_qp = iwch_create_qp; + dev->ibdev.modify_qp = iwch_ib_modify_qp; + dev->ibdev.destroy_qp = iwch_destroy_qp; + dev->ibdev.create_cq = iwch_create_cq; + dev->ibdev.destroy_cq = iwch_destroy_cq; + dev->ibdev.resize_cq = iwch_resize_cq; + dev->ibdev.poll_cq = iwch_poll_cq; + dev->ibdev.get_dma_mr = iwch_get_dma_mr; + dev->ibdev.reg_phys_mr = iwch_register_phys_mem; + dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; + dev->ibdev.reg_user_mr = iwch_reg_user_mr; + dev->ibdev.dereg_mr = iwch_dereg_mr; + dev->ibdev.alloc_mw = iwch_alloc_mw; + dev->ibdev.bind_mw = iwch_bind_mw; + dev->ibdev.dealloc_mw = iwch_dealloc_mw; + + dev->ibdev.attach_mcast = iwch_multicast_attach; + dev->ibdev.detach_mcast = iwch_multicast_detach; + dev->ibdev.process_mad = iwch_process_mad; + + dev->ibdev.req_notify_cq = iwch_arm_cq; + dev->ibdev.post_send = iwch_post_send; + dev->ibdev.post_recv = iwch_post_receive; + + + dev->ibdev.iwcm = + (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs), + GFP_KERNEL); + dev->ibdev.iwcm->connect = iwch_connect; + dev->ibdev.iwcm->accept = iwch_accept_cr; + dev->ibdev.iwcm->reject = iwch_reject_cr; + dev->ibdev.iwcm->create_listen = iwch_create_listen; + dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen; + dev->ibdev.iwcm->add_ref = iwch_qp_add_ref; + dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; + dev->ibdev.iwcm->get_qp = iwch_get_qp; + + ret = ib_register_device(&dev->ibdev); + if (ret) + goto bail1; + + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { + ret = class_device_create_file(&dev->ibdev.class_dev, + iwch_class_attributes[i]); + if (ret) { + goto bail2; + } + } + return 0; +bail2: + ib_unregister_device(&dev->ibdev); +bail1: + return ret; +} + +void iwch_unregister_device(struct iwch_dev *dev) +{ + int i; + + PDBG("%s iwch_dev %p\n", __FUNCTION__, dev); + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) + class_device_remove_file(&dev->ibdev.class_dev, + iwch_class_attributes[i]); + ib_unregister_device(&dev->ibdev); + return; +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h new file mode 100644 index 0000000..61e3278 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_PROVIDER_H__ +#define __IWCH_PROVIDER_H__ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <rdma/ib_verbs.h> +#include <asm/types.h> +#include "t3cdev.h" +#include "iwch.h" +#include "cxio_wr.h" +#include "cxio_hal.h" + +struct iwch_pd { + struct ib_pd ibpd; + u32 pdid; + struct iwch_dev *rhp; +}; + +static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct iwch_pd, ibpd); +} + +struct tpt_attributes { + u32 stag; + u32 state:1; + u32 type:2; + u32 rsvd:1; + enum tpt_mem_perm perms; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; + + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 len; + u64 va_fbo; + u32 pbl_size; +}; + +struct iwch_mr { + struct ib_mr ibmr; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +typedef struct iwch_mw iwch_mw_handle; + +static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct iwch_mr, ibmr); +} + +struct iwch_mw { + struct ib_mw ibmw; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct iwch_mw, ibmw); +} + +struct iwch_cq { + struct ib_cq ibcq; + struct iwch_dev *rhp; + struct t3_cq cq; + spinlock_t lock; + atomic_t refcnt; + wait_queue_head_t wait; + u32 __user *user_rptr_addr; +}; + +static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct iwch_cq, ibcq); +} + +enum IWCH_QP_FLAGS { + QP_QUIESCED = 0x01 +}; + +struct iwch_mpa_attributes { + u8 recv_marker_enabled; + u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ + u8 crc_enabled; + u8 version; /* 0 or 1 */ +}; + +struct iwch_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; /* enable inbound Read Resp. */ + u8 enable_bind; + u8 enable_mmid0_fastreg; /* Enable STAG0 + Fast-register */ + /* + * Next QP state. If specify the current state, only the + * QP attributes will be modified. + */ + u32 max_ord; + u32 max_ird; + u32 pd; /* IN */ + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct iwch_mpa_attributes mpa_attr; /* IN-OUT */ + struct iwch_ep *llp_stream_handle; + char *stream_msg_buf; /* Last stream msg. before Idle -> RTS */ + u32 stream_msg_buf_len; /* Only on Idle -> RTS */ +}; + +struct iwch_qp { + struct ib_qp ibqp; + struct iwch_dev *rhp; + struct iwch_ep *ep; + struct iwch_qp_attributes attr; + struct t3_wq wq; + spinlock_t lock; + atomic_t refcnt; + wait_queue_head_t wait; + enum IWCH_QP_FLAGS flags; + struct timer_list timer; +}; + +static inline int qp_quiesced(struct iwch_qp *qhp) +{ + return qhp->flags & QP_QUIESCED; +} + +static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct iwch_qp, ibqp); +} + +void iwch_qp_add_ref(struct ib_qp *qp); +void iwch_qp_rem_ref(struct ib_qp *qp); +struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn); + +struct iwch_ucontext { + struct ib_ucontext ibucontext; + struct cxio_ucontext uctx; + u32 key; + spinlock_t mmap_lock; + struct list_head mmaps; +}; + +static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct iwch_ucontext, ibucontext); +} + +struct iwch_mm_entry { + struct list_head entry; + u64 addr; + u32 key; + unsigned len; +}; + +static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext, + u32 key, unsigned len) +{ + struct list_head *pos, *nxt; + struct iwch_mm_entry *mm; + + spin_lock(&ucontext->mmap_lock); + list_for_each_safe(pos, nxt, &ucontext->mmaps) { + + mm = list_entry(pos, struct iwch_mm_entry, entry); + if (mm->key == key && mm->len == len) { + list_del_init(&mm->entry); + spin_unlock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + spin_unlock(&ucontext->mmap_lock); + return NULL; +} + +static inline void insert_mmap(struct iwch_ucontext *ucontext, + struct iwch_mm_entry *mm) +{ + spin_lock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__, + mm->key, (unsigned long long) mm->addr, mm->len); + list_add_tail(&mm->entry, &ucontext->mmaps); + spin_unlock(&ucontext->mmap_lock); +} + +enum iwch_qp_attr_mask { + IWCH_QP_ATTR_NEXT_STATE = 1 << 0, + IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + IWCH_QP_ATTR_MAX_ORD = 1 << 11, + IWCH_QP_ATTR_MAX_IRD = 1 << 12, + IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + IWCH_QP_ATTR_MPA_ATTR = 1 << 24, + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_MAX_ORD | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_STREAM_MSG_BUFFER | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int iwch_modify_qp(struct iwch_dev *rhp, + struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal); + +enum iwch_qp_state { + IWCH_QP_STATE_IDLE, + IWCH_QP_STATE_RTS, + IWCH_QP_STATE_ERROR, + IWCH_QP_STATE_TERMINATE, + IWCH_QP_STATE_CLOSING, + IWCH_QP_STATE_TOT +}; + +static inline int iwch_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return IWCH_QP_STATE_IDLE; + case IB_QPS_RTS: + return IWCH_QP_STATE_RTS; + case IB_QPS_SQD: + return IWCH_QP_STATE_CLOSING; + case IB_QPS_SQE: + return IWCH_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return IWCH_QP_STATE_ERROR; + default: + return -1; + } +} + +enum iwch_mem_perms { + IWCH_MEM_ACCESS_LOCAL_READ = 1 << 0, + IWCH_MEM_ACCESS_LOCAL_WRITE = 1 << 1, + IWCH_MEM_ACCESS_REMOTE_READ = 1 << 2, + IWCH_MEM_ACCESS_REMOTE_WRITE = 1 << 3, + IWCH_MEM_ACCESS_ATOMICS = 1 << 4, + IWCH_MEM_ACCESS_BINDING = 1 << 5, + IWCH_MEM_ACCESS_LOCAL = + (IWCH_MEM_ACCESS_LOCAL_READ | IWCH_MEM_ACCESS_LOCAL_WRITE), + IWCH_MEM_ACCESS_REMOTE = + (IWCH_MEM_ACCESS_REMOTE_WRITE | IWCH_MEM_ACCESS_REMOTE_READ) + /* cannot go beyond 1 << 31 */ +} __attribute__ ((packed)); + +static inline u32 iwch_convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? IWCH_MEM_ACCESS_REMOTE_WRITE : 0) + | (acc & IB_ACCESS_REMOTE_READ ? IWCH_MEM_ACCESS_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? IWCH_MEM_ACCESS_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? IWCH_MEM_ACCESS_BINDING : 0) | + IWCH_MEM_ACCESS_LOCAL_READ; +} + +enum iwch_mmid_state { + IWCH_STAG_STATE_VALID, + IWCH_STAG_STATE_INVALID +}; + +enum iwch_qp_query_flags { + IWCH_QP_QUERY_CONTEXT_NONE = 0x0, /* No ctx; Only attrs */ + IWCH_QP_QUERY_CONTEXT_GET = 0x1, /* Get ctx + attrs */ + IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2, /* Not Supported */ + + /* + * Quiesce QP context; Consumer + * will NOT replay outstanding WR + */ + IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4, + IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8, + IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ +}; + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); +int iwch_register_device(struct iwch_dev *dev); +void iwch_unregister_device(struct iwch_dev *dev); +int iwch_quiesce_qps(struct iwch_cq *chp); +int iwch_resume_qps(struct iwch_cq *chp); +void stop_read_rep_timer(struct iwch_qp *qhp); +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list); +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list, + int npages); +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list); + + +#define IWCH_NODE_DESC "cxgb3 Chelsio Communications" + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c new file mode 100644 index 0000000..e066727 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" + +#define NO_SUPPORT -1 + +static inline int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, + u8 * flit_cnt) +{ + int i; + u32 plen; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE; + else + wqe->send.rdmaop = T3_SEND; + wqe->send.rem_stag = 0; + break; +#if 0 /* Not currently supported */ + case TYPE_SEND_INVALIDATE: + case TYPE_SEND_INVALIDATE_IMMEDIATE: + wqe->send.rdmaop = T3_SEND_WITH_INV; + wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + break; + case TYPE_SEND_SE_INVALIDATE: + wqe->send.rdmaop = T3_SEND_WITH_SE_INV; + wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + break; +#endif + default: + break; + } + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->send.reserved[0] = 0; + wqe->send.reserved[1] = 0; + wqe->send.reserved[2] = 0; + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + plen = 4; + wqe->send.sgl[0].stag = wr->imm_data; + wqe->send.sgl[0].len = __constant_cpu_to_be32(0); + wqe->send.num_sgle = __constant_cpu_to_be32(0); + *flit_cnt = 5; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return -EMSGSIZE; + } + plen += wr->sg_list[i].length; + wqe->send.sgl[i].stag = + cpu_to_be32(wr->sg_list[i].lkey); + wqe->send.sgl[i].len = + cpu_to_be32(wr->sg_list[i].length); + wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr); + } + wqe->send.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 4 + ((wr->num_sge) << 1); + } + wqe->send.plen = cpu_to_be32(plen); + return 0; +} + +static inline int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + int i; + u32 plen; + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->write.rdmaop = T3_RDMA_WRITE; + wqe->write.reserved[0] = 0; + wqe->write.reserved[1] = 0; + wqe->write.reserved[2] = 0; + wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); + wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); + + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + plen = 4; + wqe->write.sgl[0].stag = wr->imm_data; + wqe->write.sgl[0].len = __constant_cpu_to_be32(0); + wqe->write.num_sgle = __constant_cpu_to_be32(0); + *flit_cnt = 6; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return -EMSGSIZE; + } + plen += wr->sg_list[i].length; + wqe->write.sgl[i].stag = + cpu_to_be32(wr->sg_list[i].lkey); + wqe->write.sgl[i].len = + cpu_to_be32(wr->sg_list[i].length); + wqe->write.sgl[i].to = + cpu_to_be64(wr->sg_list[i].addr); + } + wqe->write.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 5 + ((wr->num_sge) << 1); + } + wqe->write.plen = cpu_to_be32(plen); + return 0; +} + +static inline int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + if (wr->num_sge > 1) + return -EINVAL; + wqe->read.rdmaop = T3_READ_REQ; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.reserved[2] = 0; + wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr); + wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey); + wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length); + wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr); + *flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + return 0; +} + +/* + * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now. + */ +static inline int iwch_sgl2pbl_map(struct iwch_dev *rhp, + struct ib_sge *sg_list, u32 num_sgle, + u32 * pbl_addr, u8 * page_size) +{ + int i; + struct iwch_mr *mhp; + u32 offset; + for (i = 0; i < num_sgle; i++) { + + mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8); + if (!mhp) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EIO; + } + if (!mhp->attr.state) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EIO; + } + if (mhp->attr.zbva) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EIO; + } + + if (sg_list[i].addr < mhp->attr.va_fbo) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) < + sg_list[i].addr) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) > + mhp->attr.va_fbo + ((u64) mhp->attr.len)) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EINVAL; + } + offset = sg_list[i].addr - mhp->attr.va_fbo; + offset += ((u32) mhp->attr.va_fbo) % + (1UL << (12 + mhp->attr.page_size)); + pbl_addr[i] = ((mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3) + + (offset >> (12 + mhp->attr.page_size)); + page_size[i] = mhp->attr.page_size; + } + return 0; +} + +static inline int iwch_build_rdma_recv(struct iwch_dev *rhp, + union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i, err = 0; + u32 pbl_addr[4]; + u8 page_size[4]; + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + err = iwch_sgl2pbl_map(rhp, wr->sg_list, wr->num_sge, pbl_addr, + page_size); + if (err) + return err; + wqe->recv.pagesz[0] = page_size[0]; + wqe->recv.pagesz[1] = page_size[1]; + wqe->recv.pagesz[2] = page_size[2]; + wqe->recv.pagesz[3] = page_size[3]; + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + for (i = 0; i < wr->num_sge; i++) { + wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey); + wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + + /* to in the WQE == the offset into the page */ + wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) % + (1UL << (12 + page_size[i]))); + + /* pbl_addr is the adapters address in the PBL */ + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]); + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + return 0; +} + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 t3_wr_flit_cnt; + enum t3_wr_opcode t3_wr_opcode = 0; + enum t3_wr_flags t3_wr_flags; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs <= 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + t3_wr_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + t3_wr_flags |= T3_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_READ_FENCE_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED) + t3_wr_flags |= T3_COMPLETION_FLAG; + sqp = qhp->wq.sq + + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + t3_wr_opcode = T3_WR_SEND; + err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + t3_wr_opcode = T3_WR_WRITE; + err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_READ: + t3_wr_opcode = T3_WR_READ; + t3_wr_flags = 0; /* T3 reads are always signaled */ + err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt); + if (err) + break; + sqp->read_len = wqe->read.local_len; + if (!qhp->wq.oldest_read) + qhp->wq.oldest_read = sqp; + break; + default: + PDBG("%s post of type=%d TBD!\n", __FUNCTION__, + wr->opcode); + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp->wr_id = wr->wr_id; + sqp->opcode = wr2opcode(t3_wr_opcode); + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED); + + build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, t3_wr_flit_cnt); + PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n", + __FUNCTION__, (unsigned long long) wr->wr_id, idx, + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2), + sqp->opcode); + wr = wr->next; + num_wrs--; + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + } + spin_unlock_irqrestore(&qhp->lock, flag); + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + return err; +} + +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, + qhp->wq.rq_size_log2) - 1; + if (!wr) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + while (wr) { + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + if (num_wrs) + err = iwch_build_rdma_recv(qhp->rhp, wqe, wr); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] = + wr->wr_id; + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, sizeof(struct t3_receive_wr) >> 3); + PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x " + "wqe %p \n", __FUNCTION__, (unsigned long long) wr->wr_id, + idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe); + ++(qhp->wq.rq_wptr); + ++(qhp->wq.wptr); + wr = wr->next; + num_wrs--; + } + spin_unlock_irqrestore(&qhp->lock, flag); + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + return err; +} + +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + struct iwch_qp *qhp; + union t3_wr *wqe; + u32 pbl_addr; + u8 page_size; + u32 num_wrs; + unsigned long flag; + struct ib_sge sgl; + int err=0; + enum t3_wr_flags t3_wr_flags; + u32 idx; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(qp); + mhp = to_iwch_mw(mw); + rhp = qhp->rhp; + + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if ((num_wrs) <= 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __FUNCTION__, idx, + mw, mw_bind); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + + t3_wr_flags = 0; + if (mw_bind->send_flags & IB_SEND_SIGNALED) + t3_wr_flags = T3_COMPLETION_FLAG; + + sgl.addr = mw_bind->addr; + sgl.lkey = mw_bind->mr->lkey; + sgl.length = mw_bind->length; + wqe->bind.reserved = 0; + wqe->bind.type = T3_VA_BASED_TO; + + /* TBD: check perms */ + wqe->bind.perms = iwch_convert_access(mw_bind->mw_access_flags); + wqe->bind.mr_stag = cpu_to_be32(mw_bind->mr->lkey); + wqe->bind.mw_stag = cpu_to_be32(mw->rkey); + wqe->bind.mw_len = cpu_to_be32(mw_bind->length); + wqe->bind.mw_va = cpu_to_be64(mw_bind->addr); + err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); + if (err) { + spin_unlock_irqrestore(&qhp->lock, flag); + return err; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + sqp->wr_id = mw_bind->wr_id; + sqp->opcode = T3_BIND_MW; + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED); + wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr); + wqe->bind.mr_pagesz = page_size; + wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id; + build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, + sizeof(struct t3_bind_mw_wr) >> 3); + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + spin_unlock_irqrestore(&qhp->lock, flag); + + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + + return err; +} + +static inline void build_term_codes(int t3err, u8 *layer_type, u8 *ecode, + int tagged) +{ + switch (t3err) { + case TPT_ERR_STAG: + if (tagged == 1) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_STAG; + } else if (tagged == 2) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + if (tagged == 1) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_STAG_NOT_ASSOC; + } else if (tagged == 2) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + } + break; + case TPT_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case TPT_ERR_BOUND: + if (tagged == 1) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else if (tagged == 2) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + } + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case TPT_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case TPT_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case TPT_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case TPT_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case TPT_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case TPT_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case TPT_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case TPT_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case TPT_ERR_MSN: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case TPT_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +/* + * This posts a TERMINATE with layer=RDMA, type=catastrophic. + */ +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) +{ + union t3_wr *wqe; + struct terminate_message *term; + int status; + int tagged = 0; + struct sk_buff *skb; + + PDBG("%s %d\n", __FUNCTION__, __LINE__); + skb = alloc_skb(40, GFP_ATOMIC); + if (!skb) { + printk(KERN_ERR "%s cannot send TERMINATE!\n", __FUNCTION__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, 40); + memset(wqe, 0, 40); + wqe->send.rdmaop = T3_TERMINATE; + + /* immediate data length */ + wqe->send.plen = htonl(4); + + /* immediate data starts here. */ + term = (struct terminate_message *)wqe->send.sgl; + if (rsp_msg) { + status = CQE_STATUS(rsp_msg->cqe); + if (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE) + tagged = 1; + if ((CQE_OPCODE(rsp_msg->cqe) == T3_READ_REQ) || + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) + tagged = 2; + } else { + status = TPT_ERR_INTERNAL_ERR; + } + build_term_codes(status, &term->layer_etype, &term->ecode, tagged); + build_fw_riwrh((void *)wqe, T3_WR_SEND, + T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 1, + qhp->ep->hwtid, 5); + skb->priority = CPL_PRIORITY_DATA; + return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag) +{ + struct iwch_cq *rchp, *schp; + int count; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + PDBG("%s qhp %p rchp %p schp %p\n", __FUNCTION__, qhp, rchp, schp); + /* take a ref on the qhp since we must release the lock */ + atomic_inc(&qhp->refcnt); + spin_unlock_irqrestore(&qhp->lock, *flag); + + /* locking heirarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&rchp->lock, *flag); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&rchp->cq); + cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); + cxio_flush_rq(&qhp->wq, &rchp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, *flag); + + /* locking heirarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&schp->lock, *flag); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&schp->cq); + cxio_count_scqes(&schp->cq, &qhp->wq, &count); + cxio_flush_sq(&qhp->wq, &schp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&schp->lock, *flag); + + /* deref */ + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); + + spin_lock_irqsave(&qhp->lock, *flag); +} + +static inline void flush_qp(struct iwch_qp *qhp, unsigned long *flag) +{ + if (t3b_device(qhp->rhp)) + cxio_set_wq_in_error(&qhp->wq); + else + __flush_qp(qhp, flag); +} + + +/* + * Return non zero if at least one RECV was pre-posted. + */ +static inline int rqes_posted(struct iwch_qp *qhp) +{ + return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV; +} + +static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs) +{ + struct t3_rdma_init_attr init_attr; + int ret; + + init_attr.tid = qhp->ep->hwtid; + init_attr.qpid = qhp->wq.qpid; + init_attr.pdid = qhp->attr.pd; + init_attr.scqid = qhp->attr.scq; + init_attr.rcqid = qhp->attr.rcq; + init_attr.rq_addr = qhp->wq.rq_addr; + init_attr.rq_size = 1 << qhp->wq.rq_size_log2; + init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE | + qhp->attr.mpa_attr.recv_marker_enabled | + (qhp->attr.mpa_attr.xmit_marker_enabled << 1) | + (qhp->attr.mpa_attr.crc_enabled << 2); + + /* + * XXX - The IWCM doesn't quite handle getting these + * attrs set before going into RTS. For now, just turn + * them on always... + */ +#if 0 + init_attr.qpcaps = qhp->attr.enableRdmaRead | + (qhp->attr.enableRdmaWrite << 1) | + (qhp->attr.enableBind << 2) | + (qhp->attr.enable_stag0_fastreg << 3) | + (qhp->attr.enable_stag0_fastreg << 4); +#else + init_attr.qpcaps = 0x1f; +#endif + init_attr.tcp_emss = qhp->ep->emss; + init_attr.ord = qhp->attr.max_ord; + init_attr.ird = qhp->attr.max_ird; + init_attr.qp_dma_addr = qhp->wq.dma_addr; + init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); + init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0; + PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " + "flags 0x%x qpcaps 0x%x\n", __FUNCTION__, + init_attr.rq_addr, init_attr.rq_size, + init_attr.flags, init_attr.qpcaps); + ret = cxio_rdma_init(&rhp->rdev, &init_attr); + PDBG("%s ret %d\n", __FUNCTION__, ret); + return ret; +} + +int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct iwch_qp_attributes newattr = qhp->attr; + unsigned long flag; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct iwch_ep *ep = NULL; + + PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __FUNCTION__, + qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state, + (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + spin_lock_irqsave(&qhp->lock, flag); + + /* Process attr changes if in IDLE */ + if (mask & IWCH_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != IWCH_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & IWCH_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > + rhp->attr.max_rdma_read_qp_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & IWCH_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > + rhp->attr.max_rdma_reads_per_qp) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (!(mask & IWCH_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case IWCH_QP_STATE_IDLE: + switch (attrs->next_state) { + case IWCH_QP_STATE_RTS: + if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + qhp->attr.state = IWCH_QP_STATE_RTS; + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + get_ep(&qhp->ep->com); + spin_unlock_irqrestore(&qhp->lock, flag); + ret = rdma_init(rhp, qhp, mask, attrs); + spin_lock_irqsave(&qhp->lock, flag); + if (ret) + goto err; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + flush_qp(qhp, &flag); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_RTS: + switch (attrs->next_state) { + case IWCH_QP_STATE_CLOSING: + BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + qhp->attr.state = IWCH_QP_STATE_CLOSING; + if (!internal) { + abort=0; + disconnect = 1; + ep = qhp->ep; + } + break; + case IWCH_QP_STATE_TERMINATE: + qhp->attr.state = IWCH_QP_STATE_TERMINATE; + if (!internal) + terminate = 1; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + if (!internal) { + abort=1; + disconnect = 1; + ep = qhp->ep; + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case IWCH_QP_STATE_IDLE: + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.llp_stream_handle = NULL; + put_ep(&qhp->ep->com); + qhp->ep = NULL; + wake_up(&qhp->wait); + break; + case IWCH_QP_STATE_ERROR: + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case IWCH_QP_STATE_ERROR: + if (attrs->next_state != IWCH_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + + if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) || + !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) { + ret = -EINVAL; + goto out; + } + qhp->attr.state = IWCH_QP_STATE_IDLE; + memset(&qhp->attr, 0, sizeof(qhp->attr)); + break; + case IWCH_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + printk(KERN_ERR "%s in a bad state %d\n", + __FUNCTION__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + PDBG("%s disassociating ep %p qpid 0x%x\n", __FUNCTION__, qhp->ep, + qhp->wq.qpid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + ep = qhp->ep; + qhp->ep = NULL; + qhp->attr.state = IWCH_QP_STATE_ERROR; + free=1; + wake_up(&qhp->wait); + BUG_ON(!ep); + flush_qp(qhp, &flag); +out: + spin_unlock_irqrestore(&qhp->lock, flag); + + if (terminate) + iwch_post_terminate(qhp, NULL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) + iwch_ep_disconnect(ep, abort, GFP_KERNEL); + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + put_ep(&ep->com); + + PDBG("%s exit state %d\n", __FUNCTION__, qhp->attr.state); + return ret; +} + +static int quiesce_qp(struct iwch_qp *qhp) +{ + spin_lock_irq(&qhp->lock); + iwch_quiesce_tid(qhp->ep); + qhp->flags |= QP_QUIESCED; + spin_unlock_irq(&qhp->lock); + return 0; +} + +static int resume_qp(struct iwch_qp *qhp) +{ + spin_lock_irq(&qhp->lock); + iwch_resume_tid(qhp->ep); + qhp->flags &= ~QP_QUIESCED; + spin_unlock_irq(&qhp->lock); + return 0; +} + +int iwch_quiesce_qps(struct iwch_cq *chp) +{ + int i; + struct iwch_qp *qhp; + + for (i=0; i < T3_MAX_NUM_QP; i++) { + qhp = get_qhp(chp->rhp, i); + if (!qhp) + continue; + if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) { + quiesce_qp(qhp); + continue; + } + if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp)) + quiesce_qp(qhp); + } + return 0; +} + +int iwch_resume_qps(struct iwch_cq *chp) +{ + int i; + struct iwch_qp *qhp; + + for (i=0; i < T3_MAX_NUM_QP; i++) { + qhp = get_qhp(chp->rhp, i); + if (!qhp) + continue; + if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) { + resume_qp(qhp); + continue; + } + if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp)) + resume_qp(qhp); + } + return 0; +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h new file mode 100644 index 0000000..c4e7fbe --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_user.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_USER_H__ +#define __IWCH_USER_H__ + +#define IWCH_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct iwch_create_cq_req { + __u64 user_rptr_addr; +}; + +struct iwch_create_cq_resp { + __u64 key; + __u32 cqid; + __u32 size_log2; +}; + +struct iwch_create_qp_resp { + __u64 key; + __u64 db_key; + __u32 qpid; + __u32 size_log2; + __u32 sq_size_log2; + __u32 rq_size_log2; +}; + +struct iwch_reg_user_mr_resp { + __u32 pbl_addr; +}; +#endif diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h new file mode 100644 index 0000000..c702dc1 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/tcb.h @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2007 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _TCB_DEFS_H +#define _TCB_DEFS_H + +#define W_TCB_T_STATE 0 +#define S_TCB_T_STATE 0 +#define M_TCB_T_STATE 0xfULL +#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE) + +#define W_TCB_TIMER 0 +#define S_TCB_TIMER 4 +#define M_TCB_TIMER 0x1ULL +#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER) + +#define W_TCB_DACK_TIMER 0 +#define S_TCB_DACK_TIMER 5 +#define M_TCB_DACK_TIMER 0x1ULL +#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER) + +#define W_TCB_DEL_FLAG 0 +#define S_TCB_DEL_FLAG 6 +#define M_TCB_DEL_FLAG 0x1ULL +#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG) + +#define W_TCB_L2T_IX 0 +#define S_TCB_L2T_IX 7 +#define M_TCB_L2T_IX 0x7ffULL +#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX) + +#define W_TCB_SMAC_SEL 0 +#define S_TCB_SMAC_SEL 18 +#define M_TCB_SMAC_SEL 0x3ULL +#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL) + +#define W_TCB_TOS 0 +#define S_TCB_TOS 20 +#define M_TCB_TOS 0x3fULL +#define V_TCB_TOS(x) ((x) << S_TCB_TOS) + +#define W_TCB_MAX_RT 0 +#define S_TCB_MAX_RT 26 +#define M_TCB_MAX_RT 0xfULL +#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT) + +#define W_TCB_T_RXTSHIFT 0 +#define S_TCB_T_RXTSHIFT 30 +#define M_TCB_T_RXTSHIFT 0xfULL +#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT) + +#define W_TCB_T_DUPACKS 1 +#define S_TCB_T_DUPACKS 2 +#define M_TCB_T_DUPACKS 0xfULL +#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS) + +#define W_TCB_T_MAXSEG 1 +#define S_TCB_T_MAXSEG 6 +#define M_TCB_T_MAXSEG 0xfULL +#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG) + +#define W_TCB_T_FLAGS1 1 +#define S_TCB_T_FLAGS1 10 +#define M_TCB_T_FLAGS1 0xffffffffULL +#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1) + +#define W_TCB_T_MIGRATION 1 +#define S_TCB_T_MIGRATION 20 +#define M_TCB_T_MIGRATION 0x1ULL +#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION) + +#define W_TCB_T_FLAGS2 2 +#define S_TCB_T_FLAGS2 10 +#define M_TCB_T_FLAGS2 0x7fULL +#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2) + +#define W_TCB_SND_SCALE 2 +#define S_TCB_SND_SCALE 17 +#define M_TCB_SND_SCALE 0xfULL +#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE) + +#define W_TCB_RCV_SCALE 2 +#define S_TCB_RCV_SCALE 21 +#define M_TCB_RCV_SCALE 0xfULL +#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE) + +#define W_TCB_SND_UNA_RAW 2 +#define S_TCB_SND_UNA_RAW 25 +#define M_TCB_SND_UNA_RAW 0x7ffffffULL +#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW) + +#define W_TCB_SND_NXT_RAW 3 +#define S_TCB_SND_NXT_RAW 20 +#define M_TCB_SND_NXT_RAW 0x7ffffffULL +#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW) + +#define W_TCB_RCV_NXT 4 +#define S_TCB_RCV_NXT 15 +#define M_TCB_RCV_NXT 0xffffffffULL +#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT) + +#define W_TCB_RCV_ADV 5 +#define S_TCB_RCV_ADV 15 +#define M_TCB_RCV_ADV 0xffffULL +#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV) + +#define W_TCB_SND_MAX_RAW 5 +#define S_TCB_SND_MAX_RAW 31 +#define M_TCB_SND_MAX_RAW 0x7ffffffULL +#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW) + +#define W_TCB_SND_CWND 6 +#define S_TCB_SND_CWND 26 +#define M_TCB_SND_CWND 0x7ffffffULL +#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND) + +#define W_TCB_SND_SSTHRESH 7 +#define S_TCB_SND_SSTHRESH 21 +#define M_TCB_SND_SSTHRESH 0x7ffffffULL +#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH) + +#define W_TCB_T_RTT_TS_RECENT_AGE 8 +#define S_TCB_T_RTT_TS_RECENT_AGE 16 +#define M_TCB_T_RTT_TS_RECENT_AGE 0xffffffffULL +#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE) + +#define W_TCB_T_RTSEQ_RECENT 9 +#define S_TCB_T_RTSEQ_RECENT 16 +#define M_TCB_T_RTSEQ_RECENT 0xffffffffULL +#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT) + +#define W_TCB_T_SRTT 10 +#define S_TCB_T_SRTT 16 +#define M_TCB_T_SRTT 0xffffULL +#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT) + +#define W_TCB_T_RTTVAR 11 +#define S_TCB_T_RTTVAR 0 +#define M_TCB_T_RTTVAR 0xffffULL +#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR) + +#define W_TCB_TS_LAST_ACK_SENT_RAW 11 +#define S_TCB_TS_LAST_ACK_SENT_RAW 16 +#define M_TCB_TS_LAST_ACK_SENT_RAW 0x7ffffffULL +#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW) + +#define W_TCB_DIP 12 +#define S_TCB_DIP 11 +#define M_TCB_DIP 0xffffffffULL +#define V_TCB_DIP(x) ((x) << S_TCB_DIP) + +#define W_TCB_SIP 13 +#define S_TCB_SIP 11 +#define M_TCB_SIP 0xffffffffULL +#define V_TCB_SIP(x) ((x) << S_TCB_SIP) + +#define W_TCB_DP 14 +#define S_TCB_DP 11 +#define M_TCB_DP 0xffffULL +#define V_TCB_DP(x) ((x) << S_TCB_DP) + +#define W_TCB_SP 14 +#define S_TCB_SP 27 +#define M_TCB_SP 0xffffULL +#define V_TCB_SP(x) ((x) << S_TCB_SP) + +#define W_TCB_TIMESTAMP 15 +#define S_TCB_TIMESTAMP 11 +#define M_TCB_TIMESTAMP 0xffffffffULL +#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP) + +#define W_TCB_TIMESTAMP_OFFSET 16 +#define S_TCB_TIMESTAMP_OFFSET 11 +#define M_TCB_TIMESTAMP_OFFSET 0xfULL +#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET) + +#define W_TCB_TX_MAX 16 +#define S_TCB_TX_MAX 15 +#define M_TCB_TX_MAX 0xffffffffULL +#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX) + +#define W_TCB_TX_HDR_PTR_RAW 17 +#define S_TCB_TX_HDR_PTR_RAW 15 +#define M_TCB_TX_HDR_PTR_RAW 0x1ffffULL +#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW) + +#define W_TCB_TX_LAST_PTR_RAW 18 +#define S_TCB_TX_LAST_PTR_RAW 0 +#define M_TCB_TX_LAST_PTR_RAW 0x1ffffULL +#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW) + +#define W_TCB_TX_COMPACT 18 +#define S_TCB_TX_COMPACT 17 +#define M_TCB_TX_COMPACT 0x1ULL +#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT) + +#define W_TCB_RX_COMPACT 18 +#define S_TCB_RX_COMPACT 18 +#define M_TCB_RX_COMPACT 0x1ULL +#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT) + +#define W_TCB_RCV_WND 18 +#define S_TCB_RCV_WND 19 +#define M_TCB_RCV_WND 0x7ffffffULL +#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND) + +#define W_TCB_RX_HDR_OFFSET 19 +#define S_TCB_RX_HDR_OFFSET 14 +#define M_TCB_RX_HDR_OFFSET 0x7ffffffULL +#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET) + +#define W_TCB_RX_FRAG0_START_IDX_RAW 20 +#define S_TCB_RX_FRAG0_START_IDX_RAW 9 +#define M_TCB_RX_FRAG0_START_IDX_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW) + +#define W_TCB_RX_FRAG1_START_IDX_OFFSET 21 +#define S_TCB_RX_FRAG1_START_IDX_OFFSET 4 +#define M_TCB_RX_FRAG1_START_IDX_OFFSET 0x7ffffffULL +#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET) + +#define W_TCB_RX_FRAG0_LEN 21 +#define S_TCB_RX_FRAG0_LEN 31 +#define M_TCB_RX_FRAG0_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN) + +#define W_TCB_RX_FRAG1_LEN 22 +#define S_TCB_RX_FRAG1_LEN 26 +#define M_TCB_RX_FRAG1_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN) + +#define W_TCB_NEWRENO_RECOVER 23 +#define S_TCB_NEWRENO_RECOVER 21 +#define M_TCB_NEWRENO_RECOVER 0x7ffffffULL +#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER) + +#define W_TCB_PDU_HAVE_LEN 24 +#define S_TCB_PDU_HAVE_LEN 16 +#define M_TCB_PDU_HAVE_LEN 0x1ULL +#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN) + +#define W_TCB_PDU_LEN 24 +#define S_TCB_PDU_LEN 17 +#define M_TCB_PDU_LEN 0xffffULL +#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN) + +#define W_TCB_RX_QUIESCE 25 +#define S_TCB_RX_QUIESCE 1 +#define M_TCB_RX_QUIESCE 0x1ULL +#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE) + +#define W_TCB_RX_PTR_RAW 25 +#define S_TCB_RX_PTR_RAW 2 +#define M_TCB_RX_PTR_RAW 0x1ffffULL +#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW) + +#define W_TCB_CPU_NO 25 +#define S_TCB_CPU_NO 19 +#define M_TCB_CPU_NO 0x7fULL +#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO) + +#define W_TCB_ULP_TYPE 25 +#define S_TCB_ULP_TYPE 26 +#define M_TCB_ULP_TYPE 0xfULL +#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE) + +#define W_TCB_RX_FRAG1_PTR_RAW 25 +#define S_TCB_RX_FRAG1_PTR_RAW 30 +#define M_TCB_RX_FRAG1_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW) + +#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 26 +#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 15 +#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW) + +#define W_TCB_RX_FRAG2_PTR_RAW 27 +#define S_TCB_RX_FRAG2_PTR_RAW 10 +#define M_TCB_RX_FRAG2_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW) + +#define W_TCB_RX_FRAG2_LEN_RAW 27 +#define S_TCB_RX_FRAG2_LEN_RAW 27 +#define M_TCB_RX_FRAG2_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW) + +#define W_TCB_RX_FRAG3_PTR_RAW 28 +#define S_TCB_RX_FRAG3_PTR_RAW 22 +#define M_TCB_RX_FRAG3_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW) + +#define W_TCB_RX_FRAG3_LEN_RAW 29 +#define S_TCB_RX_FRAG3_LEN_RAW 7 +#define M_TCB_RX_FRAG3_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW) + +#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 30 +#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 2 +#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW) + +#define W_TCB_PDU_HDR_LEN 30 +#define S_TCB_PDU_HDR_LEN 29 +#define M_TCB_PDU_HDR_LEN 0xffULL +#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN) + +#define W_TCB_SLUSH1 31 +#define S_TCB_SLUSH1 5 +#define M_TCB_SLUSH1 0x7ffffULL +#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1) + +#define W_TCB_ULP_RAW 31 +#define S_TCB_ULP_RAW 24 +#define M_TCB_ULP_RAW 0xffULL +#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW) + +#define W_TCB_DDP_RDMAP_VERSION 25 +#define S_TCB_DDP_RDMAP_VERSION 30 +#define M_TCB_DDP_RDMAP_VERSION 0x1ULL +#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION) + +#define W_TCB_MARKER_ENABLE_RX 25 +#define S_TCB_MARKER_ENABLE_RX 31 +#define M_TCB_MARKER_ENABLE_RX 0x1ULL +#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX) + +#define W_TCB_MARKER_ENABLE_TX 26 +#define S_TCB_MARKER_ENABLE_TX 0 +#define M_TCB_MARKER_ENABLE_TX 0x1ULL +#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX) + +#define W_TCB_CRC_ENABLE 26 +#define S_TCB_CRC_ENABLE 1 +#define M_TCB_CRC_ENABLE 0x1ULL +#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE) + +#define W_TCB_IRS_ULP 26 +#define S_TCB_IRS_ULP 2 +#define M_TCB_IRS_ULP 0x1ffULL +#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP) + +#define W_TCB_ISS_ULP 26 +#define S_TCB_ISS_ULP 11 +#define M_TCB_ISS_ULP 0x1ffULL +#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP) + +#define W_TCB_TX_PDU_LEN 26 +#define S_TCB_TX_PDU_LEN 20 +#define M_TCB_TX_PDU_LEN 0x3fffULL +#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN) + +#define W_TCB_TX_PDU_OUT 27 +#define S_TCB_TX_PDU_OUT 2 +#define M_TCB_TX_PDU_OUT 0x1ULL +#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT) + +#define W_TCB_CQ_IDX_SQ 27 +#define S_TCB_CQ_IDX_SQ 3 +#define M_TCB_CQ_IDX_SQ 0xffffULL +#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ) + +#define W_TCB_CQ_IDX_RQ 27 +#define S_TCB_CQ_IDX_RQ 19 +#define M_TCB_CQ_IDX_RQ 0xffffULL +#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ) + +#define W_TCB_QP_ID 28 +#define S_TCB_QP_ID 3 +#define M_TCB_QP_ID 0xffffULL +#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID) + +#define W_TCB_PD_ID 28 +#define S_TCB_PD_ID 19 +#define M_TCB_PD_ID 0xffffULL +#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID) + +#define W_TCB_STAG 29 +#define S_TCB_STAG 3 +#define M_TCB_STAG 0xffffffffULL +#define V_TCB_STAG(x) ((x) << S_TCB_STAG) + +#define W_TCB_RQ_START 30 +#define S_TCB_RQ_START 3 +#define M_TCB_RQ_START 0x3ffffffULL +#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START) + +#define W_TCB_RQ_MSN 30 +#define S_TCB_RQ_MSN 29 +#define M_TCB_RQ_MSN 0x3ffULL +#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN) + +#define W_TCB_RQ_MAX_OFFSET 31 +#define S_TCB_RQ_MAX_OFFSET 7 +#define M_TCB_RQ_MAX_OFFSET 0xfULL +#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET) + +#define W_TCB_RQ_WRITE_PTR 31 +#define S_TCB_RQ_WRITE_PTR 11 +#define M_TCB_RQ_WRITE_PTR 0x3ffULL +#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR) + +#define W_TCB_INB_WRITE_PERM 31 +#define S_TCB_INB_WRITE_PERM 21 +#define M_TCB_INB_WRITE_PERM 0x1ULL +#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM) + +#define W_TCB_INB_READ_PERM 31 +#define S_TCB_INB_READ_PERM 22 +#define M_TCB_INB_READ_PERM 0x1ULL +#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM) + +#define W_TCB_ORD_L_BIT_VLD 31 +#define S_TCB_ORD_L_BIT_VLD 23 +#define M_TCB_ORD_L_BIT_VLD 0x1ULL +#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD) + +#define W_TCB_RDMAP_OPCODE 31 +#define S_TCB_RDMAP_OPCODE 24 +#define M_TCB_RDMAP_OPCODE 0xfULL +#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE) + +#define W_TCB_TX_FLUSH 31 +#define S_TCB_TX_FLUSH 28 +#define M_TCB_TX_FLUSH 0x1ULL +#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH) + +#define W_TCB_TX_OOS_RXMT 31 +#define S_TCB_TX_OOS_RXMT 29 +#define M_TCB_TX_OOS_RXMT 0x1ULL +#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT) + +#define W_TCB_TX_OOS_TXMT 31 +#define S_TCB_TX_OOS_TXMT 30 +#define M_TCB_TX_OOS_TXMT 0x1ULL +#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT) + +#define W_TCB_SLUSH_AUX2 31 +#define S_TCB_SLUSH_AUX2 31 +#define M_TCB_SLUSH_AUX2 0x1ULL +#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2) + +#define W_TCB_RX_FRAG1_PTR_RAW2 25 +#define S_TCB_RX_FRAG1_PTR_RAW2 30 +#define M_TCB_RX_FRAG1_PTR_RAW2 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2) + +#define W_TCB_RX_DDP_FLAGS 26 +#define S_TCB_RX_DDP_FLAGS 15 +#define M_TCB_RX_DDP_FLAGS 0x3ffULL +#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS) + +#define W_TCB_SLUSH_AUX3 26 +#define S_TCB_SLUSH_AUX3 31 +#define M_TCB_SLUSH_AUX3 0x1ffULL +#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3) + +#define W_TCB_RX_DDP_BUF0_OFFSET 27 +#define S_TCB_RX_DDP_BUF0_OFFSET 8 +#define M_TCB_RX_DDP_BUF0_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET) + +#define W_TCB_RX_DDP_BUF0_LEN 27 +#define S_TCB_RX_DDP_BUF0_LEN 30 +#define M_TCB_RX_DDP_BUF0_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN) + +#define W_TCB_RX_DDP_BUF1_OFFSET 28 +#define S_TCB_RX_DDP_BUF1_OFFSET 20 +#define M_TCB_RX_DDP_BUF1_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET) + +#define W_TCB_RX_DDP_BUF1_LEN 29 +#define S_TCB_RX_DDP_BUF1_LEN 10 +#define M_TCB_RX_DDP_BUF1_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN) + +#define W_TCB_RX_DDP_BUF0_TAG 30 +#define S_TCB_RX_DDP_BUF0_TAG 0 +#define M_TCB_RX_DDP_BUF0_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG) + +#define W_TCB_RX_DDP_BUF1_TAG 31 +#define S_TCB_RX_DDP_BUF1_TAG 0 +#define M_TCB_RX_DDP_BUF1_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG) + +#define S_TF_DACK 10 +#define V_TF_DACK(x) ((x) << S_TF_DACK) + +#define S_TF_NAGLE 11 +#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE) + +#define S_TF_RECV_SCALE 12 +#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE) + +#define S_TF_RECV_TSTMP 13 +#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP) + +#define S_TF_RECV_SACK 14 +#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK) + +#define S_TF_TURBO 15 +#define V_TF_TURBO(x) ((x) << S_TF_TURBO) + +#define S_TF_KEEPALIVE 16 +#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE) + +#define S_TF_TCAM_BYPASS 17 +#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS) + +#define S_TF_CORE_FIN 18 +#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN) + +#define S_TF_CORE_MORE 19 +#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE) + +#define S_TF_MIGRATING 20 +#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING) + +#define S_TF_ACTIVE_OPEN 21 +#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN) + +#define S_TF_ASK_MODE 22 +#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE) + +#define S_TF_NON_OFFLOAD 23 +#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD) + +#define S_TF_MOD_SCHD 24 +#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD) + +#define S_TF_MOD_SCHD_REASON0 25 +#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0) + +#define S_TF_MOD_SCHD_REASON1 26 +#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1) + +#define S_TF_MOD_SCHD_RX 27 +#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX) + +#define S_TF_CORE_PUSH 28 +#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH) + +#define S_TF_RCV_COALESCE_ENABLE 29 +#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE) + +#define S_TF_RCV_COALESCE_PUSH 30 +#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH) + +#define S_TF_RCV_COALESCE_LAST_PSH 31 +#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH) + +#define S_TF_RCV_COALESCE_HEARTBEAT 32 +#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT) + +#define S_TF_HALF_CLOSE 33 +#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE) + +#define S_TF_DACK_MSS 34 +#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS) + +#define S_TF_CCTRL_SEL0 35 +#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0) + +#define S_TF_CCTRL_SEL1 36 +#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1) + +#define S_TF_TCP_NEWRENO_FAST_RECOVERY 37 +#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY) + +#define S_TF_TX_PACE_AUTO 38 +#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO) + +#define S_TF_PEER_FIN_HELD 39 +#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD) + +#define S_TF_CORE_URG 40 +#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG) + +#define S_TF_RDMA_ERROR 41 +#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR) + +#define S_TF_SSWS_DISABLED 42 +#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED) + +#define S_TF_DUPACK_COUNT_ODD 43 +#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD) + +#define S_TF_TX_CHANNEL 44 +#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL) + +#define S_TF_RX_CHANNEL 45 +#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL) + +#define S_TF_TX_PACE_FIXED 46 +#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED) + +#define S_TF_RDMA_FLM_ERROR 47 +#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR) + +#define S_TF_RX_FLOW_CONTROL_DISABLE 48 +#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE) + +#endif /* _TCB_DEFS_H */ diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index c069be8..6c4f9f9 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -756,6 +756,8 @@ void ehca_destroy_comp_pool(void) if (cpu_online(i)) destroy_comp_task(pool, i); } + free_percpu(pool->cpu_comp_tasks); + kfree(pool); #endif return; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 968d151..7131446 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1051,7 +1051,11 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET); dev_lim->max_eqs = 1 << (field & 0x7); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET); - dev_lim->reserved_mtts = 1 << (field >> 4); + if (mthca_is_memfree(dev)) + dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64), + MTHCA_MTT_SEG_SIZE) / MTHCA_MTT_SEG_SIZE; + else + dev_lim->reserved_mtts = 1 << (field >> 4); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET); dev_lim->max_mrw_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET); diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index fe5cecf..b7e42ef 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -464,6 +464,8 @@ void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd); void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd); +int mthca_write_mtt_size(struct mthca_dev *dev); + struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size); void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt); int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 44bc6cc..0d9b7d0 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -379,7 +379,7 @@ static int mthca_load_fw(struct mthca_dev *mdev) mdev->fw.arbel.fw_icm = mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, - GFP_HIGHUSER | __GFP_NOWARN); + GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.fw_icm) { mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; @@ -412,7 +412,7 @@ err_unmap_fa: mthca_UNMAP_FA(mdev, &status); err_free: - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); return err; } @@ -441,7 +441,7 @@ static int mthca_init_icm(struct mthca_dev *mdev, (unsigned long long) aux_pages << 2); mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, - GFP_HIGHUSER | __GFP_NOWARN); + GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.aux_icm) { mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; @@ -464,10 +464,15 @@ static int mthca_init_icm(struct mthca_dev *mdev, goto err_unmap_aux; } + /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ + mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * MTHCA_MTT_SEG_SIZE, + dma_get_cache_alignment()) / MTHCA_MTT_SEG_SIZE; + mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, MTHCA_MTT_SEG_SIZE, mdev->limits.num_mtt_segs, - mdev->limits.reserved_mtts, 1); + mdev->limits.reserved_mtts, + 1, 0); if (!mdev->mr_table.mtt_table) { mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); err = -ENOMEM; @@ -477,7 +482,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, dev_lim->mpt_entry_sz, mdev->limits.num_mpts, - mdev->limits.reserved_mrws, 1); + mdev->limits.reserved_mrws, + 1, 1); if (!mdev->mr_table.mpt_table) { mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); err = -ENOMEM; @@ -487,7 +493,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, dev_lim->qpc_entry_sz, mdev->limits.num_qps, - mdev->limits.reserved_qps, 0); + mdev->limits.reserved_qps, + 0, 0); if (!mdev->qp_table.qp_table) { mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); err = -ENOMEM; @@ -497,7 +504,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, dev_lim->eqpc_entry_sz, mdev->limits.num_qps, - mdev->limits.reserved_qps, 0); + mdev->limits.reserved_qps, + 0, 0); if (!mdev->qp_table.eqp_table) { mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); err = -ENOMEM; @@ -507,7 +515,7 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, MTHCA_RDB_ENTRY_SIZE, mdev->limits.num_qps << - mdev->qp_table.rdb_shift, + mdev->qp_table.rdb_shift, 0, 0, 0); if (!mdev->qp_table.rdb_table) { mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); @@ -518,7 +526,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, dev_lim->cqc_entry_sz, mdev->limits.num_cqs, - mdev->limits.reserved_cqs, 0); + mdev->limits.reserved_cqs, + 0, 0); if (!mdev->cq_table.table) { mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); err = -ENOMEM; @@ -530,7 +539,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mthca_alloc_icm_table(mdev, init_hca->srqc_base, dev_lim->srq_entry_sz, mdev->limits.num_srqs, - mdev->limits.reserved_srqs, 0); + mdev->limits.reserved_srqs, + 0, 0); if (!mdev->srq_table.table) { mthca_err(mdev, "Failed to map SRQ context memory, " "aborting.\n"); @@ -550,7 +560,7 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->limits.num_amgms, mdev->limits.num_mgms + mdev->limits.num_amgms, - 0); + 0, 0); if (!mdev->mcg_table.table) { mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); err = -ENOMEM; @@ -588,7 +598,7 @@ err_unmap_aux: mthca_UNMAP_ICM_AUX(mdev, &status); err_free_aux: - mthca_free_icm(mdev, mdev->fw.arbel.aux_icm); + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); return err; } @@ -609,7 +619,7 @@ static void mthca_free_icms(struct mthca_dev *mdev) mthca_unmap_eq_icm(mdev); mthca_UNMAP_ICM_AUX(mdev, &status); - mthca_free_icm(mdev, mdev->fw.arbel.aux_icm); + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); } static int mthca_init_arbel(struct mthca_dev *mdev) @@ -693,7 +703,7 @@ err_free_icm: err_stop_fw: mthca_UNMAP_FA(mdev, &status); - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); err_disable: if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) @@ -712,7 +722,7 @@ static void mthca_close_hca(struct mthca_dev *mdev) mthca_free_icms(mdev); mthca_UNMAP_FA(mdev, &status); - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) mthca_DISABLE_LAM(mdev, &status); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 6b19645..0b9d053 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -35,6 +35,9 @@ */ #include <linux/mm.h> +#include <linux/scatterlist.h> + +#include <asm/page.h> #include "mthca_memfree.h" #include "mthca_dev.h" @@ -58,22 +61,42 @@ struct mthca_user_db_table { } page[0]; }; -void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm) +static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) +{ + int i; + + if (chunk->nsg > 0) + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + for (i = 0; i < chunk->npages; ++i) + __free_pages(chunk->mem[i].page, + get_order(chunk->mem[i].length)); +} + +static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) { - struct mthca_icm_chunk *chunk, *tmp; int i; + for (i = 0; i < chunk->npages; ++i) { + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, + lowmem_page_address(chunk->mem[i].page), + sg_dma_address(&chunk->mem[i])); + } +} + +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent) +{ + struct mthca_icm_chunk *chunk, *tmp; + if (!icm) return; list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { - if (chunk->nsg > 0) - pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, - PCI_DMA_BIDIRECTIONAL); - - for (i = 0; i < chunk->npages; ++i) - __free_pages(chunk->mem[i].page, - get_order(chunk->mem[i].length)); + if (coherent) + mthca_free_icm_coherent(dev, chunk); + else + mthca_free_icm_pages(dev, chunk); kfree(chunk); } @@ -81,12 +104,41 @@ void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm) kfree(icm); } +static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) +{ + mem->page = alloc_pages(gfp_mask, order); + if (!mem->page) + return -ENOMEM; + + mem->length = PAGE_SIZE << order; + mem->offset = 0; + return 0; +} + +static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, + int order, gfp_t gfp_mask) +{ + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem), + gfp_mask); + if (!buf) + return -ENOMEM; + + sg_set_buf(mem, buf, PAGE_SIZE << order); + BUG_ON(mem->offset); + sg_dma_len(mem) = PAGE_SIZE << order; + return 0; +} + struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, - gfp_t gfp_mask) + gfp_t gfp_mask, int coherent) { struct mthca_icm *icm; struct mthca_icm_chunk *chunk = NULL; int cur_order; + int ret; + + /* We use sg_set_buf for coherent allocs, which assumes low memory */ + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!icm) @@ -112,21 +164,28 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, while (1 << cur_order > npages) --cur_order; - chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order); - if (chunk->mem[chunk->npages].page) { - chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order; - chunk->mem[chunk->npages].offset = 0; + if (coherent) + ret = mthca_alloc_icm_coherent(&dev->pdev->dev, + &chunk->mem[chunk->npages], + cur_order, gfp_mask); + else + ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages], + cur_order, gfp_mask); - if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) { + if (!ret) { + ++chunk->npages; + + if (!coherent && chunk->npages == MTHCA_ICM_CHUNK_LEN) { chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, chunk->npages, PCI_DMA_BIDIRECTIONAL); if (chunk->nsg <= 0) goto fail; + } + if (chunk->npages == MTHCA_ICM_CHUNK_LEN) chunk = NULL; - } npages -= 1 << cur_order; } else { @@ -136,7 +195,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, } } - if (chunk) { + if (!coherent && chunk) { chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, chunk->npages, PCI_DMA_BIDIRECTIONAL); @@ -148,7 +207,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, return icm; fail: - mthca_free_icm(dev, icm); + mthca_free_icm(dev, icm, coherent); return NULL; } @@ -167,7 +226,7 @@ int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int ob table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT, (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN); + __GFP_NOWARN, table->coherent); if (!table->icm[i]) { ret = -ENOMEM; goto out; @@ -175,7 +234,7 @@ int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int ob if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE, &status) || status) { - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; ret = -ENOMEM; goto out; @@ -204,16 +263,16 @@ void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int o mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, &status); - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; } mutex_unlock(&table->mutex); } -void *mthca_table_find(struct mthca_icm_table *table, int obj) +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle) { - int idx, offset, i; + int idx, offset, dma_offset, i; struct mthca_icm_chunk *chunk; struct mthca_icm *icm; struct page *page = NULL; @@ -225,13 +284,22 @@ void *mthca_table_find(struct mthca_icm_table *table, int obj) idx = (obj & (table->num_obj - 1)) * table->obj_size; icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE]; - offset = idx % MTHCA_TABLE_CHUNK_SIZE; + dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE; if (!icm) goto out; list_for_each_entry(chunk, &icm->chunk_list, list) { for (i = 0; i < chunk->npages; ++i) { + if (dma_handle && dma_offset >= 0) { + if (sg_dma_len(&chunk->mem[i]) > dma_offset) + *dma_handle = sg_dma_address(&chunk->mem[i]) + + dma_offset; + dma_offset -= sg_dma_len(&chunk->mem[i]); + } + /* DMA mapping can merge pages but not split them, + * so if we found the page, dma_handle has already + * been assigned to. */ if (chunk->mem[i].length > offset) { page = chunk->mem[i].page; goto out; @@ -283,7 +351,7 @@ void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, u64 virt, int obj_size, int nobj, int reserved, - int use_lowmem) + int use_lowmem, int use_coherent) { struct mthca_icm_table *table; int num_icm; @@ -302,6 +370,7 @@ struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, table->num_obj = nobj; table->obj_size = obj_size; table->lowmem = use_lowmem; + table->coherent = use_coherent; mutex_init(&table->mutex); for (i = 0; i < num_icm; ++i) @@ -314,12 +383,12 @@ struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT, (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN); + __GFP_NOWARN, use_coherent); if (!table->icm[i]) goto err; if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE, &status) || status) { - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; goto err; } @@ -339,7 +408,7 @@ err: mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE, MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, &status); - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); } kfree(table); @@ -357,7 +426,7 @@ void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table) mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, &status); - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); } kfree(table); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h index 6d42947..5941441 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.h +++ b/drivers/infiniband/hw/mthca/mthca_memfree.h @@ -69,6 +69,7 @@ struct mthca_icm_table { int num_obj; int obj_size; int lowmem; + int coherent; struct mutex mutex; struct mthca_icm *icm[0]; }; @@ -82,17 +83,17 @@ struct mthca_icm_iter { struct mthca_dev; struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, - gfp_t gfp_mask); -void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm); + gfp_t gfp_mask, int coherent); +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent); struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, u64 virt, int obj_size, int nobj, int reserved, - int use_lowmem); + int use_lowmem, int use_coherent); void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table); int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); -void *mthca_table_find(struct mthca_icm_table *table, int obj); +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle); int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, int start, int end); void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index f71ffa8..6037dd3 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -243,8 +243,8 @@ void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt) kfree(mtt); } -int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, - int start_index, u64 *buffer_list, int list_len) +static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) { struct mthca_mailbox *mailbox; __be64 *mtt_entry; @@ -295,6 +295,84 @@ out: return err; } +int mthca_write_mtt_size(struct mthca_dev *dev) +{ + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy) + /* + * Be friendly to WRITE_MTT command + * and leave two empty slots for the + * index and reserved fields of the + * mailbox. + */ + return PAGE_SIZE / sizeof (u64) - 2; + + /* For Arbel, all MTTs must fit in the same page. */ + return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff; +} + +void mthca_tavor_write_mtt_seg(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + u64 __iomem *mtts; + int i; + + mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * MTHCA_MTT_SEG_SIZE + + start_index * sizeof (u64); + for (i = 0; i < list_len; ++i) + mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT), + mtts + i); +} + +void mthca_arbel_write_mtt_seg(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + __be64 *mtts; + dma_addr_t dma_handle; + int i; + int s = start_index * sizeof (u64); + + /* For Arbel, all MTTs must fit in the same page. */ + BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE); + /* Require full segments */ + BUG_ON(s % MTHCA_MTT_SEG_SIZE); + + mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg + + s / MTHCA_MTT_SEG_SIZE, &dma_handle); + + BUG_ON(!mtts); + + for (i = 0; i < list_len; ++i) + mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT); + + dma_sync_single(&dev->pdev->dev, dma_handle, list_len * sizeof (u64), DMA_TO_DEVICE); +} + +int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + int size = mthca_write_mtt_size(dev); + int chunk; + + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy) + return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len); + + while (list_len > 0) { + chunk = min(size, list_len); + if (mthca_is_memfree(dev)) + mthca_arbel_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + else + mthca_tavor_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + + list_len -= chunk; + start_index += chunk; + buffer_list += chunk; + } + + return 0; +} + static inline u32 tavor_hw_index_to_key(u32 ind) { return ind; @@ -524,7 +602,7 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, if (err) goto err_out_mpt_free; - mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key); + mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL); BUG_ON(!mr->mem.arbel.mpt); } else mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base + @@ -538,7 +616,8 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, if (mthca_is_memfree(dev)) { mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table, - mr->mtt->first_seg); + mr->mtt->first_seg, + &mr->mem.arbel.dma_handle); BUG_ON(!mr->mem.arbel.mtts); } else mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; @@ -712,6 +791,9 @@ int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] | MTHCA_MTT_FLAG_PRESENT); + dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle, + list_len * sizeof(u64), DMA_TO_DEVICE); + fmr->mem.arbel.mpt->key = cpu_to_be32(key); fmr->mem.arbel.mpt->lkey = cpu_to_be32(key); fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); @@ -761,7 +843,7 @@ void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) int mthca_init_mr_table(struct mthca_dev *dev) { unsigned long addr; - int err, i; + int mpts, mtts, err, i; err = mthca_alloc_init(&dev->mr_table.mpt_alloc, dev->limits.num_mpts, @@ -795,13 +877,21 @@ int mthca_init_mr_table(struct mthca_dev *dev) err = -EINVAL; goto err_fmr_mpt; } + mpts = mtts = 1 << i; + } else { + mpts = dev->limits.num_mtt_segs; + mtts = dev->limits.num_mpts; + } + + if (!mthca_is_memfree(dev) && + (dev->mthca_flags & MTHCA_FLAG_FMR)) { addr = pci_resource_start(dev->pdev, 4) + ((pci_resource_len(dev->pdev, 4) - 1) & dev->mr_table.mpt_base); dev->mr_table.tavor_fmr.mpt_base = - ioremap(addr, (1 << i) * sizeof(struct mthca_mpt_entry)); + ioremap(addr, mpts * sizeof(struct mthca_mpt_entry)); if (!dev->mr_table.tavor_fmr.mpt_base) { mthca_warn(dev, "MPT ioremap for FMR failed.\n"); @@ -814,19 +904,21 @@ int mthca_init_mr_table(struct mthca_dev *dev) dev->mr_table.mtt_base); dev->mr_table.tavor_fmr.mtt_base = - ioremap(addr, (1 << i) * MTHCA_MTT_SEG_SIZE); + ioremap(addr, mtts * MTHCA_MTT_SEG_SIZE); if (!dev->mr_table.tavor_fmr.mtt_base) { mthca_warn(dev, "MTT ioremap for FMR failed.\n"); err = -ENOMEM; goto err_fmr_mtt; } + } - err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, i); + if (dev->limits.fmr_reserved_mtts) { + err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1)); if (err) goto err_fmr_mtt_buddy; /* Prevent regular MRs from using FMR keys */ - err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, i); + err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1)); if (err) goto err_reserve_fmr; diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c index 58d44aa..26bf86d 100644 --- a/drivers/infiniband/hw/mthca/mthca_profile.c +++ b/drivers/infiniband/hw/mthca/mthca_profile.c @@ -277,7 +277,7 @@ u64 mthca_make_profile(struct mthca_dev *dev, * out of the MR pool. They don't use additional memory, but * we assign them as part of the HCA profile anyway. */ - if (mthca_is_memfree(dev)) + if (mthca_is_memfree(dev) || BITS_PER_LONG == 64) dev->limits.fmr_reserved_mtts = 0; else dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 7b96751..0725ad7 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1015,6 +1015,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, int shift, n, len; int i, j, k; int err = 0; + int write_mtt_size; shift = ffs(region->page_size) - 1; @@ -1040,6 +1041,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, i = n = 0; + write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); + list_for_each_entry(chunk, ®ion->chunk_list, list) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; @@ -1047,14 +1050,11 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, pages[i++] = sg_dma_address(&chunk->page_list[j]) + region->page_size * k; /* - * Be friendly to WRITE_MTT command - * and leave two empty slots for the - * index and reserved fields of the - * mailbox. + * Be friendly to write_mtt and pass it chunks + * of appropriate size. */ - if (i == PAGE_SIZE / sizeof (u64) - 2) { - err = mthca_write_mtt(dev, mr->mtt, - n, pages, i); + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); if (err) goto mtt_done; n += i; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 9a5bece..1d266ac 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -89,6 +89,7 @@ struct mthca_fmr { struct { struct mthca_mpt_entry *mpt; __be64 *mtts; + dma_addr_t dma_handle; } arbel; } mem; }; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 5f5214c..224c93d 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -399,7 +399,7 @@ static int to_ib_qp_access_flags(int mthca_flags) static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, struct mthca_qp_path *path) { - memset(ib_ah_attr, 0, sizeof *path); + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 10684da..61974b0 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -116,11 +116,16 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev, struct mthca_srq *srq, struct mthca_arbel_srq_context *context) { - int logsize; + int logsize, max; memset(context, 0, sizeof *context); - logsize = ilog2(srq->max); + /* + * Put max in a temporary variable to work around gcc bug + * triggered by ilog2() on sparc64. + */ + max = srq->max; + logsize = ilog2(max); context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn); context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); context->db_index = cpu_to_be32(srq->db_index); diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index c75322d..af78ccc 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_IPOIB tristate "IP-over-InfiniBand" - depends on INFINIBAND && NETDEVICES && INET + depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n) ---help--- Support for the IP-over-InfiniBand protocol (IPoIB). This transports IP packets over InfiniBand so you can use your IB @@ -8,6 +8,20 @@ config INFINIBAND_IPOIB See Documentation/infiniband/ipoib.txt for more information +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB && EXPERIMENTAL + default n + ---help--- + This option enables experimental support for IPoIB connected mode. + After enabling this option, you need to switch to connected mode through + /sys/class/net/ibXXX/mode to actually create connections, and then increase + the interface MTU with e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some + packet drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. + config INFINIBAND_IPOIB_DEBUG bool "IP-over-InfiniBand debugging" if EMBEDDED depends on INFINIBAND_IPOIB diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile index 8935e74..98ee38e 100644 --- a/drivers/infiniband/ulp/ipoib/Makefile +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \ ipoib_multicast.o \ ipoib_verbs.o \ ipoib_vlan.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 07deee8..2594db2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -62,6 +62,10 @@ enum { IPOIB_ENCAP_LEN = 4, + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, IPOIB_RX_RING_SIZE = 128, IPOIB_TX_RING_SIZE = 64, IPOIB_MAX_QUEUE_SIZE = 8192, @@ -81,6 +85,8 @@ enum { IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_MCAST_STARTED = 8, + IPOIB_FLAG_NETIF_STOPPED = 9, + IPOIB_FLAG_ADMIN_CM = 10, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -90,6 +96,13 @@ enum { IPOIB_MCAST_FLAG_ATTACHED = 3, }; +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_CM_OP_SRQ (1ul << 30) +#else +#define IPOIB_CM_OP_SRQ (0) +#endif + /* structs */ struct ipoib_header { @@ -113,6 +126,59 @@ struct ipoib_tx_buf { u64 mapping; }; +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_cq *cq; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_path *path; + struct ipoib_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + unsigned long flags; + u32 mtu; + struct ib_wc ibwc[IPOIB_NUM_WC]; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; +}; + /* * Device private locking: tx_lock protects members used in TX fast * path (and we use LLTX so upper layers don't do extra locking). @@ -179,6 +245,10 @@ struct ipoib_dev_priv { struct list_head child_intfs; struct list_head list; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct list_head fs_list; struct dentry *mcg_dentry; @@ -212,6 +282,9 @@ struct ipoib_path { struct ipoib_neigh { struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif union ib_gid dgid; struct sk_buff_head queue; @@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(n->ha) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +struct ipoib_cm_tx; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -ENOSYS; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +#endif + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG void ipoib_create_debug_files(struct net_device *dev); void ipoib_delete_debug_files(struct net_device *dev); @@ -392,4 +605,6 @@ extern int ipoib_debug_level; #define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw) +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) + #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c new file mode 100644 index 0000000..2d48387 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -0,0 +1,1237 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <rdma/ib_cm.h> +#include <rdma/ib_cache.h> +#include <net/dst.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#include "ipoib.h" + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +struct ipoib_cm_id { + struct ib_cm_id *id; + int flags; + u32 remote_qpn; + u32 remote_mtu; +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i) + ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, + u64 mapping[IPOIB_CM_RX_SG]) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + if (unlikely(!skb)) + return -ENOMEM; + + /* + * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 12); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return -EIO; + } + + for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) { + struct page *page = alloc_page(GFP_ATOMIC); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, + 0, PAGE_SIZE, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + priv->cm.srq_ring[id].skb = skb; + return 0; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i >= 0; --i) + ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); + + kfree_skb(skb); + return -ENOMEM; +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->cq, /* does not matter, we never send anything */ + .recv_cq = priv->cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, struct ib_cm_req_event_param *req, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof data; + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.target_ack_delay = 20; /* FIXME */ + rep.srq = 1; + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long flags; + unsigned psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = random32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + goto err_rep; + } + + cm_id->context = p; + p->jiffies = jiffies; + spin_lock_irqsave(&priv->lock, flags); + list_add(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + return 0; + +err_rep: +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + p = cm_id->context; + ib_send_cm_drep(cm_id, NULL, 0); + /* Fall through */ + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = netdev_priv(p->dev); + spin_lock_irqsave(&priv->lock, flags); + if (list_empty(&p->list)) + ret = 0; /* Connection is going away already. */ + else { + list_del_init(&p->list); + ret = -ECONNRESET; + } + spin_unlock_irqrestore(&priv->lock, flags); + if (ret) { + ib_destroy_qp(p->qp); + kfree(p); + return ret; + } + return 0; + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + __free_page(frag->page); + --skb_shinfo(skb)->nr_frags; + } else { + size = min(length, (unsigned) PAGE_SIZE); + + frag->size = size; + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; + struct sk_buff *skb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + + ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n", + wr_id, wc->opcode, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + skb = priv->cm.srq_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, "cm recv error " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ++priv->stats.rx_dropped; + goto repost; + } + + if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) { + p = wc->qp->qp_context; + if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do + * not re-add it if it has been removed. */ + if (!list_empty(&p->list)) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + } + } + + if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id, mapping))) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++priv->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[wr_id].mapping); + memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, sizeof mapping); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb->mac.raw = skb->data; + skb_pull(skb, IPOIB_ENCAP_LEN); + + dev->last_rx = jiffies; + ++priv->stats.rx_packets; + priv->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_rx_ni(skb); + +repost: + if (unlikely(ipoib_cm_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive failed " + "for buf %d\n", wr_id); +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + u64 addr, int len) +{ + struct ib_send_wr *bad_wr; + + priv->tx_sge.addr = addr; + priv->tx_sge.length = len; + + priv->tx_wr.wr_id = wr_id; + + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_tx_buf *tx_req; + u64 addr; + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, tx->mtu); + ++priv->stats.tx_dropped; + ++priv->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, tx->mtu - INFINIBAND_ALEN); + return; + } + + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + ++priv->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + tx_req->mapping = addr; + + if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len))) { + ipoib_warn(priv, "post_send failed\n"); + ++priv->stats.tx_errors; + ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + dev_kfree_skb_any(skb); + } else { + dev->trans_start = jiffies; + ++tx->tx_head; + + if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + } + } +} + +static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx, + struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, op %d, status: %d\n", + wr_id, wc->opcode, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++priv->stats.tx_packets; + priv->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + spin_lock_irqsave(&priv->tx_lock, flags); + ++tx->tx_tail; + if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) && + tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) { + clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + netif_wake_queue(dev); + } + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + ipoib_dbg(priv, "failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + + spin_lock(&priv->lock); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + + tx->neigh = NULL; + } + + /* queue would be re-started anyway when TX is destroyed, + * but it makes sense to do it ASAP here. */ + if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) + netif_wake_queue(dev); + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock(&priv->lock); + } + + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr) +{ + struct ipoib_cm_tx *tx = tx_ptr; + int n, i; + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + do { + n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc); + for (i = 0; i < n; ++i) + ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i); + } while (n == IPOIB_NUM_WC); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + return IS_ERR(priv->cm.id); + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0, NULL); + if (ret) { + printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + ib_destroy_cm_id(priv->cm.id); + return ret; + } + return 0; +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long flags; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return; + + ib_destroy_cm_id(priv->cm.id); + spin_lock_irqsave(&priv->lock, flags); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_del_init(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + ib_destroy_cm_id(p->id); + ib_destroy_qp(p->qp); + kfree(p); + spin_lock_irqsave(&priv->lock, flags); + } + spin_unlock_irqrestore(&priv->lock, flags); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + unsigned long flags; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n", + p->mtu, priv->dev->mtu); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + spin_lock_irqsave(&priv->lock, flags); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = {}; + attr.recv_cq = priv->cq; + attr.srq = priv->cm.srq; + attr.cap.max_send_wr = ipoib_sendq_size; + attr.cap.max_send_sge = 1; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = cq; + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof data; + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = 1; + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); + if (ret) { + ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + int ret; + + p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_KERNEL); + if (!p->tx_ring) { + ipoib_warn(priv, "failed to allocate tx ring\n"); + ret = -ENOMEM; + goto err_tx; + } + + p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p, + ipoib_sendq_size + 1); + if (IS_ERR(p->cq)) { + ret = PTR_ERR(p->cq); + ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret); + goto err_cq; + } + + ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP); + if (ret) { + ipoib_warn(priv, "failed to request completion notification: %d\n", ret); + goto err_req_notify; + } + + p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_send_cm; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n", + p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn); + + return 0; + +err_send_cm: +err_modify: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_req_notify: +err_qp: + p->qp = NULL; + ib_destroy_cq(p->cq); +err_cq: + p->cq = NULL; +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_tx_buf *tx_req; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + if (p->id) + ib_destroy_cm_id(p->id); + + if (p->qp) + ib_destroy_qp(p->qp); + + if (p->cq) + ib_destroy_cq(p->cq); + + if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags)) + netif_wake_queue(p->dev); + + if (p->tx_ring) { + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + } + + kfree(p->tx_ring); + } + + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof *tx, GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->path = path; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(ipoib_workqueue, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(tx->neigh->dgid)); + tx->neigh = NULL; + } +} + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + int ret; + + struct ib_sa_path_rec pathrec; + u32 qpn; + + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + qpn = IPOIB_QPN(neigh->neighbour->ha); + memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + if (ret) { + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + } + list_del(&p->list); + kfree(p); + } + } + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct ipoib_cm_tx *p; + unsigned long flags; + + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del(&p->list); + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + ipoib_cm_tx_destroy(p); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + } + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; + + unsigned mtu = priv->mcast_mtu; + + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); +#endif + dev_kfree_skb_any(skb); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + } + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(ipoib_workqueue, &priv->cm.skb_task); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + while (!list_empty(&priv->cm.passive_ids)) { + /* List if sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_del_init(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + ib_destroy_cm_id(p->id); + ib_destroy_qp(p->qp); + kfree(p); + spin_lock_irqsave(&priv->lock, flags); + } + spin_unlock_irqrestore(&priv->lock, flags); +} + + +static ssize_t show_mode(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sprintf(buf, "connected\n"); + else + return sprintf(buf, "datagram\n"); +} + +static ssize_t set_mode(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + ipoib_flush_paths(dev); + return count; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + dev->mtu = min(priv->mcast_mtu, dev->mtu); + ipoib_flush_paths(dev); + return count; + } + + return -EINVAL; +} + +static DEVICE_ATTR(mode, S_IWUGO | S_IRUGO, show_mode, set_mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_mode); +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .attr = { + .max_wr = ipoib_recvq_size, + .max_sge = IPOIB_CM_RX_SG + } + }; + int ret, i; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + ret = PTR_ERR(priv->cm.srq); + priv->cm.srq = NULL; + return ret; + } + + priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, + GFP_KERNEL); + if (!priv->cm.srq_ring) { + printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + priv->cm.rx_sge[i].lkey = priv->mr->lkey; + + priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < IPOIB_CM_RX_SG; ++i) + priv->cm.rx_sge[i].length = PAGE_SIZE; + priv->cm.rx_wr.next = NULL; + priv->cm.rx_wr.sg_list = priv->cm.rx_sge; + priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG; + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (ipoib_cm_alloc_rx_skb(dev, i, priv->cm.srq_ring[i].mapping)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + if (ipoib_cm_post_receive(dev, i)) { + ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + + priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, ret; + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ret = ib_destroy_srq(priv->cm.srq); + if (ret) + ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); + + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + for (i = 0; i < ipoib_recvq_size; ++i) + if (priv->cm.srq_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[i].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[i].skb); + priv->cm.srq_ring[i].skb = NULL; + } + kfree(priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 59d9594..f2aa923 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif -#define IPOIB_OP_RECV (1ul << 31) - static DEFINE_MUTEX(pkey_mutex); struct ipoib_ah *ipoib_create_ah(struct net_device *dev, @@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->tx_lock, flags); ++priv->tx_tail; - if (netif_queue_stopped(dev) && - test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) && - priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) + if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) && + priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) { + clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); netif_wake_queue(dev); + } spin_unlock_irqrestore(&priv->tx_lock, flags); if (wc->status != IB_WC_SUCCESS && @@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) { - if (wc->wr_id & IPOIB_OP_RECV) + if (wc->wr_id & IPOIB_CM_OP_SRQ) + ipoib_cm_handle_rx_wc(dev, wc); + else if (wc->wr_id & IPOIB_OP_RECV) ipoib_ib_handle_rx_wc(dev, wc); else ipoib_ib_handle_tx_wc(dev, wc); @@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_tx_buf *tx_req; u64 addr; - if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) { + if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, dev->mtu + INFINIBAND_ALEN); + skb->len, priv->mcast_mtu + INFINIBAND_ALEN); ++priv->stats.tx_dropped; ++priv->stats.tx_errors; - dev_kfree_skb_any(skb); + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); return; } @@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); netif_stop_queue(dev); + set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); } } } @@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev) return -1; } + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); + ipoib_ib_dev_stop(dev); + return -1; + } + clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); @@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev) clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + ipoib_cm_dev_stop(dev); + /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index af5ee2e..18d27fd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -49,8 +49,6 @@ #include <net/dst.h> -#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) - MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); @@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); + clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); + /* * Now flush workqueue to make sure a scheduled task doesn't * bring our internal state back up. @@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) { + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) { return -EINVAL; + } priv->admin_mtu = new_mtu; @@ -414,6 +424,20 @@ static void path_rec_completion(int status, memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, sizeof(union ib_gid)); + if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + continue; + } + } + while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } @@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, sizeof(union ib_gid)); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); + if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } + } else + ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); } else { neigh->ah = NULL; @@ -538,6 +580,7 @@ err_list: err_path: ipoib_neigh_free(dev, neigh); +err_drop: ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh = *to_ipoib_neigh(skb->dst->neighbour); - if (likely(neigh->ah)) { + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto out; + } + } else if (neigh->ah) { if (unlikely(memcmp(&neigh->dgid.raw, skb->dst->neighbour->ha + 4, sizeof(union ib_gid)))) { @@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) neigh->neighbour = neighbour; *to_ipoib_neigh(neighbour) = neigh; skb_queue_head_init(&neigh->queue); + ipoib_cm_set(neigh, NULL); return neigh; } @@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); kfree(neigh); } @@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format, ipoib_create_debug_files(priv->dev); + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index b04b72c..fea737f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -597,7 +597,9 @@ void ipoib_mcast_join_task(struct work_struct *work) priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) - IPOIB_ENCAP_LEN; - dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + + if (!ipoib_cm_admin_enabled(dev)) + dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 7b717c64..3cb551b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -168,35 +168,41 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) .qp_type = IB_QPT_UD }; + int ret, size; + priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); return -ENODEV; } - priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, - ipoib_sendq_size + ipoib_recvq_size + 1); + priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(priv->mr)) { + printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); + goto out_free_pd; + } + + size = ipoib_sendq_size + ipoib_recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) + size += ipoib_recvq_size; + + priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size); if (IS_ERR(priv->cq)) { printk(KERN_WARNING "%s: failed to create CQ\n", ca->name); - goto out_free_pd; + goto out_free_mr; } if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP)) goto out_free_cq; - priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(priv->mr)) { - printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); - goto out_free_cq; - } - init_attr.send_cq = priv->cq; init_attr.recv_cq = priv->cq, priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { printk(KERN_WARNING "%s: failed to create QP\n", ca->name); - goto out_free_mr; + goto out_free_cq; } priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; @@ -212,12 +218,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) return 0; -out_free_mr: - ib_dereg_mr(priv->mr); - out_free_cq: ib_destroy_cq(priv->cq); +out_free_mr: + ib_dereg_mr(priv->mr); + out_free_pd: ib_dealloc_pd(priv->pd); return -ENODEV; @@ -235,12 +241,14 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } - if (ib_dereg_mr(priv->mr)) - ipoib_warn(priv, "ib_dereg_mr failed\n"); - if (ib_destroy_cq(priv->cq)) ipoib_warn(priv, "ib_cq_destroy failed\n"); + ipoib_cm_dev_cleanup(dev); + + if (ib_dereg_mr(priv->mr)) + ipoib_warn(priv, "ib_dereg_mr failed\n"); + if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 085eafe..6762988 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -115,6 +115,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_create_debug_files(priv->dev); + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; |