summaryrefslogtreecommitdiffstats
path: root/sys/ofed
diff options
context:
space:
mode:
authornp <np@FreeBSD.org>2016-01-22 23:33:34 +0000
committernp <np@FreeBSD.org>2016-01-22 23:33:34 +0000
commit057d736604eed731d9aa730e87220c4caa227ae3 (patch)
tree2bc6c60b6ab83ffb2aa3364ef3759bd23b93757e /sys/ofed
parent9a3b34d7631b1ef522add470e0131f1a802f4968 (diff)
downloadFreeBSD-src-057d736604eed731d9aa730e87220c4caa227ae3.zip
FreeBSD-src-057d736604eed731d9aa730e87220c4caa227ae3.tar.gz
Fix for iWARP servers that listen on INADDR_ANY.
The iWARP Connection Manager (CM) on FreeBSD creates a TCP socket to represent an iWARP endpoint when the connection is over TCP. For servers the current approach is to invoke create_listen callback for each iWARP RNIC registered with the CM. This doesn't work too well for INADDR_ANY because a listen on any TCP socket already notifies all hardware TOEs/RNICs of the new listener. This patch fixes the server side of things for FreeBSD. We've tried to keep all these modifications in the iWARP/TCP specific parts of the OFED infrastructure as much as possible. Submitted by: Krishnamraju Eraparaju @ Chelsio (with design inputs from Steve Wise) Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D4801
Diffstat (limited to 'sys/ofed')
-rw-r--r--sys/ofed/drivers/infiniband/core/cma.c77
-rw-r--r--sys/ofed/drivers/infiniband/core/iwcm.c292
-rw-r--r--sys/ofed/include/rdma/iw_cm.h8
-rw-r--r--sys/ofed/include/rdma/rdma_cm.h5
4 files changed, 373 insertions, 9 deletions
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
index 40c4d82..1cafced 100644
--- a/sys/ofed/drivers/infiniband/core/cma.c
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -3,6 +3,7 @@
* Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
* Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -407,6 +408,75 @@ static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_nu
return -EAGAIN;
}
+int
+rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type,
+ void **cm_id)
+{
+ int ret;
+ u8 port;
+ int found_dev = 0, found_cmid = 0;
+ struct rdma_id_private *id_priv;
+ struct rdma_id_private *dev_id_priv;
+ struct cma_device *cma_dev;
+ struct rdma_dev_addr dev_addr;
+ union ib_gid gid;
+ enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ?
+ IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+
+ ret = rdma_translate_ip((struct sockaddr *)local_addr,
+ &dev_addr, NULL);
+ if (ret)
+ goto err;
+
+ /* find rdma device based on MAC address/gid */
+ mutex_lock(&lock);
+
+ memcpy(&gid, dev_addr.src_dev_addr +
+ rdma_addr_gid_offset(&dev_addr), sizeof(gid));
+
+ list_for_each_entry(cma_dev, &dev_list, list)
+ for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port)
+ if ((rdma_port_get_link_layer(cma_dev->device, port) ==
+ dev_ll) &&
+ (rdma_node_get_transport(cma_dev->device->node_type) ==
+ RDMA_TRANSPORT_IWARP)) {
+ ret = find_gid_port(cma_dev->device,
+ &gid, port);
+ if (!ret) {
+ found_dev = 1;
+ goto out;
+ } else if (ret == 1) {
+ mutex_unlock(&lock);
+ goto err;
+ }
+ }
+out:
+ mutex_unlock(&lock);
+
+ if (!found_dev)
+ goto err;
+
+ /* Traverse through the list of listening cm_id's to find the
+ * desired cm_id based on rdma device & port number.
+ */
+ list_for_each_entry(id_priv, &listen_any_list, list)
+ list_for_each_entry(dev_id_priv, &id_priv->listen_list,
+ listen_list)
+ if (dev_id_priv->cma_dev == cma_dev)
+ if (dev_id_priv->cm_id.iw->local_addr.sin_port
+ == local_addr->sin_port) {
+ *cm_id = (void *)dev_id_priv->cm_id.iw;
+ found_cmid = 1;
+ }
+ return found_cmid ? 0 : -ENODEV;
+
+err:
+ return -ENODEV;
+}
+EXPORT_SYMBOL(rdma_find_cmid_laddr);
+
static int cma_acquire_dev(struct rdma_id_private *id_priv)
{
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
@@ -780,6 +850,12 @@ static inline int cma_any_addr(struct sockaddr *addr)
{
return cma_zero_addr(addr) || cma_loopback_addr(addr);
}
+int
+rdma_cma_any_addr(struct sockaddr *addr)
+{
+ return cma_any_addr(addr);
+}
+EXPORT_SYMBOL(rdma_cma_any_addr);
static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
{
@@ -1707,6 +1783,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
dev_id_priv = container_of(id, struct rdma_id_private, id);
dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+ dev_id_priv->sock = id_priv->sock;
memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c
index 14d23cc..a90f907 100644
--- a/sys/ofed/drivers/infiniband/core/iwcm.c
+++ b/sys/ofed/drivers/infiniband/core/iwcm.c
@@ -5,6 +5,7 @@
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +36,8 @@
* SOFTWARE.
*
*/
+#include "opt_inet.h"
+
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/idr.h>
@@ -47,7 +50,10 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/string.h>
+#include <netinet/tcp.h>
+#include <sys/mutex.h>
+#include <rdma/rdma_cm.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_addr.h>
@@ -65,6 +71,85 @@ struct iwcm_work {
struct iw_cm_event event;
struct list_head free_list;
};
+struct iwcm_listen_work {
+ struct work_struct work;
+ struct iw_cm_id *cm_id;
+};
+
+static LIST_HEAD(listen_port_list);
+
+static DEFINE_MUTEX(listen_port_mutex);
+static DEFINE_MUTEX(dequeue_mutex);
+
+struct listen_port_info {
+ struct list_head list;
+ uint16_t port_num;
+ uint32_t refcnt;
+};
+
+static int32_t
+add_port_to_listenlist(uint16_t port)
+{
+ struct listen_port_info *port_info;
+ int err = 0;
+
+ mutex_lock(&listen_port_mutex);
+
+ list_for_each_entry(port_info, &listen_port_list, list)
+ if (port_info->port_num == port)
+ goto found_port;
+
+ port_info = kmalloc(sizeof(*port_info), GFP_KERNEL);
+ if (!port_info) {
+ err = -ENOMEM;
+ mutex_unlock(&listen_port_mutex);
+ goto out;
+ }
+
+ port_info->port_num = port;
+ port_info->refcnt = 0;
+
+ list_add(&port_info->list, &listen_port_list);
+
+found_port:
+ ++(port_info->refcnt);
+ mutex_unlock(&listen_port_mutex);
+ return port_info->refcnt;
+out:
+ return err;
+}
+
+static int32_t
+rem_port_from_listenlist(uint16_t port)
+{
+ struct listen_port_info *port_info;
+ int ret, found_port = 0;
+
+ mutex_lock(&listen_port_mutex);
+
+ list_for_each_entry(port_info, &listen_port_list, list)
+ if (port_info->port_num == port) {
+ found_port = 1;
+ break;
+ }
+
+ if (found_port) {
+ --(port_info->refcnt);
+ ret = port_info->refcnt;
+ if (port_info->refcnt == 0) {
+ /* Remove this entry from the list as there are no
+ * more listeners for this port_num.
+ */
+ list_del(&port_info->list);
+ kfree(port_info);
+ }
+ } else {
+ ret = -EINVAL;
+ }
+ mutex_unlock(&listen_port_mutex);
+ return ret;
+
+}
/*
* The following services provide a mechanism for pre-allocating iwcm_work
@@ -320,6 +405,167 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
}
EXPORT_SYMBOL(iw_cm_disconnect);
+static struct socket *
+dequeue_socket(struct socket *head)
+{
+ struct socket *so;
+ struct sockaddr_in *remote;
+
+ ACCEPT_LOCK();
+ so = TAILQ_FIRST(&head->so_comp);
+ if (!so) {
+ ACCEPT_UNLOCK();
+ return NULL;
+ }
+
+ SOCK_LOCK(so);
+ /*
+ * Before changing the flags on the socket, we have to bump the
+ * reference count. Otherwise, if the protocol calls sofree(),
+ * the socket will be released due to a zero refcount.
+ */
+ soref(so);
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ head->so_qlen--;
+ so->so_qstate &= ~SQ_COMP;
+ so->so_head = NULL;
+ so->so_state |= SS_NBIO;
+ SOCK_UNLOCK(so);
+ ACCEPT_UNLOCK();
+ soaccept(so, (struct sockaddr **)&remote);
+
+ free(remote, M_SONAME);
+ return so;
+}
+static void
+iw_so_event_handler(struct work_struct *_work)
+{
+#ifdef INET
+ struct iwcm_listen_work *work = container_of(_work,
+ struct iwcm_listen_work, work);
+ struct iw_cm_id *listen_cm_id = work->cm_id;
+ struct iwcm_id_private *cm_id_priv;
+ struct iw_cm_id *real_cm_id;
+ struct sockaddr_in *local;
+ struct socket *so;
+
+ cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id);
+
+ if (cm_id_priv->state != IW_CM_STATE_LISTEN) {
+ kfree(work);
+ return;
+ }
+ mutex_lock(&dequeue_mutex);
+
+ /* Dequeue & process all new 'so' connection requests for this cmid */
+ while ((so = dequeue_socket(work->cm_id->so)) != NULL) {
+ if (rdma_cma_any_addr((struct sockaddr *)
+ &listen_cm_id->local_addr)) {
+ in_getsockaddr(so, (struct sockaddr **)&local);
+ if (rdma_find_cmid_laddr(local, ARPHRD_ETHER,
+ (void **) &real_cm_id)) {
+ free(local, M_SONAME);
+ goto err;
+ }
+ free(local, M_SONAME);
+
+ real_cm_id->device->iwcm->newconn(real_cm_id, so);
+ } else {
+ listen_cm_id->device->iwcm->newconn(listen_cm_id, so);
+ }
+ }
+err:
+ mutex_unlock(&dequeue_mutex);
+ kfree(work);
+#endif
+ return;
+}
+static int
+iw_so_upcall(struct socket *parent_so, void *arg, int waitflag)
+{
+ struct iwcm_listen_work *work;
+ struct socket *so;
+ struct iw_cm_id *cm_id = arg;
+
+ mutex_lock(&dequeue_mutex);
+ /* check whether iw_so_event_handler() already dequeued this 'so' */
+ so = TAILQ_FIRST(&parent_so->so_comp);
+ if (!so)
+ return SU_OK;
+ work = kzalloc(sizeof(*work), M_NOWAIT);
+ if (!work)
+ return -ENOMEM;
+ work->cm_id = cm_id;
+
+ INIT_WORK(&work->work, iw_so_event_handler);
+ queue_work(iwcm_wq, &work->work);
+
+ mutex_unlock(&dequeue_mutex);
+ return SU_OK;
+}
+
+static void
+iw_init_sock(struct iw_cm_id *cm_id)
+{
+ struct sockopt sopt;
+ struct socket *so = cm_id->so;
+ int on = 1;
+
+ SOCK_LOCK(so);
+ soupcall_set(so, SO_RCV, iw_so_upcall, cm_id);
+ so->so_state |= SS_NBIO;
+ SOCK_UNLOCK(so);
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = IPPROTO_TCP;
+ sopt.sopt_name = TCP_NODELAY;
+ sopt.sopt_val = (caddr_t)&on;
+ sopt.sopt_valsize = sizeof(on);
+ sopt.sopt_td = NULL;
+ sosetopt(so, &sopt);
+}
+
+static int
+iw_close_socket(struct iw_cm_id *cm_id, int close)
+{
+ struct socket *so = cm_id->so;
+ int rc;
+
+
+ SOCK_LOCK(so);
+ soupcall_clear(so, SO_RCV);
+ SOCK_UNLOCK(so);
+
+ if (close)
+ rc = soclose(so);
+ else
+ rc = soshutdown(so, SHUT_WR | SHUT_RD);
+
+ cm_id->so = NULL;
+
+ return rc;
+}
+
+static int
+iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+ int rc;
+
+ iw_init_sock(cm_id);
+ rc = solisten(cm_id->so, backlog, curthread);
+ if (rc != 0)
+ iw_close_socket(cm_id, 0);
+ return rc;
+}
+
+static int
+iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+ int rc;
+ rc = iw_close_socket(cm_id, 0);
+ return rc;
+}
+
+
/*
* CM_ID <-- DESTROYING
*
@@ -330,7 +576,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
unsigned long flags;
- int ret;
+ int ret = 0, refcnt;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
/*
@@ -345,8 +591,18 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
case IW_CM_STATE_LISTEN:
cm_id_priv->state = IW_CM_STATE_DESTROYING;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- /* destroy the listening endpoint */
- ret = cm_id->device->iwcm->destroy_listen(cm_id);
+ if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) {
+ refcnt =
+ rem_port_from_listenlist(cm_id->local_addr.sin_port);
+
+ if (refcnt == 0)
+ ret = iw_destroy_listen(cm_id);
+
+ cm_id->device->iwcm->destroy_listen_ep(cm_id);
+ } else {
+ ret = iw_destroy_listen(cm_id);
+ cm_id->device->iwcm->destroy_listen_ep(cm_id);
+ }
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
case IW_CM_STATE_ESTABLISHED:
@@ -418,7 +674,7 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
{
struct iwcm_id_private *cm_id_priv;
unsigned long flags;
- int ret;
+ int ret, refcnt;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
@@ -431,9 +687,33 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
case IW_CM_STATE_IDLE:
cm_id_priv->state = IW_CM_STATE_LISTEN;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
- if (ret)
+
+ if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) {
+ refcnt =
+ add_port_to_listenlist(cm_id->local_addr.sin_port);
+
+ if (refcnt == 1) {
+ ret = iw_create_listen(cm_id, backlog);
+ } else if (refcnt <= 0) {
+ ret = -EINVAL;
+ } else {
+ /* if refcnt > 1, a socket listener created
+ * already. And we need not create socket
+ * listener on other rdma devices/listen cm_id's
+ * due to TOE. That is when a socket listener is
+ * created with INADDR_ANY all registered TOE
+ * devices will get a call to start
+ * hardware listeners.
+ */
+ }
+ } else {
+ ret = iw_create_listen(cm_id, backlog);
+ }
+ if (!ret)
+ cm_id->device->iwcm->create_listen_ep(cm_id, backlog);
+ else
cm_id_priv->state = IW_CM_STATE_IDLE;
+
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
default:
diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h
index 271c2f8..a246e61 100644
--- a/sys/ofed/include/rdma/iw_cm.h
+++ b/sys/ofed/include/rdma/iw_cm.h
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
* Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -120,10 +121,13 @@ struct iw_cm_verbs {
int (*reject)(struct iw_cm_id *cm_id,
const void *pdata, u8 pdata_len);
- int (*create_listen)(struct iw_cm_id *cm_id,
+ int (*create_listen_ep)(struct iw_cm_id *cm_id,
int backlog);
- int (*destroy_listen)(struct iw_cm_id *cm_id);
+ void (*destroy_listen_ep)(struct iw_cm_id *cm_id);
+
+ void (*newconn)(struct iw_cm_id *parent_cm_id,
+ struct socket *so);
};
/**
diff --git a/sys/ofed/include/rdma/rdma_cm.h b/sys/ofed/include/rdma/rdma_cm.h
index d699261..33be957 100644
--- a/sys/ofed/include/rdma/rdma_cm.h
+++ b/sys/ofed/include/rdma/rdma_cm.h
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2005 Voltaire Inc. All rights reserved.
* Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -400,5 +401,7 @@ int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
* @timeout: QP timeout
*/
void rdma_set_timeout(struct rdma_cm_id *id, int timeout);
-
+int rdma_cma_any_addr(struct sockaddr *addr);
+int rdma_find_cmid_laddr(struct sockaddr_in *local_addr,
+ unsigned short dev_type, void **cm_id);
#endif /* RDMA_CM_H */
OpenPOWER on IntegriCloud