diff options
author | np <np@FreeBSD.org> | 2016-01-22 23:33:34 +0000 |
---|---|---|
committer | np <np@FreeBSD.org> | 2016-01-22 23:33:34 +0000 |
commit | 057d736604eed731d9aa730e87220c4caa227ae3 (patch) | |
tree | 2bc6c60b6ab83ffb2aa3364ef3759bd23b93757e /sys/ofed | |
parent | 9a3b34d7631b1ef522add470e0131f1a802f4968 (diff) | |
download | FreeBSD-src-057d736604eed731d9aa730e87220c4caa227ae3.zip FreeBSD-src-057d736604eed731d9aa730e87220c4caa227ae3.tar.gz |
Fix for iWARP servers that listen on INADDR_ANY.
The iWARP Connection Manager (CM) on FreeBSD creates a TCP socket to
represent an iWARP endpoint when the connection is over TCP. For
servers the current approach is to invoke create_listen callback for
each iWARP RNIC registered with the CM. This doesn't work too well for
INADDR_ANY because a listen on any TCP socket already notifies all
hardware TOEs/RNICs of the new listener. This patch fixes the server
side of things for FreeBSD. We've tried to keep all these modifications
in the iWARP/TCP specific parts of the OFED infrastructure as much as
possible.
Submitted by: Krishnamraju Eraparaju @ Chelsio (with design inputs from Steve Wise)
Sponsored by: Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D4801
Diffstat (limited to 'sys/ofed')
-rw-r--r-- | sys/ofed/drivers/infiniband/core/cma.c | 77 | ||||
-rw-r--r-- | sys/ofed/drivers/infiniband/core/iwcm.c | 292 | ||||
-rw-r--r-- | sys/ofed/include/rdma/iw_cm.h | 8 | ||||
-rw-r--r-- | sys/ofed/include/rdma/rdma_cm.h | 5 |
4 files changed, 373 insertions, 9 deletions
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c index 40c4d82..1cafced 100644 --- a/sys/ofed/drivers/infiniband/core/cma.c +++ b/sys/ofed/drivers/infiniband/core/cma.c @@ -3,6 +3,7 @@ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -407,6 +408,75 @@ static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_nu return -EAGAIN; } +int +rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type, + void **cm_id) +{ + int ret; + u8 port; + int found_dev = 0, found_cmid = 0; + struct rdma_id_private *id_priv; + struct rdma_id_private *dev_id_priv; + struct cma_device *cma_dev; + struct rdma_dev_addr dev_addr; + union ib_gid gid; + enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + + memset(&dev_addr, 0, sizeof(dev_addr)); + + ret = rdma_translate_ip((struct sockaddr *)local_addr, + &dev_addr, NULL); + if (ret) + goto err; + + /* find rdma device based on MAC address/gid */ + mutex_lock(&lock); + + memcpy(&gid, dev_addr.src_dev_addr + + rdma_addr_gid_offset(&dev_addr), sizeof(gid)); + + list_for_each_entry(cma_dev, &dev_list, list) + for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) + if ((rdma_port_get_link_layer(cma_dev->device, port) == + dev_ll) && + (rdma_node_get_transport(cma_dev->device->node_type) == + RDMA_TRANSPORT_IWARP)) { + ret = find_gid_port(cma_dev->device, + &gid, port); + if (!ret) { + found_dev = 1; + goto out; + } else if (ret == 1) { + mutex_unlock(&lock); + goto err; + } + } +out: + mutex_unlock(&lock); + + if (!found_dev) + goto err; + + /* Traverse through the list of listening cm_id's to find the + * desired cm_id based on rdma device & port number. + */ + list_for_each_entry(id_priv, &listen_any_list, list) + list_for_each_entry(dev_id_priv, &id_priv->listen_list, + listen_list) + if (dev_id_priv->cma_dev == cma_dev) + if (dev_id_priv->cm_id.iw->local_addr.sin_port + == local_addr->sin_port) { + *cm_id = (void *)dev_id_priv->cm_id.iw; + found_cmid = 1; + } + return found_cmid ? 0 : -ENODEV; + +err: + return -ENODEV; +} +EXPORT_SYMBOL(rdma_find_cmid_laddr); + static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; @@ -780,6 +850,12 @@ static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } +int +rdma_cma_any_addr(struct sockaddr *addr) +{ + return cma_any_addr(addr); +} +EXPORT_SYMBOL(rdma_cma_any_addr); static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) { @@ -1707,6 +1783,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, dev_id_priv = container_of(id, struct rdma_id_private, id); dev_id_priv->state = RDMA_CM_ADDR_BOUND; + dev_id_priv->sock = id_priv->sock; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c index 14d23cc..a90f907 100644 --- a/sys/ofed/drivers/infiniband/core/iwcm.c +++ b/sys/ofed/drivers/infiniband/core/iwcm.c @@ -5,6 +5,7 @@ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +36,8 @@ * SOFTWARE. * */ +#include "opt_inet.h" + #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/idr.h> @@ -47,7 +50,10 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> +#include <netinet/tcp.h> +#include <sys/mutex.h> +#include <rdma/rdma_cm.h> #include <rdma/iw_cm.h> #include <rdma/ib_addr.h> @@ -65,6 +71,85 @@ struct iwcm_work { struct iw_cm_event event; struct list_head free_list; }; +struct iwcm_listen_work { + struct work_struct work; + struct iw_cm_id *cm_id; +}; + +static LIST_HEAD(listen_port_list); + +static DEFINE_MUTEX(listen_port_mutex); +static DEFINE_MUTEX(dequeue_mutex); + +struct listen_port_info { + struct list_head list; + uint16_t port_num; + uint32_t refcnt; +}; + +static int32_t +add_port_to_listenlist(uint16_t port) +{ + struct listen_port_info *port_info; + int err = 0; + + mutex_lock(&listen_port_mutex); + + list_for_each_entry(port_info, &listen_port_list, list) + if (port_info->port_num == port) + goto found_port; + + port_info = kmalloc(sizeof(*port_info), GFP_KERNEL); + if (!port_info) { + err = -ENOMEM; + mutex_unlock(&listen_port_mutex); + goto out; + } + + port_info->port_num = port; + port_info->refcnt = 0; + + list_add(&port_info->list, &listen_port_list); + +found_port: + ++(port_info->refcnt); + mutex_unlock(&listen_port_mutex); + return port_info->refcnt; +out: + return err; +} + +static int32_t +rem_port_from_listenlist(uint16_t port) +{ + struct listen_port_info *port_info; + int ret, found_port = 0; + + mutex_lock(&listen_port_mutex); + + list_for_each_entry(port_info, &listen_port_list, list) + if (port_info->port_num == port) { + found_port = 1; + break; + } + + if (found_port) { + --(port_info->refcnt); + ret = port_info->refcnt; + if (port_info->refcnt == 0) { + /* Remove this entry from the list as there are no + * more listeners for this port_num. + */ + list_del(&port_info->list); + kfree(port_info); + } + } else { + ret = -EINVAL; + } + mutex_unlock(&listen_port_mutex); + return ret; + +} /* * The following services provide a mechanism for pre-allocating iwcm_work @@ -320,6 +405,167 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) } EXPORT_SYMBOL(iw_cm_disconnect); +static struct socket * +dequeue_socket(struct socket *head) +{ + struct socket *so; + struct sockaddr_in *remote; + + ACCEPT_LOCK(); + so = TAILQ_FIRST(&head->so_comp); + if (!so) { + ACCEPT_UNLOCK(); + return NULL; + } + + SOCK_LOCK(so); + /* + * Before changing the flags on the socket, we have to bump the + * reference count. Otherwise, if the protocol calls sofree(), + * the socket will be released due to a zero refcount. + */ + soref(so); + TAILQ_REMOVE(&head->so_comp, so, so_list); + head->so_qlen--; + so->so_qstate &= ~SQ_COMP; + so->so_head = NULL; + so->so_state |= SS_NBIO; + SOCK_UNLOCK(so); + ACCEPT_UNLOCK(); + soaccept(so, (struct sockaddr **)&remote); + + free(remote, M_SONAME); + return so; +} +static void +iw_so_event_handler(struct work_struct *_work) +{ +#ifdef INET + struct iwcm_listen_work *work = container_of(_work, + struct iwcm_listen_work, work); + struct iw_cm_id *listen_cm_id = work->cm_id; + struct iwcm_id_private *cm_id_priv; + struct iw_cm_id *real_cm_id; + struct sockaddr_in *local; + struct socket *so; + + cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id); + + if (cm_id_priv->state != IW_CM_STATE_LISTEN) { + kfree(work); + return; + } + mutex_lock(&dequeue_mutex); + + /* Dequeue & process all new 'so' connection requests for this cmid */ + while ((so = dequeue_socket(work->cm_id->so)) != NULL) { + if (rdma_cma_any_addr((struct sockaddr *) + &listen_cm_id->local_addr)) { + in_getsockaddr(so, (struct sockaddr **)&local); + if (rdma_find_cmid_laddr(local, ARPHRD_ETHER, + (void **) &real_cm_id)) { + free(local, M_SONAME); + goto err; + } + free(local, M_SONAME); + + real_cm_id->device->iwcm->newconn(real_cm_id, so); + } else { + listen_cm_id->device->iwcm->newconn(listen_cm_id, so); + } + } +err: + mutex_unlock(&dequeue_mutex); + kfree(work); +#endif + return; +} +static int +iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) +{ + struct iwcm_listen_work *work; + struct socket *so; + struct iw_cm_id *cm_id = arg; + + mutex_lock(&dequeue_mutex); + /* check whether iw_so_event_handler() already dequeued this 'so' */ + so = TAILQ_FIRST(&parent_so->so_comp); + if (!so) + return SU_OK; + work = kzalloc(sizeof(*work), M_NOWAIT); + if (!work) + return -ENOMEM; + work->cm_id = cm_id; + + INIT_WORK(&work->work, iw_so_event_handler); + queue_work(iwcm_wq, &work->work); + + mutex_unlock(&dequeue_mutex); + return SU_OK; +} + +static void +iw_init_sock(struct iw_cm_id *cm_id) +{ + struct sockopt sopt; + struct socket *so = cm_id->so; + int on = 1; + + SOCK_LOCK(so); + soupcall_set(so, SO_RCV, iw_so_upcall, cm_id); + so->so_state |= SS_NBIO; + SOCK_UNLOCK(so); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_NODELAY; + sopt.sopt_val = (caddr_t)&on; + sopt.sopt_valsize = sizeof(on); + sopt.sopt_td = NULL; + sosetopt(so, &sopt); +} + +static int +iw_close_socket(struct iw_cm_id *cm_id, int close) +{ + struct socket *so = cm_id->so; + int rc; + + + SOCK_LOCK(so); + soupcall_clear(so, SO_RCV); + SOCK_UNLOCK(so); + + if (close) + rc = soclose(so); + else + rc = soshutdown(so, SHUT_WR | SHUT_RD); + + cm_id->so = NULL; + + return rc; +} + +static int +iw_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int rc; + + iw_init_sock(cm_id); + rc = solisten(cm_id->so, backlog, curthread); + if (rc != 0) + iw_close_socket(cm_id, 0); + return rc; +} + +static int +iw_destroy_listen(struct iw_cm_id *cm_id) +{ + int rc; + rc = iw_close_socket(cm_id, 0); + return rc; +} + + /* * CM_ID <-- DESTROYING * @@ -330,7 +576,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret; + int ret = 0, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* @@ -345,8 +591,18 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - /* destroy the listening endpoint */ - ret = cm_id->device->iwcm->destroy_listen(cm_id); + if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { + refcnt = + rem_port_from_listenlist(cm_id->local_addr.sin_port); + + if (refcnt == 0) + ret = iw_destroy_listen(cm_id); + + cm_id->device->iwcm->destroy_listen_ep(cm_id); + } else { + ret = iw_destroy_listen(cm_id); + cm_id->device->iwcm->destroy_listen_ep(cm_id); + } spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: @@ -418,7 +674,7 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) { struct iwcm_id_private *cm_id_priv; unsigned long flags; - int ret; + int ret, refcnt; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); @@ -431,9 +687,33 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->create_listen(cm_id, backlog); - if (ret) + + if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) { + refcnt = + add_port_to_listenlist(cm_id->local_addr.sin_port); + + if (refcnt == 1) { + ret = iw_create_listen(cm_id, backlog); + } else if (refcnt <= 0) { + ret = -EINVAL; + } else { + /* if refcnt > 1, a socket listener created + * already. And we need not create socket + * listener on other rdma devices/listen cm_id's + * due to TOE. That is when a socket listener is + * created with INADDR_ANY all registered TOE + * devices will get a call to start + * hardware listeners. + */ + } + } else { + ret = iw_create_listen(cm_id, backlog); + } + if (!ret) + cm_id->device->iwcm->create_listen_ep(cm_id, backlog); + else cm_id_priv->state = IW_CM_STATE_IDLE; + spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h index 271c2f8..a246e61 100644 --- a/sys/ofed/include/rdma/iw_cm.h +++ b/sys/ofed/include/rdma/iw_cm.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -120,10 +121,13 @@ struct iw_cm_verbs { int (*reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); - int (*create_listen)(struct iw_cm_id *cm_id, + int (*create_listen_ep)(struct iw_cm_id *cm_id, int backlog); - int (*destroy_listen)(struct iw_cm_id *cm_id); + void (*destroy_listen_ep)(struct iw_cm_id *cm_id); + + void (*newconn)(struct iw_cm_id *parent_cm_id, + struct socket *so); }; /** diff --git a/sys/ofed/include/rdma/rdma_cm.h b/sys/ofed/include/rdma/rdma_cm.h index d699261..33be957 100644 --- a/sys/ofed/include/rdma/rdma_cm.h +++ b/sys/ofed/include/rdma/rdma_cm.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -400,5 +401,7 @@ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); * @timeout: QP timeout */ void rdma_set_timeout(struct rdma_cm_id *id, int timeout); - +int rdma_cma_any_addr(struct sockaddr *addr); +int rdma_find_cmid_laddr(struct sockaddr_in *local_addr, + unsigned short dev_type, void **cm_id); #endif /* RDMA_CM_H */ |