diff options
Diffstat (limited to 'drivers/staging/lustre/lnet/klnds')
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/Makefile | 1 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile | 5 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c | 2958 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h | 1048 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 3763 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c | 296 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/Makefile | 6 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c | 2921 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h | 704 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 2586 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c | 534 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c | 184 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c | 810 |
13 files changed, 0 insertions, 15816 deletions
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile deleted file mode 100644 index c23e4f6..0000000 --- a/drivers/staging/lustre/lnet/klnds/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_LNET) += o2iblnd/ socklnd/ diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile deleted file mode 100644 index 4affe1d..0000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o -ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c deleted file mode 100644 index f0b4eb42..0000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ /dev/null @@ -1,2958 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd.c - * - * Author: Eric Barton <eric@bartonsoftware.com> - */ - -#include <asm/div64.h> -#include <asm/page.h> -#include "o2iblnd.h" - -static struct lnet_lnd the_o2iblnd; - -struct kib_data kiblnd_data; - -static __u32 kiblnd_cksum(void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - /* ensure I don't return 0 (== no checksum) */ - return !sum ? 1 : sum; -} - -static char *kiblnd_msgtype2str(int type) -{ - switch (type) { - case IBLND_MSG_CONNREQ: - return "CONNREQ"; - - case IBLND_MSG_CONNACK: - return "CONNACK"; - - case IBLND_MSG_NOOP: - return "NOOP"; - - case IBLND_MSG_IMMEDIATE: - return "IMMEDIATE"; - - case IBLND_MSG_PUT_REQ: - return "PUT_REQ"; - - case IBLND_MSG_PUT_NAK: - return "PUT_NAK"; - - case IBLND_MSG_PUT_ACK: - return "PUT_ACK"; - - case IBLND_MSG_PUT_DONE: - return "PUT_DONE"; - - case IBLND_MSG_GET_REQ: - return "GET_REQ"; - - case IBLND_MSG_GET_DONE: - return "GET_DONE"; - - default: - return "???"; - } -} - -static int kiblnd_msgtype2size(int type) -{ - const int hdr_size = offsetof(struct kib_msg, ibm_u); - - switch (type) { - case IBLND_MSG_CONNREQ: - case IBLND_MSG_CONNACK: - return hdr_size + sizeof(struct kib_connparams); - - case IBLND_MSG_NOOP: - return hdr_size; - - case IBLND_MSG_IMMEDIATE: - return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]); - - case IBLND_MSG_PUT_REQ: - return hdr_size + sizeof(struct kib_putreq_msg); - - case IBLND_MSG_PUT_ACK: - return hdr_size + sizeof(struct kib_putack_msg); - - case IBLND_MSG_GET_REQ: - return hdr_size + sizeof(struct kib_get_msg); - - case IBLND_MSG_PUT_NAK: - case IBLND_MSG_PUT_DONE: - case IBLND_MSG_GET_DONE: - return hdr_size + sizeof(struct kib_completion_msg); - default: - return -1; - } -} - -static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) -{ - struct kib_rdma_desc *rd; - int msg_size; - int nob; - int n; - int i; - - LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || - msg->ibm_type == IBLND_MSG_PUT_ACK); - - rd = msg->ibm_type == IBLND_MSG_GET_REQ ? - &msg->ibm_u.get.ibgm_rd : - &msg->ibm_u.putack.ibpam_rd; - - if (flip) { - __swab32s(&rd->rd_key); - __swab32s(&rd->rd_nfrags); - } - - n = rd->rd_nfrags; - - nob = offsetof(struct kib_msg, ibm_u) + - kiblnd_rd_msg_size(rd, msg->ibm_type, n); - - if (msg->ibm_nob < nob) { - CERROR("Short %s: %d(%d)\n", - kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); - return 1; - } - - msg_size = kiblnd_rd_size(rd); - if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) { - CERROR("Bad msg_size: %d, should be 0 < n <= %d\n", - msg_size, LNET_MAX_PAYLOAD); - return 1; - } - - if (!flip) - return 0; - - for (i = 0; i < n; i++) { - __swab32s(&rd->rd_frags[i].rf_nob); - __swab64s(&rd->rd_frags[i].rf_addr); - } - - return 0; -} - -void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, - int credits, lnet_nid_t dstnid, __u64 dststamp) -{ - struct kib_net *net = ni->ni_data; - - /* - * CAVEAT EMPTOR! all message fields not set here should have been - * initialised previously. - */ - msg->ibm_magic = IBLND_MSG_MAGIC; - msg->ibm_version = version; - /* ibm_type */ - msg->ibm_credits = credits; - /* ibm_nob */ - msg->ibm_cksum = 0; - msg->ibm_srcnid = ni->ni_nid; - msg->ibm_srcstamp = net->ibn_incarnation; - msg->ibm_dstnid = dstnid; - msg->ibm_dststamp = dststamp; - - if (*kiblnd_tunables.kib_cksum) { - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); - } -} - -int kiblnd_unpack_msg(struct kib_msg *msg, int nob) -{ - const int hdr_size = offsetof(struct kib_msg, ibm_u); - __u32 msg_cksum; - __u16 version; - int msg_nob; - int flip; - - /* 6 bytes are enough to have received magic + version */ - if (nob < 6) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - if (msg->ibm_magic == IBLND_MSG_MAGIC) { - flip = 0; - } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { - flip = 1; - } else { - CERROR("Bad magic: %08x\n", msg->ibm_magic); - return -EPROTO; - } - - version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; - if (version != IBLND_MSG_VERSION && - version != IBLND_MSG_VERSION_1) { - CERROR("Bad version: %x\n", version); - return -EPROTO; - } - - if (nob < hdr_size) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; - if (msg_nob > nob) { - CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); - return -EPROTO; - } - - /* - * checksum must be computed with ibm_cksum zero and BEFORE anything - * gets flipped - */ - msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; - msg->ibm_cksum = 0; - if (msg_cksum && - msg_cksum != kiblnd_cksum(msg, msg_nob)) { - CERROR("Bad checksum\n"); - return -EPROTO; - } - - msg->ibm_cksum = msg_cksum; - - if (flip) { - /* leave magic unflipped as a clue to peer endianness */ - msg->ibm_version = version; - BUILD_BUG_ON(sizeof(msg->ibm_type) != 1); - BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1); - msg->ibm_nob = msg_nob; - __swab64s(&msg->ibm_srcnid); - __swab64s(&msg->ibm_srcstamp); - __swab64s(&msg->ibm_dstnid); - __swab64s(&msg->ibm_dststamp); - } - - if (msg->ibm_srcnid == LNET_NID_ANY) { - CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); - return -EPROTO; - } - - if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { - CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), - msg_nob, kiblnd_msgtype2size(msg->ibm_type)); - return -EPROTO; - } - - switch (msg->ibm_type) { - default: - CERROR("Unknown message type %x\n", msg->ibm_type); - return -EPROTO; - - case IBLND_MSG_NOOP: - case IBLND_MSG_IMMEDIATE: - case IBLND_MSG_PUT_REQ: - break; - - case IBLND_MSG_PUT_ACK: - case IBLND_MSG_GET_REQ: - if (kiblnd_unpack_rd(msg, flip)) - return -EPROTO; - break; - - case IBLND_MSG_PUT_NAK: - case IBLND_MSG_PUT_DONE: - case IBLND_MSG_GET_DONE: - if (flip) - __swab32s(&msg->ibm_u.completion.ibcm_status); - break; - - case IBLND_MSG_CONNREQ: - case IBLND_MSG_CONNACK: - if (flip) { - __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); - __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); - __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); - } - break; - } - return 0; -} - -int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp, - lnet_nid_t nid) -{ - struct kib_peer *peer; - struct kib_net *net = ni->ni_data; - int cpt = lnet_cpt_of_nid(nid); - unsigned long flags; - - LASSERT(net); - LASSERT(nid != LNET_NID_ANY); - - peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt); - if (!peer) { - CERROR("Cannot allocate peer\n"); - return -ENOMEM; - } - - peer->ibp_ni = ni; - peer->ibp_nid = nid; - peer->ibp_error = 0; - peer->ibp_last_alive = 0; - peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni); - peer->ibp_queue_depth = ni->ni_peertxcredits; - atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD(&peer->ibp_conns); - INIT_LIST_HEAD(&peer->ibp_tx_queue); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - /* always called with a ref on ni, which prevents ni being shutdown */ - LASSERT(!net->ibn_shutdown); - - /* npeers only grows with the global lock held */ - atomic_inc(&net->ibn_npeers); - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - *peerp = peer; - return 0; -} - -void kiblnd_destroy_peer(struct kib_peer *peer) -{ - struct kib_net *net = peer->ibp_ni->ni_data; - - LASSERT(net); - LASSERT(!atomic_read(&peer->ibp_refcount)); - LASSERT(!kiblnd_peer_active(peer)); - LASSERT(kiblnd_peer_idle(peer)); - LASSERT(list_empty(&peer->ibp_tx_queue)); - - kfree(peer); - - /* - * NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. - */ - atomic_dec(&net->ibn_npeers); -} - -struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid) -{ - /* - * the caller is responsible for accounting the additional reference - * that this creates - */ - struct list_head *peer_list = kiblnd_nid2peerlist(nid); - struct list_head *tmp; - struct kib_peer *peer; - - list_for_each(tmp, peer_list) { - peer = list_entry(tmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", - peer, libcfs_nid2str(nid), - atomic_read(&peer->ibp_refcount), - peer->ibp_version); - return peer; - } - return NULL; -} - -void kiblnd_unlink_peer_locked(struct kib_peer *peer) -{ - LASSERT(list_empty(&peer->ibp_conns)); - - LASSERT(kiblnd_peer_active(peer)); - list_del_init(&peer->ibp_list); - /* lose peerlist's ref */ - kiblnd_peer_decref(peer); -} - -static int kiblnd_get_peer_info(struct lnet_ni *ni, int index, - lnet_nid_t *nidp, int *count) -{ - struct kib_peer *peer; - struct list_head *ptmp; - int i; - unsigned long flags; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { - list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *count = atomic_read(&peer->ibp_refcount); - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, - flags); - return 0; - } - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - return -ENOENT; -} - -static void kiblnd_del_peer_locked(struct kib_peer *peer) -{ - struct list_head *ctmp; - struct list_head *cnxt; - struct kib_conn *conn; - - if (list_empty(&peer->ibp_conns)) { - kiblnd_unlink_peer_locked(peer); - } else { - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - kiblnd_close_conn_locked(conn, 0); - } - /* NB closing peer's last conn unlinked it. */ - } - /* - * NB peer now unlinked; might even be freed if the peer table had the - * last ref on it. - */ -} - -static int kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid) -{ - LIST_HEAD(zombies); - struct list_head *ptmp; - struct list_head *pnxt; - struct kib_peer *peer; - int lo; - int hi; - int i; - unsigned long flags; - int rc = -ENOENT; - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) { - lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - } else { - lo = 0; - hi = kiblnd_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) - continue; - - if (!list_empty(&peer->ibp_tx_queue)) { - LASSERT(list_empty(&peer->ibp_conns)); - - list_splice_init(&peer->ibp_tx_queue, - &zombies); - } - - kiblnd_del_peer_locked(peer); - rc = 0; /* matched something */ - } - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_txlist_done(ni, &zombies, -EIO); - - return rc; -} - -static struct kib_conn *kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index) -{ - struct kib_peer *peer; - struct list_head *ptmp; - struct kib_conn *conn; - struct list_head *ctmp; - int i; - unsigned long flags; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { - list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - list_for_each(ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry(ctmp, struct kib_conn, - ibc_list); - kiblnd_conn_addref(conn); - read_unlock_irqrestore( - &kiblnd_data.kib_global_lock, - flags); - return conn; - } - } - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - return NULL; -} - -int kiblnd_translate_mtu(int value) -{ - switch (value) { - default: - return -1; - case 0: - return 0; - case 256: - return IB_MTU_256; - case 512: - return IB_MTU_512; - case 1024: - return IB_MTU_1024; - case 2048: - return IB_MTU_2048; - case 4096: - return IB_MTU_4096; - } -} - -static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) -{ - int mtu; - - /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ - if (!cmid->route.path_rec) - return; - - mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); - LASSERT(mtu >= 0); - if (mtu) - cmid->route.path_rec->mtu = mtu; -} - -static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) -{ - cpumask_var_t *mask; - int vectors; - int off; - int i; - lnet_nid_t nid = conn->ibc_peer->ibp_nid; - - vectors = conn->ibc_cmid->device->num_comp_vectors; - if (vectors <= 1) - return 0; - - mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); - if (!mask) - return 0; - - /* hash NID to CPU id in this partition... */ - off = do_div(nid, cpumask_weight(*mask)); - for_each_cpu(i, *mask) { - if (!off--) - return i % vectors; - } - - LBUG(); - return 1; -} - -struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid, - int state, int version) -{ - /* - * CAVEAT EMPTOR: - * If the new conn is created successfully it takes over the caller's - * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself - * is destroyed. On failure, the caller's ref on 'peer' remains and - * she must dispose of 'cmid'. (Actually I'd block forever if I tried - * to destroy 'cmid' here since I'm called from the CM which still has - * its ref on 'cmid'). - */ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_net *net = peer->ibp_ni->ni_data; - struct kib_dev *dev; - struct ib_qp_init_attr *init_qp_attr; - struct kib_sched_info *sched; - struct ib_cq_init_attr cq_attr = {}; - struct kib_conn *conn; - struct ib_cq *cq; - unsigned long flags; - int cpt; - int rc; - int i; - - LASSERT(net); - LASSERT(!in_interrupt()); - - dev = net->ibn_dev; - - cpt = lnet_cpt_of_nid(peer->ibp_nid); - sched = kiblnd_data.kib_scheds[cpt]; - - LASSERT(sched->ibs_nthreads > 0); - - init_qp_attr = kzalloc_cpt(sizeof(*init_qp_attr), GFP_NOFS, cpt); - if (!init_qp_attr) { - CERROR("Can't allocate qp_attr for %s\n", - libcfs_nid2str(peer->ibp_nid)); - goto failed_0; - } - - conn = kzalloc_cpt(sizeof(*conn), GFP_NOFS, cpt); - if (!conn) { - CERROR("Can't allocate connection for %s\n", - libcfs_nid2str(peer->ibp_nid)); - goto failed_1; - } - - conn->ibc_state = IBLND_CONN_INIT; - conn->ibc_version = version; - conn->ibc_peer = peer; /* I take the caller's ref */ - cmid->context = conn; /* for future CM callbacks */ - conn->ibc_cmid = cmid; - conn->ibc_max_frags = peer->ibp_max_frags; - conn->ibc_queue_depth = peer->ibp_queue_depth; - - INIT_LIST_HEAD(&conn->ibc_early_rxs); - INIT_LIST_HEAD(&conn->ibc_tx_noops); - INIT_LIST_HEAD(&conn->ibc_tx_queue); - INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); - INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); - INIT_LIST_HEAD(&conn->ibc_active_txs); - spin_lock_init(&conn->ibc_lock); - - conn->ibc_connvars = kzalloc_cpt(sizeof(*conn->ibc_connvars), GFP_NOFS, cpt); - if (!conn->ibc_connvars) { - CERROR("Can't allocate in-progress connection state\n"); - goto failed_2; - } - - write_lock_irqsave(glock, flags); - if (dev->ibd_failover) { - write_unlock_irqrestore(glock, flags); - CERROR("%s: failover in progress\n", dev->ibd_ifname); - goto failed_2; - } - - if (dev->ibd_hdev->ibh_ibdev != cmid->device) { - /* wakeup failover thread and teardown connection */ - if (kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - wake_up(&kiblnd_data.kib_failover_waitq); - } - - write_unlock_irqrestore(glock, flags); - CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", - cmid->device->name, dev->ibd_ifname); - goto failed_2; - } - - kiblnd_hdev_addref_locked(dev->ibd_hdev); - conn->ibc_hdev = dev->ibd_hdev; - - kiblnd_setup_mtu_locked(cmid); - - write_unlock_irqrestore(glock, flags); - - conn->ibc_rxs = kzalloc_cpt(IBLND_RX_MSGS(conn) * sizeof(struct kib_rx), - GFP_NOFS, cpt); - if (!conn->ibc_rxs) { - CERROR("Cannot allocate RX buffers\n"); - goto failed_2; - } - - rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, - IBLND_RX_MSG_PAGES(conn)); - if (rc) - goto failed_2; - - kiblnd_map_rx_descs(conn); - - cq_attr.cqe = IBLND_CQ_ENTRIES(conn); - cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt); - cq = ib_create_cq(cmid->device, - kiblnd_cq_completion, kiblnd_cq_event, conn, - &cq_attr); - if (IS_ERR(cq)) { - CERROR("Failed to create CQ with %d CQEs: %ld\n", - IBLND_CQ_ENTRIES(conn), PTR_ERR(cq)); - goto failed_2; - } - - conn->ibc_cq = cq; - - rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (rc) { - CERROR("Can't request completion notification: %d\n", rc); - goto failed_2; - } - - init_qp_attr->event_handler = kiblnd_qp_event; - init_qp_attr->qp_context = conn; - init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn); - init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); - init_qp_attr->cap.max_send_sge = 1; - init_qp_attr->cap.max_recv_sge = 1; - init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - init_qp_attr->qp_type = IB_QPT_RC; - init_qp_attr->send_cq = cq; - init_qp_attr->recv_cq = cq; - - conn->ibc_sched = sched; - - rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); - if (rc) { - CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", - rc, init_qp_attr->cap.max_send_wr, - init_qp_attr->cap.max_recv_wr); - goto failed_2; - } - - kfree(init_qp_attr); - - /* 1 ref for caller and each rxmsg */ - atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); - conn->ibc_nrx = IBLND_RX_MSGS(conn); - - /* post receives */ - for (i = 0; i < IBLND_RX_MSGS(conn); i++) { - rc = kiblnd_post_rx(&conn->ibc_rxs[i], - IBLND_POSTRX_NO_CREDIT); - if (rc) { - CERROR("Can't post rxmsg: %d\n", rc); - - /* Make posted receives complete */ - kiblnd_abort_receives(conn); - - /* - * correct # of posted buffers - * NB locking needed now I'm racing with completion - */ - spin_lock_irqsave(&sched->ibs_lock, flags); - conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i; - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - /* - * cmid will be destroyed by CM(ofed) after cm_callback - * returned, so we can't refer it anymore - * (by kiblnd_connd()->kiblnd_destroy_conn) - */ - rdma_destroy_qp(conn->ibc_cmid); - conn->ibc_cmid = NULL; - - /* Drop my own and unused rxbuffer refcounts */ - while (i++ <= IBLND_RX_MSGS(conn)) - kiblnd_conn_decref(conn); - - return NULL; - } - } - - /* Init successful! */ - LASSERT(state == IBLND_CONN_ACTIVE_CONNECT || - state == IBLND_CONN_PASSIVE_WAIT); - conn->ibc_state = state; - - /* 1 more conn */ - atomic_inc(&net->ibn_nconns); - return conn; - - failed_2: - kiblnd_destroy_conn(conn); - kfree(conn); - failed_1: - kfree(init_qp_attr); - failed_0: - return NULL; -} - -void kiblnd_destroy_conn(struct kib_conn *conn) -{ - struct rdma_cm_id *cmid = conn->ibc_cmid; - struct kib_peer *peer = conn->ibc_peer; - int rc; - - LASSERT(!in_interrupt()); - LASSERT(!atomic_read(&conn->ibc_refcount)); - LASSERT(list_empty(&conn->ibc_early_rxs)); - LASSERT(list_empty(&conn->ibc_tx_noops)); - LASSERT(list_empty(&conn->ibc_tx_queue)); - LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); - LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); - LASSERT(list_empty(&conn->ibc_active_txs)); - LASSERT(!conn->ibc_noops_posted); - LASSERT(!conn->ibc_nsends_posted); - - switch (conn->ibc_state) { - default: - /* conn must be completely disengaged from the network */ - LBUG(); - - case IBLND_CONN_DISCONNECTED: - /* connvars should have been freed already */ - LASSERT(!conn->ibc_connvars); - break; - - case IBLND_CONN_INIT: - break; - } - - /* conn->ibc_cmid might be destroyed by CM already */ - if (cmid && cmid->qp) - rdma_destroy_qp(cmid); - - if (conn->ibc_cq) { - rc = ib_destroy_cq(conn->ibc_cq); - if (rc) - CWARN("Error destroying CQ: %d\n", rc); - } - - if (conn->ibc_rx_pages) - kiblnd_unmap_rx_descs(conn); - - kfree(conn->ibc_rxs); - kfree(conn->ibc_connvars); - - if (conn->ibc_hdev) - kiblnd_hdev_decref(conn->ibc_hdev); - - /* See CAVEAT EMPTOR above in kiblnd_create_conn */ - if (conn->ibc_state != IBLND_CONN_INIT) { - struct kib_net *net = peer->ibp_ni->ni_data; - - kiblnd_peer_decref(peer); - rdma_destroy_id(cmid); - atomic_dec(&net->ibn_nconns); - } -} - -int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why) -{ - struct kib_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_version, why); - - kiblnd_close_conn_locked(conn, why); - count++; - } - - return count; -} - -int kiblnd_close_stale_conns_locked(struct kib_peer *peer, - int version, __u64 incarnation) -{ - struct kib_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - if (conn->ibc_version == version && - conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, - "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_version, conn->ibc_incarnation, - version, incarnation); - - kiblnd_close_conn_locked(conn, -ESTALE); - count++; - } - - return count; -} - -static int kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid) -{ - struct kib_peer *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - unsigned long flags; - int count = 0; - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) { - lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - } else { - lo = 0; - hi = kiblnd_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kiblnd_close_peer_conns_locked(peer, 0); - } - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == LNET_NID_ANY) - return 0; - - return !count ? -ENOENT : 0; -} - -static int kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int rc = -EINVAL; - - switch (cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - int count = 0; - - rc = kiblnd_get_peer_info(ni, data->ioc_count, - &nid, &count); - data->ioc_nid = nid; - data->ioc_count = count; - break; - } - - case IOC_LIBCFS_DEL_PEER: { - rc = kiblnd_del_peer(ni, data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - struct kib_conn *conn; - - rc = 0; - conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); - if (!conn) { - rc = -ENOENT; - break; - } - - LASSERT(conn->ibc_cmid); - data->ioc_nid = conn->ibc_peer->ibp_nid; - if (!conn->ibc_cmid->route.path_rec) - data->ioc_u32[0] = 0; /* iWarp has no path MTU */ - else - data->ioc_u32[0] = - ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); - kiblnd_conn_decref(conn); - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - rc = kiblnd_close_matching_conns(ni, data->ioc_nid); - break; - } - - default: - break; - } - - return rc; -} - -static void kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, - unsigned long *when) -{ - unsigned long last_alive = 0; - unsigned long now = jiffies; - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_peer *peer; - unsigned long flags; - - read_lock_irqsave(glock, flags); - - peer = kiblnd_find_peer_locked(nid); - if (peer) - last_alive = peer->ibp_last_alive; - - read_unlock_irqrestore(glock, flags); - - if (last_alive) - *when = last_alive; - - /* - * peer is not persistent in hash, trigger peer creation - * and connection establishment with a NULL tx - */ - if (!peer) - kiblnd_launch_tx(ni, NULL, nid); - - CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", - libcfs_nid2str(nid), peer, - last_alive ? (now - last_alive) / HZ : -1); -} - -static void kiblnd_free_pages(struct kib_pages *p) -{ - int npages = p->ibp_npages; - int i; - - for (i = 0; i < npages; i++) { - if (p->ibp_pages[i]) - __free_page(p->ibp_pages[i]); - } - - kfree(p); -} - -int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages) -{ - struct kib_pages *p; - int i; - - p = kzalloc_cpt(offsetof(struct kib_pages, ibp_pages[npages]), - GFP_NOFS, cpt); - if (!p) { - CERROR("Can't allocate descriptor for %d pages\n", npages); - return -ENOMEM; - } - - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_pages_node( - cfs_cpt_spread_node(lnet_cpt_table(), cpt), - GFP_NOFS, 0); - if (!p->ibp_pages[i]) { - CERROR("Can't allocate page %d of %d\n", i, npages); - kiblnd_free_pages(p); - return -ENOMEM; - } - } - - *pp = p; - return 0; -} - -void kiblnd_unmap_rx_descs(struct kib_conn *conn) -{ - struct kib_rx *rx; - int i; - - LASSERT(conn->ibc_rxs); - LASSERT(conn->ibc_hdev); - - for (i = 0; i < IBLND_RX_MSGS(conn); i++) { - rx = &conn->ibc_rxs[i]; - - LASSERT(rx->rx_nob >= 0); /* not posted */ - - kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, - KIBLND_UNMAP_ADDR(rx, rx_msgunmap, - rx->rx_msgaddr), - IBLND_MSG_SIZE, DMA_FROM_DEVICE); - } - - kiblnd_free_pages(conn->ibc_rx_pages); - - conn->ibc_rx_pages = NULL; -} - -void kiblnd_map_rx_descs(struct kib_conn *conn) -{ - struct kib_rx *rx; - struct page *pg; - int pg_off; - int ipg; - int i; - - for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) { - pg = conn->ibc_rx_pages->ibp_pages[ipg]; - rx = &conn->ibc_rxs[i]; - - rx->rx_conn = conn; - rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off); - - rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, - rx->rx_msg, - IBLND_MSG_SIZE, - DMA_FROM_DEVICE); - LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, - rx->rx_msgaddr)); - KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); - - CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", - i, rx->rx_msg, rx->rx_msgaddr, - (__u64)(page_to_phys(pg) + pg_off)); - - pg_off += IBLND_MSG_SIZE; - LASSERT(pg_off <= PAGE_SIZE); - - if (pg_off == PAGE_SIZE) { - pg_off = 0; - ipg++; - LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn)); - } - } -} - -static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo) -{ - struct kib_hca_dev *hdev = tpo->tpo_hdev; - struct kib_tx *tx; - int i; - - LASSERT(!tpo->tpo_pool.po_allocated); - - if (!hdev) - return; - - for (i = 0; i < tpo->tpo_pool.po_size; i++) { - tx = &tpo->tpo_tx_descs[i]; - kiblnd_dma_unmap_single(hdev->ibh_ibdev, - KIBLND_UNMAP_ADDR(tx, tx_msgunmap, - tx->tx_msgaddr), - IBLND_MSG_SIZE, DMA_TO_DEVICE); - } - - kiblnd_hdev_decref(hdev); - tpo->tpo_hdev = NULL; -} - -static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev) -{ - struct kib_hca_dev *hdev; - unsigned long flags; - int i = 0; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - while (dev->ibd_failover) { - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - if (!(i++ % 50)) - CDEBUG(D_NET, "%s: Wait for failover\n", - dev->ibd_ifname); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 100); - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - } - - kiblnd_hdev_addref_locked(dev->ibd_hdev); - hdev = dev->ibd_hdev; - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - return hdev; -} - -static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo) -{ - struct kib_pages *txpgs = tpo->tpo_tx_pages; - struct kib_pool *pool = &tpo->tpo_pool; - struct kib_net *net = pool->po_owner->ps_net; - struct kib_dev *dev; - struct page *page; - struct kib_tx *tx; - int page_offset; - int ipage; - int i; - - LASSERT(net); - - dev = net->ibn_dev; - - /* pre-mapped messages are not bigger than 1 page */ - BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE); - - tpo->tpo_hdev = kiblnd_current_hdev(dev); - - for (ipage = page_offset = i = 0; i < pool->po_size; i++) { - page = txpgs->ibp_pages[ipage]; - tx = &tpo->tpo_tx_descs[i]; - - tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) + - page_offset); - - tx->tx_msgaddr = kiblnd_dma_map_single( - tpo->tpo_hdev->ibh_ibdev, tx->tx_msg, - IBLND_MSG_SIZE, DMA_TO_DEVICE); - LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, - tx->tx_msgaddr)); - KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); - - list_add(&tx->tx_list, &pool->po_free_list); - - page_offset += IBLND_MSG_SIZE; - LASSERT(page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT(ipage <= txpgs->ibp_npages); - } - } -} - -static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo) -{ - LASSERT(!fpo->fpo_map_count); - - if (fpo->fpo_is_fmr) { - if (fpo->fmr.fpo_fmr_pool) - ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool); - } else { - struct kib_fast_reg_descriptor *frd, *tmp; - int i = 0; - - list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, - frd_list) { - list_del(&frd->frd_list); - ib_dereg_mr(frd->frd_mr); - kfree(frd); - i++; - } - if (i < fpo->fast_reg.fpo_pool_size) - CERROR("FastReg pool still has %d regions registered\n", - fpo->fast_reg.fpo_pool_size - i); - } - - if (fpo->fpo_hdev) - kiblnd_hdev_decref(fpo->fpo_hdev); - - kfree(fpo); -} - -static void kiblnd_destroy_fmr_pool_list(struct list_head *head) -{ - struct kib_fmr_pool *fpo, *tmp; - - list_for_each_entry_safe(fpo, tmp, head, fpo_list) { - list_del(&fpo->fpo_list); - kiblnd_destroy_fmr_pool(fpo); - } -} - -static int -kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, - int ncpts) -{ - int size = tunables->lnd_fmr_pool_size / ncpts; - - return max(IBLND_FMR_POOL, size); -} - -static int -kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, - int ncpts) -{ - int size = tunables->lnd_fmr_flush_trigger / ncpts; - - return max(IBLND_FMR_POOL_FLUSH, size); -} - -static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo) -{ - struct ib_fmr_pool_param param = { - .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE, - .page_shift = PAGE_SHIFT, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE), - .pool_size = fps->fps_pool_size, - .dirty_watermark = fps->fps_flush_trigger, - .flush_function = NULL, - .flush_arg = NULL, - .cache = !!fps->fps_cache }; - int rc = 0; - - fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, - ¶m); - if (IS_ERR(fpo->fmr.fpo_fmr_pool)) { - rc = PTR_ERR(fpo->fmr.fpo_fmr_pool); - if (rc != -ENOSYS) - CERROR("Failed to create FMR pool: %d\n", rc); - else - CERROR("FMRs are not supported\n"); - } - - return rc; -} - -static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo) -{ - struct kib_fast_reg_descriptor *frd, *tmp; - int i, rc; - - INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list); - fpo->fast_reg.fpo_pool_size = 0; - for (i = 0; i < fps->fps_pool_size; i++) { - frd = kzalloc_cpt(sizeof(*frd), GFP_NOFS, fps->fps_cpt); - if (!frd) { - CERROR("Failed to allocate a new fast_reg descriptor\n"); - rc = -ENOMEM; - goto out; - } - - frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd, - IB_MR_TYPE_MEM_REG, - LNET_MAX_PAYLOAD / PAGE_SIZE); - if (IS_ERR(frd->frd_mr)) { - rc = PTR_ERR(frd->frd_mr); - CERROR("Failed to allocate ib_alloc_mr: %d\n", rc); - frd->frd_mr = NULL; - goto out_middle; - } - - frd->frd_valid = true; - - list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); - fpo->fast_reg.fpo_pool_size++; - } - - return 0; - -out_middle: - if (frd->frd_mr) - ib_dereg_mr(frd->frd_mr); - kfree(frd); - -out: - list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, - frd_list) { - list_del(&frd->frd_list); - ib_dereg_mr(frd->frd_mr); - kfree(frd); - } - - return rc; -} - -static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps, - struct kib_fmr_pool **pp_fpo) -{ - struct kib_dev *dev = fps->fps_net->ibn_dev; - struct ib_device_attr *dev_attr; - struct kib_fmr_pool *fpo; - int rc; - - fpo = kzalloc_cpt(sizeof(*fpo), GFP_NOFS, fps->fps_cpt); - if (!fpo) - return -ENOMEM; - - fpo->fpo_hdev = kiblnd_current_hdev(dev); - dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs; - - /* Check for FMR or FastReg support */ - fpo->fpo_is_fmr = 0; - if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr && - fpo->fpo_hdev->ibh_ibdev->dealloc_fmr && - fpo->fpo_hdev->ibh_ibdev->map_phys_fmr && - fpo->fpo_hdev->ibh_ibdev->unmap_fmr) { - LCONSOLE_INFO("Using FMR for registration\n"); - fpo->fpo_is_fmr = 1; - } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - LCONSOLE_INFO("Using FastReg for registration\n"); - } else { - rc = -ENOSYS; - LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n"); - goto out_fpo; - } - - if (fpo->fpo_is_fmr) - rc = kiblnd_alloc_fmr_pool(fps, fpo); - else - rc = kiblnd_alloc_freg_pool(fps, fpo); - if (rc) - goto out_fpo; - - fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - fpo->fpo_owner = fps; - *pp_fpo = fpo; - - return 0; - -out_fpo: - kiblnd_hdev_decref(fpo->fpo_hdev); - kfree(fpo); - return rc; -} - -static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, - struct list_head *zombies) -{ - if (!fps->fps_net) /* initialized? */ - return; - - spin_lock(&fps->fps_lock); - - while (!list_empty(&fps->fps_pool_list)) { - struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next, - struct kib_fmr_pool, fpo_list); - fpo->fpo_failed = 1; - list_del(&fpo->fpo_list); - if (!fpo->fpo_map_count) - list_add(&fpo->fpo_list, zombies); - else - list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); - } - - spin_unlock(&fps->fps_lock); -} - -static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps) -{ - if (fps->fps_net) { /* initialized? */ - kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); - kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); - } -} - -static int -kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts, - struct kib_net *net, - struct lnet_ioctl_config_o2iblnd_tunables *tunables) -{ - struct kib_fmr_pool *fpo; - int rc; - - memset(fps, 0, sizeof(*fps)); - - fps->fps_net = net; - fps->fps_cpt = cpt; - - fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); - fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); - fps->fps_cache = tunables->lnd_fmr_cache; - - spin_lock_init(&fps->fps_lock); - INIT_LIST_HEAD(&fps->fps_pool_list); - INIT_LIST_HEAD(&fps->fps_failed_pool_list); - - rc = kiblnd_create_fmr_pool(fps, &fpo); - if (!rc) - list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); - - return rc; -} - -static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now) -{ - if (fpo->fpo_map_count) /* still in use */ - return 0; - if (fpo->fpo_failed) - return 1; - return time_after_eq(now, fpo->fpo_deadline); -} - -static int -kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd) -{ - __u64 *pages = tx->tx_pages; - struct kib_hca_dev *hdev; - int npages; - int size; - int i; - - hdev = tx->tx_pool->tpo_hdev; - - for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { - for (size = 0; size < rd->rd_frags[i].rf_nob; - size += hdev->ibh_page_size) { - pages[npages++] = (rd->rd_frags[i].rf_addr & - hdev->ibh_page_mask) + size; - } - } - - return npages; -} - -void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status) -{ - LIST_HEAD(zombies); - struct kib_fmr_pool *fpo = fmr->fmr_pool; - struct kib_fmr_poolset *fps; - unsigned long now = jiffies; - struct kib_fmr_pool *tmp; - int rc; - - if (!fpo) - return; - - fps = fpo->fpo_owner; - if (fpo->fpo_is_fmr) { - if (fmr->fmr_pfmr) { - rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); - LASSERT(!rc); - fmr->fmr_pfmr = NULL; - } - - if (status) { - rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool); - LASSERT(!rc); - } - } else { - struct kib_fast_reg_descriptor *frd = fmr->fmr_frd; - - if (frd) { - frd->frd_valid = false; - spin_lock(&fps->fps_lock); - list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); - spin_unlock(&fps->fps_lock); - fmr->fmr_frd = NULL; - } - } - fmr->fmr_pool = NULL; - - spin_lock(&fps->fps_lock); - fpo->fpo_map_count--; /* decref the pool */ - - list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { - /* the first pool is persistent */ - if (fps->fps_pool_list.next == &fpo->fpo_list) - continue; - - if (kiblnd_fmr_pool_is_idle(fpo, now)) { - list_move(&fpo->fpo_list, &zombies); - fps->fps_version++; - } - } - spin_unlock(&fps->fps_lock); - - if (!list_empty(&zombies)) - kiblnd_destroy_fmr_pool_list(&zombies); -} - -int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, - struct kib_rdma_desc *rd, __u32 nob, __u64 iov, - struct kib_fmr *fmr) -{ - __u64 *pages = tx->tx_pages; - bool is_rx = (rd != tx->tx_rd); - bool tx_pages_mapped = false; - struct kib_fmr_pool *fpo; - int npages = 0; - __u64 version; - int rc; - - again: - spin_lock(&fps->fps_lock); - version = fps->fps_version; - list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { - fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - fpo->fpo_map_count++; - - if (fpo->fpo_is_fmr) { - struct ib_pool_fmr *pfmr; - - spin_unlock(&fps->fps_lock); - - if (!tx_pages_mapped) { - npages = kiblnd_map_tx_pages(tx, rd); - tx_pages_mapped = 1; - } - - pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool, - pages, npages, iov); - if (likely(!IS_ERR(pfmr))) { - fmr->fmr_key = is_rx ? pfmr->fmr->rkey : - pfmr->fmr->lkey; - fmr->fmr_frd = NULL; - fmr->fmr_pfmr = pfmr; - fmr->fmr_pool = fpo; - return 0; - } - rc = PTR_ERR(pfmr); - } else { - if (!list_empty(&fpo->fast_reg.fpo_pool_list)) { - struct kib_fast_reg_descriptor *frd; - struct ib_reg_wr *wr; - struct ib_mr *mr; - int n; - - frd = list_first_entry(&fpo->fast_reg.fpo_pool_list, - struct kib_fast_reg_descriptor, - frd_list); - list_del(&frd->frd_list); - spin_unlock(&fps->fps_lock); - - mr = frd->frd_mr; - - if (!frd->frd_valid) { - __u32 key = is_rx ? mr->rkey : mr->lkey; - struct ib_send_wr *inv_wr; - - inv_wr = &frd->frd_inv_wr; - memset(inv_wr, 0, sizeof(*inv_wr)); - inv_wr->opcode = IB_WR_LOCAL_INV; - inv_wr->wr_id = IBLND_WID_MR; - inv_wr->ex.invalidate_rkey = key; - - /* Bump the key */ - key = ib_inc_rkey(key); - ib_update_fast_reg_key(mr, key); - } - - n = ib_map_mr_sg(mr, tx->tx_frags, - tx->tx_nfrags, NULL, PAGE_SIZE); - if (unlikely(n != tx->tx_nfrags)) { - CERROR("Failed to map mr %d/%d elements\n", - n, tx->tx_nfrags); - return n < 0 ? n : -EINVAL; - } - - mr->iova = iov; - - /* Prepare FastReg WR */ - wr = &frd->frd_fastreg_wr; - memset(wr, 0, sizeof(*wr)); - wr->wr.opcode = IB_WR_REG_MR; - wr->wr.wr_id = IBLND_WID_MR; - wr->wr.num_sge = 0; - wr->wr.send_flags = 0; - wr->mr = mr; - wr->key = is_rx ? mr->rkey : mr->lkey; - wr->access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - - fmr->fmr_key = is_rx ? mr->rkey : mr->lkey; - fmr->fmr_frd = frd; - fmr->fmr_pfmr = NULL; - fmr->fmr_pool = fpo; - return 0; - } - spin_unlock(&fps->fps_lock); - rc = -EAGAIN; - } - - spin_lock(&fps->fps_lock); - fpo->fpo_map_count--; - if (rc != -EAGAIN) { - spin_unlock(&fps->fps_lock); - return rc; - } - - /* EAGAIN and ... */ - if (version != fps->fps_version) { - spin_unlock(&fps->fps_lock); - goto again; - } - } - - if (fps->fps_increasing) { - spin_unlock(&fps->fps_lock); - CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n"); - schedule(); - goto again; - } - - if (time_before(jiffies, fps->fps_next_retry)) { - /* someone failed recently */ - spin_unlock(&fps->fps_lock); - return -EAGAIN; - } - - fps->fps_increasing = 1; - spin_unlock(&fps->fps_lock); - - CDEBUG(D_NET, "Allocate new FMR pool\n"); - rc = kiblnd_create_fmr_pool(fps, &fpo); - spin_lock(&fps->fps_lock); - fps->fps_increasing = 0; - if (!rc) { - fps->fps_version++; - list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); - } else { - fps->fps_next_retry = jiffies + IBLND_POOL_RETRY * HZ; - } - spin_unlock(&fps->fps_lock); - - goto again; -} - -static void kiblnd_fini_pool(struct kib_pool *pool) -{ - LASSERT(list_empty(&pool->po_free_list)); - LASSERT(!pool->po_allocated); - - CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); -} - -static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size) -{ - CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); - - memset(pool, 0, sizeof(*pool)); - INIT_LIST_HEAD(&pool->po_free_list); - pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - pool->po_owner = ps; - pool->po_size = size; -} - -static void kiblnd_destroy_pool_list(struct list_head *head) -{ - struct kib_pool *pool; - - while (!list_empty(head)) { - pool = list_entry(head->next, struct kib_pool, po_list); - list_del(&pool->po_list); - - LASSERT(pool->po_owner); - pool->po_owner->ps_pool_destroy(pool); - } -} - -static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies) -{ - if (!ps->ps_net) /* initialized? */ - return; - - spin_lock(&ps->ps_lock); - while (!list_empty(&ps->ps_pool_list)) { - struct kib_pool *po = list_entry(ps->ps_pool_list.next, - struct kib_pool, po_list); - po->po_failed = 1; - list_del(&po->po_list); - if (!po->po_allocated) - list_add(&po->po_list, zombies); - else - list_add(&po->po_list, &ps->ps_failed_pool_list); - } - spin_unlock(&ps->ps_lock); -} - -static void kiblnd_fini_poolset(struct kib_poolset *ps) -{ - if (ps->ps_net) { /* initialized? */ - kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); - kiblnd_destroy_pool_list(&ps->ps_pool_list); - } -} - -static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt, - struct kib_net *net, char *name, int size, - kib_ps_pool_create_t po_create, - kib_ps_pool_destroy_t po_destroy, - kib_ps_node_init_t nd_init, - kib_ps_node_fini_t nd_fini) -{ - struct kib_pool *pool; - int rc; - - memset(ps, 0, sizeof(*ps)); - - ps->ps_cpt = cpt; - ps->ps_net = net; - ps->ps_pool_create = po_create; - ps->ps_pool_destroy = po_destroy; - ps->ps_node_init = nd_init; - ps->ps_node_fini = nd_fini; - ps->ps_pool_size = size; - if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) - >= sizeof(ps->ps_name)) - return -E2BIG; - spin_lock_init(&ps->ps_lock); - INIT_LIST_HEAD(&ps->ps_pool_list); - INIT_LIST_HEAD(&ps->ps_failed_pool_list); - - rc = ps->ps_pool_create(ps, size, &pool); - if (!rc) - list_add(&pool->po_list, &ps->ps_pool_list); - else - CERROR("Failed to create the first pool for %s\n", ps->ps_name); - - return rc; -} - -static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now) -{ - if (pool->po_allocated) /* still in use */ - return 0; - if (pool->po_failed) - return 1; - return time_after_eq(now, pool->po_deadline); -} - -void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node) -{ - LIST_HEAD(zombies); - struct kib_poolset *ps = pool->po_owner; - struct kib_pool *tmp; - unsigned long now = jiffies; - - spin_lock(&ps->ps_lock); - - if (ps->ps_node_fini) - ps->ps_node_fini(pool, node); - - LASSERT(pool->po_allocated > 0); - list_add(node, &pool->po_free_list); - pool->po_allocated--; - - list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { - /* the first pool is persistent */ - if (ps->ps_pool_list.next == &pool->po_list) - continue; - - if (kiblnd_pool_is_idle(pool, now)) - list_move(&pool->po_list, &zombies); - } - spin_unlock(&ps->ps_lock); - - if (!list_empty(&zombies)) - kiblnd_destroy_pool_list(&zombies); -} - -struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps) -{ - struct list_head *node; - struct kib_pool *pool; - unsigned int interval = 1; - unsigned long time_before; - unsigned int trips = 0; - int rc; - - again: - spin_lock(&ps->ps_lock); - list_for_each_entry(pool, &ps->ps_pool_list, po_list) { - if (list_empty(&pool->po_free_list)) - continue; - - pool->po_allocated++; - pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - node = pool->po_free_list.next; - list_del(node); - - if (ps->ps_node_init) { - /* still hold the lock */ - ps->ps_node_init(pool, node); - } - spin_unlock(&ps->ps_lock); - return node; - } - - /* no available tx pool and ... */ - if (ps->ps_increasing) { - /* another thread is allocating a new pool */ - spin_unlock(&ps->ps_lock); - trips++; - CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n", - ps->ps_name, interval, trips); - - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(interval); - if (interval < HZ) - interval *= 2; - - goto again; - } - - if (time_before(jiffies, ps->ps_next_retry)) { - /* someone failed recently */ - spin_unlock(&ps->ps_lock); - return NULL; - } - - ps->ps_increasing = 1; - spin_unlock(&ps->ps_lock); - - CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); - time_before = jiffies; - rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); - CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete", - jiffies - time_before); - - spin_lock(&ps->ps_lock); - ps->ps_increasing = 0; - if (!rc) { - list_add_tail(&pool->po_list, &ps->ps_pool_list); - } else { - ps->ps_next_retry = jiffies + IBLND_POOL_RETRY * HZ; - CERROR("Can't allocate new %s pool because out of memory\n", - ps->ps_name); - } - spin_unlock(&ps->ps_lock); - - goto again; -} - -static void kiblnd_destroy_tx_pool(struct kib_pool *pool) -{ - struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool); - int i; - - LASSERT(!pool->po_allocated); - - if (tpo->tpo_tx_pages) { - kiblnd_unmap_tx_pool(tpo); - kiblnd_free_pages(tpo->tpo_tx_pages); - } - - if (!tpo->tpo_tx_descs) - goto out; - - for (i = 0; i < pool->po_size; i++) { - struct kib_tx *tx = &tpo->tpo_tx_descs[i]; - - list_del(&tx->tx_list); - kfree(tx->tx_pages); - kfree(tx->tx_frags); - kfree(tx->tx_wrq); - kfree(tx->tx_sge); - kfree(tx->tx_rd); - } - - kfree(tpo->tpo_tx_descs); -out: - kiblnd_fini_pool(pool); - kfree(tpo); -} - -static int kiblnd_tx_pool_size(int ncpts) -{ - int ntx = *kiblnd_tunables.kib_ntx / ncpts; - - return max(IBLND_TX_POOL, ntx); -} - -static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size, - struct kib_pool **pp_po) -{ - int i; - int npg; - struct kib_pool *pool; - struct kib_tx_pool *tpo; - - tpo = kzalloc_cpt(sizeof(*tpo), GFP_NOFS, ps->ps_cpt); - if (!tpo) { - CERROR("Failed to allocate TX pool\n"); - return -ENOMEM; - } - - pool = &tpo->tpo_pool; - kiblnd_init_pool(ps, pool, size); - tpo->tpo_tx_descs = NULL; - tpo->tpo_tx_pages = NULL; - - npg = DIV_ROUND_UP(size * IBLND_MSG_SIZE, PAGE_SIZE); - if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) { - CERROR("Can't allocate tx pages: %d\n", npg); - kfree(tpo); - return -ENOMEM; - } - - tpo->tpo_tx_descs = kzalloc_cpt(size * sizeof(struct kib_tx), - GFP_NOFS, ps->ps_cpt); - if (!tpo->tpo_tx_descs) { - CERROR("Can't allocate %d tx descriptors\n", size); - ps->ps_pool_destroy(pool); - return -ENOMEM; - } - - memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx)); - - for (i = 0; i < size; i++) { - struct kib_tx *tx = &tpo->tpo_tx_descs[i]; - - tx->tx_pool = tpo; - if (ps->ps_net->ibn_fmr_ps) { - tx->tx_pages = kzalloc_cpt(LNET_MAX_IOV * sizeof(*tx->tx_pages), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_pages) - break; - } - - tx->tx_frags = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_frags), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_frags) - break; - - sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1); - - tx->tx_wrq = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_wrq) - break; - - tx->tx_sge = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_sge) - break; - - tx->tx_rd = kzalloc_cpt(offsetof(struct kib_rdma_desc, - rd_frags[IBLND_MAX_RDMA_FRAGS]), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_rd) - break; - } - - if (i == size) { - kiblnd_map_tx_pool(tpo); - *pp_po = pool; - return 0; - } - - ps->ps_pool_destroy(pool); - return -ENOMEM; -} - -static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node) -{ - struct kib_tx_poolset *tps = container_of(pool->po_owner, - struct kib_tx_poolset, - tps_poolset); - struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list); - - tx->tx_cookie = tps->tps_next_tx_cookie++; -} - -static void kiblnd_net_fini_pools(struct kib_net *net) -{ - int i; - - cfs_cpt_for_each(i, lnet_cpt_table()) { - struct kib_tx_poolset *tps; - struct kib_fmr_poolset *fps; - - if (net->ibn_tx_ps) { - tps = net->ibn_tx_ps[i]; - kiblnd_fini_poolset(&tps->tps_poolset); - } - - if (net->ibn_fmr_ps) { - fps = net->ibn_fmr_ps[i]; - kiblnd_fini_fmr_poolset(fps); - } - } - - if (net->ibn_tx_ps) { - cfs_percpt_free(net->ibn_tx_ps); - net->ibn_tx_ps = NULL; - } - - if (net->ibn_fmr_ps) { - cfs_percpt_free(net->ibn_fmr_ps); - net->ibn_fmr_ps = NULL; - } -} - -static int kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, - __u32 *cpts, int ncpts) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - int cpt; - int rc; - int i; - - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - - if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) { - CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", - tunables->lnd_fmr_pool_size, - *kiblnd_tunables.kib_ntx / 4); - rc = -EINVAL; - goto failed; - } - - /* - * TX pool must be created later than FMR, see LU-2268 - * for details - */ - LASSERT(!net->ibn_tx_ps); - - /* - * premapping can fail if ibd_nmr > 1, so we always create - * FMR pool and map-on-demand if premapping failed - * - * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset - * The number of struct kib_fmr_poolsets create is equal to the - * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt]. - */ - net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct kib_fmr_poolset)); - if (!net->ibn_fmr_ps) { - CERROR("Failed to allocate FMR pool array\n"); - rc = -ENOMEM; - goto failed; - } - - for (i = 0; i < ncpts; i++) { - cpt = !cpts ? i : cpts[i]; - rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, - net, tunables); - if (rc) { - CERROR("Can't initialize FMR pool for CPT %d: %d\n", - cpt, rc); - goto failed; - } - } - - if (i > 0) - LASSERT(i == ncpts); - - /* - * cfs_precpt_alloc is creating an array of struct kib_tx_poolset - * The number of struct kib_tx_poolsets create is equal to the - * number of CPTs that exist, i.e net->ibn_tx_ps[cpt]. - */ - net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct kib_tx_poolset)); - if (!net->ibn_tx_ps) { - CERROR("Failed to allocate tx pool array\n"); - rc = -ENOMEM; - goto failed; - } - - for (i = 0; i < ncpts; i++) { - cpt = !cpts ? i : cpts[i]; - rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, - cpt, net, "TX", - kiblnd_tx_pool_size(ncpts), - kiblnd_create_tx_pool, - kiblnd_destroy_tx_pool, - kiblnd_tx_init, NULL); - if (rc) { - CERROR("Can't initialize TX pool for CPT %d: %d\n", - cpt, rc); - goto failed; - } - } - - return 0; - failed: - kiblnd_net_fini_pools(net); - LASSERT(rc); - return rc; -} - -static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) -{ - /* - * It's safe to assume a HCA can handle a page size - * matching that of the native system - */ - hdev->ibh_page_shift = PAGE_SHIFT; - hdev->ibh_page_size = 1 << PAGE_SHIFT; - hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); - - hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size; - if (hdev->ibh_mr_size == ~0ULL) { - hdev->ibh_mr_shift = 64; - return 0; - } - - CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); - return -EINVAL; -} - -void kiblnd_hdev_destroy(struct kib_hca_dev *hdev) -{ - if (hdev->ibh_pd) - ib_dealloc_pd(hdev->ibh_pd); - - if (hdev->ibh_cmid) - rdma_destroy_id(hdev->ibh_cmid); - - kfree(hdev); -} - -/* DUMMY */ -static int kiblnd_dummy_callback(struct rdma_cm_id *cmid, - struct rdma_cm_event *event) -{ - return 0; -} - -static int kiblnd_dev_need_failover(struct kib_dev *dev) -{ - struct rdma_cm_id *cmid; - struct sockaddr_in srcaddr; - struct sockaddr_in dstaddr; - int rc; - - if (!dev->ibd_hdev || /* initializing */ - !dev->ibd_hdev->ibh_cmid || /* listener is dead */ - *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ - return 1; - - /* - * XXX: it's UGLY, but I don't have better way to find - * ib-bonding HCA failover because: - * - * a. no reliable CM event for HCA failover... - * b. no OFED API to get ib_device for current net_device... - * - * We have only two choices at this point: - * - * a. rdma_bind_addr(), it will conflict with listener cmid - * b. rdma_resolve_addr() to zero addr - */ - cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP, - IB_QPT_RC); - if (IS_ERR(cmid)) { - rc = PTR_ERR(cmid); - CERROR("Failed to create cmid for failover: %d\n", rc); - return rc; - } - - memset(&srcaddr, 0, sizeof(srcaddr)); - srcaddr.sin_family = AF_INET; - srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); - - memset(&dstaddr, 0, sizeof(dstaddr)); - dstaddr.sin_family = AF_INET; - rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, - (struct sockaddr *)&dstaddr, 1); - if (rc || !cmid->device) { - CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", - dev->ibd_ifname, &dev->ibd_ifip, - cmid->device, rc); - rdma_destroy_id(cmid); - return rc; - } - - rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */ - rdma_destroy_id(cmid); - - return rc; -} - -int kiblnd_dev_failover(struct kib_dev *dev) -{ - LIST_HEAD(zombie_tpo); - LIST_HEAD(zombie_ppo); - LIST_HEAD(zombie_fpo); - struct rdma_cm_id *cmid = NULL; - struct kib_hca_dev *hdev = NULL; - struct ib_pd *pd; - struct kib_net *net; - struct sockaddr_in addr; - unsigned long flags; - int rc = 0; - int i; - - LASSERT(*kiblnd_tunables.kib_dev_failover > 1 || - dev->ibd_can_failover || !dev->ibd_hdev); - - rc = kiblnd_dev_need_failover(dev); - if (rc <= 0) - goto out; - - if (dev->ibd_hdev && - dev->ibd_hdev->ibh_cmid) { - /* - * XXX it's not good to close old listener at here, - * because we can fail to create new listener. - * But we have to close it now, otherwise rdma_bind_addr - * will return EADDRINUSE... How crap! - */ - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - cmid = dev->ibd_hdev->ibh_cmid; - /* - * make next schedule of kiblnd_dev_need_failover() - * return 1 for me - */ - dev->ibd_hdev->ibh_cmid = NULL; - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - rdma_destroy_id(cmid); - } - - cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP, - IB_QPT_RC); - if (IS_ERR(cmid)) { - rc = PTR_ERR(cmid); - CERROR("Failed to create cmid for failover: %d\n", rc); - goto out; - } - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(dev->ibd_ifip); - addr.sin_port = htons(*kiblnd_tunables.kib_service); - - /* Bind to failover device or port */ - rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); - if (rc || !cmid->device) { - CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", - dev->ibd_ifname, &dev->ibd_ifip, - cmid->device, rc); - rdma_destroy_id(cmid); - goto out; - } - - hdev = kzalloc(sizeof(*hdev), GFP_NOFS); - if (!hdev) { - CERROR("Failed to allocate kib_hca_dev\n"); - rdma_destroy_id(cmid); - rc = -ENOMEM; - goto out; - } - - atomic_set(&hdev->ibh_ref, 1); - hdev->ibh_dev = dev; - hdev->ibh_cmid = cmid; - hdev->ibh_ibdev = cmid->device; - - pd = ib_alloc_pd(cmid->device, 0); - if (IS_ERR(pd)) { - rc = PTR_ERR(pd); - CERROR("Can't allocate PD: %d\n", rc); - goto out; - } - - hdev->ibh_pd = pd; - - rc = rdma_listen(cmid, 0); - if (rc) { - CERROR("Can't start new listener: %d\n", rc); - goto out; - } - - rc = kiblnd_hdev_get_attr(hdev); - if (rc) { - CERROR("Can't get device attributes: %d\n", rc); - goto out; - } - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - swap(dev->ibd_hdev, hdev); /* take over the refcount */ - - list_for_each_entry(net, &dev->ibd_nets, ibn_list) { - cfs_cpt_for_each(i, lnet_cpt_table()) { - kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, - &zombie_tpo); - - if (net->ibn_fmr_ps) - kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], - &zombie_fpo); - } - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - out: - if (!list_empty(&zombie_tpo)) - kiblnd_destroy_pool_list(&zombie_tpo); - if (!list_empty(&zombie_ppo)) - kiblnd_destroy_pool_list(&zombie_ppo); - if (!list_empty(&zombie_fpo)) - kiblnd_destroy_fmr_pool_list(&zombie_fpo); - if (hdev) - kiblnd_hdev_decref(hdev); - - if (rc) - dev->ibd_failed_failover++; - else - dev->ibd_failed_failover = 0; - - return rc; -} - -void kiblnd_destroy_dev(struct kib_dev *dev) -{ - LASSERT(!dev->ibd_nnets); - LASSERT(list_empty(&dev->ibd_nets)); - - list_del(&dev->ibd_fail_list); - list_del(&dev->ibd_list); - - if (dev->ibd_hdev) - kiblnd_hdev_decref(dev->ibd_hdev); - - kfree(dev); -} - -static struct kib_dev *kiblnd_create_dev(char *ifname) -{ - struct net_device *netdev; - struct kib_dev *dev; - __u32 netmask; - __u32 ip; - int up; - int rc; - - rc = lnet_ipif_query(ifname, &up, &ip, &netmask); - if (rc) { - CERROR("Can't query IPoIB interface %s: %d\n", - ifname, rc); - return NULL; - } - - if (!up) { - CERROR("Can't query IPoIB interface %s: it's down\n", ifname); - return NULL; - } - - dev = kzalloc(sizeof(*dev), GFP_NOFS); - if (!dev) - return NULL; - - netdev = dev_get_by_name(&init_net, ifname); - if (!netdev) { - dev->ibd_can_failover = 0; - } else { - dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER); - dev_put(netdev); - } - - INIT_LIST_HEAD(&dev->ibd_nets); - INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */ - INIT_LIST_HEAD(&dev->ibd_fail_list); - dev->ibd_ifip = ip; - strcpy(&dev->ibd_ifname[0], ifname); - - /* initialize the device */ - rc = kiblnd_dev_failover(dev); - if (rc) { - CERROR("Can't initialize device: %d\n", rc); - kfree(dev); - return NULL; - } - - list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs); - return dev; -} - -static void kiblnd_base_shutdown(void) -{ - struct kib_sched_info *sched; - int i; - - LASSERT(list_empty(&kiblnd_data.kib_devs)); - - switch (kiblnd_data.kib_init) { - default: - LBUG(); - - case IBLND_INIT_ALL: - case IBLND_INIT_DATA: - LASSERT(kiblnd_data.kib_peers); - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) - LASSERT(list_empty(&kiblnd_data.kib_peers[i])); - LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); - LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); - LASSERT(list_empty(&kiblnd_data.kib_reconn_list)); - LASSERT(list_empty(&kiblnd_data.kib_reconn_wait)); - - /* flag threads to terminate; wake and wait for them to die */ - kiblnd_data.kib_shutdown = 1; - - /* - * NB: we really want to stop scheduler threads net by net - * instead of the whole module, this should be improved - * with dynamic configuration LNet - */ - cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) - wake_up_all(&sched->ibs_waitq); - - wake_up_all(&kiblnd_data.kib_connd_waitq); - wake_up_all(&kiblnd_data.kib_failover_waitq); - - i = 2; - while (atomic_read(&kiblnd_data.kib_nthreads)) { - i++; - /* power of 2 ? */ - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, - "Waiting for %d threads to terminate\n", - atomic_read(&kiblnd_data.kib_nthreads)); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - - /* fall through */ - - case IBLND_INIT_NOTHING: - break; - } - - kvfree(kiblnd_data.kib_peers); - - if (kiblnd_data.kib_scheds) - cfs_percpt_free(kiblnd_data.kib_scheds); - - kiblnd_data.kib_init = IBLND_INIT_NOTHING; - module_put(THIS_MODULE); -} - -static void kiblnd_shutdown(struct lnet_ni *ni) -{ - struct kib_net *net = ni->ni_data; - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - int i; - unsigned long flags; - - LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); - - if (!net) - goto out; - - write_lock_irqsave(g_lock, flags); - net->ibn_shutdown = 1; - write_unlock_irqrestore(g_lock, flags); - - switch (net->ibn_init) { - default: - LBUG(); - - case IBLND_INIT_ALL: - /* nuke all existing peers within this net */ - kiblnd_del_peer(ni, LNET_NID_ANY); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read(&net->ibn_npeers)) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ - "%s: waiting for %d peers to disconnect\n", - libcfs_nid2str(ni->ni_nid), - atomic_read(&net->ibn_npeers)); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - - kiblnd_net_fini_pools(net); - - write_lock_irqsave(g_lock, flags); - LASSERT(net->ibn_dev->ibd_nnets > 0); - net->ibn_dev->ibd_nnets--; - list_del(&net->ibn_list); - write_unlock_irqrestore(g_lock, flags); - - /* fall through */ - - case IBLND_INIT_NOTHING: - LASSERT(!atomic_read(&net->ibn_nconns)); - - if (net->ibn_dev && !net->ibn_dev->ibd_nnets) - kiblnd_destroy_dev(net->ibn_dev); - - break; - } - - net->ibn_init = IBLND_INIT_NOTHING; - ni->ni_data = NULL; - - kfree(net); - -out: - if (list_empty(&kiblnd_data.kib_devs)) - kiblnd_base_shutdown(); -} - -static int kiblnd_base_startup(void) -{ - struct kib_sched_info *sched; - int rc; - int i; - - LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); - - try_module_get(THIS_MODULE); - /* zero pointers, flags etc */ - memset(&kiblnd_data, 0, sizeof(kiblnd_data)); - - rwlock_init(&kiblnd_data.kib_global_lock); - - INIT_LIST_HEAD(&kiblnd_data.kib_devs); - INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); - - kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; - kiblnd_data.kib_peers = kvmalloc_array(kiblnd_data.kib_peer_hash_size, - sizeof(struct list_head), - GFP_KERNEL); - if (!kiblnd_data.kib_peers) - goto failed; - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); - - spin_lock_init(&kiblnd_data.kib_connd_lock); - INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); - INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); - INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); - INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); - - init_waitqueue_head(&kiblnd_data.kib_connd_waitq); - init_waitqueue_head(&kiblnd_data.kib_failover_waitq); - - kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*sched)); - if (!kiblnd_data.kib_scheds) - goto failed; - - cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { - int nthrs; - - spin_lock_init(&sched->ibs_lock); - INIT_LIST_HEAD(&sched->ibs_conns); - init_waitqueue_head(&sched->ibs_waitq); - - nthrs = cfs_cpt_weight(lnet_cpt_table(), i); - if (*kiblnd_tunables.kib_nscheds > 0) { - nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); - } else { - /* - * max to half of CPUs, another half is reserved for - * upper layer modules - */ - nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); - } - - sched->ibs_nthreads_max = nthrs; - sched->ibs_cpt = i; - } - - kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; - - /* lists/ptrs/locks initialised */ - kiblnd_data.kib_init = IBLND_INIT_DATA; - /*****************************************************/ - - rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); - if (rc) { - CERROR("Can't spawn o2iblnd connd: %d\n", rc); - goto failed; - } - - if (*kiblnd_tunables.kib_dev_failover) - rc = kiblnd_thread_start(kiblnd_failover_thread, NULL, - "kiblnd_failover"); - - if (rc) { - CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); - goto failed; - } - - /* flag everything initialised */ - kiblnd_data.kib_init = IBLND_INIT_ALL; - /*****************************************************/ - - return 0; - - failed: - kiblnd_base_shutdown(); - return -ENETDOWN; -} - -static int kiblnd_start_schedulers(struct kib_sched_info *sched) -{ - int rc = 0; - int nthrs; - int i; - - if (!sched->ibs_nthreads) { - if (*kiblnd_tunables.kib_nscheds > 0) { - nthrs = sched->ibs_nthreads_max; - } else { - nthrs = cfs_cpt_weight(lnet_cpt_table(), - sched->ibs_cpt); - nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); - nthrs = min(IBLND_N_SCHED_HIGH, nthrs); - } - } else { - LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); - /* increase one thread if there is new interface */ - nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max; - } - - for (i = 0; i < nthrs; i++) { - long id; - char name[20]; - - id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); - snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", - KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); - rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); - if (!rc) - continue; - - CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", - sched->ibs_cpt, sched->ibs_nthreads + i, rc); - break; - } - - sched->ibs_nthreads += i; - return rc; -} - -static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts, - int ncpts) -{ - int cpt; - int rc; - int i; - - for (i = 0; i < ncpts; i++) { - struct kib_sched_info *sched; - - cpt = !cpts ? i : cpts[i]; - sched = kiblnd_data.kib_scheds[cpt]; - - if (!newdev && sched->ibs_nthreads > 0) - continue; - - rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); - if (rc) { - CERROR("Failed to start scheduler threads for %s\n", - dev->ibd_ifname); - return rc; - } - } - return 0; -} - -static struct kib_dev *kiblnd_dev_search(char *ifname) -{ - struct kib_dev *alias = NULL; - struct kib_dev *dev; - char *colon; - char *colon2; - - colon = strchr(ifname, ':'); - list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { - if (!strcmp(&dev->ibd_ifname[0], ifname)) - return dev; - - if (alias) - continue; - - colon2 = strchr(dev->ibd_ifname, ':'); - if (colon) - *colon = 0; - if (colon2) - *colon2 = 0; - - if (!strcmp(&dev->ibd_ifname[0], ifname)) - alias = dev; - - if (colon) - *colon = ':'; - if (colon2) - *colon2 = ':'; - } - return alias; -} - -static int kiblnd_startup(struct lnet_ni *ni) -{ - char *ifname; - struct kib_dev *ibdev = NULL; - struct kib_net *net; - struct timespec64 tv; - unsigned long flags; - int rc; - int newdev; - - LASSERT(ni->ni_lnd == &the_o2iblnd); - - if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { - rc = kiblnd_base_startup(); - if (rc) - return rc; - } - - net = kzalloc(sizeof(*net), GFP_NOFS); - ni->ni_data = net; - if (!net) - goto net_failed; - - ktime_get_real_ts64(&tv); - net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC + - tv.tv_nsec / NSEC_PER_USEC; - - rc = kiblnd_tunables_setup(ni); - if (rc) - goto net_failed; - - if (ni->ni_interfaces[0]) { - /* Use the IPoIB interface specified in 'networks=' */ - - BUILD_BUG_ON(LNET_MAX_INTERFACES <= 1); - if (ni->ni_interfaces[1]) { - CERROR("Multiple interfaces not supported\n"); - goto failed; - } - - ifname = ni->ni_interfaces[0]; - } else { - ifname = *kiblnd_tunables.kib_default_ipif; - } - - if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { - CERROR("IPoIB interface name too long: %s\n", ifname); - goto failed; - } - - ibdev = kiblnd_dev_search(ifname); - - newdev = !ibdev; - /* hmm...create kib_dev even for alias */ - if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname)) - ibdev = kiblnd_create_dev(ifname); - - if (!ibdev) - goto failed; - - net->ibn_dev = ibdev; - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); - - rc = kiblnd_dev_start_threads(ibdev, newdev, - ni->ni_cpts, ni->ni_ncpts); - if (rc) - goto failed; - - rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); - if (rc) { - CERROR("Failed to initialize NI pools: %d\n", rc); - goto failed; - } - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - ibdev->ibd_nnets++; - list_add_tail(&net->ibn_list, &ibdev->ibd_nets); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - net->ibn_init = IBLND_INIT_ALL; - - return 0; - -failed: - if (!net->ibn_dev && ibdev) - kiblnd_destroy_dev(ibdev); - -net_failed: - kiblnd_shutdown(ni); - - CDEBUG(D_NET, "%s failed\n", __func__); - return -ENETDOWN; -} - -static struct lnet_lnd the_o2iblnd = { - .lnd_type = O2IBLND, - .lnd_startup = kiblnd_startup, - .lnd_shutdown = kiblnd_shutdown, - .lnd_ctl = kiblnd_ctl, - .lnd_query = kiblnd_query, - .lnd_send = kiblnd_send, - .lnd_recv = kiblnd_recv, -}; - -static void __exit ko2iblnd_exit(void) -{ - lnet_unregister_lnd(&the_o2iblnd); -} - -static int __init ko2iblnd_init(void) -{ - int rc; - - BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE); - BUILD_BUG_ON(offsetof(struct kib_msg, - ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) - > IBLND_MSG_SIZE); - BUILD_BUG_ON(offsetof(struct kib_msg, - ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) - > IBLND_MSG_SIZE); - - kiblnd_tunables_init(); - - rc = libcfs_setup(); - if (rc) - return rc; - - lnet_register_lnd(&the_o2iblnd); - - return 0; -} - -MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>"); -MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver"); -MODULE_VERSION("2.7.0"); -MODULE_LICENSE("GPL"); - -module_init(ko2iblnd_init); -module_exit(ko2iblnd_exit); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h deleted file mode 100644 index 217503f..0000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h +++ /dev/null @@ -1,1048 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd.h - * - * Author: Eric Barton <eric@bartonsoftware.com> - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/unistd.h> -#include <linux/uio.h> -#include <linux/uaccess.h> - -#include <linux/io.h> - -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/list.h> -#include <linux/kmod.h> -#include <linux/sysctl.h> -#include <linux/pci.h> - -#include <net/sock.h> -#include <linux/in.h> - -#include <rdma/rdma_cm.h> -#include <rdma/ib_cm.h> -#include <rdma/ib_verbs.h> -#include <rdma/ib_fmr_pool.h> - -#define DEBUG_SUBSYSTEM S_LND - -#include <linux/lnet/lib-lnet.h> - -#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ -/* # scheduler loops before reschedule */ -#define IBLND_RESCHED 100 - -#define IBLND_N_SCHED 2 -#define IBLND_N_SCHED_HIGH 4 - -struct kib_tunables { - int *kib_dev_failover; /* HCA failover */ - unsigned int *kib_service; /* IB service number */ - int *kib_min_reconnect_interval; /* first failed connection retry... */ - int *kib_max_reconnect_interval; /* exponentially increasing to this */ - int *kib_cksum; /* checksum struct kib_msg? */ - int *kib_timeout; /* comms timeout (seconds) */ - int *kib_keepalive; /* keepalive timeout (seconds) */ - int *kib_ntx; /* # tx descs */ - char **kib_default_ipif; /* default IPoIB interface */ - int *kib_retry_count; - int *kib_rnr_retry_count; - int *kib_ib_mtu; /* IB MTU */ - int *kib_require_priv_port; /* accept only privileged ports */ - int *kib_use_priv_port; /* use privileged port for active connect */ - int *kib_nscheds; /* # threads on each CPT */ -}; - -extern struct kib_tunables kiblnd_tunables; - -#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */ -#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */ - -#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */ -#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *)0)->ibm_credits)) - 1) /* Max # of peer credits */ - -/* when eagerly to return credits */ -#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \ - IBLND_CREDIT_HIGHWATER_V1 : \ - t->lnd_peercredits_hiw) - -#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \ - cb, dev, \ - ps, qpt) - -/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ -#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) -#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) - -#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */ -#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ -#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */ - -/************************/ -/* derived constants... */ -/* Pools (shared by connections on each CPT) */ -/* These pools can grow at runtime, so don't need give a very large value */ -#define IBLND_TX_POOL 256 -#define IBLND_FMR_POOL 256 -#define IBLND_FMR_POOL_FLUSH 192 - -#define IBLND_RX_MSGS(c) \ - ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version)) -#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE) -#define IBLND_RX_MSG_PAGES(c) \ - ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE) - -/* WRs and CQEs (per connection) */ -#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) -#define IBLND_SEND_WRS(c) \ - (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \ - kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni)) -#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) - -struct kib_hca_dev; - -/* o2iblnd can run over aliased interface */ -#ifdef IFALIASZ -#define KIB_IFNAME_SIZE IFALIASZ -#else -#define KIB_IFNAME_SIZE 256 -#endif - -struct kib_dev { - struct list_head ibd_list; /* chain on kib_devs */ - struct list_head ibd_fail_list; /* chain on kib_failed_devs */ - __u32 ibd_ifip; /* IPoIB interface IP */ - - /* IPoIB interface name */ - char ibd_ifname[KIB_IFNAME_SIZE]; - int ibd_nnets; /* # nets extant */ - - unsigned long ibd_next_failover; - int ibd_failed_failover; /* # failover failures */ - unsigned int ibd_failover; /* failover in progress */ - unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */ - struct list_head ibd_nets; - struct kib_hca_dev *ibd_hdev; -}; - -struct kib_hca_dev { - struct rdma_cm_id *ibh_cmid; /* listener cmid */ - struct ib_device *ibh_ibdev; /* IB device */ - int ibh_page_shift; /* page shift of current HCA */ - int ibh_page_size; /* page size of current HCA */ - __u64 ibh_page_mask; /* page mask of current HCA */ - int ibh_mr_shift; /* bits shift of max MR size */ - __u64 ibh_mr_size; /* size of MR */ - struct ib_pd *ibh_pd; /* PD */ - struct kib_dev *ibh_dev; /* owner */ - atomic_t ibh_ref; /* refcount */ -}; - -/** # of seconds to keep pool alive */ -#define IBLND_POOL_DEADLINE 300 -/** # of seconds to retry if allocation failed */ -#define IBLND_POOL_RETRY 1 - -struct kib_pages { - int ibp_npages; /* # pages */ - struct page *ibp_pages[0]; /* page array */ -}; - -struct kib_pool; -struct kib_poolset; - -typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, - int inc, struct kib_pool **pp_po); -typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); -typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); -typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); - -struct kib_net; - -#define IBLND_POOL_NAME_LEN 32 - -struct kib_poolset { - spinlock_t ps_lock; /* serialize */ - struct kib_net *ps_net; /* network it belongs to */ - char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */ - struct list_head ps_pool_list; /* list of pools */ - struct list_head ps_failed_pool_list;/* failed pool list */ - unsigned long ps_next_retry; /* time stamp for retry if */ - /* failed to allocate */ - int ps_increasing; /* is allocating new pool */ - int ps_pool_size; /* new pool size */ - int ps_cpt; /* CPT id */ - - kib_ps_pool_create_t ps_pool_create; /* create a new pool */ - kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ - kib_ps_node_init_t ps_node_init; /* initialize new allocated node */ - kib_ps_node_fini_t ps_node_fini; /* finalize node */ -}; - -struct kib_pool { - struct list_head po_list; /* chain on pool list */ - struct list_head po_free_list; /* pre-allocated node */ - struct kib_poolset *po_owner; /* pool_set of this pool */ - unsigned long po_deadline; /* deadline of this pool */ - int po_allocated; /* # of elements in use */ - int po_failed; /* pool is created on failed HCA */ - int po_size; /* # of pre-allocated elements */ -}; - -struct kib_tx_poolset { - struct kib_poolset tps_poolset; /* pool-set */ - __u64 tps_next_tx_cookie; /* cookie of TX */ -}; - -struct kib_tx_pool { - struct kib_pool tpo_pool; /* pool */ - struct kib_hca_dev *tpo_hdev; /* device for this pool */ - struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ - struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */ -}; - -struct kib_fmr_poolset { - spinlock_t fps_lock; /* serialize */ - struct kib_net *fps_net; /* IB network */ - struct list_head fps_pool_list; /* FMR pool list */ - struct list_head fps_failed_pool_list;/* FMR pool list */ - __u64 fps_version; /* validity stamp */ - int fps_cpt; /* CPT id */ - int fps_pool_size; - int fps_flush_trigger; - int fps_cache; - int fps_increasing; /* is allocating new pool */ - unsigned long fps_next_retry; /* time stamp for retry if*/ - /* failed to allocate */ -}; - -struct kib_fast_reg_descriptor { /* For fast registration */ - struct list_head frd_list; - struct ib_send_wr frd_inv_wr; - struct ib_reg_wr frd_fastreg_wr; - struct ib_mr *frd_mr; - bool frd_valid; -}; - -struct kib_fmr_pool { - struct list_head fpo_list; /* chain on pool list */ - struct kib_hca_dev *fpo_hdev; /* device for this pool */ - struct kib_fmr_poolset *fpo_owner; /* owner of this pool */ - union { - struct { - struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ - } fmr; - struct { /* For fast registration */ - struct list_head fpo_pool_list; - int fpo_pool_size; - } fast_reg; - }; - unsigned long fpo_deadline; /* deadline of this pool */ - int fpo_failed; /* fmr pool is failed */ - int fpo_map_count; /* # of mapped FMR */ - int fpo_is_fmr; -}; - -struct kib_fmr { - struct kib_fmr_pool *fmr_pool; /* pool of FMR */ - struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ - struct kib_fast_reg_descriptor *fmr_frd; - u32 fmr_key; -}; - -struct kib_net { - struct list_head ibn_list; /* chain on struct kib_dev::ibd_nets */ - __u64 ibn_incarnation;/* my epoch */ - int ibn_init; /* initialisation state */ - int ibn_shutdown; /* shutting down? */ - - atomic_t ibn_npeers; /* # peers extant */ - atomic_t ibn_nconns; /* # connections extant */ - - struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */ - struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */ - - struct kib_dev *ibn_dev; /* underlying IB device */ -}; - -#define KIB_THREAD_SHIFT 16 -#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) -#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) -#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) - -struct kib_sched_info { - spinlock_t ibs_lock; /* serialise */ - wait_queue_head_t ibs_waitq; /* schedulers sleep here */ - struct list_head ibs_conns; /* conns to check for rx completions */ - int ibs_nthreads; /* number of scheduler threads */ - int ibs_nthreads_max; /* max allowed scheduler threads */ - int ibs_cpt; /* CPT id */ -}; - -struct kib_data { - int kib_init; /* initialisation state */ - int kib_shutdown; /* shut down? */ - struct list_head kib_devs; /* IB devices extant */ - struct list_head kib_failed_devs; /* list head of failed devices */ - wait_queue_head_t kib_failover_waitq; /* schedulers sleep here */ - atomic_t kib_nthreads; /* # live threads */ - rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */ - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - void *kib_connd; /* the connd task (serialisation assertions) */ - struct list_head kib_connd_conns; /* connections to setup/teardown */ - struct list_head kib_connd_zombies; /* connections with zero refcount */ - /* connections to reconnect */ - struct list_head kib_reconn_list; - /* peers wait for reconnection */ - struct list_head kib_reconn_wait; - /** - * The second that peers are pulled out from \a kib_reconn_wait - * for reconnection. - */ - time64_t kib_reconn_sec; - - wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */ - spinlock_t kib_connd_lock; /* serialise */ - struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ - struct kib_sched_info **kib_scheds; /* percpt data for schedulers */ -}; - -#define IBLND_INIT_NOTHING 0 -#define IBLND_INIT_DATA 1 -#define IBLND_INIT_ALL 2 - -/************************************************************************ - * IB Wire message format. - * These are sent in sender's byte order (i.e. receiver flips). - */ - -struct kib_connparams { - __u16 ibcp_queue_depth; - __u16 ibcp_max_frags; - __u32 ibcp_max_msg_size; -} WIRE_ATTR; - -struct kib_immediate_msg { - struct lnet_hdr ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR; - -struct kib_rdma_frag { - __u32 rf_nob; /* # bytes this frag */ - __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ -} WIRE_ATTR; - -struct kib_rdma_desc { - __u32 rd_key; /* local/remote key */ - __u32 rd_nfrags; /* # fragments */ - struct kib_rdma_frag rd_frags[0]; /* buffer frags */ -} WIRE_ATTR; - -struct kib_putreq_msg { - struct lnet_hdr ibprm_hdr; /* portals header */ - __u64 ibprm_cookie; /* opaque completion cookie */ -} WIRE_ATTR; - -struct kib_putack_msg { - __u64 ibpam_src_cookie; /* reflected completion cookie */ - __u64 ibpam_dst_cookie; /* opaque completion cookie */ - struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */ -} WIRE_ATTR; - -struct kib_get_msg { - struct lnet_hdr ibgm_hdr; /* portals header */ - __u64 ibgm_cookie; /* opaque completion cookie */ - struct kib_rdma_desc ibgm_rd; /* rdma descriptor */ -} WIRE_ATTR; - -struct kib_completion_msg { - __u64 ibcm_cookie; /* opaque completion cookie */ - __s32 ibcm_status; /* < 0 failure: >= 0 length */ -} WIRE_ATTR; - -struct kib_msg { - /* First 2 fields fixed FOR ALL TIME */ - __u32 ibm_magic; /* I'm an ibnal message */ - __u16 ibm_version; /* this is my version number */ - - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ - __u32 ibm_nob; /* # bytes in whole message */ - __u32 ibm_cksum; /* checksum (0 == no checksum) */ - __u64 ibm_srcnid; /* sender's NID */ - __u64 ibm_srcstamp; /* sender's incarnation */ - __u64 ibm_dstnid; /* destination's NID */ - __u64 ibm_dststamp; /* destination's incarnation */ - - union { - struct kib_connparams connparams; - struct kib_immediate_msg immediate; - struct kib_putreq_msg putreq; - struct kib_putack_msg putack; - struct kib_get_msg get; - struct kib_completion_msg completion; - } WIRE_ATTR ibm_u; -} WIRE_ATTR; - -#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ - -#define IBLND_MSG_VERSION_1 0x11 -#define IBLND_MSG_VERSION_2 0x12 -#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2 - -#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ -#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ -#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ -#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ -#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ -#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ -#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ -#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ -#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ - -struct kib_rej { - __u32 ibr_magic; /* sender's magic */ - __u16 ibr_version; /* sender's version */ - __u8 ibr_why; /* reject reason */ - __u8 ibr_padding; /* padding */ - __u64 ibr_incarnation; /* incarnation of peer */ - struct kib_connparams ibr_cp; /* connection parameters */ -} WIRE_ATTR; - -/* connection rejection reasons */ -#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ -#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ -#define IBLND_REJECT_FATAL 3 /* Anything else */ -#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */ -#define IBLND_REJECT_CONN_STALE 5 /* stale peer */ -/* peer's rdma frags doesn't match mine */ -#define IBLND_REJECT_RDMA_FRAGS 6 -/* peer's msg queue size doesn't match mine */ -#define IBLND_REJECT_MSG_QUEUE_SIZE 7 - -/***********************************************************************/ - -struct kib_rx { /* receive message */ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_nob; /* # bytes received (-1 while posted) */ - enum ib_wc_status rx_status; /* completion status */ - struct kib_msg *rx_msg; /* message buffer (host vaddr) */ - __u64 rx_msgaddr; /* message buffer (I/O addr) */ - DECLARE_PCI_UNMAP_ADDR(rx_msgunmap); /* for dma_unmap_single() */ - struct ib_recv_wr rx_wrq; /* receive work item... */ - struct ib_sge rx_sge; /* ...and its memory */ -}; - -#define IBLND_POSTRX_DONT_POST 0 /* don't post */ -#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ -#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */ -#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give self back 1 reserved credit */ - -struct kib_tx { /* transmit message */ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - struct kib_tx_pool *tx_pool; /* pool I'm from */ - struct kib_conn *tx_conn; /* owning conn */ - short tx_sending; /* # tx callbacks outstanding */ - short tx_queued; /* queued for sending */ - short tx_waiting; /* waiting for peer */ - int tx_status; /* LNET completion status */ - unsigned long tx_deadline; /* completion deadline */ - __u64 tx_cookie; /* completion cookie */ - struct lnet_msg *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ - struct kib_msg *tx_msg; /* message buffer (host vaddr) */ - __u64 tx_msgaddr; /* message buffer (I/O addr) */ - DECLARE_PCI_UNMAP_ADDR(tx_msgunmap); /* for dma_unmap_single() */ - int tx_nwrq; /* # send work items */ - struct ib_rdma_wr *tx_wrq; /* send work items... */ - struct ib_sge *tx_sge; /* ...and their memory */ - struct kib_rdma_desc *tx_rd; /* rdma descriptor */ - int tx_nfrags; /* # entries in... */ - struct scatterlist *tx_frags; /* dma_map_sg descriptor */ - __u64 *tx_pages; /* rdma phys page addrs */ - struct kib_fmr fmr; /* FMR */ - int tx_dmadir; /* dma direction */ -}; - -struct kib_connvars { - struct kib_msg cv_msg; /* connection-in-progress variables */ -}; - -struct kib_conn { - struct kib_sched_info *ibc_sched; /* scheduler information */ - struct kib_peer *ibc_peer; /* owning peer */ - struct kib_hca_dev *ibc_hdev; /* HCA bound on */ - struct list_head ibc_list; /* stash on peer's conn list */ - struct list_head ibc_sched_list; /* schedule for attention */ - __u16 ibc_version; /* version of connection */ - /* reconnect later */ - __u16 ibc_reconnect:1; - __u64 ibc_incarnation; /* which instance of the peer */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_noops_posted; /* # uncompleted NOOPs */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - int ibc_reserved_credits; /* # ACK/DONE msg credits */ - int ibc_comms_error; /* set on comms error */ - /* connections queue depth */ - __u16 ibc_queue_depth; - /* connections max frags */ - __u16 ibc_max_frags; - unsigned int ibc_nrx:16; /* receive buffers owned */ - unsigned int ibc_scheduled:1; /* scheduled for attention */ - unsigned int ibc_ready:1; /* CQ callback fired */ - unsigned long ibc_last_send; /* time of last send */ - struct list_head ibc_connd_list; /* link chain for */ - /* kiblnd_check_conns only */ - struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ - struct list_head ibc_tx_noops; /* IBLND_MSG_NOOPs for */ - /* IBLND_MSG_VERSION_1 */ - struct list_head ibc_tx_queue; /* sends that need a credit */ - struct list_head ibc_tx_queue_nocred; /* sends that don't need a */ - /* credit */ - struct list_head ibc_tx_queue_rsrvd; /* sends that need to */ - /* reserve an ACK/DONE msg */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - struct kib_rx *ibc_rxs; /* the rx descs */ - struct kib_pages *ibc_rx_pages; /* premapped rx msg pages */ - - struct rdma_cm_id *ibc_cmid; /* CM id */ - struct ib_cq *ibc_cq; /* completion queue */ - - struct kib_connvars *ibc_connvars; /* in-progress connection state */ -}; - -#define IBLND_CONN_INIT 0 /* being initialised */ -#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ -#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ -#define IBLND_CONN_ESTABLISHED 3 /* connection established */ -#define IBLND_CONN_CLOSING 4 /* being closed */ -#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ - -struct kib_peer { - struct list_head ibp_list; /* stash on global peer list */ - lnet_nid_t ibp_nid; /* who's on the other end(s) */ - struct lnet_ni *ibp_ni; /* LNet interface */ - struct list_head ibp_conns; /* all active connections */ - struct kib_conn *ibp_next_conn; /* next connection to send on for - * round robin */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - __u64 ibp_incarnation; /* incarnation of peer */ - /* when (in jiffies) I was last alive */ - unsigned long ibp_last_alive; - /* # users */ - atomic_t ibp_refcount; - /* version of peer */ - __u16 ibp_version; - /* current passive connection attempts */ - unsigned short ibp_accepting; - /* current active connection attempts */ - unsigned short ibp_connecting; - /* reconnect this peer later */ - unsigned char ibp_reconnecting; - /* counter of how many times we triggered a conn race */ - unsigned char ibp_races; - /* # consecutive reconnection attempts to this peer */ - unsigned int ibp_reconnected; - /* errno on closing this peer */ - int ibp_error; - /* max map_on_demand */ - __u16 ibp_max_frags; - /* max_peer_credits */ - __u16 ibp_queue_depth; -}; - -extern struct kib_data kiblnd_data; - -void kiblnd_hdev_destroy(struct kib_hca_dev *hdev); - -int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); - -/* max # of fragments configured by user */ -static inline int -kiblnd_cfg_rdma_frags(struct lnet_ni *ni) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - int mod; - - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - mod = tunables->lnd_map_on_demand; - return mod ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT; -} - -static inline int -kiblnd_rdma_frags(int version, struct lnet_ni *ni) -{ - return version == IBLND_MSG_VERSION_1 ? - (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) : - kiblnd_cfg_rdma_frags(ni); -} - -static inline int -kiblnd_concurrent_sends(int version, struct lnet_ni *ni) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - int concurrent_sends; - - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - concurrent_sends = tunables->lnd_concurrent_sends; - - if (version == IBLND_MSG_VERSION_1) { - if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) - return IBLND_MSG_QUEUE_SIZE_V1 * 2; - - if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) - return IBLND_MSG_QUEUE_SIZE_V1 / 2; - } - - return concurrent_sends; -} - -static inline void -kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev) -{ - LASSERT(atomic_read(&hdev->ibh_ref) > 0); - atomic_inc(&hdev->ibh_ref); -} - -static inline void -kiblnd_hdev_decref(struct kib_hca_dev *hdev) -{ - LASSERT(atomic_read(&hdev->ibh_ref) > 0); - if (atomic_dec_and_test(&hdev->ibh_ref)) - kiblnd_hdev_destroy(hdev); -} - -static inline int -kiblnd_dev_can_failover(struct kib_dev *dev) -{ - if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */ - return 0; - - if (!*kiblnd_tunables.kib_dev_failover) /* disabled */ - return 0; - - if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */ - return 1; - - return dev->ibd_can_failover; -} - -#define kiblnd_conn_addref(conn) \ -do { \ - CDEBUG(D_NET, "conn[%p] (%d)++\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - atomic_inc(&(conn)->ibc_refcount); \ -} while (0) - -#define kiblnd_conn_decref(conn) \ -do { \ - unsigned long flags; \ - \ - CDEBUG(D_NET, "conn[%p] (%d)--\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \ - if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ - spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ - list_add_tail(&(conn)->ibc_list, \ - &kiblnd_data.kib_connd_zombies); \ - wake_up(&kiblnd_data.kib_connd_waitq); \ - spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\ - } \ -} while (0) - -#define kiblnd_peer_addref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read(&(peer)->ibp_refcount)); \ - atomic_inc(&(peer)->ibp_refcount); \ -} while (0) - -#define kiblnd_peer_decref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read(&(peer)->ibp_refcount)); \ - LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \ - if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ - kiblnd_destroy_peer(peer); \ -} while (0) - -static inline bool -kiblnd_peer_connecting(struct kib_peer *peer) -{ - return peer->ibp_connecting || - peer->ibp_reconnecting || - peer->ibp_accepting; -} - -static inline bool -kiblnd_peer_idle(struct kib_peer *peer) -{ - return !kiblnd_peer_connecting(peer) && list_empty(&peer->ibp_conns); -} - -static inline struct list_head * -kiblnd_nid2peerlist(lnet_nid_t nid) -{ - unsigned int hash = - ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size; - - return &kiblnd_data.kib_peers[hash]; -} - -static inline int -kiblnd_peer_active(struct kib_peer *peer) -{ - /* Am I in the peer hash table? */ - return !list_empty(&peer->ibp_list); -} - -static inline struct kib_conn * -kiblnd_get_conn_locked(struct kib_peer *peer) -{ - struct list_head *next; - - LASSERT(!list_empty(&peer->ibp_conns)); - - /* Advance to next connection, be sure to skip the head node */ - if (!peer->ibp_next_conn || - peer->ibp_next_conn->ibc_list.next == &peer->ibp_conns) - next = peer->ibp_conns.next; - else - next = peer->ibp_next_conn->ibc_list.next; - peer->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list); - - return peer->ibp_next_conn; -} - -static inline int -kiblnd_send_keepalive(struct kib_conn *conn) -{ - return (*kiblnd_tunables.kib_keepalive > 0) && - time_after(jiffies, conn->ibc_last_send + - msecs_to_jiffies(*kiblnd_tunables.kib_keepalive * - MSEC_PER_SEC)); -} - -static inline int -kiblnd_need_noop(struct kib_conn *conn) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - - if (conn->ibc_outstanding_credits < - IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) && - !kiblnd_send_keepalive(conn)) - return 0; /* No need to send NOOP */ - - if (IBLND_OOB_CAPABLE(conn->ibc_version)) { - if (!list_empty(&conn->ibc_tx_queue_nocred)) - return 0; /* NOOP can be piggybacked */ - - /* No tx to piggyback NOOP onto or no credit to send a tx */ - return (list_empty(&conn->ibc_tx_queue) || - !conn->ibc_credits); - } - - if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ - !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ - !conn->ibc_credits) /* no credit */ - return 0; - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - !conn->ibc_outstanding_credits) /* giving back credits */ - return 0; - - /* No tx to piggyback NOOP onto or no credit to send a tx */ - return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); -} - -static inline void -kiblnd_abort_receives(struct kib_conn *conn) -{ - ib_modify_qp(conn->ibc_cmid->qp, - &kiblnd_data.kib_error_qpa, IB_QP_STATE); -} - -static inline const char * -kiblnd_queue2str(struct kib_conn *conn, struct list_head *q) -{ - if (q == &conn->ibc_tx_queue) - return "tx_queue"; - - if (q == &conn->ibc_tx_queue_rsrvd) - return "tx_queue_rsrvd"; - - if (q == &conn->ibc_tx_queue_nocred) - return "tx_queue_nocred"; - - if (q == &conn->ibc_active_txs) - return "active_txs"; - - LBUG(); - return NULL; -} - -/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the */ -/* lowest bits of the work request id to stash the work item type. */ - -#define IBLND_WID_INVAL 0 -#define IBLND_WID_TX 1 -#define IBLND_WID_RX 2 -#define IBLND_WID_RDMA 3 -#define IBLND_WID_MR 4 -#define IBLND_WID_MASK 7UL - -static inline __u64 -kiblnd_ptr2wreqid(void *ptr, int type) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT(!(lptr & IBLND_WID_MASK)); - LASSERT(!(type & ~IBLND_WID_MASK)); - return (__u64)(lptr | type); -} - -static inline void * -kiblnd_wreqid2ptr(__u64 wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); -} - -static inline int -kiblnd_wreqid2type(__u64 wreqid) -{ - return wreqid & IBLND_WID_MASK; -} - -static inline void -kiblnd_set_conn_state(struct kib_conn *conn, int state) -{ - conn->ibc_state = state; - mb(); -} - -static inline void -kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob) -{ - msg->ibm_type = type; - msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob; -} - -static inline int -kiblnd_rd_size(struct kib_rdma_desc *rd) -{ - int i; - int size; - - for (i = size = 0; i < rd->rd_nfrags; i++) - size += rd->rd_frags[i].rf_nob; - - return size; -} - -static inline __u64 -kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index) -{ - return rd->rd_frags[index].rf_addr; -} - -static inline __u32 -kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index) -{ - return rd->rd_frags[index].rf_nob; -} - -static inline __u32 -kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index) -{ - return rd->rd_key; -} - -static inline int -kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob) -{ - if (nob < rd->rd_frags[index].rf_nob) { - rd->rd_frags[index].rf_addr += nob; - rd->rd_frags[index].rf_nob -= nob; - } else { - index++; - } - - return index; -} - -static inline int -kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n) -{ - LASSERT(msgtype == IBLND_MSG_GET_REQ || - msgtype == IBLND_MSG_PUT_ACK); - - return msgtype == IBLND_MSG_GET_REQ ? - offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) : - offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]); -} - -static inline __u64 -kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr) -{ - return ib_dma_mapping_error(dev, dma_addr); -} - -static inline __u64 kiblnd_dma_map_single(struct ib_device *dev, - void *msg, size_t size, - enum dma_data_direction direction) -{ - return ib_dma_map_single(dev, msg, size, direction); -} - -static inline void kiblnd_dma_unmap_single(struct ib_device *dev, - __u64 addr, size_t size, - enum dma_data_direction direction) -{ - ib_dma_unmap_single(dev, addr, size, direction); -} - -#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) -#define KIBLND_UNMAP_ADDR(p, m, a) (a) - -static inline int kiblnd_dma_map_sg(struct ib_device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - return ib_dma_map_sg(dev, sg, nents, direction); -} - -static inline void kiblnd_dma_unmap_sg(struct ib_device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - ib_dma_unmap_sg(dev, sg, nents, direction); -} - -static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev, - struct scatterlist *sg) -{ - return ib_sg_dma_address(dev, sg); -} - -static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return ib_sg_dma_len(dev, sg); -} - -/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly */ -/* right because OFED1.2 defines it as const, to use it we have to add */ -/* (void *) cast to overcome "const" */ - -#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) -#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) - -void kiblnd_map_rx_descs(struct kib_conn *conn); -void kiblnd_unmap_rx_descs(struct kib_conn *conn); -void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node); -struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps); - -int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, - struct kib_rdma_desc *rd, __u32 nob, __u64 iov, - struct kib_fmr *fmr); -void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status); - -int kiblnd_tunables_setup(struct lnet_ni *ni); -void kiblnd_tunables_init(void); - -int kiblnd_connd(void *arg); -int kiblnd_scheduler(void *arg); -int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name); -int kiblnd_failover_thread(void *arg); - -int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages); - -int kiblnd_cm_callback(struct rdma_cm_id *cmid, - struct rdma_cm_event *event); -int kiblnd_translate_mtu(int value); - -int kiblnd_dev_failover(struct kib_dev *dev); -int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp, - lnet_nid_t nid); -void kiblnd_destroy_peer(struct kib_peer *peer); -bool kiblnd_reconnect_peer(struct kib_peer *peer); -void kiblnd_destroy_dev(struct kib_dev *dev); -void kiblnd_unlink_peer_locked(struct kib_peer *peer); -struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid); -int kiblnd_close_stale_conns_locked(struct kib_peer *peer, - int version, __u64 incarnation); -int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why); - -struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, - struct rdma_cm_id *cmid, - int state, int version); -void kiblnd_destroy_conn(struct kib_conn *conn); -void kiblnd_close_conn(struct kib_conn *conn, int error); -void kiblnd_close_conn_locked(struct kib_conn *conn, int error); - -void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid); -void kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, - int status); - -void kiblnd_qp_event(struct ib_event *event, void *arg); -void kiblnd_cq_event(struct ib_event *event, void *arg); -void kiblnd_cq_completion(struct ib_cq *cq, void *arg); - -void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, - int credits, lnet_nid_t dstnid, __u64 dststamp); -int kiblnd_unpack_msg(struct kib_msg *msg, int nob); -int kiblnd_post_rx(struct kib_rx *rx, int credit); - -int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); -int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c deleted file mode 100644 index 65b7a62..0000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ /dev/null @@ -1,3763 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd_cb.c - * - * Author: Eric Barton <eric@bartonsoftware.com> - */ - -#include <linux/highmem.h> -#include "o2iblnd.h" - -#define MAX_CONN_RACES_BEFORE_ABORT 20 - -static void kiblnd_peer_alive(struct kib_peer *peer); -static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error); -static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, - int type, int body_nob); -static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, - int resid, struct kib_rdma_desc *dstrd, - __u64 dstcookie); -static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn); -static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn); -static void kiblnd_unmap_tx(struct kib_tx *tx); -static void kiblnd_check_sends_locked(struct kib_conn *conn); - -static void -kiblnd_tx_done(struct lnet_ni *ni, struct kib_tx *tx) -{ - struct lnet_msg *lntmsg[2]; - struct kib_net *net = ni->ni_data; - int rc; - int i; - - LASSERT(net); - LASSERT(!in_interrupt()); - LASSERT(!tx->tx_queued); /* mustn't be queued for sending */ - LASSERT(!tx->tx_sending); /* mustn't be awaiting sent callback */ - LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */ - LASSERT(tx->tx_pool); - - kiblnd_unmap_tx(tx); - - /* tx may have up to 2 lnet msgs to finalise */ - lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; - lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; - rc = tx->tx_status; - - if (tx->tx_conn) { - LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni); - - kiblnd_conn_decref(tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nwrq = 0; - tx->tx_status = 0; - - kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); - - /* delay finalize until my descs have been freed */ - for (i = 0; i < 2; i++) { - if (!lntmsg[i]) - continue; - - lnet_finalize(ni, lntmsg[i], rc); - } -} - -void -kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int status) -{ - struct kib_tx *tx; - - while (!list_empty(txlist)) { - tx = list_entry(txlist->next, struct kib_tx, tx_list); - - list_del(&tx->tx_list); - /* complete now */ - tx->tx_waiting = 0; - tx->tx_status = status; - kiblnd_tx_done(ni, tx); - } -} - -static struct kib_tx * -kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target) -{ - struct kib_net *net = (struct kib_net *)ni->ni_data; - struct list_head *node; - struct kib_tx *tx; - struct kib_tx_poolset *tps; - - tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)]; - node = kiblnd_pool_alloc_node(&tps->tps_poolset); - if (!node) - return NULL; - tx = list_entry(node, struct kib_tx, tx_list); - - LASSERT(!tx->tx_nwrq); - LASSERT(!tx->tx_queued); - LASSERT(!tx->tx_sending); - LASSERT(!tx->tx_waiting); - LASSERT(!tx->tx_status); - LASSERT(!tx->tx_conn); - LASSERT(!tx->tx_lntmsg[0]); - LASSERT(!tx->tx_lntmsg[1]); - LASSERT(!tx->tx_nfrags); - - return tx; -} - -static void -kiblnd_drop_rx(struct kib_rx *rx) -{ - struct kib_conn *conn = rx->rx_conn; - struct kib_sched_info *sched = conn->ibc_sched; - unsigned long flags; - - spin_lock_irqsave(&sched->ibs_lock, flags); - LASSERT(conn->ibc_nrx > 0); - conn->ibc_nrx--; - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - kiblnd_conn_decref(conn); -} - -int -kiblnd_post_rx(struct kib_rx *rx, int credit) -{ - struct kib_conn *conn = rx->rx_conn; - struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data; - struct ib_recv_wr *bad_wrq = NULL; - int rc; - - LASSERT(net); - LASSERT(!in_interrupt()); - LASSERT(credit == IBLND_POSTRX_NO_CREDIT || - credit == IBLND_POSTRX_PEER_CREDIT || - credit == IBLND_POSTRX_RSRVD_CREDIT); - - rx->rx_sge.lkey = conn->ibc_hdev->ibh_pd->local_dma_lkey; - rx->rx_sge.addr = rx->rx_msgaddr; - rx->rx_sge.length = IBLND_MSG_SIZE; - - rx->rx_wrq.next = NULL; - rx->rx_wrq.sg_list = &rx->rx_sge; - rx->rx_wrq.num_sge = 1; - rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); - - LASSERT(conn->ibc_state >= IBLND_CONN_INIT); - LASSERT(rx->rx_nob >= 0); /* not posted */ - - if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { - kiblnd_drop_rx(rx); /* No more posts for this rx */ - return 0; - } - - rx->rx_nob = -1; /* flag posted */ - - /* NB: need an extra reference after ib_post_recv because we don't - * own this rx (and rx::rx_conn) anymore, LU-5678. - */ - kiblnd_conn_addref(conn); - rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); - if (unlikely(rc)) { - CERROR("Can't post rx for %s: %d, bad_wrq: %p\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq); - rx->rx_nob = 0; - } - - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ - goto out; - - if (unlikely(rc)) { - kiblnd_close_conn(conn, rc); - kiblnd_drop_rx(rx); /* No more posts for this rx */ - goto out; - } - - if (credit == IBLND_POSTRX_NO_CREDIT) - goto out; - - spin_lock(&conn->ibc_lock); - if (credit == IBLND_POSTRX_PEER_CREDIT) - conn->ibc_outstanding_credits++; - else - conn->ibc_reserved_credits++; - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - -out: - kiblnd_conn_decref(conn); - return rc; -} - -static struct kib_tx * -kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, __u64 cookie) -{ - struct list_head *tmp; - - list_for_each(tmp, &conn->ibc_active_txs) { - struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list); - - LASSERT(!tx->tx_queued); - LASSERT(tx->tx_sending || tx->tx_waiting); - - if (tx->tx_cookie != cookie) - continue; - - if (tx->tx_waiting && - tx->tx_msg->ibm_type == txtype) - return tx; - - CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", - tx->tx_waiting ? "" : "NOT ", - tx->tx_msg->ibm_type, txtype); - } - return NULL; -} - -static void -kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, __u64 cookie) -{ - struct kib_tx *tx; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - int idle; - - spin_lock(&conn->ibc_lock); - - tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); - if (!tx) { - spin_unlock(&conn->ibc_lock); - - CWARN("Unmatched completion type %x cookie %#llx from %s\n", - txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_close_conn(conn, -EPROTO); - return; - } - - if (!tx->tx_status) { /* success so far */ - if (status < 0) /* failed? */ - tx->tx_status = status; - else if (txtype == IBLND_MSG_GET_REQ) - lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); - } - - tx->tx_waiting = 0; - - idle = !tx->tx_queued && !tx->tx_sending; - if (idle) - list_del(&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (idle) - kiblnd_tx_done(ni, tx); -} - -static void -kiblnd_send_completion(struct kib_conn *conn, int type, int status, __u64 cookie) -{ - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - - if (!tx) { - CERROR("Can't get tx for completion %x for %s\n", - type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - tx->tx_msg->ibm_u.completion.ibcm_status = status; - tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; - kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg)); - - kiblnd_queue_tx(tx, conn); -} - -static void -kiblnd_handle_rx(struct kib_rx *rx) -{ - struct kib_msg *msg = rx->rx_msg; - struct kib_conn *conn = rx->rx_conn; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - int credits = msg->ibm_credits; - struct kib_tx *tx; - int rc = 0; - int rc2; - int post_credit; - - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - CDEBUG(D_NET, "Received %x[%d] from %s\n", - msg->ibm_type, credits, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - if (credits) { - /* Have I received credits that will let me send? */ - spin_lock(&conn->ibc_lock); - - if (conn->ibc_credits + credits > - conn->ibc_queue_depth) { - rc2 = conn->ibc_credits; - spin_unlock(&conn->ibc_lock); - - CERROR("Bad credits from %s: %d + %d > %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - rc2, credits, conn->ibc_queue_depth); - - kiblnd_close_conn(conn, -EPROTO); - kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); - return; - } - - conn->ibc_credits += credits; - - /* This ensures the credit taken by NOOP can be returned */ - if (msg->ibm_type == IBLND_MSG_NOOP && - !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ - conn->ibc_outstanding_credits++; - - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - } - - switch (msg->ibm_type) { - default: - CERROR("Bad IBLND message type %x from %s\n", - msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - post_credit = IBLND_POSTRX_NO_CREDIT; - rc = -EPROTO; - break; - - case IBLND_MSG_NOOP: - if (IBLND_OOB_CAPABLE(conn->ibc_version)) { - post_credit = IBLND_POSTRX_NO_CREDIT; - break; - } - - if (credits) /* credit already posted */ - post_credit = IBLND_POSTRX_NO_CREDIT; - else /* a keepalive NOOP */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_IMMEDIATE: - post_credit = IBLND_POSTRX_DONT_POST; - rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr, - msg->ibm_srcnid, rx, 0); - if (rc < 0) /* repost on error */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_PUT_REQ: - post_credit = IBLND_POSTRX_DONT_POST; - rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr, - msg->ibm_srcnid, rx, 1); - if (rc < 0) /* repost on error */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_PUT_NAK: - CWARN("PUT_NACK from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - post_credit = IBLND_POSTRX_RSRVD_CREDIT; - kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBLND_MSG_PUT_ACK: - post_credit = IBLND_POSTRX_RSRVD_CREDIT; - - spin_lock(&conn->ibc_lock); - tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, - msg->ibm_u.putack.ibpam_src_cookie); - if (tx) - list_del(&tx->tx_list); - spin_unlock(&conn->ibc_lock); - - if (!tx) { - CERROR("Unmatched PUT_ACK from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - } - - LASSERT(tx->tx_waiting); - /* - * CAVEAT EMPTOR: I could be racing with tx_complete, but... - * (a) I can overwrite tx_msg since my peer has received it! - * (b) tx_waiting set tells tx_complete() it's not done. - */ - tx->tx_nwrq = 0; /* overwrite PUT_REQ */ - - rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, - kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), - &msg->ibm_u.putack.ibpam_rd, - msg->ibm_u.putack.ibpam_dst_cookie); - if (rc2 < 0) - CERROR("Can't setup rdma for PUT to %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); - - spin_lock(&conn->ibc_lock); - tx->tx_waiting = 0; /* clear waiting and queue atomically */ - kiblnd_queue_tx_locked(tx, conn); - spin_unlock(&conn->ibc_lock); - break; - - case IBLND_MSG_PUT_DONE: - post_credit = IBLND_POSTRX_PEER_CREDIT; - kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBLND_MSG_GET_REQ: - post_credit = IBLND_POSTRX_DONT_POST; - rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr, - msg->ibm_srcnid, rx, 1); - if (rc < 0) /* repost on error */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_GET_DONE: - post_credit = IBLND_POSTRX_RSRVD_CREDIT; - kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - } - - if (rc < 0) /* protocol error */ - kiblnd_close_conn(conn, rc); - - if (post_credit != IBLND_POSTRX_DONT_POST) - kiblnd_post_rx(rx, post_credit); -} - -static void -kiblnd_rx_complete(struct kib_rx *rx, int status, int nob) -{ - struct kib_msg *msg = rx->rx_msg; - struct kib_conn *conn = rx->rx_conn; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - struct kib_net *net = ni->ni_data; - int rc; - int err = -EIO; - - LASSERT(net); - LASSERT(rx->rx_nob < 0); /* was posted */ - rx->rx_nob = 0; /* isn't now */ - - if (conn->ibc_state > IBLND_CONN_ESTABLISHED) - goto ignore; - - if (status != IB_WC_SUCCESS) { - CNETERR("Rx from %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), status); - goto failed; - } - - LASSERT(nob >= 0); - rx->rx_nob = nob; - - rc = kiblnd_unpack_msg(msg, rx->rx_nob); - if (rc) { - CERROR("Error %d unpacking rx from %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || - msg->ibm_dstnid != ni->ni_nid || - msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dststamp != net->ibn_incarnation) { - CERROR("Stale rx from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - err = -ESTALE; - goto failed; - } - - /* set time last known alive */ - kiblnd_peer_alive(conn->ibc_peer); - - /* racing with connection establishment/teardown! */ - - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - unsigned long flags; - - write_lock_irqsave(g_lock, flags); - /* must check holding global lock to eliminate race */ - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); - write_unlock_irqrestore(g_lock, flags); - return; - } - write_unlock_irqrestore(g_lock, flags); - } - kiblnd_handle_rx(rx); - return; - - failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kiblnd_close_conn(conn, err); - ignore: - kiblnd_drop_rx(rx); /* Don't re-post rx. */ -} - -static struct page * -kiblnd_kvaddr_to_page(unsigned long vaddr) -{ - struct page *page; - - if (is_vmalloc_addr((void *)vaddr)) { - page = vmalloc_to_page((void *)vaddr); - LASSERT(page); - return page; - } -#ifdef CONFIG_HIGHMEM - if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ - CERROR("find page for address in highmem\n"); - LBUG(); - } -#endif - page = virt_to_page(vaddr); - LASSERT(page); - return page; -} - -static int -kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, struct kib_rdma_desc *rd, __u32 nob) -{ - struct kib_hca_dev *hdev; - struct kib_fmr_poolset *fps; - int cpt; - int rc; - - LASSERT(tx->tx_pool); - LASSERT(tx->tx_pool->tpo_pool.po_owner); - - hdev = tx->tx_pool->tpo_hdev; - cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; - - fps = net->ibn_fmr_ps[cpt]; - rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr); - if (rc) { - CERROR("Can't map %u bytes: %d\n", nob, rc); - return rc; - } - - /* - * If rd is not tx_rd, it's going to get sent to a peer, who will need - * the rkey - */ - rd->rd_key = tx->fmr.fmr_key; - rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; - rd->rd_frags[0].rf_nob = nob; - rd->rd_nfrags = 1; - - return 0; -} - -static void kiblnd_unmap_tx(struct kib_tx *tx) -{ - if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd) - kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status); - - if (tx->tx_nfrags) { - kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, - tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); - tx->tx_nfrags = 0; - } -} - -static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, int nfrags) -{ - struct kib_net *net = ni->ni_data; - struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; - __u32 nob; - int i; - - /* - * If rd is not tx_rd, it's going to get sent to a peer and I'm the - * RDMA sink - */ - tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - tx->tx_nfrags = nfrags; - - rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags, - tx->tx_nfrags, tx->tx_dmadir); - - for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { - rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( - hdev->ibh_ibdev, &tx->tx_frags[i]); - rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address( - hdev->ibh_ibdev, &tx->tx_frags[i]); - nob += rd->rd_frags[i].rf_nob; - } - - if (net->ibn_fmr_ps) - return kiblnd_fmr_map_tx(net, tx, rd, nob); - - return -EINVAL; -} - -static int -kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, unsigned int niov, - const struct kvec *iov, int offset, int nob) -{ - struct kib_net *net = ni->ni_data; - struct page *page; - struct scatterlist *sg; - unsigned long vaddr; - int fragnob; - int page_offset; - - LASSERT(nob > 0); - LASSERT(niov > 0); - LASSERT(net); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT(niov > 0); - } - - sg = tx->tx_frags; - do { - LASSERT(niov > 0); - - vaddr = ((unsigned long)iov->iov_base) + offset; - page_offset = vaddr & (PAGE_SIZE - 1); - page = kiblnd_kvaddr_to_page(vaddr); - if (!page) { - CERROR("Can't find page\n"); - return -EFAULT; - } - - fragnob = min((int)(iov->iov_len - offset), nob); - fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); - - sg_set_page(sg, page, fragnob, page_offset); - sg = sg_next(sg); - if (!sg) { - CERROR("lacking enough sg entries to map tx\n"); - return -EFAULT; - } - - if (offset + fragnob < iov->iov_len) { - offset += fragnob; - } else { - offset = 0; - iov++; - niov--; - } - nob -= fragnob; - } while (nob > 0); - - return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); -} - -static int -kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, int nkiov, - const struct bio_vec *kiov, int offset, int nob) -{ - struct kib_net *net = ni->ni_data; - struct scatterlist *sg; - int fragnob; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT(nob > 0); - LASSERT(nkiov > 0); - LASSERT(net); - - while (offset >= kiov->bv_len) { - offset -= kiov->bv_len; - nkiov--; - kiov++; - LASSERT(nkiov > 0); - } - - sg = tx->tx_frags; - do { - LASSERT(nkiov > 0); - - fragnob = min((int)(kiov->bv_len - offset), nob); - - sg_set_page(sg, kiov->bv_page, fragnob, - kiov->bv_offset + offset); - sg = sg_next(sg); - if (!sg) { - CERROR("lacking enough sg entries to map tx\n"); - return -EFAULT; - } - - offset = 0; - kiov++; - nkiov--; - nob -= fragnob; - } while (nob > 0); - - return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); -} - -static int -kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) - __must_hold(&conn->ibc_lock) -{ - struct kib_msg *msg = tx->tx_msg; - struct kib_peer *peer = conn->ibc_peer; - struct lnet_ni *ni = peer->ibp_ni; - int ver = conn->ibc_version; - int rc; - int done; - - LASSERT(tx->tx_queued); - /* We rely on this for QP sizing */ - LASSERT(tx->tx_nwrq > 0); - - LASSERT(!credit || credit == 1); - LASSERT(conn->ibc_outstanding_credits >= 0); - LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth); - LASSERT(conn->ibc_credits >= 0); - LASSERT(conn->ibc_credits <= conn->ibc_queue_depth); - - if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) { - /* tx completions outstanding... */ - CDEBUG(D_NET, "%s: posted enough\n", - libcfs_nid2str(peer->ibp_nid)); - return -EAGAIN; - } - - if (credit && !conn->ibc_credits) { /* no credits */ - CDEBUG(D_NET, "%s: no credits\n", - libcfs_nid2str(peer->ibp_nid)); - return -EAGAIN; - } - - if (credit && !IBLND_OOB_CAPABLE(ver) && - conn->ibc_credits == 1 && /* last credit reserved */ - msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */ - CDEBUG(D_NET, "%s: not using last credit\n", - libcfs_nid2str(peer->ibp_nid)); - return -EAGAIN; - } - - /* NB don't drop ibc_lock before bumping tx_sending */ - list_del(&tx->tx_list); - tx->tx_queued = 0; - - if (msg->ibm_type == IBLND_MSG_NOOP && - (!kiblnd_need_noop(conn) || /* redundant NOOP */ - (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ - conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { - /* - * OK to drop when posted enough NOOPs, since - * kiblnd_check_sends_locked will queue NOOP again when - * posted NOOPs complete - */ - spin_unlock(&conn->ibc_lock); - kiblnd_tx_done(peer->ibp_ni, tx); - spin_lock(&conn->ibc_lock); - CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_noops_posted); - return 0; - } - - kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits, - peer->ibp_nid, conn->ibc_incarnation); - - conn->ibc_credits -= credit; - conn->ibc_outstanding_credits = 0; - conn->ibc_nsends_posted++; - if (msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted++; - - /* - * CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA - * PUT. If so, it was first queued here as a PUT_REQ, sent and - * stashed on ibc_active_txs, matched by an incoming PUT_ACK, - * and then re-queued here. It's (just) possible that - * tx_sending is non-zero if we've not done the tx_complete() - * from the first send; hence the ++ rather than = below. - */ - tx->tx_sending++; - list_add(&tx->tx_list, &conn->ibc_active_txs); - - /* I'm still holding ibc_lock! */ - if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { - rc = -ECONNABORTED; - } else if (tx->tx_pool->tpo_pool.po_failed || - conn->ibc_hdev != tx->tx_pool->tpo_hdev) { - /* close_conn will launch failover */ - rc = -ENETDOWN; - } else { - struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd; - struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; - struct ib_send_wr *wrq = &tx->tx_wrq[0].wr; - - if (frd) { - if (!frd->frd_valid) { - wrq = &frd->frd_inv_wr; - wrq->next = &frd->frd_fastreg_wr.wr; - } else { - wrq = &frd->frd_fastreg_wr.wr; - } - frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr; - } - - LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), - "bad wr_id %llx, opc %d, flags %d, peer: %s\n", - bad->wr_id, bad->opcode, bad->send_flags, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - bad = NULL; - rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad); - } - - conn->ibc_last_send = jiffies; - - if (!rc) - return 0; - - /* - * NB credits are transferred in the actual - * message, which can only be the last work item - */ - conn->ibc_credits += credit; - conn->ibc_outstanding_credits += msg->ibm_credits; - conn->ibc_nsends_posted--; - if (msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted--; - - tx->tx_status = rc; - tx->tx_waiting = 0; - tx->tx_sending--; - - done = !tx->tx_sending; - if (done) - list_del(&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (conn->ibc_state == IBLND_CONN_ESTABLISHED) - CERROR("Error %d posting transmit to %s\n", - rc, libcfs_nid2str(peer->ibp_nid)); - else - CDEBUG(D_NET, "Error %d posting transmit to %s\n", - rc, libcfs_nid2str(peer->ibp_nid)); - - kiblnd_close_conn(conn, rc); - - if (done) - kiblnd_tx_done(peer->ibp_ni, tx); - - spin_lock(&conn->ibc_lock); - - return -EIO; -} - -static void -kiblnd_check_sends_locked(struct kib_conn *conn) -{ - int ver = conn->ibc_version; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - struct kib_tx *tx; - - /* Don't send anything until after the connection is established */ - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - CDEBUG(D_NET, "%s too soon\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni)); - LASSERT(!IBLND_OOB_CAPABLE(ver) || - conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); - LASSERT(conn->ibc_reserved_credits >= 0); - - while (conn->ibc_reserved_credits > 0 && - !list_empty(&conn->ibc_tx_queue_rsrvd)) { - tx = list_entry(conn->ibc_tx_queue_rsrvd.next, - struct kib_tx, tx_list); - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); - conn->ibc_reserved_credits--; - } - - if (kiblnd_need_noop(conn)) { - spin_unlock(&conn->ibc_lock); - - tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - if (tx) - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); - - spin_lock(&conn->ibc_lock); - if (tx) - kiblnd_queue_tx_locked(tx, conn); - } - - for (;;) { - int credit; - - if (!list_empty(&conn->ibc_tx_queue_nocred)) { - credit = 0; - tx = list_entry(conn->ibc_tx_queue_nocred.next, - struct kib_tx, tx_list); - } else if (!list_empty(&conn->ibc_tx_noops)) { - LASSERT(!IBLND_OOB_CAPABLE(ver)); - credit = 1; - tx = list_entry(conn->ibc_tx_noops.next, - struct kib_tx, tx_list); - } else if (!list_empty(&conn->ibc_tx_queue)) { - credit = 1; - tx = list_entry(conn->ibc_tx_queue.next, - struct kib_tx, tx_list); - } else { - break; - } - - if (kiblnd_post_tx_locked(conn, tx, credit)) - break; - } -} - -static void -kiblnd_tx_complete(struct kib_tx *tx, int status) -{ - int failed = (status != IB_WC_SUCCESS); - struct kib_conn *conn = tx->tx_conn; - int idle; - - LASSERT(tx->tx_sending > 0); - - if (failed) { - if (conn->ibc_state == IBLND_CONN_ESTABLISHED) - CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - tx->tx_cookie, tx->tx_sending, tx->tx_waiting, - status); - - kiblnd_close_conn(conn, -EIO); - } else { - kiblnd_peer_alive(conn->ibc_peer); - } - - spin_lock(&conn->ibc_lock); - - /* - * I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. - */ - tx->tx_sending--; - conn->ibc_nsends_posted--; - if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted--; - - if (failed) { - tx->tx_waiting = 0; /* don't wait for peer */ - tx->tx_status = -EIO; - } - - idle = !tx->tx_sending && /* This is the final callback */ - !tx->tx_waiting && /* Not waiting for peer */ - !tx->tx_queued; /* Not re-queued (PUT_DONE) */ - if (idle) - list_del(&tx->tx_list); - - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - - if (idle) - kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); -} - -static void -kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type, - int body_nob) -{ - struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev; - struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq]; - struct ib_rdma_wr *wrq = &tx->tx_wrq[tx->tx_nwrq]; - int nob = offsetof(struct kib_msg, ibm_u) + body_nob; - - LASSERT(tx->tx_nwrq >= 0); - LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); - LASSERT(nob <= IBLND_MSG_SIZE); - - kiblnd_init_msg(tx->tx_msg, type, body_nob); - - sge->lkey = hdev->ibh_pd->local_dma_lkey; - sge->addr = tx->tx_msgaddr; - sge->length = nob; - - memset(wrq, 0, sizeof(*wrq)); - - wrq->wr.next = NULL; - wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX); - wrq->wr.sg_list = sge; - wrq->wr.num_sge = 1; - wrq->wr.opcode = IB_WR_SEND; - wrq->wr.send_flags = IB_SEND_SIGNALED; - - tx->tx_nwrq++; -} - -static int -kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, - int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie) -{ - struct kib_msg *ibmsg = tx->tx_msg; - struct kib_rdma_desc *srcrd = tx->tx_rd; - struct ib_sge *sge = &tx->tx_sge[0]; - struct ib_rdma_wr *wrq, *next; - int rc = resid; - int srcidx = 0; - int dstidx = 0; - int wrknob; - - LASSERT(!in_interrupt()); - LASSERT(!tx->tx_nwrq); - LASSERT(type == IBLND_MSG_GET_DONE || - type == IBLND_MSG_PUT_DONE); - - if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) { - CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_max_frags << PAGE_SHIFT, - kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd)); - rc = -EMSGSIZE; - goto too_big; - } - - while (resid > 0) { - if (srcidx >= srcrd->rd_nfrags) { - CERROR("Src buffer exhausted: %d frags\n", srcidx); - rc = -EPROTO; - break; - } - - if (dstidx == dstrd->rd_nfrags) { - CERROR("Dst buffer exhausted: %d frags\n", dstidx); - rc = -EPROTO; - break; - } - - if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) { - CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - IBLND_MAX_RDMA_FRAGS, - srcidx, srcrd->rd_nfrags, - dstidx, dstrd->rd_nfrags); - rc = -EMSGSIZE; - break; - } - - wrknob = min3(kiblnd_rd_frag_size(srcrd, srcidx), - kiblnd_rd_frag_size(dstrd, dstidx), - (__u32)resid); - - sge = &tx->tx_sge[tx->tx_nwrq]; - sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); - sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); - sge->length = wrknob; - - wrq = &tx->tx_wrq[tx->tx_nwrq]; - next = wrq + 1; - - wrq->wr.next = &next->wr; - wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); - wrq->wr.sg_list = sge; - wrq->wr.num_sge = 1; - wrq->wr.opcode = IB_WR_RDMA_WRITE; - wrq->wr.send_flags = 0; - - wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx); - wrq->rkey = kiblnd_rd_frag_key(dstrd, dstidx); - - srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob); - dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob); - - resid -= wrknob; - - tx->tx_nwrq++; - wrq++; - sge++; - } -too_big: - if (rc < 0) /* no RDMA if completing with failure */ - tx->tx_nwrq = 0; - - ibmsg->ibm_u.completion.ibcm_status = rc; - ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; - kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx, - type, sizeof(struct kib_completion_msg)); - - return rc; -} - -static void -kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn) -{ - struct list_head *q; - - LASSERT(tx->tx_nwrq > 0); /* work items set up */ - LASSERT(!tx->tx_queued); /* not queued for sending already */ - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - tx->tx_queued = 1; - tx->tx_deadline = jiffies + - msecs_to_jiffies(*kiblnd_tunables.kib_timeout * - MSEC_PER_SEC); - - if (!tx->tx_conn) { - kiblnd_conn_addref(conn); - tx->tx_conn = conn; - LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); - } else { - /* PUT_DONE first attached to conn as a PUT_REQ */ - LASSERT(tx->tx_conn == conn); - LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); - } - - switch (tx->tx_msg->ibm_type) { - default: - LBUG(); - - case IBLND_MSG_PUT_REQ: - case IBLND_MSG_GET_REQ: - q = &conn->ibc_tx_queue_rsrvd; - break; - - case IBLND_MSG_PUT_NAK: - case IBLND_MSG_PUT_ACK: - case IBLND_MSG_PUT_DONE: - case IBLND_MSG_GET_DONE: - q = &conn->ibc_tx_queue_nocred; - break; - - case IBLND_MSG_NOOP: - if (IBLND_OOB_CAPABLE(conn->ibc_version)) - q = &conn->ibc_tx_queue_nocred; - else - q = &conn->ibc_tx_noops; - break; - - case IBLND_MSG_IMMEDIATE: - q = &conn->ibc_tx_queue; - break; - } - - list_add_tail(&tx->tx_list, q); -} - -static void -kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) -{ - spin_lock(&conn->ibc_lock); - kiblnd_queue_tx_locked(tx, conn); - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); -} - -static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, - struct sockaddr_in *srcaddr, - struct sockaddr_in *dstaddr, - int timeout_ms) -{ - unsigned short port; - int rc; - - /* allow the port to be reused */ - rc = rdma_set_reuseaddr(cmid, 1); - if (rc) { - CERROR("Unable to set reuse on cmid: %d\n", rc); - return rc; - } - - /* look for a free privileged port */ - for (port = PROT_SOCK - 1; port > 0; port--) { - srcaddr->sin_port = htons(port); - rc = rdma_resolve_addr(cmid, - (struct sockaddr *)srcaddr, - (struct sockaddr *)dstaddr, - timeout_ms); - if (!rc) { - CDEBUG(D_NET, "bound to port %hu\n", port); - return 0; - } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) { - CDEBUG(D_NET, "bind to port %hu failed: %d\n", - port, rc); - } else { - return rc; - } - } - - CERROR("Failed to bind to a free privileged port\n"); - return rc; -} - -static void -kiblnd_connect_peer(struct kib_peer *peer) -{ - struct rdma_cm_id *cmid; - struct kib_dev *dev; - struct kib_net *net = peer->ibp_ni->ni_data; - struct sockaddr_in srcaddr; - struct sockaddr_in dstaddr; - int rc; - - LASSERT(net); - LASSERT(peer->ibp_connecting > 0); - - cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP, - IB_QPT_RC); - - if (IS_ERR(cmid)) { - CERROR("Can't create CMID for %s: %ld\n", - libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid)); - rc = PTR_ERR(cmid); - goto failed; - } - - dev = net->ibn_dev; - memset(&srcaddr, 0, sizeof(srcaddr)); - srcaddr.sin_family = AF_INET; - srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); - - memset(&dstaddr, 0, sizeof(dstaddr)); - dstaddr.sin_family = AF_INET; - dstaddr.sin_port = htons(*kiblnd_tunables.kib_service); - dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid)); - - kiblnd_peer_addref(peer); /* cmid's ref */ - - if (*kiblnd_tunables.kib_use_priv_port) { - rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, - *kiblnd_tunables.kib_timeout * 1000); - } else { - rc = rdma_resolve_addr(cmid, - (struct sockaddr *)&srcaddr, - (struct sockaddr *)&dstaddr, - *kiblnd_tunables.kib_timeout * 1000); - } - if (rc) { - /* Can't initiate address resolution: */ - CERROR("Can't resolve addr for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - goto failed2; - } - - return; - - failed2: - kiblnd_peer_connect_failed(peer, 1, rc); - kiblnd_peer_decref(peer); /* cmid's ref */ - rdma_destroy_id(cmid); - return; - failed: - kiblnd_peer_connect_failed(peer, 1, rc); -} - -bool -kiblnd_reconnect_peer(struct kib_peer *peer) -{ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - char *reason = NULL; - struct list_head txs; - unsigned long flags; - - INIT_LIST_HEAD(&txs); - - write_lock_irqsave(glock, flags); - if (!peer->ibp_reconnecting) { - if (peer->ibp_accepting) - reason = "accepting"; - else if (peer->ibp_connecting) - reason = "connecting"; - else if (!list_empty(&peer->ibp_conns)) - reason = "connected"; - else /* connected then closed */ - reason = "closed"; - - goto no_reconnect; - } - - LASSERT(!peer->ibp_accepting && !peer->ibp_connecting && - list_empty(&peer->ibp_conns)); - peer->ibp_reconnecting--; - - if (!kiblnd_peer_active(peer)) { - list_splice_init(&peer->ibp_tx_queue, &txs); - reason = "unlinked"; - goto no_reconnect; - } - - peer->ibp_connecting++; - peer->ibp_reconnected++; - write_unlock_irqrestore(glock, flags); - - kiblnd_connect_peer(peer); - return true; - -no_reconnect: - write_unlock_irqrestore(glock, flags); - - CWARN("Abort reconnection of %s: %s\n", - libcfs_nid2str(peer->ibp_nid), reason); - kiblnd_txlist_done(peer->ibp_ni, &txs, -ECONNABORTED); - return false; -} - -void -kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) -{ - struct kib_peer *peer; - struct kib_peer *peer2; - struct kib_conn *conn; - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - unsigned long flags; - int rc; - int i; - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - - /* - * If I get here, I've committed to send, so I complete the tx with - * failure on any problems - */ - LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */ - LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */ - - /* - * First time, just use a read lock since I expect to find my peer - * connected - */ - read_lock_irqsave(g_lock, flags); - - peer = kiblnd_find_peer_locked(nid); - if (peer && !list_empty(&peer->ibp_conns)) { - /* Found a peer with an established connection */ - conn = kiblnd_get_conn_locked(peer); - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - read_unlock_irqrestore(g_lock, flags); - - if (tx) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - return; - } - - read_unlock(g_lock); - /* Re-try with a write lock */ - write_lock(g_lock); - - peer = kiblnd_find_peer_locked(nid); - if (peer) { - if (list_empty(&peer->ibp_conns)) { - /* found a peer, but it's still connecting... */ - LASSERT(kiblnd_peer_connecting(peer)); - if (tx) - list_add_tail(&tx->tx_list, - &peer->ibp_tx_queue); - write_unlock_irqrestore(g_lock, flags); - } else { - conn = kiblnd_get_conn_locked(peer); - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - write_unlock_irqrestore(g_lock, flags); - - if (tx) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - } - return; - } - - write_unlock_irqrestore(g_lock, flags); - - /* Allocate a peer ready to add to the peer table and retry */ - rc = kiblnd_create_peer(ni, &peer, nid); - if (rc) { - CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); - if (tx) { - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kiblnd_tx_done(ni, tx); - } - return; - } - - write_lock_irqsave(g_lock, flags); - - peer2 = kiblnd_find_peer_locked(nid); - if (peer2) { - if (list_empty(&peer2->ibp_conns)) { - /* found a peer, but it's still connecting... */ - LASSERT(kiblnd_peer_connecting(peer2)); - if (tx) - list_add_tail(&tx->tx_list, - &peer2->ibp_tx_queue); - write_unlock_irqrestore(g_lock, flags); - } else { - conn = kiblnd_get_conn_locked(peer2); - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - write_unlock_irqrestore(g_lock, flags); - - if (tx) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - } - - kiblnd_peer_decref(peer); - return; - } - - /* Brand new peer */ - LASSERT(!peer->ibp_connecting); - tunables = &peer->ibp_ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - peer->ibp_connecting = tunables->lnd_conns_per_peer; - - /* always called with a ref on ni, which prevents ni being shutdown */ - LASSERT(!((struct kib_net *)ni->ni_data)->ibn_shutdown); - - if (tx) - list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); - - kiblnd_peer_addref(peer); - list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); - - write_unlock_irqrestore(g_lock, flags); - - for (i = 0; i < tunables->lnd_conns_per_peer; i++) - kiblnd_connect_peer(peer); - kiblnd_peer_decref(peer); -} - -int -kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) -{ - struct lnet_hdr *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - struct lnet_process_id target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct kvec *payload_iov = lntmsg->msg_iov; - struct bio_vec *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - struct iov_iter from; - struct kib_msg *ibmsg; - struct kib_rdma_desc *rd; - struct kib_tx *tx; - int nob; - int rc; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT(!payload_nob || payload_niov > 0); - LASSERT(payload_niov <= LNET_MAX_IOV); - - /* Thread context */ - LASSERT(!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT(!(payload_kiov && payload_iov)); - - if (payload_kiov) - iov_iter_bvec(&from, ITER_BVEC | WRITE, - payload_kiov, payload_niov, - payload_nob + payload_offset); - else - iov_iter_kvec(&from, ITER_KVEC | WRITE, - payload_iov, payload_niov, - payload_nob + payload_offset); - - iov_iter_advance(&from, payload_offset); - - switch (type) { - default: - LBUG(); - return -EIO; - - case LNET_MSG_ACK: - LASSERT(!payload_nob); - break; - - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send IMMEDIATE */ - - /* is the REPLY message too small for RDMA? */ - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); - if (nob <= IBLND_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kiblnd_get_idle_tx(ni, target.nid); - if (!tx) { - CERROR("Can't allocate txd for GET to %s\n", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - rd = &ibmsg->ibm_u.get.ibgm_rd; - if (!(lntmsg->msg_md->md_options & LNET_MD_KIOV)) - rc = kiblnd_setup_rd_iov(ni, tx, rd, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, - 0, lntmsg->msg_md->md_length); - else - rc = kiblnd_setup_rd_kiov(ni, tx, rd, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.kiov, - 0, lntmsg->msg_md->md_length); - if (rc) { - CERROR("Can't setup GET sink for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kiblnd_tx_done(ni, tx); - return -EIO; - } - - nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]); - ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; - ibmsg->ibm_u.get.ibgm_hdr = *hdr; - - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); - - tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); - if (!tx->tx_lntmsg[1]) { - CERROR("Can't create reply for GET -> %s\n", - libcfs_nid2str(target.nid)); - kiblnd_tx_done(ni, tx); - return -EIO; - } - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ - tx->tx_waiting = 1; /* waiting for GET_DONE */ - kiblnd_launch_tx(ni, tx, target.nid); - return 0; - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - /* Is the payload small enough not to need RDMA? */ - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob <= IBLND_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kiblnd_get_idle_tx(ni, target.nid); - if (!tx) { - CERROR("Can't allocate %s txd for %s\n", - type == LNET_MSG_PUT ? "PUT" : "REPLY", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - if (!payload_kiov) - rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, - payload_niov, payload_kiov, - payload_offset, payload_nob); - if (rc) { - CERROR("Can't setup PUT src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kiblnd_tx_done(ni, tx); - return -EIO; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; - ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(struct kib_putreq_msg)); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ - kiblnd_launch_tx(ni, tx, target.nid); - return 0; - } - - /* send IMMEDIATE */ - - LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]) - <= IBLND_MSG_SIZE); - - tx = kiblnd_get_idle_tx(ni, target.nid); - if (!tx) { - CERROR("Can't send %d to %s: tx descs exhausted\n", - type, libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - rc = copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, payload_nob, - &from); - if (rc != payload_nob) { - kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); - return -EFAULT; - } - - nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]); - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - kiblnd_launch_tx(ni, tx, target.nid); - return 0; -} - -static void -kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) -{ - struct lnet_process_id target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct kvec *iov = lntmsg->msg_iov; - struct bio_vec *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - struct kib_tx *tx; - int rc; - - tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); - if (!tx) { - CERROR("Can't get tx for REPLY to %s\n", - libcfs_nid2str(target.nid)); - goto failed_0; - } - - if (!nob) - rc = 0; - else if (!kiov) - rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, - niov, iov, offset, nob); - else - rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, - niov, kiov, offset, nob); - - if (rc) { - CERROR("Can't setup GET src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - rc = kiblnd_init_rdma(rx->rx_conn, tx, - IBLND_MSG_GET_DONE, nob, - &rx->rx_msg->ibm_u.get.ibgm_rd, - rx->rx_msg->ibm_u.get.ibgm_cookie); - if (rc < 0) { - CERROR("Can't setup rdma for GET from %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - if (!nob) { - /* No RDMA: local completion may happen now! */ - lnet_finalize(ni, lntmsg, 0); - } else { - /* RDMA: lnet_finalize(lntmsg) when it completes */ - tx->tx_lntmsg[0] = lntmsg; - } - - kiblnd_queue_tx(tx, rx->rx_conn); - return; - - failed_1: - kiblnd_tx_done(ni, tx); - failed_0: - lnet_finalize(ni, lntmsg, -EIO); -} - -int -kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen) -{ - struct kib_rx *rx = private; - struct kib_msg *rxmsg = rx->rx_msg; - struct kib_conn *conn = rx->rx_conn; - struct kib_tx *tx; - int nob; - int post_credit = IBLND_POSTRX_PEER_CREDIT; - int rc = 0; - - LASSERT(iov_iter_count(to) <= rlen); - LASSERT(!in_interrupt()); - /* Either all pages or all vaddrs */ - - switch (rxmsg->ibm_type) { - default: - LBUG(); - - case IBLND_MSG_IMMEDIATE: - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]); - if (nob > rx->rx_nob) { - CERROR("Immediate message from %s too big: %d(%d)\n", - libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), - nob, rx->rx_nob); - rc = -EPROTO; - break; - } - - rc = copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, rlen, - to); - if (rc != rlen) { - rc = -EFAULT; - break; - } - - rc = 0; - lnet_finalize(ni, lntmsg, 0); - break; - - case IBLND_MSG_PUT_REQ: { - struct kib_msg *txmsg; - struct kib_rdma_desc *rd; - - if (!iov_iter_count(to)) { - lnet_finalize(ni, lntmsg, 0); - kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - if (!tx) { - CERROR("Can't allocate tx for %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* Not replying will break the connection */ - rc = -ENOMEM; - break; - } - - txmsg = tx->tx_msg; - rd = &txmsg->ibm_u.putack.ibpam_rd; - if (!(to->type & ITER_BVEC)) - rc = kiblnd_setup_rd_iov(ni, tx, rd, - to->nr_segs, to->kvec, - to->iov_offset, - iov_iter_count(to)); - else - rc = kiblnd_setup_rd_kiov(ni, tx, rd, - to->nr_segs, to->bvec, - to->iov_offset, - iov_iter_count(to)); - if (rc) { - CERROR("Can't setup PUT sink for %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kiblnd_tx_done(ni, tx); - /* tell peer it's over */ - kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]); - txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; - txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; - - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_DONE */ - kiblnd_queue_tx(tx, conn); - - /* reposted buffer reserved for PUT_DONE */ - post_credit = IBLND_POSTRX_NO_CREDIT; - break; - } - - case IBLND_MSG_GET_REQ: - if (lntmsg) { - /* Optimized GET; RDMA lntmsg's payload */ - kiblnd_reply(ni, rx, lntmsg); - } else { - /* GET didn't match anything */ - kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, - -ENODATA, - rxmsg->ibm_u.get.ibgm_cookie); - } - break; - } - - kiblnd_post_rx(rx, post_credit); - return rc; -} - -int -kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name) -{ - struct task_struct *task = kthread_run(fn, arg, "%s", name); - - if (IS_ERR(task)) - return PTR_ERR(task); - - atomic_inc(&kiblnd_data.kib_nthreads); - return 0; -} - -static void -kiblnd_thread_fini(void) -{ - atomic_dec(&kiblnd_data.kib_nthreads); -} - -static void -kiblnd_peer_alive(struct kib_peer *peer) -{ - /* This is racy, but everyone's only writing jiffies */ - peer->ibp_last_alive = jiffies; - mb(); -} - -static void -kiblnd_peer_notify(struct kib_peer *peer) -{ - int error = 0; - unsigned long last_alive = 0; - unsigned long flags; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (kiblnd_peer_idle(peer) && peer->ibp_error) { - error = peer->ibp_error; - peer->ibp_error = 0; - - last_alive = peer->ibp_last_alive; - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - if (error) - lnet_notify(peer->ibp_ni, - peer->ibp_nid, 0, last_alive); -} - -void -kiblnd_close_conn_locked(struct kib_conn *conn, int error) -{ - /* - * This just does the immediate housekeeping. 'error' is zero for a - * normal shutdown which can happen only after the connection has been - * established. If the connection is established, schedule the - * connection to be finished off by the connd. Otherwise the connd is - * already dealing with it (either to set it up or tear it down). - * Caller holds kib_global_lock exclusively in irq context - */ - struct kib_peer *peer = conn->ibc_peer; - struct kib_dev *dev; - unsigned long flags; - - LASSERT(error || conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - if (error && !conn->ibc_comms_error) - conn->ibc_comms_error = error; - - if (conn->ibc_state != IBLND_CONN_ESTABLISHED) - return; /* already being handled */ - - if (!error && - list_empty(&conn->ibc_tx_noops) && - list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_rsrvd) && - list_empty(&conn->ibc_tx_queue_nocred) && - list_empty(&conn->ibc_active_txs)) { - CDEBUG(D_NET, "closing conn to %s\n", - libcfs_nid2str(peer->ibp_nid)); - } else { - CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", - libcfs_nid2str(peer->ibp_nid), error, - list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", - list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", - list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", - list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", - list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); - } - - dev = ((struct kib_net *)peer->ibp_ni->ni_data)->ibn_dev; - if (peer->ibp_next_conn == conn) - /* clear next_conn so it won't be used */ - peer->ibp_next_conn = NULL; - list_del(&conn->ibc_list); - /* connd (see below) takes over ibc_list's ref */ - - if (list_empty(&peer->ibp_conns) && /* no more conns */ - kiblnd_peer_active(peer)) { /* still in peer table */ - kiblnd_unlink_peer_locked(peer); - - /* set/clear error on last conn */ - peer->ibp_error = conn->ibc_comms_error; - } - - kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); - - if (error && - kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - wake_up(&kiblnd_data.kib_failover_waitq); - } - - spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); - - list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns); - wake_up(&kiblnd_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); -} - -void -kiblnd_close_conn(struct kib_conn *conn, int error) -{ - unsigned long flags; - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - kiblnd_close_conn_locked(conn, error); - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); -} - -static void -kiblnd_handle_early_rxs(struct kib_conn *conn) -{ - unsigned long flags; - struct kib_rx *rx; - - LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - while (!list_empty(&conn->ibc_early_rxs)) { - rx = list_entry(conn->ibc_early_rxs.next, - struct kib_rx, rx_list); - list_del(&rx->rx_list); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_handle_rx(rx); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - } - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); -} - -static void -kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) -{ - LIST_HEAD(zombies); - struct list_head *tmp; - struct list_head *nxt; - struct kib_tx *tx; - - spin_lock(&conn->ibc_lock); - - list_for_each_safe(tmp, nxt, txs) { - tx = list_entry(tmp, struct kib_tx, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT(!tx->tx_queued); - LASSERT(tx->tx_waiting || tx->tx_sending); - } else { - LASSERT(tx->tx_queued); - } - - tx->tx_status = -ECONNABORTED; - tx->tx_waiting = 0; - - if (!tx->tx_sending) { - tx->tx_queued = 0; - list_del(&tx->tx_list); - list_add(&tx->tx_list, &zombies); - } - } - - spin_unlock(&conn->ibc_lock); - - kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED); -} - -static void -kiblnd_finalise_conn(struct kib_conn *conn) -{ - LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state > IBLND_CONN_INIT); - - kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); - - /* - * abort_receives moves QP state to IB_QPS_ERR. This is only required - * for connections that didn't get as far as being connected, because - * rdma_disconnect() does this for free. - */ - kiblnd_abort_receives(conn); - - /* - * Complete all tx descs not waiting for sends to complete. - * NB we should be safe from RDMA now that the QP has changed state - */ - kiblnd_abort_txs(conn, &conn->ibc_tx_noops); - kiblnd_abort_txs(conn, &conn->ibc_tx_queue); - kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); - kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); - kiblnd_abort_txs(conn, &conn->ibc_active_txs); - - kiblnd_handle_early_rxs(conn); -} - -static void -kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error) -{ - LIST_HEAD(zombies); - unsigned long flags; - - LASSERT(error); - LASSERT(!in_interrupt()); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (active) { - LASSERT(peer->ibp_connecting > 0); - peer->ibp_connecting--; - } else { - LASSERT(peer->ibp_accepting > 0); - peer->ibp_accepting--; - } - - if (kiblnd_peer_connecting(peer)) { - /* another connection attempt under way... */ - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, - flags); - return; - } - - peer->ibp_reconnected = 0; - if (list_empty(&peer->ibp_conns)) { - /* Take peer's blocked transmits to complete with error */ - list_add(&zombies, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (kiblnd_peer_active(peer)) - kiblnd_unlink_peer_locked(peer); - - peer->ibp_error = error; - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT(list_empty(&peer->ibp_tx_queue)); - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_peer_notify(peer); - - if (list_empty(&zombies)) - return; - - CNETERR("Deleting messages for %s: connection failed\n", - libcfs_nid2str(peer->ibp_nid)); - - kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH); -} - -static void -kiblnd_connreq_done(struct kib_conn *conn, int status) -{ - struct kib_peer *peer = conn->ibc_peer; - struct kib_tx *tx; - struct kib_tx *tmp; - struct list_head txs; - unsigned long flags; - int active; - - active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); - - CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n", - libcfs_nid2str(peer->ibp_nid), active, - conn->ibc_version, status); - - LASSERT(!in_interrupt()); - LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && - peer->ibp_connecting > 0) || - (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && - peer->ibp_accepting > 0)); - - kfree(conn->ibc_connvars); - conn->ibc_connvars = NULL; - - if (status) { - /* failed to establish connection */ - kiblnd_peer_connect_failed(peer, active, status); - kiblnd_finalise_conn(conn); - return; - } - - /* connection established */ - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - conn->ibc_last_send = jiffies; - kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); - kiblnd_peer_alive(peer); - - /* - * Add conn to peer's list and nuke any dangling conns from a different - * peer instance... - */ - kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ - list_add(&conn->ibc_list, &peer->ibp_conns); - peer->ibp_reconnected = 0; - if (active) - peer->ibp_connecting--; - else - peer->ibp_accepting--; - - if (!peer->ibp_version) { - peer->ibp_version = conn->ibc_version; - peer->ibp_incarnation = conn->ibc_incarnation; - } - - if (peer->ibp_version != conn->ibc_version || - peer->ibp_incarnation != conn->ibc_incarnation) { - kiblnd_close_stale_conns_locked(peer, conn->ibc_version, - conn->ibc_incarnation); - peer->ibp_version = conn->ibc_version; - peer->ibp_incarnation = conn->ibc_incarnation; - } - - /* grab pending txs while I have the lock */ - list_add(&txs, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (!kiblnd_peer_active(peer) || /* peer has been deleted */ - conn->ibc_comms_error) { /* error has happened already */ - struct lnet_ni *ni = peer->ibp_ni; - - /* start to shut down connection */ - kiblnd_close_conn_locked(conn, -ECONNABORTED); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_txlist_done(ni, &txs, -ECONNABORTED); - - return; - } - - /* - * +1 ref for myself, this connection is visible to other threads - * now, refcount of peer:ibp_conns can be released by connection - * close from either a different thread, or the calling of - * kiblnd_check_sends_locked() below. See bz21911 for details. - */ - kiblnd_conn_addref(conn); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - /* Schedule blocked txs - * Note: if we are running with conns_per_peer > 1, these blocked - * txs will all get scheduled to the first connection which gets - * scheduled. We won't be using round robin on this first batch. - */ - spin_lock(&conn->ibc_lock); - list_for_each_entry_safe(tx, tmp, &txs, tx_list) { - list_del(&tx->tx_list); - - kiblnd_queue_tx_locked(tx, conn); - } - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - - /* schedule blocked rxs */ - kiblnd_handle_early_rxs(conn); - - kiblnd_conn_decref(conn); -} - -static void -kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) -{ - int rc; - - rc = rdma_reject(cmid, rej, sizeof(*rej)); - - if (rc) - CWARN("Error %d sending reject\n", rc); -} - -static int -kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) -{ - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - struct kib_msg *reqmsg = priv; - struct kib_msg *ackmsg; - struct kib_dev *ibdev; - struct kib_peer *peer; - struct kib_peer *peer2; - struct kib_conn *conn; - struct lnet_ni *ni = NULL; - struct kib_net *net = NULL; - lnet_nid_t nid; - struct rdma_conn_param cp; - struct kib_rej rej; - int version = IBLND_MSG_VERSION; - unsigned long flags; - int max_frags; - int rc; - struct sockaddr_in *peer_addr; - - LASSERT(!in_interrupt()); - - /* cmid inherits 'context' from the corresponding listener id */ - ibdev = (struct kib_dev *)cmid->context; - LASSERT(ibdev); - - memset(&rej, 0, sizeof(rej)); - rej.ibr_magic = IBLND_MSG_MAGIC; - rej.ibr_why = IBLND_REJECT_FATAL; - rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; - - peer_addr = (struct sockaddr_in *)&cmid->route.addr.dst_addr; - if (*kiblnd_tunables.kib_require_priv_port && - ntohs(peer_addr->sin_port) >= PROT_SOCK) { - __u32 ip = ntohl(peer_addr->sin_addr.s_addr); - - CERROR("Peer's port (%pI4h:%hu) is not privileged\n", - &ip, ntohs(peer_addr->sin_port)); - goto failed; - } - - if (priv_nob < offsetof(struct kib_msg, ibm_type)) { - CERROR("Short connection request\n"); - goto failed; - } - - /* - * Future protocol version compatibility support! If the - * o2iblnd-specific protocol changes, or when LNET unifies - * protocols over all LNDs, the initial connection will - * negotiate a protocol version. I trap this here to avoid - * console errors; the reject tells the peer which protocol I - * speak. - */ - if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || - reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) - goto failed; - if (reqmsg->ibm_magic == IBLND_MSG_MAGIC && - reqmsg->ibm_version != IBLND_MSG_VERSION && - reqmsg->ibm_version != IBLND_MSG_VERSION_1) - goto failed; - if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && - reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) && - reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1)) - goto failed; - - rc = kiblnd_unpack_msg(reqmsg, priv_nob); - if (rc) { - CERROR("Can't parse connection request: %d\n", rc); - goto failed; - } - - nid = reqmsg->ibm_srcnid; - ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid)); - - if (ni) { - net = (struct kib_net *)ni->ni_data; - rej.ibr_incarnation = net->ibn_incarnation; - } - - if (!ni || /* no matching net */ - ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ - net->ibn_dev != ibdev) { /* wrong device */ - CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n", - libcfs_nid2str(nid), - !ni ? "NA" : libcfs_nid2str(ni->ni_nid), - ibdev->ibd_ifname, ibdev->ibd_nnets, - &ibdev->ibd_ifip, - libcfs_nid2str(reqmsg->ibm_dstnid)); - - goto failed; - } - - /* check time stamp as soon as possible */ - if (reqmsg->ibm_dststamp && - reqmsg->ibm_dststamp != net->ibn_incarnation) { - CWARN("Stale connection request\n"); - rej.ibr_why = IBLND_REJECT_CONN_STALE; - goto failed; - } - - /* I can accept peer's version */ - version = reqmsg->ibm_version; - - if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { - CERROR("Unexpected connreq msg type: %x from %s\n", - reqmsg->ibm_type, libcfs_nid2str(nid)); - goto failed; - } - - if (reqmsg->ibm_u.connparams.ibcp_queue_depth > - kiblnd_msg_queue_size(version, ni)) { - CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n", - libcfs_nid2str(nid), - reqmsg->ibm_u.connparams.ibcp_queue_depth, - kiblnd_msg_queue_size(version, ni)); - - if (version == IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; - - goto failed; - } - - max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; - if (max_frags > kiblnd_rdma_frags(version, ni)) { - CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n", - libcfs_nid2str(nid), version, max_frags, - kiblnd_rdma_frags(version, ni)); - - if (version >= IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; - - goto failed; - } else if (max_frags < kiblnd_rdma_frags(version, ni) && - !net->ibn_fmr_ps) { - CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n", - libcfs_nid2str(nid), version, max_frags, - kiblnd_rdma_frags(version, ni)); - - if (version == IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; - - goto failed; - } - - if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { - CERROR("Can't accept %s: message size %d too big (%d max)\n", - libcfs_nid2str(nid), - reqmsg->ibm_u.connparams.ibcp_max_msg_size, - IBLND_MSG_SIZE); - goto failed; - } - - /* assume 'nid' is a new peer; create */ - rc = kiblnd_create_peer(ni, &peer, nid); - if (rc) { - CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); - rej.ibr_why = IBLND_REJECT_NO_RESOURCES; - goto failed; - } - - /* We have validated the peer's parameters so use those */ - peer->ibp_max_frags = max_frags; - peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; - - write_lock_irqsave(g_lock, flags); - - peer2 = kiblnd_find_peer_locked(nid); - if (peer2) { - if (!peer2->ibp_version) { - peer2->ibp_version = version; - peer2->ibp_incarnation = reqmsg->ibm_srcstamp; - } - - /* not the guy I've talked with */ - if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || - peer2->ibp_version != version) { - kiblnd_close_peer_conns_locked(peer2, -ESTALE); - - if (kiblnd_peer_active(peer2)) { - peer2->ibp_incarnation = reqmsg->ibm_srcstamp; - peer2->ibp_version = version; - } - write_unlock_irqrestore(g_lock, flags); - - CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n", - libcfs_nid2str(nid), peer2->ibp_version, version, - peer2->ibp_incarnation, reqmsg->ibm_srcstamp); - - kiblnd_peer_decref(peer); - rej.ibr_why = IBLND_REJECT_CONN_STALE; - goto failed; - } - - /* - * Tie-break connection race in favour of the higher NID. - * If we keep running into a race condition multiple times, - * we have to assume that the connection attempt with the - * higher NID is stuck in a connecting state and will never - * recover. As such, we pass through this if-block and let - * the lower NID connection win so we can move forward. - */ - if (peer2->ibp_connecting && - nid < ni->ni_nid && peer2->ibp_races < - MAX_CONN_RACES_BEFORE_ABORT) { - peer2->ibp_races++; - write_unlock_irqrestore(g_lock, flags); - - CDEBUG(D_NET, "Conn race %s\n", - libcfs_nid2str(peer2->ibp_nid)); - - kiblnd_peer_decref(peer); - rej.ibr_why = IBLND_REJECT_CONN_RACE; - goto failed; - } - if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT) - CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n", - libcfs_nid2str(peer2->ibp_nid), - MAX_CONN_RACES_BEFORE_ABORT); - /** - * passive connection is allowed even this peer is waiting for - * reconnection. - */ - peer2->ibp_reconnecting = 0; - peer2->ibp_races = 0; - peer2->ibp_accepting++; - kiblnd_peer_addref(peer2); - - /** - * Race with kiblnd_launch_tx (active connect) to create peer - * so copy validated parameters since we now know what the - * peer's limits are - */ - peer2->ibp_max_frags = peer->ibp_max_frags; - peer2->ibp_queue_depth = peer->ibp_queue_depth; - - write_unlock_irqrestore(g_lock, flags); - kiblnd_peer_decref(peer); - peer = peer2; - } else { - /* Brand new peer */ - LASSERT(!peer->ibp_accepting); - LASSERT(!peer->ibp_version && - !peer->ibp_incarnation); - - peer->ibp_accepting = 1; - peer->ibp_version = version; - peer->ibp_incarnation = reqmsg->ibm_srcstamp; - - /* I have a ref on ni that prevents it being shutdown */ - LASSERT(!net->ibn_shutdown); - - kiblnd_peer_addref(peer); - list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); - - write_unlock_irqrestore(g_lock, flags); - } - - conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, - version); - if (!conn) { - kiblnd_peer_connect_failed(peer, 0, -ENOMEM); - kiblnd_peer_decref(peer); - rej.ibr_why = IBLND_REJECT_NO_RESOURCES; - goto failed; - } - - /* - * conn now "owns" cmid, so I return success from here on to ensure the - * CM callback doesn't destroy cmid. - */ - conn->ibc_incarnation = reqmsg->ibm_srcstamp; - conn->ibc_credits = conn->ibc_queue_depth; - conn->ibc_reserved_credits = conn->ibc_queue_depth; - LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + - IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn)); - - ackmsg = &conn->ibc_connvars->cv_msg; - memset(ackmsg, 0, sizeof(*ackmsg)); - - kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, - sizeof(ackmsg->ibm_u.connparams)); - ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; - ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - - kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); - - memset(&cp, 0, sizeof(cp)); - cp.private_data = ackmsg; - cp.private_data_len = ackmsg->ibm_nob; - cp.responder_resources = 0; /* No atomic ops or RDMA reads */ - cp.initiator_depth = 0; - cp.flow_control = 1; - cp.retry_count = *kiblnd_tunables.kib_retry_count; - cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; - - CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); - - rc = rdma_accept(cmid, &cp); - if (rc) { - CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); - rej.ibr_version = version; - rej.ibr_why = IBLND_REJECT_FATAL; - - kiblnd_reject(cmid, &rej); - kiblnd_connreq_done(conn, rc); - kiblnd_conn_decref(conn); - } - - lnet_ni_decref(ni); - return 0; - - failed: - if (ni) { - rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni); - rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni); - lnet_ni_decref(ni); - } - - rej.ibr_version = version; - kiblnd_reject(cmid, &rej); - - return -ECONNREFUSED; -} - -static void -kiblnd_check_reconnect(struct kib_conn *conn, int version, - __u64 incarnation, int why, struct kib_connparams *cp) -{ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_peer *peer = conn->ibc_peer; - char *reason; - int msg_size = IBLND_MSG_SIZE; - int frag_num = -1; - int queue_dep = -1; - bool reconnect; - unsigned long flags; - - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); - LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */ - - if (cp) { - msg_size = cp->ibcp_max_msg_size; - frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT; - queue_dep = cp->ibcp_queue_depth; - } - - write_lock_irqsave(glock, flags); - /** - * retry connection if it's still needed and no other connection - * attempts (active or passive) are in progress - * NB: reconnect is still needed even when ibp_tx_queue is - * empty if ibp_version != version because reconnect may be - * initiated by kiblnd_query() - */ - reconnect = (!list_empty(&peer->ibp_tx_queue) || - peer->ibp_version != version) && - peer->ibp_connecting && - !peer->ibp_accepting; - if (!reconnect) { - reason = "no need"; - goto out; - } - - switch (why) { - default: - reason = "Unknown"; - break; - - case IBLND_REJECT_RDMA_FRAGS: { - struct lnet_ioctl_config_lnd_tunables *tunables; - - if (!cp) { - reason = "can't negotiate max frags"; - goto out; - } - tunables = peer->ibp_ni->ni_lnd_tunables; - if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) { - reason = "map_on_demand must be enabled"; - goto out; - } - if (conn->ibc_max_frags <= frag_num) { - reason = "unsupported max frags"; - goto out; - } - - peer->ibp_max_frags = frag_num; - reason = "rdma fragments"; - break; - } - case IBLND_REJECT_MSG_QUEUE_SIZE: - if (!cp) { - reason = "can't negotiate queue depth"; - goto out; - } - if (conn->ibc_queue_depth <= queue_dep) { - reason = "unsupported queue depth"; - goto out; - } - - peer->ibp_queue_depth = queue_dep; - reason = "queue depth"; - break; - - case IBLND_REJECT_CONN_STALE: - reason = "stale"; - break; - - case IBLND_REJECT_CONN_RACE: - reason = "conn race"; - break; - - case IBLND_REJECT_CONN_UNCOMPAT: - reason = "version negotiation"; - break; - } - - conn->ibc_reconnect = 1; - peer->ibp_reconnecting++; - peer->ibp_version = version; - if (incarnation) - peer->ibp_incarnation = incarnation; -out: - write_unlock_irqrestore(glock, flags); - - CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n", - libcfs_nid2str(peer->ibp_nid), - reconnect ? "reconnect" : "don't reconnect", - reason, IBLND_MSG_VERSION, version, msg_size, - conn->ibc_queue_depth, queue_dep, - conn->ibc_max_frags, frag_num); - /** - * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer - * while destroying the zombie - */ -} - -static void -kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) -{ - struct kib_peer *peer = conn->ibc_peer; - - LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); - - switch (reason) { - case IB_CM_REJ_STALE_CONN: - kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, - IBLND_REJECT_CONN_STALE, NULL); - break; - - case IB_CM_REJ_INVALID_SERVICE_ID: - CNETERR("%s rejected: no listener at %d\n", - libcfs_nid2str(peer->ibp_nid), - *kiblnd_tunables.kib_service); - break; - - case IB_CM_REJ_CONSUMER_DEFINED: - if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { - struct kib_rej *rej = priv; - struct kib_connparams *cp = NULL; - int flip = 0; - __u64 incarnation = -1; - - /* NB. default incarnation is -1 because: - * a) V1 will ignore dst incarnation in connreq. - * b) V2 will provide incarnation while rejecting me, - * -1 will be overwrote. - * - * if I try to connect to a V1 peer with V2 protocol, - * it rejected me then upgrade to V2, I have no idea - * about the upgrading and try to reconnect with V1, - * in this case upgraded V2 can find out I'm trying to - * talk to the old guy and reject me(incarnation is -1). - */ - - if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || - rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { - __swab32s(&rej->ibr_magic); - __swab16s(&rej->ibr_version); - flip = 1; - } - - if (priv_nob >= sizeof(struct kib_rej) && - rej->ibr_version > IBLND_MSG_VERSION_1) { - /* - * priv_nob is always 148 in current version - * of OFED, so we still need to check version. - * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) - */ - cp = &rej->ibr_cp; - - if (flip) { - __swab64s(&rej->ibr_incarnation); - __swab16s(&cp->ibcp_queue_depth); - __swab16s(&cp->ibcp_max_frags); - __swab32s(&cp->ibcp_max_msg_size); - } - - incarnation = rej->ibr_incarnation; - } - - if (rej->ibr_magic != IBLND_MSG_MAGIC && - rej->ibr_magic != LNET_PROTO_MAGIC) { - CERROR("%s rejected: consumer defined fatal error\n", - libcfs_nid2str(peer->ibp_nid)); - break; - } - - if (rej->ibr_version != IBLND_MSG_VERSION && - rej->ibr_version != IBLND_MSG_VERSION_1) { - CERROR("%s rejected: o2iblnd version %x error\n", - libcfs_nid2str(peer->ibp_nid), - rej->ibr_version); - break; - } - - if (rej->ibr_why == IBLND_REJECT_FATAL && - rej->ibr_version == IBLND_MSG_VERSION_1) { - CDEBUG(D_NET, "rejected by old version peer %s: %x\n", - libcfs_nid2str(peer->ibp_nid), rej->ibr_version); - - if (conn->ibc_version != IBLND_MSG_VERSION_1) - rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; - } - - switch (rej->ibr_why) { - case IBLND_REJECT_CONN_RACE: - case IBLND_REJECT_CONN_STALE: - case IBLND_REJECT_CONN_UNCOMPAT: - case IBLND_REJECT_MSG_QUEUE_SIZE: - case IBLND_REJECT_RDMA_FRAGS: - kiblnd_check_reconnect(conn, rej->ibr_version, - incarnation, - rej->ibr_why, cp); - break; - - case IBLND_REJECT_NO_RESOURCES: - CERROR("%s rejected: o2iblnd no resources\n", - libcfs_nid2str(peer->ibp_nid)); - break; - - case IBLND_REJECT_FATAL: - CERROR("%s rejected: o2iblnd fatal error\n", - libcfs_nid2str(peer->ibp_nid)); - break; - - default: - CERROR("%s rejected: o2iblnd reason %d\n", - libcfs_nid2str(peer->ibp_nid), - rej->ibr_why); - break; - } - break; - } - /* fall through */ - default: - CNETERR("%s rejected: reason %d, size %d\n", - libcfs_nid2str(peer->ibp_nid), reason, priv_nob); - break; - } - - kiblnd_connreq_done(conn, -ECONNREFUSED); -} - -static void -kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) -{ - struct kib_peer *peer = conn->ibc_peer; - struct lnet_ni *ni = peer->ibp_ni; - struct kib_net *net = ni->ni_data; - struct kib_msg *msg = priv; - int ver = conn->ibc_version; - int rc = kiblnd_unpack_msg(msg, priv_nob); - unsigned long flags; - - LASSERT(net); - - if (rc) { - CERROR("Can't unpack connack from %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - goto failed; - } - - if (msg->ibm_type != IBLND_MSG_CONNACK) { - CERROR("Unexpected message %d from %s\n", - msg->ibm_type, libcfs_nid2str(peer->ibp_nid)); - rc = -EPROTO; - goto failed; - } - - if (ver != msg->ibm_version) { - CERROR("%s replied version %x is different with requested version %x\n", - libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver); - rc = -EPROTO; - goto failed; - } - - if (msg->ibm_u.connparams.ibcp_queue_depth > - conn->ibc_queue_depth) { - CERROR("%s has incompatible queue depth %d (<=%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_queue_depth, - conn->ibc_queue_depth); - rc = -EPROTO; - goto failed; - } - - if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) > - conn->ibc_max_frags) { - CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT, - conn->ibc_max_frags); - rc = -EPROTO; - goto failed; - } - - if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { - CERROR("%s max message size %d too big (%d max)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_msg_size, - IBLND_MSG_SIZE); - rc = -EPROTO; - goto failed; - } - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - if (msg->ibm_dstnid == ni->ni_nid && - msg->ibm_dststamp == net->ibn_incarnation) - rc = 0; - else - rc = -ESTALE; - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - if (rc) { - CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n", - libcfs_nid2str(peer->ibp_nid), rc, - msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags); - goto failed; - } - - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; - LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + - IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); - - kiblnd_connreq_done(conn, 0); - return; - - failed: - /* - * NB My QP has already established itself, so I handle anything going - * wrong here by setting ibc_comms_error. - * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then - * immediately tears it down. - */ - LASSERT(rc); - conn->ibc_comms_error = rc; - kiblnd_connreq_done(conn, 0); -} - -static int -kiblnd_active_connect(struct rdma_cm_id *cmid) -{ - struct kib_peer *peer = (struct kib_peer *)cmid->context; - struct kib_conn *conn; - struct kib_msg *msg; - struct rdma_conn_param cp; - int version; - __u64 incarnation; - unsigned long flags; - int rc; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - incarnation = peer->ibp_incarnation; - version = !peer->ibp_version ? IBLND_MSG_VERSION : - peer->ibp_version; - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, - version); - if (!conn) { - kiblnd_peer_connect_failed(peer, 1, -ENOMEM); - kiblnd_peer_decref(peer); /* lose cmid's ref */ - return -ENOMEM; - } - - /* - * conn "owns" cmid now, so I return success from here on to ensure the - * CM callback doesn't destroy cmid. conn also takes over cmid's ref - * on peer - */ - msg = &conn->ibc_connvars->cv_msg; - - memset(msg, 0, sizeof(*msg)); - kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); - msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; - msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - - kiblnd_pack_msg(peer->ibp_ni, msg, version, - 0, peer->ibp_nid, incarnation); - - memset(&cp, 0, sizeof(cp)); - cp.private_data = msg; - cp.private_data_len = msg->ibm_nob; - cp.responder_resources = 0; /* No atomic ops or RDMA reads */ - cp.initiator_depth = 0; - cp.flow_control = 1; - cp.retry_count = *kiblnd_tunables.kib_retry_count; - cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; - - LASSERT(cmid->context == (void *)conn); - LASSERT(conn->ibc_cmid == cmid); - - rc = rdma_connect(cmid, &cp); - if (rc) { - CERROR("Can't connect to %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - kiblnd_connreq_done(conn, rc); - kiblnd_conn_decref(conn); - } - - return 0; -} - -int -kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) -{ - struct kib_peer *peer; - struct kib_conn *conn; - int rc; - - switch (event->event) { - default: - CERROR("Unexpected event: %d, status: %d\n", - event->event, event->status); - LBUG(); - - case RDMA_CM_EVENT_CONNECT_REQUEST: - /* destroy cmid on failure */ - rc = kiblnd_passive_connect(cmid, - (void *)KIBLND_CONN_PARAM(event), - KIBLND_CONN_PARAM_LEN(event)); - CDEBUG(D_NET, "connreq: %d\n", rc); - return rc; - - case RDMA_CM_EVENT_ADDR_ERROR: - peer = (struct kib_peer *)cmid->context; - CNETERR("%s: ADDR ERROR %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); - kiblnd_peer_decref(peer); - return -EHOSTUNREACH; /* rc destroys cmid */ - - case RDMA_CM_EVENT_ADDR_RESOLVED: - peer = (struct kib_peer *)cmid->context; - - CDEBUG(D_NET, "%s Addr resolved: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - - if (event->status) { - CNETERR("Can't resolve address for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - rc = event->status; - } else { - rc = rdma_resolve_route( - cmid, *kiblnd_tunables.kib_timeout * 1000); - if (!rc) { - struct kib_net *net = peer->ibp_ni->ni_data; - struct kib_dev *dev = net->ibn_dev; - - CDEBUG(D_NET, "%s: connection bound to "\ - "%s:%pI4h:%s\n", - libcfs_nid2str(peer->ibp_nid), - dev->ibd_ifname, - &dev->ibd_ifip, cmid->device->name); - - return 0; - } - - /* Can't initiate route resolution */ - CERROR("Can't resolve route for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - } - kiblnd_peer_connect_failed(peer, 1, rc); - kiblnd_peer_decref(peer); - return rc; /* rc destroys cmid */ - - case RDMA_CM_EVENT_ROUTE_ERROR: - peer = (struct kib_peer *)cmid->context; - CNETERR("%s: ROUTE ERROR %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); - kiblnd_peer_decref(peer); - return -EHOSTUNREACH; /* rc destroys cmid */ - - case RDMA_CM_EVENT_ROUTE_RESOLVED: - peer = (struct kib_peer *)cmid->context; - CDEBUG(D_NET, "%s Route resolved: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - - if (!event->status) - return kiblnd_active_connect(cmid); - - CNETERR("Can't resolve route for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - kiblnd_peer_connect_failed(peer, 1, event->status); - kiblnd_peer_decref(peer); - return event->status; /* rc destroys cmid */ - - case RDMA_CM_EVENT_UNREACHABLE: - conn = (struct kib_conn *)cmid->context; - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || - conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); - CNETERR("%s: UNREACHABLE %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); - kiblnd_connreq_done(conn, -ENETDOWN); - kiblnd_conn_decref(conn); - return 0; - - case RDMA_CM_EVENT_CONNECT_ERROR: - conn = (struct kib_conn *)cmid->context; - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || - conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); - CNETERR("%s: CONNECT ERROR %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); - kiblnd_connreq_done(conn, -ENOTCONN); - kiblnd_conn_decref(conn); - return 0; - - case RDMA_CM_EVENT_REJECTED: - conn = (struct kib_conn *)cmid->context; - switch (conn->ibc_state) { - default: - LBUG(); - - case IBLND_CONN_PASSIVE_WAIT: - CERROR("%s: REJECTED %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - event->status); - kiblnd_connreq_done(conn, -ECONNRESET); - break; - - case IBLND_CONN_ACTIVE_CONNECT: - kiblnd_rejected(conn, event->status, - (void *)KIBLND_CONN_PARAM(event), - KIBLND_CONN_PARAM_LEN(event)); - break; - } - kiblnd_conn_decref(conn); - return 0; - - case RDMA_CM_EVENT_ESTABLISHED: - conn = (struct kib_conn *)cmid->context; - switch (conn->ibc_state) { - default: - LBUG(); - - case IBLND_CONN_PASSIVE_WAIT: - CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_connreq_done(conn, 0); - break; - - case IBLND_CONN_ACTIVE_CONNECT: - CDEBUG(D_NET, "ESTABLISHED(active): %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_check_connreply(conn, - (void *)KIBLND_CONN_PARAM(event), - KIBLND_CONN_PARAM_LEN(event)); - break; - } - /* net keeps its ref on conn! */ - return 0; - - case RDMA_CM_EVENT_TIMEWAIT_EXIT: - CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n"); - return 0; - case RDMA_CM_EVENT_DISCONNECTED: - conn = (struct kib_conn *)cmid->context; - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - CERROR("%s DISCONNECTED\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_connreq_done(conn, -ECONNRESET); - } else { - kiblnd_close_conn(conn, 0); - } - kiblnd_conn_decref(conn); - cmid->context = NULL; - return 0; - - case RDMA_CM_EVENT_DEVICE_REMOVAL: - LCONSOLE_ERROR_MSG(0x131, - "Received notification of device removal\n" - "Please shutdown LNET to allow this to proceed\n"); - /* - * Can't remove network from underneath LNET for now, so I have - * to ignore this - */ - return 0; - - case RDMA_CM_EVENT_ADDR_CHANGE: - LCONSOLE_INFO("Physical link changed (eg hca/port)\n"); - return 0; - } -} - -static int -kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) -{ - struct kib_tx *tx; - struct list_head *ttmp; - - list_for_each(ttmp, txs) { - tx = list_entry(ttmp, struct kib_tx, tx_list); - - if (txs != &conn->ibc_active_txs) { - LASSERT(tx->tx_queued); - } else { - LASSERT(!tx->tx_queued); - LASSERT(tx->tx_waiting || tx->tx_sending); - } - - if (time_after_eq(jiffies, tx->tx_deadline)) { - CERROR("Timed out tx: %s, %lu seconds\n", - kiblnd_queue2str(conn, txs), - (jiffies - tx->tx_deadline) / HZ); - return 1; - } - } - - return 0; -} - -static int -kiblnd_conn_timed_out_locked(struct kib_conn *conn) -{ - return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || - kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || - kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || - kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || - kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); -} - -static void -kiblnd_check_conns(int idx) -{ - LIST_HEAD(closes); - LIST_HEAD(checksends); - struct list_head *peers = &kiblnd_data.kib_peers[idx]; - struct list_head *ptmp; - struct kib_peer *peer; - struct kib_conn *conn; - struct kib_conn *temp; - struct kib_conn *tmp; - struct list_head *ctmp; - unsigned long flags; - - /* - * NB. We expect to have a look at all the peers and not find any - * RDMAs to time out, so we just use a shared lock while we - * take a look... - */ - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - list_for_each(ptmp, peers) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - - list_for_each(ctmp, &peer->ibp_conns) { - int timedout; - int sendnoop; - - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED); - - spin_lock(&conn->ibc_lock); - - sendnoop = kiblnd_need_noop(conn); - timedout = kiblnd_conn_timed_out_locked(conn); - if (!sendnoop && !timedout) { - spin_unlock(&conn->ibc_lock); - continue; - } - - if (timedout) { - CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n", - libcfs_nid2str(peer->ibp_nid), - (jiffies - peer->ibp_last_alive) / HZ, - conn->ibc_credits, - conn->ibc_outstanding_credits, - conn->ibc_reserved_credits); - list_add(&conn->ibc_connd_list, &closes); - } else { - list_add(&conn->ibc_connd_list, &checksends); - } - /* +ref for 'closes' or 'checksends' */ - kiblnd_conn_addref(conn); - - spin_unlock(&conn->ibc_lock); - } - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - /* - * Handle timeout by closing the whole - * connection. We can only be sure RDMA activity - * has ceased once the QP has been modified. - */ - list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) { - list_del(&conn->ibc_connd_list); - kiblnd_close_conn(conn, -ETIMEDOUT); - kiblnd_conn_decref(conn); - } - - /* - * In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... - */ - list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) { - list_del(&conn->ibc_connd_list); - - spin_lock(&conn->ibc_lock); - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - - kiblnd_conn_decref(conn); - } -} - -static void -kiblnd_disconnect_conn(struct kib_conn *conn) -{ - LASSERT(!in_interrupt()); - LASSERT(current == kiblnd_data.kib_connd); - LASSERT(conn->ibc_state == IBLND_CONN_CLOSING); - - rdma_disconnect(conn->ibc_cmid); - kiblnd_finalise_conn(conn); - - kiblnd_peer_notify(conn->ibc_peer); -} - -/** - * High-water for reconnection to the same peer, reconnection attempt should - * be delayed after trying more than KIB_RECONN_HIGH_RACE. - */ -#define KIB_RECONN_HIGH_RACE 10 -/** - * Allow connd to take a break and handle other things after consecutive - * reconnection attempts. - */ -#define KIB_RECONN_BREAK 100 - -int -kiblnd_connd(void *arg) -{ - spinlock_t *lock = &kiblnd_data.kib_connd_lock; - wait_queue_entry_t wait; - unsigned long flags; - struct kib_conn *conn; - int timeout; - int i; - int dropped_lock; - int peer_index = 0; - unsigned long deadline = jiffies; - - init_waitqueue_entry(&wait, current); - kiblnd_data.kib_connd = current; - - spin_lock_irqsave(lock, flags); - - while (!kiblnd_data.kib_shutdown) { - int reconn = 0; - - dropped_lock = 0; - - if (!list_empty(&kiblnd_data.kib_connd_zombies)) { - struct kib_peer *peer = NULL; - - conn = list_entry(kiblnd_data.kib_connd_zombies.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - if (conn->ibc_reconnect) { - peer = conn->ibc_peer; - kiblnd_peer_addref(peer); - } - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - kiblnd_destroy_conn(conn); - - spin_lock_irqsave(lock, flags); - if (!peer) { - kfree(conn); - continue; - } - - conn->ibc_peer = peer; - if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) - list_add_tail(&conn->ibc_list, - &kiblnd_data.kib_reconn_list); - else - list_add_tail(&conn->ibc_list, - &kiblnd_data.kib_reconn_wait); - } - - if (!list_empty(&kiblnd_data.kib_connd_conns)) { - conn = list_entry(kiblnd_data.kib_connd_conns.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - kiblnd_disconnect_conn(conn); - kiblnd_conn_decref(conn); - - spin_lock_irqsave(lock, flags); - } - - while (reconn < KIB_RECONN_BREAK) { - if (kiblnd_data.kib_reconn_sec != - ktime_get_real_seconds()) { - kiblnd_data.kib_reconn_sec = ktime_get_real_seconds(); - list_splice_init(&kiblnd_data.kib_reconn_wait, - &kiblnd_data.kib_reconn_list); - } - - if (list_empty(&kiblnd_data.kib_reconn_list)) - break; - - conn = list_entry(kiblnd_data.kib_reconn_list.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - reconn += kiblnd_reconnect_peer(conn->ibc_peer); - kiblnd_peer_decref(conn->ibc_peer); - kfree(conn); - - spin_lock_irqsave(lock, flags); - } - - /* careful with the jiffy wrap... */ - timeout = (int)(deadline - jiffies); - if (timeout <= 0) { - const int n = 4; - const int p = 1; - int chunk = kiblnd_data.kib_peer_hash_size; - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - /* - * Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. - */ - if (*kiblnd_tunables.kib_timeout > n * p) - chunk = (chunk * n * p) / - *kiblnd_tunables.kib_timeout; - if (!chunk) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kiblnd_check_conns(peer_index); - peer_index = (peer_index + 1) % - kiblnd_data.kib_peer_hash_size; - } - - deadline += msecs_to_jiffies(p * MSEC_PER_SEC); - spin_lock_irqsave(lock, flags); - } - - if (dropped_lock) - continue; - - /* Nothing to do for 'timeout' */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); - spin_unlock_irqrestore(lock, flags); - - schedule_timeout(timeout); - - remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); - spin_lock_irqsave(lock, flags); - } - - spin_unlock_irqrestore(lock, flags); - - kiblnd_thread_fini(); - return 0; -} - -void -kiblnd_qp_event(struct ib_event *event, void *arg) -{ - struct kib_conn *conn = arg; - - switch (event->event) { - case IB_EVENT_COMM_EST: - CDEBUG(D_NET, "%s established\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* - * We received a packet but connection isn't established - * probably handshake packet was lost, so free to - * force make connection established - */ - rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); - return; - - default: - CERROR("%s: Async QP event type %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); - return; - } -} - -static void -kiblnd_complete(struct ib_wc *wc) -{ - switch (kiblnd_wreqid2type(wc->wr_id)) { - default: - LBUG(); - - case IBLND_WID_MR: - if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) - CNETERR("FastReg failed: %d\n", wc->status); - break; - - case IBLND_WID_RDMA: - /* - * We only get RDMA completion notification if it fails. All - * subsequent work items, including the final SEND will fail - * too. However we can't print out any more info about the - * failing RDMA because 'tx' might be back on the idle list or - * even reused already if we didn't manage to post all our work - * items - */ - CNETERR("RDMA (tx: %p) failed: %d\n", - kiblnd_wreqid2ptr(wc->wr_id), wc->status); - return; - - case IBLND_WID_TX: - kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); - return; - - case IBLND_WID_RX: - kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, - wc->byte_len); - return; - } -} - -void -kiblnd_cq_completion(struct ib_cq *cq, void *arg) -{ - /* - * NB I'm not allowed to schedule this conn once its refcount has - * reached 0. Since fundamentally I'm racing with scheduler threads - * consuming my CQ I could be called after all completions have - * occurred. But in this case, !ibc_nrx && !ibc_nsends_posted - * and this CQ is about to be destroyed so I NOOP. - */ - struct kib_conn *conn = arg; - struct kib_sched_info *sched = conn->ibc_sched; - unsigned long flags; - - LASSERT(cq == conn->ibc_cq); - - spin_lock_irqsave(&sched->ibs_lock, flags); - - conn->ibc_ready = 1; - - if (!conn->ibc_scheduled && - (conn->ibc_nrx > 0 || - conn->ibc_nsends_posted > 0)) { - kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ - conn->ibc_scheduled = 1; - list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); - - if (waitqueue_active(&sched->ibs_waitq)) - wake_up(&sched->ibs_waitq); - } - - spin_unlock_irqrestore(&sched->ibs_lock, flags); -} - -void -kiblnd_cq_event(struct ib_event *event, void *arg) -{ - struct kib_conn *conn = arg; - - CERROR("%s: async CQ event type %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); -} - -int -kiblnd_scheduler(void *arg) -{ - long id = (long)arg; - struct kib_sched_info *sched; - struct kib_conn *conn; - wait_queue_entry_t wait; - unsigned long flags; - struct ib_wc wc; - int did_something; - int busy_loops = 0; - int rc; - - init_waitqueue_entry(&wait, current); - - sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; - - rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); - if (rc) { - CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n", - sched->ibs_cpt); - } - - spin_lock_irqsave(&sched->ibs_lock, flags); - - while (!kiblnd_data.kib_shutdown) { - if (busy_loops++ >= IBLND_RESCHED) { - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - cond_resched(); - busy_loops = 0; - - spin_lock_irqsave(&sched->ibs_lock, flags); - } - - did_something = 0; - - if (!list_empty(&sched->ibs_conns)) { - conn = list_entry(sched->ibs_conns.next, struct kib_conn, - ibc_sched_list); - /* take over kib_sched_conns' ref on conn... */ - LASSERT(conn->ibc_scheduled); - list_del(&conn->ibc_sched_list); - conn->ibc_ready = 0; - - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - wc.wr_id = IBLND_WID_INVAL; - - rc = ib_poll_cq(conn->ibc_cq, 1, &wc); - if (!rc) { - rc = ib_req_notify_cq(conn->ibc_cq, - IB_CQ_NEXT_COMP); - if (rc < 0) { - CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kiblnd_close_conn(conn, -EIO); - kiblnd_conn_decref(conn); - spin_lock_irqsave(&sched->ibs_lock, - flags); - continue; - } - - rc = ib_poll_cq(conn->ibc_cq, 1, &wc); - } - - if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) { - LCONSOLE_ERROR("ib_poll_cq (rc: %d) returned invalid wr_id, opcode %d, status: %d, vendor_err: %d, conn: %s status: %d\nplease upgrade firmware and OFED or contact vendor.\n", - rc, wc.opcode, wc.status, - wc.vendor_err, - libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_state); - rc = -EINVAL; - } - - if (rc < 0) { - CWARN("%s: ib_poll_cq failed: %d, closing connection\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - rc); - kiblnd_close_conn(conn, -EIO); - kiblnd_conn_decref(conn); - spin_lock_irqsave(&sched->ibs_lock, flags); - continue; - } - - spin_lock_irqsave(&sched->ibs_lock, flags); - - if (rc || conn->ibc_ready) { - /* - * There may be another completion waiting; get - * another scheduler to check while I handle - * this one... - */ - /* +1 ref for sched_conns */ - kiblnd_conn_addref(conn); - list_add_tail(&conn->ibc_sched_list, - &sched->ibs_conns); - if (waitqueue_active(&sched->ibs_waitq)) - wake_up(&sched->ibs_waitq); - } else { - conn->ibc_scheduled = 0; - } - - if (rc) { - spin_unlock_irqrestore(&sched->ibs_lock, flags); - kiblnd_complete(&wc); - - spin_lock_irqsave(&sched->ibs_lock, flags); - } - - kiblnd_conn_decref(conn); /* ...drop my ref from above */ - did_something = 1; - } - - if (did_something) - continue; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&sched->ibs_waitq, &wait); - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - schedule(); - busy_loops = 0; - - remove_wait_queue(&sched->ibs_waitq, &wait); - spin_lock_irqsave(&sched->ibs_lock, flags); - } - - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - kiblnd_thread_fini(); - return 0; -} - -int -kiblnd_failover_thread(void *arg) -{ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_dev *dev; - wait_queue_entry_t wait; - unsigned long flags; - int rc; - - LASSERT(*kiblnd_tunables.kib_dev_failover); - - init_waitqueue_entry(&wait, current); - write_lock_irqsave(glock, flags); - - while (!kiblnd_data.kib_shutdown) { - int do_failover = 0; - int long_sleep; - - list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, - ibd_fail_list) { - if (time_before(jiffies, - dev->ibd_next_failover)) - continue; - do_failover = 1; - break; - } - - if (do_failover) { - list_del_init(&dev->ibd_fail_list); - dev->ibd_failover = 1; - write_unlock_irqrestore(glock, flags); - - rc = kiblnd_dev_failover(dev); - - write_lock_irqsave(glock, flags); - - LASSERT(dev->ibd_failover); - dev->ibd_failover = 0; - if (rc >= 0) { /* Device is OK or failover succeed */ - dev->ibd_next_failover = jiffies + 3 * HZ; - continue; - } - - /* failed to failover, retry later */ - dev->ibd_next_failover = - jiffies + min(dev->ibd_failed_failover, 10) * HZ; - if (kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - } - - continue; - } - - /* long sleep if no more pending failover */ - long_sleep = list_empty(&kiblnd_data.kib_failed_devs); - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); - write_unlock_irqrestore(glock, flags); - - rc = schedule_timeout(long_sleep ? 10 * HZ : - HZ); - remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); - write_lock_irqsave(glock, flags); - - if (!long_sleep || rc) - continue; - - /* - * have a long sleep, routine check all active devices, - * we need checking like this because if there is not active - * connection on the dev and no SEND from local, we may listen - * on wrong HCA for ever while there is a bonding failover - */ - list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { - if (kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - } - } - } - - write_unlock_irqrestore(glock, flags); - - kiblnd_thread_fini(); - return 0; -} diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c deleted file mode 100644 index 39d0792..0000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd_modparams.c - * - * Author: Eric Barton <eric@bartonsoftware.com> - */ - -#include "o2iblnd.h" - -static int service = 987; -module_param(service, int, 0444); -MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)"); - -static int cksum; -module_param(cksum, int, 0644); -MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums"); - -static int timeout = 50; -module_param(timeout, int, 0644); -MODULE_PARM_DESC(timeout, "timeout (seconds)"); - -/* - * Number of threads in each scheduler pool which is percpt, - * we will estimate reasonable value based on CPUs if it's set to zero. - */ -static int nscheds; -module_param(nscheds, int, 0444); -MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool"); - -static unsigned int conns_per_peer = 1; -module_param(conns_per_peer, uint, 0444); -MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); - -/* NB: this value is shared by all CPTs, it can grow at runtime */ -static int ntx = 512; -module_param(ntx, int, 0444); -MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool"); - -/* NB: this value is shared by all CPTs */ -static int credits = 256; -module_param(credits, int, 0444); -MODULE_PARM_DESC(credits, "# concurrent sends"); - -static int peer_credits = 8; -module_param(peer_credits, int, 0444); -MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); - -static int peer_credits_hiw; -module_param(peer_credits_hiw, int, 0444); -MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits"); - -static int peer_buffer_credits; -module_param(peer_buffer_credits, int, 0444); -MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); - -static int peer_timeout = 180; -module_param(peer_timeout, int, 0444); -MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); - -static char *ipif_name = "ib0"; -module_param(ipif_name, charp, 0444); -MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); - -static int retry_count = 5; -module_param(retry_count, int, 0644); -MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received"); - -static int rnr_retry_count = 6; -module_param(rnr_retry_count, int, 0644); -MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions"); - -static int keepalive = 100; -module_param(keepalive, int, 0644); -MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive"); - -static int ib_mtu; -module_param(ib_mtu, int, 0444); -MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096"); - -static int concurrent_sends; -module_param(concurrent_sends, int, 0444); -MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing"); - -#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS -static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND; -module_param(map_on_demand, int, 0444); -MODULE_PARM_DESC(map_on_demand, "map on demand"); - -/* NB: this value is shared by all CPTs, it can grow at runtime */ -static int fmr_pool_size = 512; -module_param(fmr_pool_size, int, 0444); -MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)"); - -/* NB: this value is shared by all CPTs, it can grow at runtime */ -static int fmr_flush_trigger = 384; -module_param(fmr_flush_trigger, int, 0444); -MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush"); - -static int fmr_cache = 1; -module_param(fmr_cache, int, 0444); -MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching"); - -/* - * 0: disable failover - * 1: enable failover if necessary - * 2: force to failover (for debug) - */ -static int dev_failover; -module_param(dev_failover, int, 0444); -MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)"); - -static int require_privileged_port; -module_param(require_privileged_port, int, 0644); -MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection"); - -static int use_privileged_port = 1; -module_param(use_privileged_port, int, 0644); -MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection"); - -struct kib_tunables kiblnd_tunables = { - .kib_dev_failover = &dev_failover, - .kib_service = &service, - .kib_cksum = &cksum, - .kib_timeout = &timeout, - .kib_keepalive = &keepalive, - .kib_ntx = &ntx, - .kib_default_ipif = &ipif_name, - .kib_retry_count = &retry_count, - .kib_rnr_retry_count = &rnr_retry_count, - .kib_ib_mtu = &ib_mtu, - .kib_require_priv_port = &require_privileged_port, - .kib_use_priv_port = &use_privileged_port, - .kib_nscheds = &nscheds -}; - -static struct lnet_ioctl_config_o2iblnd_tunables default_tunables; - -/* # messages/RDMAs in-flight */ -int kiblnd_msg_queue_size(int version, struct lnet_ni *ni) -{ - if (version == IBLND_MSG_VERSION_1) - return IBLND_MSG_QUEUE_SIZE_V1; - else if (ni) - return ni->ni_peertxcredits; - else - return peer_credits; -} - -int kiblnd_tunables_setup(struct lnet_ni *ni) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - - /* - * if there was no tunables specified, setup the tunables to be - * defaulted - */ - if (!ni->ni_lnd_tunables) { - ni->ni_lnd_tunables = kzalloc(sizeof(*ni->ni_lnd_tunables), - GFP_NOFS); - if (!ni->ni_lnd_tunables) - return -ENOMEM; - - memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib, - &default_tunables, sizeof(*tunables)); - } - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - - /* Current API version */ - tunables->lnd_version = 0; - - if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { - CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", - *kiblnd_tunables.kib_ib_mtu); - return -EINVAL; - } - - if (!ni->ni_peertimeout) - ni->ni_peertimeout = peer_timeout; - - if (!ni->ni_maxtxcredits) - ni->ni_maxtxcredits = credits; - - if (!ni->ni_peertxcredits) - ni->ni_peertxcredits = peer_credits; - - if (!ni->ni_peerrtrcredits) - ni->ni_peerrtrcredits = peer_buffer_credits; - - if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT) - ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT; - - if (ni->ni_peertxcredits > IBLND_CREDITS_MAX) - ni->ni_peertxcredits = IBLND_CREDITS_MAX; - - if (ni->ni_peertxcredits > credits) - ni->ni_peertxcredits = credits; - - if (!tunables->lnd_peercredits_hiw) - tunables->lnd_peercredits_hiw = peer_credits_hiw; - - if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2) - tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2; - - if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits) - tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1; - - if (tunables->lnd_map_on_demand <= 0 || - tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) { - /* Use the default */ - CWARN("Invalid map_on_demand (%d), expects 1 - %d. Using default of %d\n", - tunables->lnd_map_on_demand, - IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND); - tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND; - } - - if (tunables->lnd_map_on_demand == 1) { - /* don't make sense to create map if only one fragment */ - tunables->lnd_map_on_demand = 2; - } - - if (!tunables->lnd_concurrent_sends) { - if (tunables->lnd_map_on_demand > 0 && - tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) { - tunables->lnd_concurrent_sends = - ni->ni_peertxcredits * 2; - } else { - tunables->lnd_concurrent_sends = ni->ni_peertxcredits; - } - } - - if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2) - tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2; - - if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2) - tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2; - - if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) { - CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n", - tunables->lnd_concurrent_sends, ni->ni_peertxcredits); - } - - if (!tunables->lnd_fmr_pool_size) - tunables->lnd_fmr_pool_size = fmr_pool_size; - if (!tunables->lnd_fmr_flush_trigger) - tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; - if (!tunables->lnd_fmr_cache) - tunables->lnd_fmr_cache = fmr_cache; - if (!tunables->lnd_conns_per_peer) { - tunables->lnd_conns_per_peer = (conns_per_peer) ? - conns_per_peer : 1; - } - - return 0; -} - -void kiblnd_tunables_init(void) -{ - default_tunables.lnd_version = 0; - default_tunables.lnd_peercredits_hiw = peer_credits_hiw, - default_tunables.lnd_map_on_demand = map_on_demand; - default_tunables.lnd_concurrent_sends = concurrent_sends; - default_tunables.lnd_fmr_pool_size = fmr_pool_size; - default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger; - default_tunables.lnd_fmr_cache = fmr_cache; - default_tunables.lnd_conns_per_peer = conns_per_peer; -} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile deleted file mode 100644 index a7da1ab..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET) += ksocklnd.o - -ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib.o diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c deleted file mode 100644 index f01b34a..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c +++ /dev/null @@ -1,2921 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/socklnd/socklnd.c - * - * Author: Zach Brown <zab@zabbo.net> - * Author: Peter J. Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Eric Barton <eric@bartonsoftware.com> - */ - -#include "socklnd.h" - -static struct lnet_lnd the_ksocklnd; -struct ksock_nal_data ksocknal_data; - -static struct ksock_interface * -ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip) -{ - struct ksock_net *net = ni->ni_data; - int i; - struct ksock_interface *iface; - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - LASSERT(i < LNET_MAX_INTERFACES); - iface = &net->ksnn_interfaces[i]; - - if (iface->ksni_ipaddr == ip) - return iface; - } - - return NULL; -} - -static struct ksock_route * -ksocknal_create_route(__u32 ipaddr, int port) -{ - struct ksock_route *route; - - route = kzalloc(sizeof(*route), GFP_NOFS); - if (!route) - return NULL; - - atomic_set(&route->ksnr_refcount, 1); - route->ksnr_peer = NULL; - route->ksnr_retry_interval = 0; /* OK to connect at any time */ - route->ksnr_ipaddr = ipaddr; - route->ksnr_port = port; - route->ksnr_scheduled = 0; - route->ksnr_connecting = 0; - route->ksnr_connected = 0; - route->ksnr_deleted = 0; - route->ksnr_conn_count = 0; - route->ksnr_share_count = 0; - - return route; -} - -void -ksocknal_destroy_route(struct ksock_route *route) -{ - LASSERT(!atomic_read(&route->ksnr_refcount)); - - if (route->ksnr_peer) - ksocknal_peer_decref(route->ksnr_peer); - - kfree(route); -} - -static int -ksocknal_create_peer(struct ksock_peer **peerp, struct lnet_ni *ni, - struct lnet_process_id id) -{ - int cpt = lnet_cpt_of_nid(id.nid); - struct ksock_net *net = ni->ni_data; - struct ksock_peer *peer; - - LASSERT(id.nid != LNET_NID_ANY); - LASSERT(id.pid != LNET_PID_ANY); - LASSERT(!in_interrupt()); - - peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt); - if (!peer) - return -ENOMEM; - - peer->ksnp_ni = ni; - peer->ksnp_id = id; - atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */ - peer->ksnp_closing = 0; - peer->ksnp_accepting = 0; - peer->ksnp_proto = NULL; - peer->ksnp_last_alive = 0; - peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; - - INIT_LIST_HEAD(&peer->ksnp_conns); - INIT_LIST_HEAD(&peer->ksnp_routes); - INIT_LIST_HEAD(&peer->ksnp_tx_queue); - INIT_LIST_HEAD(&peer->ksnp_zc_req_list); - spin_lock_init(&peer->ksnp_lock); - - spin_lock_bh(&net->ksnn_lock); - - if (net->ksnn_shutdown) { - spin_unlock_bh(&net->ksnn_lock); - - kfree(peer); - CERROR("Can't create peer: network shutdown\n"); - return -ESHUTDOWN; - } - - net->ksnn_npeers++; - - spin_unlock_bh(&net->ksnn_lock); - - *peerp = peer; - return 0; -} - -void -ksocknal_destroy_peer(struct ksock_peer *peer) -{ - struct ksock_net *net = peer->ksnp_ni->ni_data; - - CDEBUG(D_NET, "peer %s %p deleted\n", - libcfs_id2str(peer->ksnp_id), peer); - - LASSERT(!atomic_read(&peer->ksnp_refcount)); - LASSERT(!peer->ksnp_accepting); - LASSERT(list_empty(&peer->ksnp_conns)); - LASSERT(list_empty(&peer->ksnp_routes)); - LASSERT(list_empty(&peer->ksnp_tx_queue)); - LASSERT(list_empty(&peer->ksnp_zc_req_list)); - - kfree(peer); - - /* - * NB a peer's connections and routes keep a reference on their peer - * until they are destroyed, so we can be assured that _all_ state to - * do with this peer has been cleaned up when its refcount drops to - * zero. - */ - spin_lock_bh(&net->ksnn_lock); - net->ksnn_npeers--; - spin_unlock_bh(&net->ksnn_lock); -} - -struct ksock_peer * -ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id) -{ - struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); - struct ksock_peer *peer; - - list_for_each_entry(peer, peer_list, ksnp_list) { - LASSERT(!peer->ksnp_closing); - - if (peer->ksnp_ni != ni) - continue; - - if (peer->ksnp_id.nid != id.nid || - peer->ksnp_id.pid != id.pid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", - peer, libcfs_id2str(id), - atomic_read(&peer->ksnp_refcount)); - return peer; - } - return NULL; -} - -struct ksock_peer * -ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id) -{ - struct ksock_peer *peer; - - read_lock(&ksocknal_data.ksnd_global_lock); - peer = ksocknal_find_peer_locked(ni, id); - if (peer) /* +1 ref for caller? */ - ksocknal_peer_addref(peer); - read_unlock(&ksocknal_data.ksnd_global_lock); - - return peer; -} - -static void -ksocknal_unlink_peer_locked(struct ksock_peer *peer) -{ - int i; - __u32 ip; - struct ksock_interface *iface; - - for (i = 0; i < peer->ksnp_n_passive_ips; i++) { - LASSERT(i < LNET_MAX_INTERFACES); - ip = peer->ksnp_passive_ips[i]; - - iface = ksocknal_ip2iface(peer->ksnp_ni, ip); - /* - * All IPs in peer->ksnp_passive_ips[] come from the - * interface list, therefore the call must succeed. - */ - LASSERT(iface); - - CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n", - peer, iface, iface->ksni_nroutes); - iface->ksni_npeers--; - } - - LASSERT(list_empty(&peer->ksnp_conns)); - LASSERT(list_empty(&peer->ksnp_routes)); - LASSERT(!peer->ksnp_closing); - peer->ksnp_closing = 1; - list_del(&peer->ksnp_list); - /* lose peerlist's ref */ - ksocknal_peer_decref(peer); -} - -static int -ksocknal_get_peer_info(struct lnet_ni *ni, int index, - struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip, - int *port, int *conn_count, int *share_count) -{ - struct ksock_peer *peer; - struct list_head *ptmp; - struct ksock_route *route; - struct list_head *rtmp; - int i; - int j; - int rc = -ENOENT; - - read_lock(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni != ni) - continue; - - if (!peer->ksnp_n_passive_ips && - list_empty(&peer->ksnp_routes)) { - if (index-- > 0) - continue; - - *id = peer->ksnp_id; - *myip = 0; - *peer_ip = 0; - *port = 0; - *conn_count = 0; - *share_count = 0; - rc = 0; - goto out; - } - - for (j = 0; j < peer->ksnp_n_passive_ips; j++) { - if (index-- > 0) - continue; - - *id = peer->ksnp_id; - *myip = peer->ksnp_passive_ips[j]; - *peer_ip = 0; - *port = 0; - *conn_count = 0; - *share_count = 0; - rc = 0; - goto out; - } - - list_for_each(rtmp, &peer->ksnp_routes) { - if (index-- > 0) - continue; - - route = list_entry(rtmp, struct ksock_route, - ksnr_list); - - *id = peer->ksnp_id; - *myip = route->ksnr_myipaddr; - *peer_ip = route->ksnr_ipaddr; - *port = route->ksnr_port; - *conn_count = route->ksnr_conn_count; - *share_count = route->ksnr_share_count; - rc = 0; - goto out; - } - } - } - out: - read_unlock(&ksocknal_data.ksnd_global_lock); - return rc; -} - -static void -ksocknal_associate_route_conn_locked(struct ksock_route *route, - struct ksock_conn *conn) -{ - struct ksock_peer *peer = route->ksnr_peer; - int type = conn->ksnc_type; - struct ksock_interface *iface; - - conn->ksnc_route = route; - ksocknal_route_addref(route); - - if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { - if (!route->ksnr_myipaddr) { - /* route wasn't bound locally yet (the initial route) */ - CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr, - &conn->ksnc_myipaddr); - } else { - CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr, - &route->ksnr_myipaddr, - &conn->ksnc_myipaddr); - - iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, - route->ksnr_myipaddr); - if (iface) - iface->ksni_nroutes--; - } - route->ksnr_myipaddr = conn->ksnc_myipaddr; - iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, - route->ksnr_myipaddr); - if (iface) - iface->ksni_nroutes++; - } - - route->ksnr_connected |= (1 << type); - route->ksnr_conn_count++; - - /* - * Successful connection => further attempts can - * proceed immediately - */ - route->ksnr_retry_interval = 0; -} - -static void -ksocknal_add_route_locked(struct ksock_peer *peer, struct ksock_route *route) -{ - struct list_head *tmp; - struct ksock_conn *conn; - struct ksock_route *route2; - - LASSERT(!peer->ksnp_closing); - LASSERT(!route->ksnr_peer); - LASSERT(!route->ksnr_scheduled); - LASSERT(!route->ksnr_connecting); - LASSERT(!route->ksnr_connected); - - /* LASSERT(unique) */ - list_for_each(tmp, &peer->ksnp_routes) { - route2 = list_entry(tmp, struct ksock_route, ksnr_list); - - if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { - CERROR("Duplicate route %s %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr); - LBUG(); - } - } - - route->ksnr_peer = peer; - ksocknal_peer_addref(peer); - /* peer's routelist takes over my ref on 'route' */ - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - - list_for_each(tmp, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn->ksnc_ipaddr != route->ksnr_ipaddr) - continue; - - ksocknal_associate_route_conn_locked(route, conn); - /* keep going (typed routes) */ - } -} - -static void -ksocknal_del_route_locked(struct ksock_route *route) -{ - struct ksock_peer *peer = route->ksnr_peer; - struct ksock_interface *iface; - struct ksock_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - - LASSERT(!route->ksnr_deleted); - - /* Close associated conns */ - list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, struct ksock_conn, ksnc_list); - - if (conn->ksnc_route != route) - continue; - - ksocknal_close_conn_locked(conn, 0); - } - - if (route->ksnr_myipaddr) { - iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, - route->ksnr_myipaddr); - if (iface) - iface->ksni_nroutes--; - } - - route->ksnr_deleted = 1; - list_del(&route->ksnr_list); - ksocknal_route_decref(route); /* drop peer's ref */ - - if (list_empty(&peer->ksnp_routes) && - list_empty(&peer->ksnp_conns)) { - /* - * I've just removed the last route to a peer with no active - * connections - */ - ksocknal_unlink_peer_locked(peer); - } -} - -int -ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr, - int port) -{ - struct ksock_peer *peer; - struct ksock_peer *peer2; - struct ksock_route *route; - struct ksock_route *route2; - int rc; - - if (id.nid == LNET_NID_ANY || - id.pid == LNET_PID_ANY) - return -EINVAL; - - /* Have a brand new peer ready... */ - rc = ksocknal_create_peer(&peer, ni, id); - if (rc) - return rc; - - route = ksocknal_create_route(ipaddr, port); - if (!route) { - ksocknal_peer_decref(peer); - return -ENOMEM; - } - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - /* always called with a ref on ni, so shutdown can't have started */ - LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown); - - peer2 = ksocknal_find_peer_locked(ni, id); - if (peer2) { - ksocknal_peer_decref(peer); - peer = peer2; - } else { - /* peer table takes my ref on peer */ - list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(id.nid)); - } - - list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) { - if (route2->ksnr_ipaddr == ipaddr) { - /* Route already exists, use the old one */ - ksocknal_route_decref(route); - route2->ksnr_share_count++; - goto out; - } - } - /* Route doesn't already exist, add the new one */ - ksocknal_add_route_locked(peer, route); - route->ksnr_share_count++; -out: - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return 0; -} - -static void -ksocknal_del_peer_locked(struct ksock_peer *peer, __u32 ip) -{ - struct ksock_conn *conn; - struct ksock_route *route; - struct list_head *tmp; - struct list_head *nxt; - int nshared; - - LASSERT(!peer->ksnp_closing); - - /* Extra ref prevents peer disappearing until I'm done with it */ - ksocknal_peer_addref(peer); - - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - /* no match */ - if (!(!ip || route->ksnr_ipaddr == ip)) - continue; - - route->ksnr_share_count = 0; - /* This deletes associated conns too */ - ksocknal_del_route_locked(route); - } - - nshared = 0; - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - nshared += route->ksnr_share_count; - } - - if (!nshared) { - /* - * remove everything else if there are no explicit entries - * left - */ - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - /* we should only be removing auto-entries */ - LASSERT(!route->ksnr_share_count); - ksocknal_del_route_locked(route); - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - - ksocknal_close_conn_locked(conn, 0); - } - } - - ksocknal_peer_decref(peer); - /* NB peer unlinks itself when last conn/route is removed */ -} - -static int -ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip) -{ - LIST_HEAD(zombies); - struct list_head *ptmp; - struct list_head *pnxt; - struct ksock_peer *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - if (id.nid != LNET_NID_ANY) { - lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - } else { - lo = 0; - hi = ksocknal_data.ksnd_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni != ni) - continue; - - if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && - (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) - continue; - - ksocknal_peer_addref(peer); /* a ref for me... */ - - ksocknal_del_peer_locked(peer, ip); - - if (peer->ksnp_closing && - !list_empty(&peer->ksnp_tx_queue)) { - LASSERT(list_empty(&peer->ksnp_conns)); - LASSERT(list_empty(&peer->ksnp_routes)); - - list_splice_init(&peer->ksnp_tx_queue, - &zombies); - } - - ksocknal_peer_decref(peer); /* ...till here */ - - rc = 0; /* matched! */ - } - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_txlist_done(ni, &zombies, 1); - - return rc; -} - -static struct ksock_conn * -ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index) -{ - struct ksock_peer *peer; - struct list_head *ptmp; - struct ksock_conn *conn; - struct list_head *ctmp; - int i; - - read_lock(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - LASSERT(!peer->ksnp_closing); - - if (peer->ksnp_ni != ni) - continue; - - list_for_each(ctmp, &peer->ksnp_conns) { - if (index-- > 0) - continue; - - conn = list_entry(ctmp, struct ksock_conn, - ksnc_list); - ksocknal_conn_addref(conn); - read_unlock(&ksocknal_data.ksnd_global_lock); - return conn; - } - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - return NULL; -} - -static struct ksock_sched * -ksocknal_choose_scheduler_locked(unsigned int cpt) -{ - struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt]; - struct ksock_sched *sched; - int i; - - LASSERT(info->ksi_nthreads > 0); - - sched = &info->ksi_scheds[0]; - /* - * NB: it's safe so far, but info->ksi_nthreads could be changed - * at runtime when we have dynamic LNet configuration, then we - * need to take care of this. - */ - for (i = 1; i < info->ksi_nthreads; i++) { - if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns) - sched = &info->ksi_scheds[i]; - } - - return sched; -} - -static int -ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs) -{ - struct ksock_net *net = ni->ni_data; - int i; - int nip; - - read_lock(&ksocknal_data.ksnd_global_lock); - - nip = net->ksnn_ninterfaces; - LASSERT(nip <= LNET_MAX_INTERFACES); - - /* - * Only offer interfaces for additional connections if I have - * more than one. - */ - if (nip < 2) { - read_unlock(&ksocknal_data.ksnd_global_lock); - return 0; - } - - for (i = 0; i < nip; i++) { - ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; - LASSERT(ipaddrs[i]); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - return nip; -} - -static int -ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips) -{ - int best_netmatch = 0; - int best_xor = 0; - int best = -1; - int this_xor; - int this_netmatch; - int i; - - for (i = 0; i < nips; i++) { - if (!ips[i]) - continue; - - this_xor = ips[i] ^ iface->ksni_ipaddr; - this_netmatch = !(this_xor & iface->ksni_netmask) ? 1 : 0; - - if (!(best < 0 || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_xor > this_xor))) - continue; - - best = i; - best_netmatch = this_netmatch; - best_xor = this_xor; - } - - LASSERT(best >= 0); - return best; -} - -static int -ksocknal_select_ips(struct ksock_peer *peer, __u32 *peerips, int n_peerips) -{ - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - struct ksock_net *net = peer->ksnp_ni->ni_data; - struct ksock_interface *iface; - struct ksock_interface *best_iface; - int n_ips; - int i; - int j; - int k; - __u32 ip; - __u32 xor; - int this_netmatch; - int best_netmatch; - int best_npeers; - - /* - * CAVEAT EMPTOR: We do all our interface matching with an - * exclusive hold of global lock at IRQ priority. We're only - * expecting to be dealing with small numbers of interfaces, so the - * O(n**3)-ness shouldn't matter - */ - /* - * Also note that I'm not going to return more than n_peerips - * interfaces, even if I have more myself - */ - write_lock_bh(global_lock); - - LASSERT(n_peerips <= LNET_MAX_INTERFACES); - LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); - - /* - * Only match interfaces for additional connections - * if I have > 1 interface - */ - n_ips = (net->ksnn_ninterfaces < 2) ? 0 : - min(n_peerips, net->ksnn_ninterfaces); - - for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { - /* ^ yes really... */ - - /* - * If we have any new interfaces, first tick off all the - * peer IPs that match old interfaces, then choose new - * interfaces to match the remaining peer IPS. - * We don't forget interfaces we've stopped using; we might - * start using them again... - */ - if (i < peer->ksnp_n_passive_ips) { - /* Old interface. */ - ip = peer->ksnp_passive_ips[i]; - best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); - - /* peer passive ips are kept up to date */ - LASSERT(best_iface); - } else { - /* choose a new interface */ - LASSERT(i == peer->ksnp_n_passive_ips); - - best_iface = NULL; - best_netmatch = 0; - best_npeers = 0; - - for (j = 0; j < net->ksnn_ninterfaces; j++) { - iface = &net->ksnn_interfaces[j]; - ip = iface->ksni_ipaddr; - - for (k = 0; k < peer->ksnp_n_passive_ips; k++) - if (peer->ksnp_passive_ips[k] == ip) - break; - - if (k < peer->ksnp_n_passive_ips) /* using it already */ - continue; - - k = ksocknal_match_peerip(iface, peerips, - n_peerips); - xor = ip ^ peerips[k]; - this_netmatch = !(xor & iface->ksni_netmask) ? 1 : 0; - - if (!(!best_iface || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_npeers > iface->ksni_npeers))) - continue; - - best_iface = iface; - best_netmatch = this_netmatch; - best_npeers = iface->ksni_npeers; - } - - LASSERT(best_iface); - - best_iface->ksni_npeers++; - ip = best_iface->ksni_ipaddr; - peer->ksnp_passive_ips[i] = ip; - peer->ksnp_n_passive_ips = i + 1; - } - - /* mark the best matching peer IP used */ - j = ksocknal_match_peerip(best_iface, peerips, n_peerips); - peerips[j] = 0; - } - - /* Overwrite input peer IP addresses */ - memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); - - write_unlock_bh(global_lock); - - return n_ips; -} - -static void -ksocknal_create_routes(struct ksock_peer *peer, int port, - __u32 *peer_ipaddrs, int npeer_ipaddrs) -{ - struct ksock_route *newroute = NULL; - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - struct lnet_ni *ni = peer->ksnp_ni; - struct ksock_net *net = ni->ni_data; - struct list_head *rtmp; - struct ksock_route *route; - struct ksock_interface *iface; - struct ksock_interface *best_iface; - int best_netmatch; - int this_netmatch; - int best_nroutes; - int i; - int j; - - /* - * CAVEAT EMPTOR: We do all our interface matching with an - * exclusive hold of global lock at IRQ priority. We're only - * expecting to be dealing with small numbers of interfaces, so the - * O(n**3)-ness here shouldn't matter - */ - write_lock_bh(global_lock); - - if (net->ksnn_ninterfaces < 2) { - /* - * Only create additional connections - * if I have > 1 interface - */ - write_unlock_bh(global_lock); - return; - } - - LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES); - - for (i = 0; i < npeer_ipaddrs; i++) { - if (newroute) { - newroute->ksnr_ipaddr = peer_ipaddrs[i]; - } else { - write_unlock_bh(global_lock); - - newroute = ksocknal_create_route(peer_ipaddrs[i], port); - if (!newroute) - return; - - write_lock_bh(global_lock); - } - - if (peer->ksnp_closing) { - /* peer got closed under me */ - break; - } - - /* Already got a route? */ - route = NULL; - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, struct ksock_route, ksnr_list); - - if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) - break; - - route = NULL; - } - if (route) - continue; - - best_iface = NULL; - best_nroutes = 0; - best_netmatch = 0; - - LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); - - /* Select interface to connect from */ - for (j = 0; j < net->ksnn_ninterfaces; j++) { - iface = &net->ksnn_interfaces[j]; - - /* Using this interface already? */ - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, struct ksock_route, - ksnr_list); - - if (route->ksnr_myipaddr == iface->ksni_ipaddr) - break; - - route = NULL; - } - if (route) - continue; - - this_netmatch = (!((iface->ksni_ipaddr ^ - newroute->ksnr_ipaddr) & - iface->ksni_netmask)) ? 1 : 0; - - if (!(!best_iface || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_nroutes > iface->ksni_nroutes))) - continue; - - best_iface = iface; - best_netmatch = this_netmatch; - best_nroutes = iface->ksni_nroutes; - } - - if (!best_iface) - continue; - - newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; - best_iface->ksni_nroutes++; - - ksocknal_add_route_locked(peer, newroute); - newroute = NULL; - } - - write_unlock_bh(global_lock); - if (newroute) - ksocknal_route_decref(newroute); -} - -int -ksocknal_accept(struct lnet_ni *ni, struct socket *sock) -{ - struct ksock_connreq *cr; - int rc; - __u32 peer_ip; - int peer_port; - - rc = lnet_sock_getaddr(sock, 1, &peer_ip, &peer_port); - LASSERT(!rc); /* we succeeded before */ - - cr = kzalloc(sizeof(*cr), GFP_NOFS); - if (!cr) { - LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n", - &peer_ip); - return -ENOMEM; - } - - lnet_ni_addref(ni); - cr->ksncr_ni = ni; - cr->ksncr_sock = sock; - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - - list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); - wake_up(&ksocknal_data.ksnd_connd_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - return 0; -} - -static int -ksocknal_connecting(struct ksock_peer *peer, __u32 ipaddr) -{ - struct ksock_route *route; - - list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) { - if (route->ksnr_ipaddr == ipaddr) - return route->ksnr_connecting; - } - return 0; -} - -int -ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route, - struct socket *sock, int type) -{ - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - LIST_HEAD(zombies); - struct lnet_process_id peerid; - struct list_head *tmp; - __u64 incarnation; - struct ksock_conn *conn; - struct ksock_conn *conn2; - struct ksock_peer *peer = NULL; - struct ksock_peer *peer2; - struct ksock_sched *sched; - struct ksock_hello_msg *hello; - int cpt; - struct ksock_tx *tx; - struct ksock_tx *txtmp; - int rc; - int active; - char *warn = NULL; - - active = !!route; - - LASSERT(active == (type != SOCKLND_CONN_NONE)); - - conn = kzalloc(sizeof(*conn), GFP_NOFS); - if (!conn) { - rc = -ENOMEM; - goto failed_0; - } - - conn->ksnc_peer = NULL; - conn->ksnc_route = NULL; - conn->ksnc_sock = sock; - /* - * 2 ref, 1 for conn, another extra ref prevents socket - * being closed before establishment of connection - */ - atomic_set(&conn->ksnc_sock_refcount, 2); - conn->ksnc_type = type; - ksocknal_lib_save_callback(sock, conn); - atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ - - conn->ksnc_rx_ready = 0; - conn->ksnc_rx_scheduled = 0; - - INIT_LIST_HEAD(&conn->ksnc_tx_queue); - conn->ksnc_tx_ready = 0; - conn->ksnc_tx_scheduled = 0; - conn->ksnc_tx_carrier = NULL; - atomic_set(&conn->ksnc_tx_nob, 0); - - hello = kvzalloc(offsetof(struct ksock_hello_msg, - kshm_ips[LNET_MAX_INTERFACES]), - GFP_KERNEL); - if (!hello) { - rc = -ENOMEM; - goto failed_1; - } - - /* stash conn's local and remote addrs */ - rc = ksocknal_lib_get_conn_addrs(conn); - if (rc) - goto failed_1; - - /* - * Find out/confirm peer's NID and connection type and get the - * vector of interfaces she's willing to let me connect to. - * Passive connections use the listener timeout since the peer sends - * eagerly - */ - if (active) { - peer = route->ksnr_peer; - LASSERT(ni == peer->ksnp_ni); - - /* Active connection sends HELLO eagerly */ - hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); - peerid = peer->ksnp_id; - - write_lock_bh(global_lock); - conn->ksnc_proto = peer->ksnp_proto; - write_unlock_bh(global_lock); - - if (!conn->ksnc_proto) { - conn->ksnc_proto = &ksocknal_protocol_v3x; -#if SOCKNAL_VERSION_DEBUG - if (*ksocknal_tunables.ksnd_protocol == 2) - conn->ksnc_proto = &ksocknal_protocol_v2x; - else if (*ksocknal_tunables.ksnd_protocol == 1) - conn->ksnc_proto = &ksocknal_protocol_v1x; -#endif - } - - rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); - if (rc) - goto failed_1; - } else { - peerid.nid = LNET_NID_ANY; - peerid.pid = LNET_PID_ANY; - - /* Passive, get protocol from peer */ - conn->ksnc_proto = NULL; - } - - rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation); - if (rc < 0) - goto failed_1; - - LASSERT(!rc || active); - LASSERT(conn->ksnc_proto); - LASSERT(peerid.nid != LNET_NID_ANY); - - cpt = lnet_cpt_of_nid(peerid.nid); - - if (active) { - ksocknal_peer_addref(peer); - write_lock_bh(global_lock); - } else { - rc = ksocknal_create_peer(&peer, ni, peerid); - if (rc) - goto failed_1; - - write_lock_bh(global_lock); - - /* called with a ref on ni, so shutdown can't have started */ - LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown); - - peer2 = ksocknal_find_peer_locked(ni, peerid); - if (!peer2) { - /* - * NB this puts an "empty" peer in the peer - * table (which takes my ref) - */ - list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(peerid.nid)); - } else { - ksocknal_peer_decref(peer); - peer = peer2; - } - - /* +1 ref for me */ - ksocknal_peer_addref(peer); - peer->ksnp_accepting++; - - /* - * Am I already connecting to this guy? Resolve in - * favour of higher NID... - */ - if (peerid.nid < ni->ni_nid && - ksocknal_connecting(peer, conn->ksnc_ipaddr)) { - rc = EALREADY; - warn = "connection race resolution"; - goto failed_2; - } - } - - if (peer->ksnp_closing || - (active && route->ksnr_deleted)) { - /* peer/route got closed under me */ - rc = -ESTALE; - warn = "peer/route removed"; - goto failed_2; - } - - if (!peer->ksnp_proto) { - /* - * Never connected before. - * NB recv_hello may have returned EPROTO to signal my peer - * wants a different protocol than the one I asked for. - */ - LASSERT(list_empty(&peer->ksnp_conns)); - - peer->ksnp_proto = conn->ksnc_proto; - peer->ksnp_incarnation = incarnation; - } - - if (peer->ksnp_proto != conn->ksnc_proto || - peer->ksnp_incarnation != incarnation) { - /* Peer rebooted or I've got the wrong protocol version */ - ksocknal_close_peer_conns_locked(peer, 0, 0); - - peer->ksnp_proto = NULL; - rc = ESTALE; - warn = peer->ksnp_incarnation != incarnation ? - "peer rebooted" : - "wrong proto version"; - goto failed_2; - } - - switch (rc) { - default: - LBUG(); - case 0: - break; - case EALREADY: - warn = "lost conn race"; - goto failed_2; - case EPROTO: - warn = "retry with different protocol version"; - goto failed_2; - } - - /* - * Refuse to duplicate an existing connection, unless this is a - * loopback connection - */ - if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || - conn2->ksnc_myipaddr != conn->ksnc_myipaddr || - conn2->ksnc_type != conn->ksnc_type) - continue; - - /* - * Reply on a passive connection attempt so the peer - * realises we're connected. - */ - LASSERT(!rc); - if (!active) - rc = EALREADY; - - warn = "duplicate"; - goto failed_2; - } - } - - /* - * If the connection created by this route didn't bind to the IP - * address the route connected to, the connection/route matching - * code below probably isn't going to work. - */ - if (active && - route->ksnr_ipaddr != conn->ksnc_ipaddr) { - CERROR("Route %s %pI4h connected to %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr, - &conn->ksnc_ipaddr); - } - - /* - * Search for a route corresponding to the new connection and - * create an association. This allows incoming connections created - * by routes in my peer to match my own route entries so I don't - * continually create duplicate routes. - */ - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - if (route->ksnr_ipaddr != conn->ksnc_ipaddr) - continue; - - ksocknal_associate_route_conn_locked(route, conn); - break; - } - - conn->ksnc_peer = peer; /* conn takes my ref on peer */ - peer->ksnp_last_alive = jiffies; - peer->ksnp_send_keepalive = 0; - peer->ksnp_error = 0; - - sched = ksocknal_choose_scheduler_locked(cpt); - sched->kss_nconns++; - conn->ksnc_scheduler = sched; - - conn->ksnc_tx_last_post = jiffies; - /* Set the deadline for the outgoing HELLO to drain */ - conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; - conn->ksnc_tx_deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - mb(); /* order with adding to peer's conn list */ - - list_add(&conn->ksnc_list, &peer->ksnp_conns); - ksocknal_conn_addref(conn); - - ksocknal_new_packet(conn, 0); - - conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); - - /* Take packets blocking for this connection. */ - list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) { - int match = conn->ksnc_proto->pro_match_tx(conn, tx, - tx->tx_nonblk); - - if (match == SOCKNAL_MATCH_NO) - continue; - - list_del(&tx->tx_list); - ksocknal_queue_tx_locked(tx, conn); - } - - write_unlock_bh(global_lock); - - /* - * We've now got a new connection. Any errors from here on are just - * like "normal" comms errors and we close the connection normally. - * NB (a) we still have to send the reply HELLO for passive - * connections, - * (b) normal I/O on the conn is blocked until I setup and call the - * socket callbacks. - */ - CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n", - libcfs_id2str(peerid), conn->ksnc_proto->pro_version, - &conn->ksnc_myipaddr, &conn->ksnc_ipaddr, - conn->ksnc_port, incarnation, cpt, - (int)(sched - &sched->kss_info->ksi_scheds[0])); - - if (active) { - /* additional routes after interface exchange? */ - ksocknal_create_routes(peer, conn->ksnc_port, - hello->kshm_ips, hello->kshm_nips); - } else { - hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, - hello->kshm_nips); - rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); - } - - kvfree(hello); - - /* - * setup the socket AFTER I've received hello (it disables - * SO_LINGER). I might call back to the acceptor who may want - * to send a protocol version response and then close the - * socket; this ensures the socket only tears down after the - * response has been sent. - */ - if (!rc) - rc = ksocknal_lib_setup_sock(sock); - - write_lock_bh(global_lock); - - /* NB my callbacks block while I hold ksnd_global_lock */ - ksocknal_lib_set_callback(sock, conn); - - if (!active) - peer->ksnp_accepting--; - - write_unlock_bh(global_lock); - - if (rc) { - write_lock_bh(global_lock); - if (!conn->ksnc_closing) { - /* could be closed by another thread */ - ksocknal_close_conn_locked(conn, rc); - } - write_unlock_bh(global_lock); - } else if (!ksocknal_connsock_addref(conn)) { - /* Allow I/O to proceed. */ - ksocknal_read_callback(conn); - ksocknal_write_callback(conn); - ksocknal_connsock_decref(conn); - } - - ksocknal_connsock_decref(conn); - ksocknal_conn_decref(conn); - return rc; - - failed_2: - if (!peer->ksnp_closing && - list_empty(&peer->ksnp_conns) && - list_empty(&peer->ksnp_routes)) { - list_add(&zombies, &peer->ksnp_tx_queue); - list_del_init(&peer->ksnp_tx_queue); - ksocknal_unlink_peer_locked(peer); - } - - write_unlock_bh(global_lock); - - if (warn) { - if (rc < 0) - CERROR("Not creating conn %s type %d: %s\n", - libcfs_id2str(peerid), conn->ksnc_type, warn); - else - CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", - libcfs_id2str(peerid), conn->ksnc_type, warn); - } - - if (!active) { - if (rc > 0) { - /* - * Request retry by replying with CONN_NONE - * ksnc_proto has been set already - */ - conn->ksnc_type = SOCKLND_CONN_NONE; - hello->kshm_nips = 0; - ksocknal_send_hello(ni, conn, peerid.nid, hello); - } - - write_lock_bh(global_lock); - peer->ksnp_accepting--; - write_unlock_bh(global_lock); - } - - ksocknal_txlist_done(ni, &zombies, 1); - ksocknal_peer_decref(peer); - -failed_1: - kvfree(hello); - - kfree(conn); - -failed_0: - sock_release(sock); - return rc; -} - -void -ksocknal_close_conn_locked(struct ksock_conn *conn, int error) -{ - /* - * This just does the immmediate housekeeping, and queues the - * connection for the reaper to terminate. - * Caller holds ksnd_global_lock exclusively in irq context - */ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_route *route; - struct ksock_conn *conn2; - struct list_head *tmp; - - LASSERT(!peer->ksnp_error); - LASSERT(!conn->ksnc_closing); - conn->ksnc_closing = 1; - - /* ksnd_deathrow_conns takes over peer's ref */ - list_del(&conn->ksnc_list); - - route = conn->ksnc_route; - if (route) { - /* dissociate conn from route... */ - LASSERT(!route->ksnr_deleted); - LASSERT(route->ksnr_connected & (1 << conn->ksnc_type)); - - conn2 = NULL; - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn2->ksnc_route == route && - conn2->ksnc_type == conn->ksnc_type) - break; - - conn2 = NULL; - } - if (!conn2) - route->ksnr_connected &= ~(1 << conn->ksnc_type); - - conn->ksnc_route = NULL; - - ksocknal_route_decref(route); /* drop conn's ref on route */ - } - - if (list_empty(&peer->ksnp_conns)) { - /* No more connections to this peer */ - - if (!list_empty(&peer->ksnp_tx_queue)) { - struct ksock_tx *tx; - - LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); - - /* - * throw them to the last connection..., - * these TXs will be send to /dev/null by scheduler - */ - list_for_each_entry(tx, &peer->ksnp_tx_queue, - tx_list) - ksocknal_tx_prep(conn, tx); - - spin_lock_bh(&conn->ksnc_scheduler->kss_lock); - list_splice_init(&peer->ksnp_tx_queue, - &conn->ksnc_tx_queue); - spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); - } - - peer->ksnp_proto = NULL; /* renegotiate protocol version */ - peer->ksnp_error = error; /* stash last conn close reason */ - - if (list_empty(&peer->ksnp_routes)) { - /* - * I've just closed last conn belonging to a - * peer with no routes to it - */ - ksocknal_unlink_peer_locked(peer); - } - } - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - list_add_tail(&conn->ksnc_list, - &ksocknal_data.ksnd_deathrow_conns); - wake_up(&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); -} - -void -ksocknal_peer_failed(struct ksock_peer *peer) -{ - int notify = 0; - unsigned long last_alive = 0; - - /* - * There has been a connection failure or comms error; but I'll only - * tell LNET I think the peer is dead if it's to another kernel and - * there are no connections or connection attempts in existence. - */ - read_lock(&ksocknal_data.ksnd_global_lock); - - if (!(peer->ksnp_id.pid & LNET_PID_USERFLAG) && - list_empty(&peer->ksnp_conns) && - !peer->ksnp_accepting && - !ksocknal_find_connecting_route_locked(peer)) { - notify = 1; - last_alive = peer->ksnp_last_alive; - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - if (notify) - lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0, - last_alive); -} - -void -ksocknal_finalize_zcreq(struct ksock_conn *conn) -{ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_tx *tx; - struct ksock_tx *temp; - struct ksock_tx *tmp; - LIST_HEAD(zlist); - - /* - * NB safe to finalize TXs because closing of socket will - * abort all buffered data - */ - LASSERT(!conn->ksnc_sock); - - spin_lock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) { - if (tx->tx_conn != conn) - continue; - - LASSERT(tx->tx_msg.ksm_zc_cookies[0]); - - tx->tx_msg.ksm_zc_cookies[0] = 0; - tx->tx_zc_aborted = 1; /* mark it as not-acked */ - list_del(&tx->tx_zc_list); - list_add(&tx->tx_zc_list, &zlist); - } - - spin_unlock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) { - list_del(&tx->tx_zc_list); - ksocknal_tx_decref(tx); - } -} - -void -ksocknal_terminate_conn(struct ksock_conn *conn) -{ - /* - * This gets called by the reaper (guaranteed thread context) to - * disengage the socket from its callbacks and close it. - * ksnc_refcount will eventually hit zero, and then the reaper will - * destroy it. - */ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_sched *sched = conn->ksnc_scheduler; - int failed = 0; - - LASSERT(conn->ksnc_closing); - - /* wake up the scheduler to "send" all remaining packets to /dev/null */ - spin_lock_bh(&sched->kss_lock); - - /* a closing conn is always ready to tx */ - conn->ksnc_tx_ready = 1; - - if (!conn->ksnc_tx_scheduled && - !list_empty(&conn->ksnc_tx_queue)) { - list_add_tail(&conn->ksnc_tx_list, - &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - /* extra ref for scheduler */ - ksocknal_conn_addref(conn); - - wake_up(&sched->kss_waitq); - } - - spin_unlock_bh(&sched->kss_lock); - - /* serialise with callbacks */ - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_lib_reset_callback(conn->ksnc_sock, conn); - - /* - * OK, so this conn may not be completely disengaged from its - * scheduler yet, but it _has_ committed to terminate... - */ - conn->ksnc_scheduler->kss_nconns--; - - if (peer->ksnp_error) { - /* peer's last conn closed in error */ - LASSERT(list_empty(&peer->ksnp_conns)); - failed = 1; - peer->ksnp_error = 0; /* avoid multiple notifications */ - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - if (failed) - ksocknal_peer_failed(peer); - - /* - * The socket is closed on the final put; either here, or in - * ksocknal_{send,recv}msg(). Since we set up the linger2 option - * when the connection was established, this will close the socket - * immediately, aborting anything buffered in it. Any hung - * zero-copy transmits will therefore complete in finite time. - */ - ksocknal_connsock_decref(conn); -} - -void -ksocknal_queue_zombie_conn(struct ksock_conn *conn) -{ - /* Queue the conn for the reaper to destroy */ - - LASSERT(!atomic_read(&conn->ksnc_conn_refcount)); - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); - wake_up(&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); -} - -void -ksocknal_destroy_conn(struct ksock_conn *conn) -{ - unsigned long last_rcv; - - /* Final coup-de-grace of the reaper */ - CDEBUG(D_NET, "connection %p\n", conn); - - LASSERT(!atomic_read(&conn->ksnc_conn_refcount)); - LASSERT(!atomic_read(&conn->ksnc_sock_refcount)); - LASSERT(!conn->ksnc_sock); - LASSERT(!conn->ksnc_route); - LASSERT(!conn->ksnc_tx_scheduled); - LASSERT(!conn->ksnc_rx_scheduled); - LASSERT(list_empty(&conn->ksnc_tx_queue)); - - /* complete current receive if any */ - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_LNET_PAYLOAD: - last_rcv = conn->ksnc_rx_deadline - - *ksocknal_tunables.ksnd_timeout * HZ; - CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %zd, left: %d, last alive is %ld secs ago\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, - &conn->ksnc_ipaddr, conn->ksnc_port, - iov_iter_count(&conn->ksnc_rx_to), conn->ksnc_rx_nob_left, - (jiffies - last_rcv) / HZ); - lnet_finalize(conn->ksnc_peer->ksnp_ni, - conn->ksnc_cookie, -EIO); - break; - case SOCKNAL_RX_LNET_HEADER: - if (conn->ksnc_rx_started) - CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_proto->pro_version); - break; - case SOCKNAL_RX_KSM_HEADER: - if (conn->ksnc_rx_started) - CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_proto->pro_version); - break; - case SOCKNAL_RX_SLOP: - if (conn->ksnc_rx_started) - CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port); - break; - default: - LBUG(); - break; - } - - ksocknal_peer_decref(conn->ksnc_peer); - - kfree(conn); -} - -int -ksocknal_close_peer_conns_locked(struct ksock_peer *peer, __u32 ipaddr, int why) -{ - struct ksock_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, struct ksock_conn, ksnc_list); - - if (!ipaddr || conn->ksnc_ipaddr == ipaddr) { - count++; - ksocknal_close_conn_locked(conn, why); - } - } - - return count; -} - -int -ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why) -{ - struct ksock_peer *peer = conn->ksnc_peer; - __u32 ipaddr = conn->ksnc_ipaddr; - int count; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - count = ksocknal_close_peer_conns_locked(peer, ipaddr, why); - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return count; -} - -int -ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr) -{ - struct ksock_peer *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - if (id.nid != LNET_NID_ANY) { - lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - } else { - lo = 0; - hi = ksocknal_data.ksnd_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, - &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && - (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) - continue; - - count += ksocknal_close_peer_conns_locked(peer, ipaddr, - 0); - } - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - /* wildcards always succeed */ - if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || !ipaddr) - return 0; - - if (!count) - return -ENOENT; - else - return 0; -} - -void -ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive) -{ - /* - * The router is telling me she's been notified of a change in - * gateway state.... - */ - struct lnet_process_id id = {0}; - - id.nid = gw_nid; - id.pid = LNET_PID_ANY; - - CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), - alive ? "up" : "down"); - - if (!alive) { - /* If the gateway crashed, close all open connections... */ - ksocknal_close_matching_conns(id, 0); - return; - } - - /* - * ...otherwise do nothing. We can only establish new connections - * if we have autroutes, and these connect on demand. - */ -} - -void -ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when) -{ - int connect = 1; - unsigned long last_alive = 0; - unsigned long now = jiffies; - struct ksock_peer *peer = NULL; - rwlock_t *glock = &ksocknal_data.ksnd_global_lock; - struct lnet_process_id id = { - .nid = nid, - .pid = LNET_PID_LUSTRE, - }; - - read_lock(glock); - - peer = ksocknal_find_peer_locked(ni, id); - if (peer) { - struct ksock_conn *conn; - int bufnob; - - list_for_each_entry(conn, &peer->ksnp_conns, ksnc_list) { - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - - if (bufnob < conn->ksnc_tx_bufnob) { - /* something got ACKed */ - conn->ksnc_tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - peer->ksnp_last_alive = now; - conn->ksnc_tx_bufnob = bufnob; - } - } - - last_alive = peer->ksnp_last_alive; - if (!ksocknal_find_connectable_route_locked(peer)) - connect = 0; - } - - read_unlock(glock); - - if (last_alive) - *when = last_alive; - - CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", - libcfs_nid2str(nid), peer, - last_alive ? (now - last_alive) / HZ : -1, - connect); - - if (!connect) - return; - - ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); - - write_lock_bh(glock); - - peer = ksocknal_find_peer_locked(ni, id); - if (peer) - ksocknal_launch_all_connections_locked(peer); - - write_unlock_bh(glock); -} - -static void -ksocknal_push_peer(struct ksock_peer *peer) -{ - int index; - int i; - struct list_head *tmp; - struct ksock_conn *conn; - - for (index = 0; ; index++) { - read_lock(&ksocknal_data.ksnd_global_lock); - - i = 0; - conn = NULL; - - list_for_each(tmp, &peer->ksnp_conns) { - if (i++ == index) { - conn = list_entry(tmp, struct ksock_conn, - ksnc_list); - ksocknal_conn_addref(conn); - break; - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - if (!conn) - break; - - ksocknal_lib_push_conn(conn); - ksocknal_conn_decref(conn); - } -} - -static int ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id) -{ - struct list_head *start; - struct list_head *end; - struct list_head *tmp; - int rc = -ENOENT; - unsigned int hsize = ksocknal_data.ksnd_peer_hash_size; - - if (id.nid == LNET_NID_ANY) { - start = &ksocknal_data.ksnd_peers[0]; - end = &ksocknal_data.ksnd_peers[hsize - 1]; - } else { - start = ksocknal_nid2peerlist(id.nid); - end = ksocknal_nid2peerlist(id.nid); - } - - for (tmp = start; tmp <= end; tmp++) { - int peer_off; /* searching offset in peer hash table */ - - for (peer_off = 0; ; peer_off++) { - struct ksock_peer *peer; - int i = 0; - - read_lock(&ksocknal_data.ksnd_global_lock); - list_for_each_entry(peer, tmp, ksnp_list) { - if (!((id.nid == LNET_NID_ANY || - id.nid == peer->ksnp_id.nid) && - (id.pid == LNET_PID_ANY || - id.pid == peer->ksnp_id.pid))) - continue; - - if (i++ == peer_off) { - ksocknal_peer_addref(peer); - break; - } - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - if (!i) /* no match */ - break; - - rc = 0; - ksocknal_push_peer(peer); - ksocknal_peer_decref(peer); - } - } - return rc; -} - -static int -ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask) -{ - struct ksock_net *net = ni->ni_data; - struct ksock_interface *iface; - int rc; - int i; - int j; - struct list_head *ptmp; - struct ksock_peer *peer; - struct list_head *rtmp; - struct ksock_route *route; - - if (!ipaddress || !netmask) - return -EINVAL; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - iface = ksocknal_ip2iface(ni, ipaddress); - if (iface) { - /* silently ignore dups */ - rc = 0; - } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { - rc = -ENOSPC; - } else { - iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; - - iface->ksni_ipaddr = ipaddress; - iface->ksni_netmask = netmask; - iface->ksni_nroutes = 0; - iface->ksni_npeers = 0; - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, - ksnp_list); - - for (j = 0; j < peer->ksnp_n_passive_ips; j++) - if (peer->ksnp_passive_ips[j] == ipaddress) - iface->ksni_npeers++; - - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, struct ksock_route, - ksnr_list); - - if (route->ksnr_myipaddr == ipaddress) - iface->ksni_nroutes++; - } - } - } - - rc = 0; - /* - * NB only new connections will pay attention to the - * new interface! - */ - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return rc; -} - -static void -ksocknal_peer_del_interface_locked(struct ksock_peer *peer, __u32 ipaddr) -{ - struct list_head *tmp; - struct list_head *nxt; - struct ksock_route *route; - struct ksock_conn *conn; - int i; - int j; - - for (i = 0; i < peer->ksnp_n_passive_ips; i++) - if (peer->ksnp_passive_ips[i] == ipaddr) { - for (j = i + 1; j < peer->ksnp_n_passive_ips; j++) - peer->ksnp_passive_ips[j - 1] = - peer->ksnp_passive_ips[j]; - peer->ksnp_n_passive_ips--; - break; - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - if (route->ksnr_myipaddr != ipaddr) - continue; - - if (route->ksnr_share_count) { - /* Manually created; keep, but unbind */ - route->ksnr_myipaddr = 0; - } else { - ksocknal_del_route_locked(route); - } - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn->ksnc_myipaddr == ipaddr) - ksocknal_close_conn_locked(conn, 0); - } -} - -static int -ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress) -{ - struct ksock_net *net = ni->ni_data; - int rc = -ENOENT; - struct list_head *tmp; - struct list_head *nxt; - struct ksock_peer *peer; - __u32 this_ip; - int i; - int j; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - this_ip = net->ksnn_interfaces[i].ksni_ipaddr; - - if (!(!ipaddress || ipaddress == this_ip)) - continue; - - rc = 0; - - for (j = i + 1; j < net->ksnn_ninterfaces; j++) - net->ksnn_interfaces[j - 1] = - net->ksnn_interfaces[j]; - - net->ksnn_ninterfaces--; - - for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { - list_for_each_safe(tmp, nxt, - &ksocknal_data.ksnd_peers[j]) { - peer = list_entry(tmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni != ni) - continue; - - ksocknal_peer_del_interface_locked(peer, this_ip); - } - } - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return rc; -} - -int -ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) -{ - struct lnet_process_id id = {0}; - struct libcfs_ioctl_data *data = arg; - int rc; - - switch (cmd) { - case IOC_LIBCFS_GET_INTERFACE: { - struct ksock_net *net = ni->ni_data; - struct ksock_interface *iface; - - read_lock(&ksocknal_data.ksnd_global_lock); - - if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { - rc = -ENOENT; - } else { - rc = 0; - iface = &net->ksnn_interfaces[data->ioc_count]; - - data->ioc_u32[0] = iface->ksni_ipaddr; - data->ioc_u32[1] = iface->ksni_netmask; - data->ioc_u32[2] = iface->ksni_npeers; - data->ioc_u32[3] = iface->ksni_nroutes; - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - return rc; - } - - case IOC_LIBCFS_ADD_INTERFACE: - return ksocknal_add_interface(ni, - data->ioc_u32[0], /* IP address */ - data->ioc_u32[1]); /* net mask */ - - case IOC_LIBCFS_DEL_INTERFACE: - return ksocknal_del_interface(ni, - data->ioc_u32[0]); /* IP address */ - - case IOC_LIBCFS_GET_PEER: { - __u32 myip = 0; - __u32 ip = 0; - int port = 0; - int conn_count = 0; - int share_count = 0; - - rc = ksocknal_get_peer_info(ni, data->ioc_count, - &id, &myip, &ip, &port, - &conn_count, &share_count); - if (rc) - return rc; - - data->ioc_nid = id.nid; - data->ioc_count = share_count; - data->ioc_u32[0] = ip; - data->ioc_u32[1] = port; - data->ioc_u32[2] = myip; - data->ioc_u32[3] = conn_count; - data->ioc_u32[4] = id.pid; - return 0; - } - - case IOC_LIBCFS_ADD_PEER: - id.nid = data->ioc_nid; - id.pid = LNET_PID_LUSTRE; - return ksocknal_add_peer(ni, id, - data->ioc_u32[0], /* IP */ - data->ioc_u32[1]); /* port */ - - case IOC_LIBCFS_DEL_PEER: - id.nid = data->ioc_nid; - id.pid = LNET_PID_ANY; - return ksocknal_del_peer(ni, id, - data->ioc_u32[0]); /* IP */ - - case IOC_LIBCFS_GET_CONN: { - int txmem; - int rxmem; - int nagle; - struct ksock_conn *conn; - - conn = ksocknal_get_conn_by_idx(ni, data->ioc_count); - if (!conn) - return -ENOENT; - - ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); - - data->ioc_count = txmem; - data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; - data->ioc_flags = nagle; - data->ioc_u32[0] = conn->ksnc_ipaddr; - data->ioc_u32[1] = conn->ksnc_port; - data->ioc_u32[2] = conn->ksnc_myipaddr; - data->ioc_u32[3] = conn->ksnc_type; - data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt; - data->ioc_u32[5] = rxmem; - data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; - ksocknal_conn_decref(conn); - return 0; - } - - case IOC_LIBCFS_CLOSE_CONNECTION: - id.nid = data->ioc_nid; - id.pid = LNET_PID_ANY; - return ksocknal_close_matching_conns(id, - data->ioc_u32[0]); - - case IOC_LIBCFS_REGISTER_MYNID: - /* Ignore if this is a noop */ - if (data->ioc_nid == ni->ni_nid) - return 0; - - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - return -EINVAL; - - case IOC_LIBCFS_PUSH_CONNECTION: - id.nid = data->ioc_nid; - id.pid = LNET_PID_ANY; - return ksocknal_push(ni, id); - - default: - return -EINVAL; - } - /* not reached */ -} - -static void -ksocknal_free_buffers(void) -{ - LASSERT(!atomic_read(&ksocknal_data.ksnd_nactive_txs)); - - if (ksocknal_data.ksnd_sched_info) { - struct ksock_sched_info *info; - int i; - - cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) - kfree(info->ksi_scheds); - cfs_percpt_free(ksocknal_data.ksnd_sched_info); - } - - kvfree(ksocknal_data.ksnd_peers); - - spin_lock(&ksocknal_data.ksnd_tx_lock); - - if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { - struct list_head zlist; - struct ksock_tx *tx; - struct ksock_tx *temp; - - list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); - list_del_init(&ksocknal_data.ksnd_idle_noop_txs); - spin_unlock(&ksocknal_data.ksnd_tx_lock); - - list_for_each_entry_safe(tx, temp, &zlist, tx_list) { - list_del(&tx->tx_list); - kfree(tx); - } - } else { - spin_unlock(&ksocknal_data.ksnd_tx_lock); - } -} - -static void -ksocknal_base_shutdown(void) -{ - struct ksock_sched_info *info; - struct ksock_sched *sched; - int i; - int j; - - LASSERT(!ksocknal_data.ksnd_nnets); - - switch (ksocknal_data.ksnd_init) { - default: - LASSERT(0); - /* fall through */ - case SOCKNAL_INIT_ALL: - case SOCKNAL_INIT_DATA: - LASSERT(ksocknal_data.ksnd_peers); - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) - LASSERT(list_empty(&ksocknal_data.ksnd_peers[i])); - - LASSERT(list_empty(&ksocknal_data.ksnd_nets)); - LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns)); - LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns)); - LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs)); - LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes)); - - if (ksocknal_data.ksnd_sched_info) { - cfs_percpt_for_each(info, i, - ksocknal_data.ksnd_sched_info) { - if (!info->ksi_scheds) - continue; - - for (j = 0; j < info->ksi_nthreads_max; j++) { - sched = &info->ksi_scheds[j]; - LASSERT(list_empty( - &sched->kss_tx_conns)); - LASSERT(list_empty( - &sched->kss_rx_conns)); - LASSERT(list_empty( - &sched->kss_zombie_noop_txs)); - LASSERT(!sched->kss_nconns); - } - } - } - - /* flag threads to terminate; wake and wait for them to die */ - ksocknal_data.ksnd_shuttingdown = 1; - wake_up_all(&ksocknal_data.ksnd_connd_waitq); - wake_up_all(&ksocknal_data.ksnd_reaper_waitq); - - if (ksocknal_data.ksnd_sched_info) { - cfs_percpt_for_each(info, i, - ksocknal_data.ksnd_sched_info) { - if (!info->ksi_scheds) - continue; - - for (j = 0; j < info->ksi_nthreads_max; j++) { - sched = &info->ksi_scheds[j]; - wake_up_all(&sched->kss_waitq); - } - } - } - - i = 4; - read_lock(&ksocknal_data.ksnd_global_lock); - while (ksocknal_data.ksnd_nthreads) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d threads to terminate\n", - ksocknal_data.ksnd_nthreads); - read_unlock(&ksocknal_data.ksnd_global_lock); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - read_lock(&ksocknal_data.ksnd_global_lock); - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - ksocknal_free_buffers(); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; - break; - } - - module_put(THIS_MODULE); -} - -static __u64 -ksocknal_new_incarnation(void) -{ - /* The incarnation number is the time this module loaded and it - * identifies this particular instance of the socknal. - */ - return ktime_get_ns(); -} - -static int -ksocknal_base_startup(void) -{ - struct ksock_sched_info *info; - int rc; - int i; - - LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); - LASSERT(!ksocknal_data.ksnd_nnets); - - memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */ - - ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; - ksocknal_data.ksnd_peers = kvmalloc_array(ksocknal_data.ksnd_peer_hash_size, - sizeof(struct list_head), - GFP_KERNEL); - if (!ksocknal_data.ksnd_peers) - return -ENOMEM; - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) - INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); - - rwlock_init(&ksocknal_data.ksnd_global_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); - - spin_lock_init(&ksocknal_data.ksnd_reaper_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns); - INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns); - INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns); - init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); - - spin_lock_init(&ksocknal_data.ksnd_connd_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs); - INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes); - init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); - - spin_lock_init(&ksocknal_data.ksnd_tx_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs); - - /* NB memset above zeros whole of ksocknal_data */ - - /* flag lists/ptrs/locks initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; - try_module_get(THIS_MODULE); - - ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*info)); - if (!ksocknal_data.ksnd_sched_info) - goto failed; - - cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { - struct ksock_sched *sched; - int nthrs; - - nthrs = cfs_cpt_weight(lnet_cpt_table(), i); - if (*ksocknal_tunables.ksnd_nscheds > 0) { - nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); - } else { - /* - * max to half of CPUs, assume another half should be - * reserved for upper layer modules - */ - nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); - } - - info->ksi_nthreads_max = nthrs; - info->ksi_cpt = i; - - info->ksi_scheds = kzalloc_cpt(info->ksi_nthreads_max * sizeof(*sched), - GFP_NOFS, i); - if (!info->ksi_scheds) - goto failed; - - for (; nthrs > 0; nthrs--) { - sched = &info->ksi_scheds[nthrs - 1]; - - sched->kss_info = info; - spin_lock_init(&sched->kss_lock); - INIT_LIST_HEAD(&sched->kss_rx_conns); - INIT_LIST_HEAD(&sched->kss_tx_conns); - INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); - init_waitqueue_head(&sched->kss_waitq); - } - } - - ksocknal_data.ksnd_connd_starting = 0; - ksocknal_data.ksnd_connd_failed_stamp = 0; - ksocknal_data.ksnd_connd_starting_stamp = ktime_get_real_seconds(); - /* - * must have at least 2 connds to remain responsive to accepts while - * connecting - */ - if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) - *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; - - if (*ksocknal_tunables.ksnd_nconnds_max < - *ksocknal_tunables.ksnd_nconnds) { - ksocknal_tunables.ksnd_nconnds_max = - ksocknal_tunables.ksnd_nconnds; - } - - for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { - char name[16]; - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - ksocknal_data.ksnd_connd_starting++; - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - - snprintf(name, sizeof(name), "socknal_cd%02d", i); - rc = ksocknal_thread_start(ksocknal_connd, - (void *)((uintptr_t)i), name); - if (rc) { - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - ksocknal_data.ksnd_connd_starting--; - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - CERROR("Can't spawn socknal connd: %d\n", rc); - goto failed; - } - } - - rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); - if (rc) { - CERROR("Can't spawn socknal reaper: %d\n", rc); - goto failed; - } - - /* flag everything initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - - return 0; - - failed: - ksocknal_base_shutdown(); - return -ENETDOWN; -} - -static void -ksocknal_debug_peerhash(struct lnet_ni *ni) -{ - struct ksock_peer *peer = NULL; - struct list_head *tmp; - int i; - - read_lock(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(tmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni == ni) - break; - - peer = NULL; - } - } - - if (peer) { - struct ksock_route *route; - struct ksock_conn *conn; - - CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n", - libcfs_id2str(peer->ksnp_id), - atomic_read(&peer->ksnp_refcount), - peer->ksnp_sharecount, peer->ksnp_closing, - peer->ksnp_accepting, peer->ksnp_error, - peer->ksnp_zc_next_cookie, - !list_empty(&peer->ksnp_tx_queue), - !list_empty(&peer->ksnp_zc_req_list)); - - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n", - atomic_read(&route->ksnr_refcount), - route->ksnr_scheduled, route->ksnr_connecting, - route->ksnr_connected, route->ksnr_deleted); - } - - list_for_each(tmp, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - CWARN("Conn: ref %d, sref %d, t %d, c %d\n", - atomic_read(&conn->ksnc_conn_refcount), - atomic_read(&conn->ksnc_sock_refcount), - conn->ksnc_type, conn->ksnc_closing); - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -void -ksocknal_shutdown(struct lnet_ni *ni) -{ - struct ksock_net *net = ni->ni_data; - int i; - struct lnet_process_id anyid = {0}; - - anyid.nid = LNET_NID_ANY; - anyid.pid = LNET_PID_ANY; - - LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); - LASSERT(ksocknal_data.ksnd_nnets > 0); - - spin_lock_bh(&net->ksnn_lock); - net->ksnn_shutdown = 1; /* prevent new peers */ - spin_unlock_bh(&net->ksnn_lock); - - /* Delete all peers */ - ksocknal_del_peer(ni, anyid, 0); - - /* Wait for all peer state to clean up */ - i = 2; - spin_lock_bh(&net->ksnn_lock); - while (net->ksnn_npeers) { - spin_unlock_bh(&net->ksnn_lock); - - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect\n", - net->ksnn_npeers); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - - ksocknal_debug_peerhash(ni); - - spin_lock_bh(&net->ksnn_lock); - } - spin_unlock_bh(&net->ksnn_lock); - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - LASSERT(!net->ksnn_interfaces[i].ksni_npeers); - LASSERT(!net->ksnn_interfaces[i].ksni_nroutes); - } - - list_del(&net->ksnn_list); - kfree(net); - - ksocknal_data.ksnd_nnets--; - if (!ksocknal_data.ksnd_nnets) - ksocknal_base_shutdown(); -} - -static int -ksocknal_enumerate_interfaces(struct ksock_net *net) -{ - char **names; - int i; - int j; - int rc; - int n; - - n = lnet_ipif_enumerate(&names); - if (n <= 0) { - CERROR("Can't enumerate interfaces: %d\n", n); - return n; - } - - for (i = j = 0; i < n; i++) { - int up; - __u32 ip; - __u32 mask; - - if (!strcmp(names[i], "lo")) /* skip the loopback IF */ - continue; - - rc = lnet_ipif_query(names[i], &up, &ip, &mask); - if (rc) { - CWARN("Can't get interface %s info: %d\n", - names[i], rc); - continue; - } - - if (!up) { - CWARN("Ignoring interface %s (down)\n", - names[i]); - continue; - } - - if (j == LNET_MAX_INTERFACES) { - CWARN("Ignoring interface %s (too many interfaces)\n", - names[i]); - continue; - } - - net->ksnn_interfaces[j].ksni_ipaddr = ip; - net->ksnn_interfaces[j].ksni_netmask = mask; - strlcpy(net->ksnn_interfaces[j].ksni_name, - names[i], sizeof(net->ksnn_interfaces[j].ksni_name)); - j++; - } - - lnet_ipif_free_enumeration(names, n); - - if (!j) - CERROR("Can't find any usable interfaces\n"); - - return j; -} - -static int -ksocknal_search_new_ipif(struct ksock_net *net) -{ - int new_ipif = 0; - int i; - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - char *ifnam = &net->ksnn_interfaces[i].ksni_name[0]; - char *colon = strchr(ifnam, ':'); - int found = 0; - struct ksock_net *tmp; - int j; - - if (colon) /* ignore alias device */ - *colon = 0; - - list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, ksnn_list) { - for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) { - char *ifnam2 = - &tmp->ksnn_interfaces[j].ksni_name[0]; - char *colon2 = strchr(ifnam2, ':'); - - if (colon2) - *colon2 = 0; - - found = !strcmp(ifnam, ifnam2); - if (colon2) - *colon2 = ':'; - } - if (found) - break; - } - - new_ipif += !found; - if (colon) - *colon = ':'; - } - - return new_ipif; -} - -static int -ksocknal_start_schedulers(struct ksock_sched_info *info) -{ - int nthrs; - int rc = 0; - int i; - - if (!info->ksi_nthreads) { - if (*ksocknal_tunables.ksnd_nscheds > 0) { - nthrs = info->ksi_nthreads_max; - } else { - nthrs = cfs_cpt_weight(lnet_cpt_table(), - info->ksi_cpt); - nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); - nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); - } - nthrs = min(nthrs, info->ksi_nthreads_max); - } else { - LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max); - /* increase two threads if there is new interface */ - nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads); - } - - for (i = 0; i < nthrs; i++) { - long id; - char name[20]; - struct ksock_sched *sched; - - id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i); - sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; - snprintf(name, sizeof(name), "socknal_sd%02d_%02d", - info->ksi_cpt, (int)(sched - &info->ksi_scheds[0])); - - rc = ksocknal_thread_start(ksocknal_scheduler, - (void *)id, name); - if (!rc) - continue; - - CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", - info->ksi_cpt, info->ksi_nthreads + i, rc); - break; - } - - info->ksi_nthreads += i; - return rc; -} - -static int -ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts) -{ - int newif = ksocknal_search_new_ipif(net); - int rc; - int i; - - LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table())); - - for (i = 0; i < ncpts; i++) { - struct ksock_sched_info *info; - int cpt = !cpts ? i : cpts[i]; - - LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); - info = ksocknal_data.ksnd_sched_info[cpt]; - - if (!newif && info->ksi_nthreads > 0) - continue; - - rc = ksocknal_start_schedulers(info); - if (rc) - return rc; - } - return 0; -} - -int -ksocknal_startup(struct lnet_ni *ni) -{ - struct ksock_net *net; - int rc; - int i; - - LASSERT(ni->ni_lnd == &the_ksocklnd); - - if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { - rc = ksocknal_base_startup(); - if (rc) - return rc; - } - - net = kzalloc(sizeof(*net), GFP_NOFS); - if (!net) - goto fail_0; - - spin_lock_init(&net->ksnn_lock); - net->ksnn_incarnation = ksocknal_new_incarnation(); - ni->ni_data = net; - ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; - ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; - ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits; - ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits; - - if (!ni->ni_interfaces[0]) { - rc = ksocknal_enumerate_interfaces(net); - if (rc <= 0) - goto fail_1; - - net->ksnn_ninterfaces = 1; - } else { - for (i = 0; i < LNET_MAX_INTERFACES; i++) { - int up; - - if (!ni->ni_interfaces[i]) - break; - - rc = lnet_ipif_query(ni->ni_interfaces[i], &up, - &net->ksnn_interfaces[i].ksni_ipaddr, - &net->ksnn_interfaces[i].ksni_netmask); - - if (rc) { - CERROR("Can't get interface %s info: %d\n", - ni->ni_interfaces[i], rc); - goto fail_1; - } - - if (!up) { - CERROR("Interface %s is down\n", - ni->ni_interfaces[i]); - goto fail_1; - } - - strlcpy(net->ksnn_interfaces[i].ksni_name, - ni->ni_interfaces[i], - sizeof(net->ksnn_interfaces[i].ksni_name)); - } - net->ksnn_ninterfaces = i; - } - - /* call it before add it to ksocknal_data.ksnd_nets */ - rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); - if (rc) - goto fail_1; - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), - net->ksnn_interfaces[0].ksni_ipaddr); - list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); - - ksocknal_data.ksnd_nnets++; - - return 0; - - fail_1: - kfree(net); - fail_0: - if (!ksocknal_data.ksnd_nnets) - ksocknal_base_shutdown(); - - return -ENETDOWN; -} - -static void __exit ksocklnd_exit(void) -{ - lnet_unregister_lnd(&the_ksocklnd); -} - -static int __init ksocklnd_init(void) -{ - int rc; - - /* check ksnr_connected/connecting field large enough */ - BUILD_BUG_ON(SOCKLND_CONN_NTYPES > 4); - BUILD_BUG_ON(SOCKLND_CONN_ACK != SOCKLND_CONN_BULK_IN); - - /* initialize the_ksocklnd */ - the_ksocklnd.lnd_type = SOCKLND; - the_ksocklnd.lnd_startup = ksocknal_startup; - the_ksocklnd.lnd_shutdown = ksocknal_shutdown; - the_ksocklnd.lnd_ctl = ksocknal_ctl; - the_ksocklnd.lnd_send = ksocknal_send; - the_ksocklnd.lnd_recv = ksocknal_recv; - the_ksocklnd.lnd_notify = ksocknal_notify; - the_ksocklnd.lnd_query = ksocknal_query; - the_ksocklnd.lnd_accept = ksocknal_accept; - - rc = ksocknal_tunables_init(); - if (rc) - return rc; - - rc = libcfs_setup(); - if (rc) - return rc; - - lnet_register_lnd(&the_ksocklnd); - - return 0; -} - -MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>"); -MODULE_DESCRIPTION("TCP Socket LNet Network Driver"); -MODULE_VERSION("2.7.0"); -MODULE_LICENSE("GPL"); - -module_init(ksocklnd_init); -module_exit(ksocklnd_exit); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h deleted file mode 100644 index 4e5c89a..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h +++ /dev/null @@ -1,704 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * Author: Zach Brown <zab@zabbo.net> - * Author: Peter J. Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Eric Barton <eric@bartonsoftware.com> - * - * This file is part of Lustre, http://www.lustre.org - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef _SOCKLND_SOCKLND_H_ -#define _SOCKLND_SOCKLND_H_ - -#define DEBUG_PORTAL_ALLOC -#define DEBUG_SUBSYSTEM S_LND - -#include <linux/crc32.h> -#include <linux/errno.h> -#include <linux/if.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/sysctl.h> -#include <linux/uio.h> -#include <linux/unistd.h> -#include <asm/irq.h> -#include <net/sock.h> -#include <net/tcp.h> - -#include <linux/lnet/lib-lnet.h> -#include <linux/lnet/socklnd.h> - -/* assume one thread for each connection type */ -#define SOCKNAL_NSCHEDS 3 -#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) - -#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ -#define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */ - -#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ -#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ - -#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ - -/* - * risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). - * no risk if we're not running on a CONFIG_HIGHMEM platform. - */ -#ifdef CONFIG_HIGHMEM -# define SOCKNAL_RISK_KMAP_DEADLOCK 0 -#else -# define SOCKNAL_RISK_KMAP_DEADLOCK 1 -#endif - -struct ksock_sched_info; - -struct ksock_sched { /* per scheduler state */ - spinlock_t kss_lock; /* serialise */ - struct list_head kss_rx_conns; /* conn waiting to be read */ - struct list_head kss_tx_conns; /* conn waiting to be written */ - struct list_head kss_zombie_noop_txs; /* zombie noop tx list */ - wait_queue_head_t kss_waitq; /* where scheduler sleeps */ - int kss_nconns; /* # connections assigned to - * this scheduler - */ - struct ksock_sched_info *kss_info; /* owner of it */ -}; - -struct ksock_sched_info { - int ksi_nthreads_max; /* max allowed threads */ - int ksi_nthreads; /* number of threads */ - int ksi_cpt; /* CPT id */ - struct ksock_sched *ksi_scheds; /* array of schedulers */ -}; - -#define KSOCK_CPT_SHIFT 16 -#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) -#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) -#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) - -struct ksock_interface { /* in-use interface */ - __u32 ksni_ipaddr; /* interface's IP address */ - __u32 ksni_netmask; /* interface's network mask */ - int ksni_nroutes; /* # routes using (active) */ - int ksni_npeers; /* # peers using (passive) */ - char ksni_name[IFNAMSIZ]; /* interface name */ -}; - -struct ksock_tunables { - int *ksnd_timeout; /* "stuck" socket timeout - * (seconds) - */ - int *ksnd_nscheds; /* # scheduler threads in each - * pool while starting - */ - int *ksnd_nconnds; /* # connection daemons */ - int *ksnd_nconnds_max; /* max # connection daemons */ - int *ksnd_min_reconnectms; /* first connection retry after - * (ms)... - */ - int *ksnd_max_reconnectms; /* ...exponentially increasing to - * this - */ - int *ksnd_eager_ack; /* make TCP ack eagerly? */ - int *ksnd_typed_conns; /* drive sockets by type? */ - int *ksnd_min_bulk; /* smallest "large" message */ - int *ksnd_tx_buffer_size; /* socket tx buffer size */ - int *ksnd_rx_buffer_size; /* socket rx buffer size */ - int *ksnd_nagle; /* enable NAGLE? */ - int *ksnd_round_robin; /* round robin for multiple - * interfaces - */ - int *ksnd_keepalive; /* # secs for sending keepalive - * NOOP - */ - int *ksnd_keepalive_idle; /* # idle secs before 1st probe - */ - int *ksnd_keepalive_count; /* # probes */ - int *ksnd_keepalive_intvl; /* time between probes */ - int *ksnd_credits; /* # concurrent sends */ - int *ksnd_peertxcredits; /* # concurrent sends to 1 peer - */ - int *ksnd_peerrtrcredits; /* # per-peer router buffer - * credits - */ - int *ksnd_peertimeout; /* seconds to consider peer dead - */ - int *ksnd_enable_csum; /* enable check sum */ - int *ksnd_inject_csum_error; /* set non-zero to inject - * checksum error - */ - int *ksnd_nonblk_zcack; /* always send zc-ack on - * non-blocking connection - */ - unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload - * size - */ - int *ksnd_zc_recv; /* enable ZC receive (for - * Chelsio TOE) - */ - int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to - * enable ZC receive - */ -}; - -struct ksock_net { - __u64 ksnn_incarnation; /* my epoch */ - spinlock_t ksnn_lock; /* serialise */ - struct list_head ksnn_list; /* chain on global list */ - int ksnn_npeers; /* # peers */ - int ksnn_shutdown; /* shutting down? */ - int ksnn_ninterfaces; /* IP interfaces */ - struct ksock_interface ksnn_interfaces[LNET_MAX_INTERFACES]; -}; - -/** connd timeout */ -#define SOCKNAL_CONND_TIMEOUT 120 -/** reserved thread for accepting & creating new connd */ -#define SOCKNAL_CONND_RESV 1 - -struct ksock_nal_data { - int ksnd_init; /* initialisation state - */ - int ksnd_nnets; /* # networks set up */ - struct list_head ksnd_nets; /* list of nets */ - rwlock_t ksnd_global_lock; /* stabilize peer/conn - * ops - */ - struct list_head *ksnd_peers; /* hash table of all my - * known peers - */ - int ksnd_peer_hash_size; /* size of ksnd_peers */ - - int ksnd_nthreads; /* # live threads */ - int ksnd_shuttingdown; /* tell threads to exit - */ - struct ksock_sched_info **ksnd_sched_info; /* schedulers info */ - - atomic_t ksnd_nactive_txs; /* #active txs */ - - struct list_head ksnd_deathrow_conns; /* conns to close: - * reaper_lock - */ - struct list_head ksnd_zombie_conns; /* conns to free: - * reaper_lock - */ - struct list_head ksnd_enomem_conns; /* conns to retry: - * reaper_lock - */ - wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ - unsigned long ksnd_reaper_waketime; /* when reaper will wake - */ - spinlock_t ksnd_reaper_lock; /* serialise */ - - int ksnd_enomem_tx; /* test ENOMEM sender */ - int ksnd_stall_tx; /* test sluggish sender - */ - int ksnd_stall_rx; /* test sluggish - * receiver - */ - struct list_head ksnd_connd_connreqs; /* incoming connection - * requests - */ - struct list_head ksnd_connd_routes; /* routes waiting to be - * connected - */ - wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */ - int ksnd_connd_connecting; /* # connds connecting - */ - time64_t ksnd_connd_failed_stamp;/* time stamp of the - * last failed - * connecting attempt - */ - time64_t ksnd_connd_starting_stamp;/* time stamp of the - * last starting connd - */ - unsigned int ksnd_connd_starting; /* # starting connd */ - unsigned int ksnd_connd_running; /* # running connd */ - spinlock_t ksnd_connd_lock; /* serialise */ - - struct list_head ksnd_idle_noop_txs; /* list head for freed - * noop tx - */ - spinlock_t ksnd_tx_lock; /* serialise, g_lock - * unsafe - */ -}; - -#define SOCKNAL_INIT_NOTHING 0 -#define SOCKNAL_INIT_DATA 1 -#define SOCKNAL_INIT_ALL 2 - -/* - * A packet just assembled for transmission is represented by 1 or more - * struct iovec fragments (the first frag contains the portals header), - * followed by 0 or more struct bio_vec fragments. - * - * On the receive side, initially 1 struct iovec fragment is posted for - * receive (the header). Once the header has been received, the payload is - * received into either struct iovec or struct bio_vec fragments, depending on - * what the header matched or whether the message needs forwarding. - */ -struct ksock_conn; /* forward ref */ -struct ksock_peer; /* forward ref */ -struct ksock_route; /* forward ref */ -struct ksock_proto; /* forward ref */ - -struct ksock_tx { /* transmit packet */ - struct list_head tx_list; /* queue on conn for transmission etc - */ - struct list_head tx_zc_list; /* queue on peer for ZC request */ - atomic_t tx_refcount; /* tx reference count */ - int tx_nob; /* # packet bytes */ - int tx_resid; /* residual bytes */ - int tx_niov; /* # packet iovec frags */ - struct kvec *tx_iov; /* packet iovec frags */ - int tx_nkiov; /* # packet page frags */ - unsigned short tx_zc_aborted; /* aborted ZC request */ - unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ - unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ - unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ - struct bio_vec *tx_kiov; /* packet page frags */ - struct ksock_conn *tx_conn; /* owning conn */ - struct lnet_msg *tx_lnetmsg; /* lnet message for lnet_finalize() - */ - unsigned long tx_deadline; /* when (in jiffies) tx times out */ - struct ksock_msg tx_msg; /* socklnd message buffer */ - int tx_desc_size; /* size of this descriptor */ - union { - struct { - struct kvec iov; /* virt hdr */ - struct bio_vec kiov[0]; /* paged payload */ - } paged; - struct { - struct kvec iov[1]; /* virt hdr + payload */ - } virt; - } tx_frags; -}; - -#define KSOCK_NOOP_TX_SIZE (offsetof(struct ksock_tx, tx_frags.paged.kiov[0])) - -/* network zero copy callback descriptor embedded in struct ksock_tx */ - -#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ -#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ -#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ -#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ -#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ -#define SOCKNAL_RX_SLOP 6 /* skipping body */ - -struct ksock_conn { - struct ksock_peer *ksnc_peer; /* owning peer */ - struct ksock_route *ksnc_route; /* owning route */ - struct list_head ksnc_list; /* stash on peer's conn list */ - struct socket *ksnc_sock; /* actual socket */ - void *ksnc_saved_data_ready; /* socket's original - * data_ready() callback - */ - void *ksnc_saved_write_space; /* socket's original - * write_space() callback - */ - atomic_t ksnc_conn_refcount;/* conn refcount */ - atomic_t ksnc_sock_refcount;/* sock refcount */ - struct ksock_sched *ksnc_scheduler; /* who schedules this connection - */ - __u32 ksnc_myipaddr; /* my IP */ - __u32 ksnc_ipaddr; /* peer's IP */ - int ksnc_port; /* peer's port */ - signed int ksnc_type:3; /* type of connection, should be - * signed value - */ - unsigned int ksnc_closing:1; /* being shut down */ - unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ - unsigned int ksnc_zc_capable:1; /* enable to ZC */ - struct ksock_proto *ksnc_proto; /* protocol for the connection */ - - /* reader */ - struct list_head ksnc_rx_list; /* where I enq waiting input or a - * forwarding descriptor - */ - unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times - * out - */ - __u8 ksnc_rx_started; /* started receiving a message */ - __u8 ksnc_rx_ready; /* data ready to read */ - __u8 ksnc_rx_scheduled; /* being progressed */ - __u8 ksnc_rx_state; /* what is being read */ - int ksnc_rx_nob_left; /* # bytes to next hdr/body */ - struct iov_iter ksnc_rx_to; /* copy destination */ - struct kvec ksnc_rx_iov_space[LNET_MAX_IOV]; /* space for frag descriptors */ - __u32 ksnc_rx_csum; /* partial checksum for incoming - * data - */ - void *ksnc_cookie; /* rx lnet_finalize passthru arg - */ - struct ksock_msg ksnc_msg; /* incoming message buffer: - * V2.x message takes the - * whole struct - * V1.x message is a bare - * struct lnet_hdr, it's stored in - * ksnc_msg.ksm_u.lnetmsg - */ - /* WRITER */ - struct list_head ksnc_tx_list; /* where I enq waiting for output - * space - */ - struct list_head ksnc_tx_queue; /* packets waiting to be sent */ - struct ksock_tx *ksnc_tx_carrier; /* next TX that can carry a LNet - * message or ZC-ACK - */ - unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out - */ - int ksnc_tx_bufnob; /* send buffer marker */ - atomic_t ksnc_tx_nob; /* # bytes queued */ - int ksnc_tx_ready; /* write space */ - int ksnc_tx_scheduled; /* being progressed */ - unsigned long ksnc_tx_last_post; /* time stamp of the last posted - * TX - */ -}; - -struct ksock_route { - struct list_head ksnr_list; /* chain on peer route list */ - struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ - struct ksock_peer *ksnr_peer; /* owning peer */ - atomic_t ksnr_refcount; /* # users */ - unsigned long ksnr_timeout; /* when (in jiffies) reconnection - * can happen next - */ - long ksnr_retry_interval; /* how long between retries */ - __u32 ksnr_myipaddr; /* my IP */ - __u32 ksnr_ipaddr; /* IP address to connect to */ - int ksnr_port; /* port to connect to */ - unsigned int ksnr_scheduled:1; /* scheduled for attention */ - unsigned int ksnr_connecting:1; /* connection establishment in - * progress - */ - unsigned int ksnr_connected:4; /* connections established by - * type - */ - unsigned int ksnr_deleted:1; /* been removed from peer? */ - unsigned int ksnr_share_count; /* created explicitly? */ - int ksnr_conn_count; /* # conns established by this - * route - */ -}; - -#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ - -struct ksock_peer { - struct list_head ksnp_list; /* stash on global peer list */ - unsigned long ksnp_last_alive; /* when (in jiffies) I was last - * alive - */ - struct lnet_process_id ksnp_id; /* who's on the other end(s) */ - atomic_t ksnp_refcount; /* # users */ - int ksnp_sharecount; /* lconf usage counter */ - int ksnp_closing; /* being closed */ - int ksnp_accepting; /* # passive connections pending - */ - int ksnp_error; /* errno on closing last conn */ - __u64 ksnp_zc_next_cookie; /* ZC completion cookie */ - __u64 ksnp_incarnation; /* latest known peer incarnation - */ - struct ksock_proto *ksnp_proto; /* latest known peer protocol */ - struct list_head ksnp_conns; /* all active connections */ - struct list_head ksnp_routes; /* routes */ - struct list_head ksnp_tx_queue; /* waiting packets */ - spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ - struct list_head ksnp_zc_req_list; /* zero copy requests wait for - * ACK - */ - unsigned long ksnp_send_keepalive; /* time to send keepalive */ - struct lnet_ni *ksnp_ni; /* which network */ - int ksnp_n_passive_ips; /* # of... */ - - /* preferred local interfaces */ - __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; -}; - -struct ksock_connreq { - struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */ - struct lnet_ni *ksncr_ni; /* chosen NI */ - struct socket *ksncr_sock; /* accepted socket */ -}; - -extern struct ksock_nal_data ksocknal_data; -extern struct ksock_tunables ksocknal_tunables; - -#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ -#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ -#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not - * preferred - */ - -struct ksock_proto { - /* version number of protocol */ - int pro_version; - - /* handshake function */ - int (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *); - - /* handshake function */ - int (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int); - - /* message pack */ - void (*pro_pack)(struct ksock_tx *); - - /* message unpack */ - void (*pro_unpack)(struct ksock_msg *); - - /* queue tx on the connection */ - struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *); - - /* queue ZC ack on the connection */ - int (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); - - /* handle ZC request */ - int (*pro_handle_zcreq)(struct ksock_conn *, __u64, int); - - /* handle ZC ACK */ - int (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64); - - /* - * msg type matches the connection type: - * return value: - * return MATCH_NO : no - * return MATCH_YES : matching type - * return MATCH_MAY : can be backup - */ - int (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int); -}; - -extern struct ksock_proto ksocknal_protocol_v1x; -extern struct ksock_proto ksocknal_protocol_v2x; -extern struct ksock_proto ksocknal_protocol_v3x; - -#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR -#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR -#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR - -#ifndef CPU_MASK_NONE -#define CPU_MASK_NONE 0UL -#endif - -static inline int -ksocknal_route_mask(void) -{ - if (!*ksocknal_tunables.ksnd_typed_conns) - return (1 << SOCKLND_CONN_ANY); - - return ((1 << SOCKLND_CONN_CONTROL) | - (1 << SOCKLND_CONN_BULK_IN) | - (1 << SOCKLND_CONN_BULK_OUT)); -} - -static inline struct list_head * -ksocknal_nid2peerlist(lnet_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; - - return &ksocknal_data.ksnd_peers[hash]; -} - -static inline void -ksocknal_conn_addref(struct ksock_conn *conn) -{ - LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); - atomic_inc(&conn->ksnc_conn_refcount); -} - -void ksocknal_queue_zombie_conn(struct ksock_conn *conn); -void ksocknal_finalize_zcreq(struct ksock_conn *conn); - -static inline void -ksocknal_conn_decref(struct ksock_conn *conn) -{ - LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); - if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) - ksocknal_queue_zombie_conn(conn); -} - -static inline int -ksocknal_connsock_addref(struct ksock_conn *conn) -{ - int rc = -ESHUTDOWN; - - read_lock(&ksocknal_data.ksnd_global_lock); - if (!conn->ksnc_closing) { - LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); - atomic_inc(&conn->ksnc_sock_refcount); - rc = 0; - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - return rc; -} - -static inline void -ksocknal_connsock_decref(struct ksock_conn *conn) -{ - LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); - if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { - LASSERT(conn->ksnc_closing); - sock_release(conn->ksnc_sock); - conn->ksnc_sock = NULL; - ksocknal_finalize_zcreq(conn); - } -} - -static inline void -ksocknal_tx_addref(struct ksock_tx *tx) -{ - LASSERT(atomic_read(&tx->tx_refcount) > 0); - atomic_inc(&tx->tx_refcount); -} - -void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx); -void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx); - -static inline void -ksocknal_tx_decref(struct ksock_tx *tx) -{ - LASSERT(atomic_read(&tx->tx_refcount) > 0); - if (atomic_dec_and_test(&tx->tx_refcount)) - ksocknal_tx_done(NULL, tx); -} - -static inline void -ksocknal_route_addref(struct ksock_route *route) -{ - LASSERT(atomic_read(&route->ksnr_refcount) > 0); - atomic_inc(&route->ksnr_refcount); -} - -void ksocknal_destroy_route(struct ksock_route *route); - -static inline void -ksocknal_route_decref(struct ksock_route *route) -{ - LASSERT(atomic_read(&route->ksnr_refcount) > 0); - if (atomic_dec_and_test(&route->ksnr_refcount)) - ksocknal_destroy_route(route); -} - -static inline void -ksocknal_peer_addref(struct ksock_peer *peer) -{ - LASSERT(atomic_read(&peer->ksnp_refcount) > 0); - atomic_inc(&peer->ksnp_refcount); -} - -void ksocknal_destroy_peer(struct ksock_peer *peer); - -static inline void -ksocknal_peer_decref(struct ksock_peer *peer) -{ - LASSERT(atomic_read(&peer->ksnp_refcount) > 0); - if (atomic_dec_and_test(&peer->ksnp_refcount)) - ksocknal_destroy_peer(peer); -} - -int ksocknal_startup(struct lnet_ni *ni); -void ksocknal_shutdown(struct lnet_ni *ni); -int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg); -int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); -int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen); -int ksocknal_accept(struct lnet_ni *ni, struct socket *sock); - -int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip, - int port); -struct ksock_peer *ksocknal_find_peer_locked(struct lnet_ni *ni, - struct lnet_process_id id); -struct ksock_peer *ksocknal_find_peer(struct lnet_ni *ni, - struct lnet_process_id id); -void ksocknal_peer_failed(struct ksock_peer *peer); -int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route, - struct socket *sock, int type); -void ksocknal_close_conn_locked(struct ksock_conn *conn, int why); -void ksocknal_terminate_conn(struct ksock_conn *conn); -void ksocknal_destroy_conn(struct ksock_conn *conn); -int ksocknal_close_peer_conns_locked(struct ksock_peer *peer, - __u32 ipaddr, int why); -int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why); -int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr); -struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer *peer, - struct ksock_tx *tx, int nonblk); - -int ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, - struct lnet_process_id id); -struct ksock_tx *ksocknal_alloc_tx(int type, int size); -void ksocknal_free_tx(struct ksock_tx *tx); -struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); -void ksocknal_next_tx_carrier(struct ksock_conn *conn); -void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn); -void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error); -void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive); -void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when); -int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name); -void ksocknal_thread_fini(void); -void ksocknal_launch_all_connections_locked(struct ksock_peer *peer); -struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer *peer); -struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer *peer); -int ksocknal_new_packet(struct ksock_conn *conn, int skip); -int ksocknal_scheduler(void *arg); -int ksocknal_connd(void *arg); -int ksocknal_reaper(void *arg); -int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, - lnet_nid_t peer_nid, struct ksock_hello_msg *hello); -int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, - struct ksock_hello_msg *hello, - struct lnet_process_id *id, - __u64 *incarnation); -void ksocknal_read_callback(struct ksock_conn *conn); -void ksocknal_write_callback(struct ksock_conn *conn); - -int ksocknal_lib_zc_capable(struct ksock_conn *conn); -void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn); -void ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn); -void ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn); -void ksocknal_lib_push_conn(struct ksock_conn *conn); -int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn); -int ksocknal_lib_setup_sock(struct socket *so); -int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx); -int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx); -void ksocknal_lib_eager_ack(struct ksock_conn *conn); -int ksocknal_lib_recv(struct ksock_conn *conn); -int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, - int *rxmem, int *nagle); - -void ksocknal_read_callback(struct ksock_conn *conn); -void ksocknal_write_callback(struct ksock_conn *conn); - -int ksocknal_tunables_init(void); - -void ksocknal_lib_csum_tx(struct ksock_tx *tx); - -int ksocknal_lib_memory_pressure(struct ksock_conn *conn); -int ksocknal_lib_bind_thread_to_cpu(int id); - -#endif /* _SOCKLND_SOCKLND_H_ */ diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c deleted file mode 100644 index 01b31a6..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ /dev/null @@ -1,2586 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * Author: Zach Brown <zab@zabbo.net> - * Author: Peter J. Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Eric Barton <eric@bartonsoftware.com> - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include <linux/sched/mm.h> -#include "socklnd.h" - -struct ksock_tx * -ksocknal_alloc_tx(int type, int size) -{ - struct ksock_tx *tx = NULL; - - if (type == KSOCK_MSG_NOOP) { - LASSERT(size == KSOCK_NOOP_TX_SIZE); - - /* searching for a noop tx in free list */ - spin_lock(&ksocknal_data.ksnd_tx_lock); - - if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { - tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next, - struct ksock_tx, tx_list); - LASSERT(tx->tx_desc_size == size); - list_del(&tx->tx_list); - } - - spin_unlock(&ksocknal_data.ksnd_tx_lock); - } - - if (!tx) - tx = kzalloc(size, GFP_NOFS); - - if (!tx) - return NULL; - - atomic_set(&tx->tx_refcount, 1); - tx->tx_zc_aborted = 0; - tx->tx_zc_capable = 0; - tx->tx_zc_checked = 0; - tx->tx_desc_size = size; - - atomic_inc(&ksocknal_data.ksnd_nactive_txs); - - return tx; -} - -struct ksock_tx * -ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) -{ - struct ksock_tx *tx; - - tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); - if (!tx) { - CERROR("Can't allocate noop tx desc\n"); - return NULL; - } - - tx->tx_conn = NULL; - tx->tx_lnetmsg = NULL; - tx->tx_kiov = NULL; - tx->tx_nkiov = 0; - tx->tx_iov = tx->tx_frags.virt.iov; - tx->tx_niov = 1; - tx->tx_nonblk = nonblk; - - tx->tx_msg.ksm_csum = 0; - tx->tx_msg.ksm_type = KSOCK_MSG_NOOP; - tx->tx_msg.ksm_zc_cookies[0] = 0; - tx->tx_msg.ksm_zc_cookies[1] = cookie; - - return tx; -} - -void -ksocknal_free_tx(struct ksock_tx *tx) -{ - atomic_dec(&ksocknal_data.ksnd_nactive_txs); - - if (!tx->tx_lnetmsg && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { - /* it's a noop tx */ - spin_lock(&ksocknal_data.ksnd_tx_lock); - - list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); - - spin_unlock(&ksocknal_data.ksnd_tx_lock); - } else { - kfree(tx); - } -} - -static int -ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct kvec *iov = tx->tx_iov; - int nob; - int rc; - - LASSERT(tx->tx_niov > 0); - - /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ - rc = ksocknal_lib_send_iov(conn, tx); - - if (rc <= 0) /* sent nothing? */ - return rc; - - nob = rc; - LASSERT(nob <= tx->tx_resid); - tx->tx_resid -= nob; - - /* "consume" iov */ - do { - LASSERT(tx->tx_niov > 0); - - if (nob < (int)iov->iov_len) { - iov->iov_base = (void *)((char *)iov->iov_base + nob); - iov->iov_len -= nob; - return rc; - } - - nob -= iov->iov_len; - tx->tx_iov = ++iov; - tx->tx_niov--; - } while (nob); - - return rc; -} - -static int -ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct bio_vec *kiov = tx->tx_kiov; - int nob; - int rc; - - LASSERT(!tx->tx_niov); - LASSERT(tx->tx_nkiov > 0); - - /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ - rc = ksocknal_lib_send_kiov(conn, tx); - - if (rc <= 0) /* sent nothing? */ - return rc; - - nob = rc; - LASSERT(nob <= tx->tx_resid); - tx->tx_resid -= nob; - - /* "consume" kiov */ - do { - LASSERT(tx->tx_nkiov > 0); - - if (nob < (int)kiov->bv_len) { - kiov->bv_offset += nob; - kiov->bv_len -= nob; - return rc; - } - - nob -= (int)kiov->bv_len; - tx->tx_kiov = ++kiov; - tx->tx_nkiov--; - } while (nob); - - return rc; -} - -static int -ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx) -{ - int rc; - int bufnob; - - if (ksocknal_data.ksnd_stall_tx) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ksocknal_data.ksnd_stall_tx * HZ); - } - - LASSERT(tx->tx_resid); - - rc = ksocknal_connsock_addref(conn); - if (rc) { - LASSERT(conn->ksnc_closing); - return -ESHUTDOWN; - } - - do { - if (ksocknal_data.ksnd_enomem_tx > 0) { - /* testing... */ - ksocknal_data.ksnd_enomem_tx--; - rc = -EAGAIN; - } else if (tx->tx_niov) { - rc = ksocknal_send_iov(conn, tx); - } else { - rc = ksocknal_send_kiov(conn, tx); - } - - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - if (rc > 0) /* sent something? */ - conn->ksnc_tx_bufnob += rc; /* account it */ - - if (bufnob < conn->ksnc_tx_bufnob) { - /* - * allocated send buffer bytes < computed; infer - * something got ACKed - */ - conn->ksnc_tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_tx_bufnob = bufnob; - mb(); - } - - if (rc <= 0) { /* Didn't write anything? */ - - if (!rc) /* some stacks return 0 instead of -EAGAIN */ - rc = -EAGAIN; - - /* Check if EAGAIN is due to memory pressure */ - if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) - rc = -ENOMEM; - - break; - } - - /* socket's wmem_queued now includes 'rc' bytes */ - atomic_sub(rc, &conn->ksnc_tx_nob); - rc = 0; - - } while (tx->tx_resid); - - ksocknal_connsock_decref(conn); - return rc; -} - -static int -ksocknal_recv_iter(struct ksock_conn *conn) -{ - int nob; - int rc; - - /* - * Never touch conn->ksnc_rx_to or change connection - * status inside ksocknal_lib_recv - */ - rc = ksocknal_lib_recv(conn); - - if (rc <= 0) - return rc; - - /* received something... */ - nob = rc; - - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_rx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - mb(); /* order with setting rx_started */ - conn->ksnc_rx_started = 1; - - conn->ksnc_rx_nob_left -= nob; - - iov_iter_advance(&conn->ksnc_rx_to, nob); - if (iov_iter_count(&conn->ksnc_rx_to)) - return -EAGAIN; - - return 1; -} - -static int -ksocknal_receive(struct ksock_conn *conn) -{ - /* - * Return 1 on success, 0 on EOF, < 0 on error. - * Caller checks ksnc_rx_to to determine - * progress/completion. - */ - int rc; - - if (ksocknal_data.ksnd_stall_rx) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ksocknal_data.ksnd_stall_rx * HZ); - } - - rc = ksocknal_connsock_addref(conn); - if (rc) { - LASSERT(conn->ksnc_closing); - return -ESHUTDOWN; - } - - for (;;) { - rc = ksocknal_recv_iter(conn); - if (rc <= 0) { - /* error/EOF or partial receive */ - if (rc == -EAGAIN) { - rc = 1; - } else if (!rc && conn->ksnc_rx_started) { - /* EOF in the middle of a message */ - rc = -EPROTO; - } - break; - } - - /* Completed a fragment */ - - if (!iov_iter_count(&conn->ksnc_rx_to)) { - rc = 1; - break; - } - } - - ksocknal_connsock_decref(conn); - return rc; -} - -void -ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx) -{ - struct lnet_msg *lnetmsg = tx->tx_lnetmsg; - int rc = (!tx->tx_resid && !tx->tx_zc_aborted) ? 0 : -EIO; - - LASSERT(ni || tx->tx_conn); - - if (tx->tx_conn) - ksocknal_conn_decref(tx->tx_conn); - - if (!ni && tx->tx_conn) - ni = tx->tx_conn->ksnc_peer->ksnp_ni; - - ksocknal_free_tx(tx); - if (lnetmsg) /* KSOCK_MSG_NOOP go without lnetmsg */ - lnet_finalize(ni, lnetmsg, rc); -} - -void -ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) -{ - struct ksock_tx *tx; - - while (!list_empty(txlist)) { - tx = list_entry(txlist->next, struct ksock_tx, tx_list); - - if (error && tx->tx_lnetmsg) { - CNETERR("Deleting packet type %d len %d %s->%s\n", - le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type), - le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length), - libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), - libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); - } else if (error) { - CNETERR("Deleting noop packet\n"); - } - - list_del(&tx->tx_list); - - LASSERT(atomic_read(&tx->tx_refcount) == 1); - ksocknal_tx_done(ni, tx); - } -} - -static void -ksocknal_check_zc_req(struct ksock_tx *tx) -{ - struct ksock_conn *conn = tx->tx_conn; - struct ksock_peer *peer = conn->ksnc_peer; - - /* - * Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx - * to ksnp_zc_req_list if some fragment of this message should be sent - * zero-copy. Our peer will send an ACK containing this cookie when - * she has received this message to tell us we can signal completion. - * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on - * ksnp_zc_req_list. - */ - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - LASSERT(tx->tx_zc_capable); - - tx->tx_zc_checked = 1; - - if (conn->ksnc_proto == &ksocknal_protocol_v1x || - !conn->ksnc_zc_capable) - return; - - /* - * assign cookie and queue tx to pending list, it will be released when - * a matching ack is received. See ksocknal_handle_zcack() - */ - ksocknal_tx_addref(tx); - - spin_lock(&peer->ksnp_lock); - - /* ZC_REQ is going to be pinned to the peer */ - tx->tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - - LASSERT(!tx->tx_msg.ksm_zc_cookies[0]); - - tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++; - - if (!peer->ksnp_zc_next_cookie) - peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; - - list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); - - spin_unlock(&peer->ksnp_lock); -} - -static void -ksocknal_uncheck_zc_req(struct ksock_tx *tx) -{ - struct ksock_peer *peer = tx->tx_conn->ksnc_peer; - - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - LASSERT(tx->tx_zc_capable); - - tx->tx_zc_checked = 0; - - spin_lock(&peer->ksnp_lock); - - if (!tx->tx_msg.ksm_zc_cookies[0]) { - /* Not waiting for an ACK */ - spin_unlock(&peer->ksnp_lock); - return; - } - - tx->tx_msg.ksm_zc_cookies[0] = 0; - list_del(&tx->tx_zc_list); - - spin_unlock(&peer->ksnp_lock); - - ksocknal_tx_decref(tx); -} - -static int -ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx) -{ - int rc; - - if (tx->tx_zc_capable && !tx->tx_zc_checked) - ksocknal_check_zc_req(tx); - - rc = ksocknal_transmit(conn, tx); - - CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc); - - if (!tx->tx_resid) { - /* Sent everything OK */ - LASSERT(!rc); - - return 0; - } - - if (rc == -EAGAIN) - return rc; - - if (rc == -ENOMEM) { - static int counter; - - counter++; /* exponential backoff warnings */ - if ((counter & (-counter)) == counter) - CWARN("%u ENOMEM tx %p\n", counter, conn); - - /* Queue on ksnd_enomem_conns for retry after a timeout */ - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - /* enomem list takes over scheduler's ref... */ - LASSERT(conn->ksnc_tx_scheduled); - list_add_tail(&conn->ksnc_tx_list, - &ksocknal_data.ksnd_enomem_conns); - if (!time_after_eq(jiffies + SOCKNAL_ENOMEM_RETRY, - ksocknal_data.ksnd_reaper_waketime)) - wake_up(&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - return rc; - } - - /* Actual error */ - LASSERT(rc < 0); - - if (!conn->ksnc_closing) { - switch (rc) { - case -ECONNRESET: - LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n", - &conn->ksnc_ipaddr); - break; - default: - LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n", - &conn->ksnc_ipaddr, rc); - break; - } - CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n", - conn, rc, - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - } - - if (tx->tx_zc_checked) - ksocknal_uncheck_zc_req(tx); - - /* it's not an error if conn is being closed */ - ksocknal_close_conn_and_siblings(conn, (conn->ksnc_closing) ? 0 : rc); - - return rc; -} - -static void -ksocknal_launch_connection_locked(struct ksock_route *route) -{ - /* called holding write lock on ksnd_global_lock */ - - LASSERT(!route->ksnr_scheduled); - LASSERT(!route->ksnr_connecting); - LASSERT(ksocknal_route_mask() & ~route->ksnr_connected); - - route->ksnr_scheduled = 1; /* scheduling conn for connd */ - ksocknal_route_addref(route); /* extra ref for connd */ - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - - list_add_tail(&route->ksnr_connd_list, - &ksocknal_data.ksnd_connd_routes); - wake_up(&ksocknal_data.ksnd_connd_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); -} - -void -ksocknal_launch_all_connections_locked(struct ksock_peer *peer) -{ - struct ksock_route *route; - - /* called holding write lock on ksnd_global_lock */ - for (;;) { - /* launch any/all connections that need it */ - route = ksocknal_find_connectable_route_locked(peer); - if (!route) - return; - - ksocknal_launch_connection_locked(route); - } -} - -struct ksock_conn * -ksocknal_find_conn_locked(struct ksock_peer *peer, struct ksock_tx *tx, - int nonblk) -{ - struct list_head *tmp; - struct ksock_conn *conn; - struct ksock_conn *typed = NULL; - struct ksock_conn *fallback = NULL; - int tnob = 0; - int fnob = 0; - - list_for_each(tmp, &peer->ksnp_conns) { - struct ksock_conn *c; - int nob, rc; - - c = list_entry(tmp, struct ksock_conn, ksnc_list); - nob = atomic_read(&c->ksnc_tx_nob) + - c->ksnc_sock->sk->sk_wmem_queued; - - LASSERT(!c->ksnc_closing); - LASSERT(c->ksnc_proto && - c->ksnc_proto->pro_match_tx); - - rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); - - switch (rc) { - default: - LBUG(); - case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ - continue; - - case SOCKNAL_MATCH_YES: /* typed connection */ - if (!typed || tnob > nob || - (tnob == nob && *ksocknal_tunables.ksnd_round_robin && - time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) { - typed = c; - tnob = nob; - } - break; - - case SOCKNAL_MATCH_MAY: /* fallback connection */ - if (!fallback || fnob > nob || - (fnob == nob && *ksocknal_tunables.ksnd_round_robin && - time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) { - fallback = c; - fnob = nob; - } - break; - } - } - - /* prefer the typed selection */ - conn = (typed) ? typed : fallback; - - if (conn) - conn->ksnc_tx_last_post = jiffies; - - return conn; -} - -void -ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx) -{ - conn->ksnc_proto->pro_pack(tx); - - atomic_add(tx->tx_nob, &conn->ksnc_tx_nob); - ksocknal_conn_addref(conn); /* +1 ref for tx */ - tx->tx_conn = conn; -} - -void -ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn) -{ - struct ksock_sched *sched = conn->ksnc_scheduler; - struct ksock_msg *msg = &tx->tx_msg; - struct ksock_tx *ztx = NULL; - int bufnob = 0; - - /* - * called holding global lock (read or irq-write) and caller may - * not have dropped this lock between finding conn and calling me, - * so we don't need the {get,put}connsock dance to deref - * ksnc_sock... - */ - LASSERT(!conn->ksnc_closing); - - CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port); - - ksocknal_tx_prep(conn, tx); - - /* - * Ensure the frags we've been given EXACTLY match the number of - * bytes we want to send. Many TCP/IP stacks disregard any total - * size parameters passed to them and just look at the frags. - * - * We always expect at least 1 mapped fragment containing the - * complete ksocknal message header. - */ - LASSERT(lnet_iov_nob(tx->tx_niov, tx->tx_iov) + - lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == - (unsigned int)tx->tx_nob); - LASSERT(tx->tx_niov >= 1); - LASSERT(tx->tx_resid == tx->tx_nob); - - CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", - tx, (tx->tx_lnetmsg) ? tx->tx_lnetmsg->msg_hdr.type : - KSOCK_MSG_NOOP, - tx->tx_nob, tx->tx_niov, tx->tx_nkiov); - - /* - * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__ - * but they're used inside spinlocks a lot. - */ - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - spin_lock_bh(&sched->kss_lock); - - if (list_empty(&conn->ksnc_tx_queue) && !bufnob) { - /* First packet starts the timeout */ - conn->ksnc_tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_tx_bufnob = 0; - mb(); /* order with adding to tx_queue */ - } - - if (msg->ksm_type == KSOCK_MSG_NOOP) { - /* - * The packet is noop ZC ACK, try to piggyback the ack_cookie - * on a normal packet so I don't need to send it - */ - LASSERT(msg->ksm_zc_cookies[1]); - LASSERT(conn->ksnc_proto->pro_queue_tx_zcack); - - /* ZC ACK piggybacked on ztx release tx later */ - if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) - ztx = tx; - } else { - /* - * It's a normal packet - can it piggback a noop zc-ack that - * has been queued already? - */ - LASSERT(!msg->ksm_zc_cookies[1]); - LASSERT(conn->ksnc_proto->pro_queue_tx_msg); - - ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); - /* ztx will be released later */ - } - - if (ztx) { - atomic_sub(ztx->tx_nob, &conn->ksnc_tx_nob); - list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); - } - - if (conn->ksnc_tx_ready && /* able to send */ - !conn->ksnc_tx_scheduled) { /* not scheduled to send */ - /* +1 ref for scheduler */ - ksocknal_conn_addref(conn); - list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - wake_up(&sched->kss_waitq); - } - - spin_unlock_bh(&sched->kss_lock); -} - -struct ksock_route * -ksocknal_find_connectable_route_locked(struct ksock_peer *peer) -{ - unsigned long now = jiffies; - struct list_head *tmp; - struct ksock_route *route; - - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - LASSERT(!route->ksnr_connecting || route->ksnr_scheduled); - - /* connections being established */ - if (route->ksnr_scheduled) - continue; - - /* all route types connected ? */ - if (!(ksocknal_route_mask() & ~route->ksnr_connected)) - continue; - - if (!(!route->ksnr_retry_interval || /* first attempt */ - time_after_eq(now, route->ksnr_timeout))) { - CDEBUG(D_NET, - "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n", - &route->ksnr_ipaddr, - route->ksnr_connected, - route->ksnr_retry_interval, - (route->ksnr_timeout - now) / HZ); - continue; - } - - return route; - } - - return NULL; -} - -struct ksock_route * -ksocknal_find_connecting_route_locked(struct ksock_peer *peer) -{ - struct list_head *tmp; - struct ksock_route *route; - - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - LASSERT(!route->ksnr_connecting || route->ksnr_scheduled); - - if (route->ksnr_scheduled) - return route; - } - - return NULL; -} - -int -ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, - struct lnet_process_id id) -{ - struct ksock_peer *peer; - struct ksock_conn *conn; - rwlock_t *g_lock; - int retry; - int rc; - - LASSERT(!tx->tx_conn); - - g_lock = &ksocknal_data.ksnd_global_lock; - - for (retry = 0;; retry = 1) { - read_lock(g_lock); - peer = ksocknal_find_peer_locked(ni, id); - if (peer) { - if (!ksocknal_find_connectable_route_locked(peer)) { - conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); - if (conn) { - /* - * I've got no routes that need to be - * connecting and I do have an actual - * connection... - */ - ksocknal_queue_tx_locked(tx, conn); - read_unlock(g_lock); - return 0; - } - } - } - - /* I'll need a write lock... */ - read_unlock(g_lock); - - write_lock_bh(g_lock); - - peer = ksocknal_find_peer_locked(ni, id); - if (peer) - break; - - write_unlock_bh(g_lock); - - if (id.pid & LNET_PID_USERFLAG) { - CERROR("Refusing to create a connection to userspace process %s\n", - libcfs_id2str(id)); - return -EHOSTUNREACH; - } - - if (retry) { - CERROR("Can't find peer %s\n", libcfs_id2str(id)); - return -EHOSTUNREACH; - } - - rc = ksocknal_add_peer(ni, id, - LNET_NIDADDR(id.nid), - lnet_acceptor_port()); - if (rc) { - CERROR("Can't add peer %s: %d\n", - libcfs_id2str(id), rc); - return rc; - } - } - - ksocknal_launch_all_connections_locked(peer); - - conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); - if (conn) { - /* Connection exists; queue message on it */ - ksocknal_queue_tx_locked(tx, conn); - write_unlock_bh(g_lock); - return 0; - } - - if (peer->ksnp_accepting > 0 || - ksocknal_find_connecting_route_locked(peer)) { - /* the message is going to be pinned to the peer */ - tx->tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - - /* Queue the message until a connection is established */ - list_add_tail(&tx->tx_list, &peer->ksnp_tx_queue); - write_unlock_bh(g_lock); - return 0; - } - - write_unlock_bh(g_lock); - - /* NB Routes may be ignored if connections to them failed recently */ - CNETERR("No usable routes to %s\n", libcfs_id2str(id)); - return -EHOSTUNREACH; -} - -int -ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) -{ - unsigned int mpflag = 0; - int type = lntmsg->msg_type; - struct lnet_process_id target = lntmsg->msg_target; - unsigned int payload_niov = lntmsg->msg_niov; - struct kvec *payload_iov = lntmsg->msg_iov; - struct bio_vec *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - struct ksock_tx *tx; - int desc_size; - int rc; - - /* - * NB 'private' is different depending on what we're sending. - * Just ignore it... - */ - CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT(!payload_nob || payload_niov > 0); - LASSERT(payload_niov <= LNET_MAX_IOV); - /* payload is either all vaddrs or all pages */ - LASSERT(!(payload_kiov && payload_iov)); - LASSERT(!in_interrupt()); - - if (payload_iov) - desc_size = offsetof(struct ksock_tx, - tx_frags.virt.iov[1 + payload_niov]); - else - desc_size = offsetof(struct ksock_tx, - tx_frags.paged.kiov[payload_niov]); - - if (lntmsg->msg_vmflush) - mpflag = memalloc_noreclaim_save(); - tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); - if (!tx) { - CERROR("Can't allocate tx desc type %d size %d\n", - type, desc_size); - if (lntmsg->msg_vmflush) - memalloc_noreclaim_restore(mpflag); - return -ENOMEM; - } - - tx->tx_conn = NULL; /* set when assigned a conn */ - tx->tx_lnetmsg = lntmsg; - - if (payload_iov) { - tx->tx_kiov = NULL; - tx->tx_nkiov = 0; - tx->tx_iov = tx->tx_frags.virt.iov; - tx->tx_niov = 1 + - lnet_extract_iov(payload_niov, &tx->tx_iov[1], - payload_niov, payload_iov, - payload_offset, payload_nob); - } else { - tx->tx_niov = 1; - tx->tx_iov = &tx->tx_frags.paged.iov; - tx->tx_kiov = tx->tx_frags.paged.kiov; - tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, - payload_niov, payload_kiov, - payload_offset, payload_nob); - - if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) - tx->tx_zc_capable = 1; - } - - tx->tx_msg.ksm_csum = 0; - tx->tx_msg.ksm_type = KSOCK_MSG_LNET; - tx->tx_msg.ksm_zc_cookies[0] = 0; - tx->tx_msg.ksm_zc_cookies[1] = 0; - - /* The first fragment will be set later in pro_pack */ - rc = ksocknal_launch_packet(ni, tx, target); - if (mpflag) - memalloc_noreclaim_restore(mpflag); - - if (!rc) - return 0; - - ksocknal_free_tx(tx); - return -EIO; -} - -int -ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) -{ - struct task_struct *task = kthread_run(fn, arg, "%s", name); - - if (IS_ERR(task)) - return PTR_ERR(task); - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nthreads++; - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - return 0; -} - -void -ksocknal_thread_fini(void) -{ - write_lock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nthreads--; - write_unlock_bh(&ksocknal_data.ksnd_global_lock); -} - -int -ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip) -{ - static char ksocknal_slop_buffer[4096]; - struct kvec *kvec = conn->ksnc_rx_iov_space; - - int nob; - unsigned int niov; - int skipped; - - LASSERT(conn->ksnc_proto); - - if (*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) { - /* Remind the socket to ack eagerly... */ - ksocknal_lib_eager_ack(conn); - } - - if (!nob_to_skip) { /* right at next packet boundary now */ - conn->ksnc_rx_started = 0; - mb(); /* racing with timeout thread */ - - switch (conn->ksnc_proto->pro_version) { - case KSOCK_PROTO_V2: - case KSOCK_PROTO_V3: - conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; - kvec->iov_base = &conn->ksnc_msg; - kvec->iov_len = offsetof(struct ksock_msg, ksm_u); - conn->ksnc_rx_nob_left = offsetof(struct ksock_msg, ksm_u); - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, - 1, offsetof(struct ksock_msg, ksm_u)); - break; - - case KSOCK_PROTO_V1: - /* Receiving bare struct lnet_hdr */ - conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; - kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; - kvec->iov_len = sizeof(struct lnet_hdr); - conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr); - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, - 1, sizeof(struct lnet_hdr)); - break; - - default: - LBUG(); - } - conn->ksnc_rx_csum = ~0; - return 1; - } - - /* - * Set up to skip as much as possible now. If there's more left - * (ran out of iov entries) we'll get called again - */ - conn->ksnc_rx_state = SOCKNAL_RX_SLOP; - conn->ksnc_rx_nob_left = nob_to_skip; - skipped = 0; - niov = 0; - - do { - nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer)); - - kvec[niov].iov_base = ksocknal_slop_buffer; - kvec[niov].iov_len = nob; - niov++; - skipped += nob; - nob_to_skip -= nob; - - } while (nob_to_skip && /* mustn't overflow conn's rx iov */ - niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct iovec)); - - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, niov, skipped); - return 0; -} - -static int -ksocknal_process_receive(struct ksock_conn *conn) -{ - struct kvec *kvec = conn->ksnc_rx_iov_space; - struct lnet_hdr *lhdr; - struct lnet_process_id *id; - int rc; - - LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); - - /* NB: sched lock NOT held */ - /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ - LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || - conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || - conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || - conn->ksnc_rx_state == SOCKNAL_RX_SLOP); - again: - if (iov_iter_count(&conn->ksnc_rx_to)) { - rc = ksocknal_receive(conn); - - if (rc <= 0) { - LASSERT(rc != -EAGAIN); - - if (!rc) - CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n", - conn, - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - else if (!conn->ksnc_closing) - CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n", - conn, rc, - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - - /* it's not an error if conn is being closed */ - ksocknal_close_conn_and_siblings(conn, - (conn->ksnc_closing) ? 0 : rc); - return (!rc ? -ESHUTDOWN : rc); - } - - if (iov_iter_count(&conn->ksnc_rx_to)) { - /* short read */ - return -EAGAIN; - } - } - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_KSM_HEADER: - if (conn->ksnc_flip) { - __swab32s(&conn->ksnc_msg.ksm_type); - __swab32s(&conn->ksnc_msg.ksm_csum); - __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); - __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); - } - - if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && - conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { - CERROR("%s: Unknown message type: %x\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - conn->ksnc_msg.ksm_type); - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, -EPROTO); - return -EPROTO; - } - - if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && - conn->ksnc_msg.ksm_csum && /* has checksum */ - conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { - /* NOOP Checksum error */ - CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, -EPROTO); - return -EIO; - } - - if (conn->ksnc_msg.ksm_zc_cookies[1]) { - __u64 cookie = 0; - - LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); - - if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) - cookie = conn->ksnc_msg.ksm_zc_cookies[0]; - - rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, - conn->ksnc_msg.ksm_zc_cookies[1]); - - if (rc) { - CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - cookie, conn->ksnc_msg.ksm_zc_cookies[1]); - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, -EPROTO); - return rc; - } - } - - if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { - ksocknal_new_packet(conn, 0); - return 0; /* NOOP is done and just return */ - } - - conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; - conn->ksnc_rx_nob_left = sizeof(struct ksock_lnet_msg); - - kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; - kvec->iov_len = sizeof(struct ksock_lnet_msg); - - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, - 1, sizeof(struct ksock_lnet_msg)); - - goto again; /* read lnet header now */ - - case SOCKNAL_RX_LNET_HEADER: - /* unpack message header */ - conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); - - if (conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) { - /* Userspace peer */ - lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; - id = &conn->ksnc_peer->ksnp_id; - - /* Substitute process ID assigned at connection time */ - lhdr->src_pid = cpu_to_le32(id->pid); - lhdr->src_nid = cpu_to_le64(id->nid); - } - - conn->ksnc_rx_state = SOCKNAL_RX_PARSE; - ksocknal_conn_addref(conn); /* ++ref while parsing */ - - rc = lnet_parse(conn->ksnc_peer->ksnp_ni, - &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, - conn->ksnc_peer->ksnp_id.nid, conn, 0); - if (rc < 0) { - /* I just received garbage: give up on this conn */ - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, rc); - ksocknal_conn_decref(conn); - return -EPROTO; - } - - /* I'm racing with ksocknal_recv() */ - LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE || - conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); - - if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) - return 0; - - /* ksocknal_recv() got called */ - goto again; - - case SOCKNAL_RX_LNET_PAYLOAD: - /* payload all received */ - rc = 0; - - if (!conn->ksnc_rx_nob_left && /* not truncating */ - conn->ksnc_msg.ksm_csum && /* has checksum */ - conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { - CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); - rc = -EIO; - } - - if (!rc && conn->ksnc_msg.ksm_zc_cookies[0]) { - LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); - - lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; - id = &conn->ksnc_peer->ksnp_id; - - rc = conn->ksnc_proto->pro_handle_zcreq(conn, - conn->ksnc_msg.ksm_zc_cookies[0], - *ksocknal_tunables.ksnd_nonblk_zcack || - le64_to_cpu(lhdr->src_nid) != id->nid); - } - - lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); - - if (rc) { - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, rc); - return -EPROTO; - } - /* Fall through */ - - case SOCKNAL_RX_SLOP: - /* starting new packet? */ - if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left)) - return 0; /* come back later */ - goto again; /* try to finish reading slop now */ - - default: - break; - } - - /* Not Reached */ - LBUG(); - return -EINVAL; /* keep gcc happy */ -} - -int -ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, - int delayed, struct iov_iter *to, unsigned int rlen) -{ - struct ksock_conn *conn = private; - struct ksock_sched *sched = conn->ksnc_scheduler; - - LASSERT(iov_iter_count(to) <= rlen); - LASSERT(to->nr_segs <= LNET_MAX_IOV); - - conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_left = rlen; - - conn->ksnc_rx_to = *to; - - LASSERT(conn->ksnc_rx_scheduled); - - spin_lock_bh(&sched->kss_lock); - - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_PARSE_WAIT: - list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); - wake_up(&sched->kss_waitq); - LASSERT(conn->ksnc_rx_ready); - break; - - case SOCKNAL_RX_PARSE: - /* scheduler hasn't noticed I'm parsing yet */ - break; - } - - conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; - - spin_unlock_bh(&sched->kss_lock); - ksocknal_conn_decref(conn); - return 0; -} - -static inline int -ksocknal_sched_cansleep(struct ksock_sched *sched) -{ - int rc; - - spin_lock_bh(&sched->kss_lock); - - rc = !ksocknal_data.ksnd_shuttingdown && - list_empty(&sched->kss_rx_conns) && - list_empty(&sched->kss_tx_conns); - - spin_unlock_bh(&sched->kss_lock); - return rc; -} - -int ksocknal_scheduler(void *arg) -{ - struct ksock_sched_info *info; - struct ksock_sched *sched; - struct ksock_conn *conn; - struct ksock_tx *tx; - int rc; - int nloops = 0; - long id = (long)arg; - - info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)]; - sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; - - rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt); - if (rc) { - CWARN("Can't set CPU partition affinity to %d: %d\n", - info->ksi_cpt, rc); - } - - spin_lock_bh(&sched->kss_lock); - - while (!ksocknal_data.ksnd_shuttingdown) { - int did_something = 0; - - /* Ensure I progress everything semi-fairly */ - - if (!list_empty(&sched->kss_rx_conns)) { - conn = list_entry(sched->kss_rx_conns.next, - struct ksock_conn, ksnc_rx_list); - list_del(&conn->ksnc_rx_list); - - LASSERT(conn->ksnc_rx_scheduled); - LASSERT(conn->ksnc_rx_ready); - - /* - * clear rx_ready in case receive isn't complete. - * Do it BEFORE we call process_recv, since - * data_ready can set it any time after we release - * kss_lock. - */ - conn->ksnc_rx_ready = 0; - spin_unlock_bh(&sched->kss_lock); - - rc = ksocknal_process_receive(conn); - - spin_lock_bh(&sched->kss_lock); - - /* I'm the only one that can clear this flag */ - LASSERT(conn->ksnc_rx_scheduled); - - /* Did process_receive get everything it wanted? */ - if (!rc) - conn->ksnc_rx_ready = 1; - - if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { - /* - * Conn blocked waiting for ksocknal_recv() - * I change its state (under lock) to signal - * it can be rescheduled - */ - conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; - } else if (conn->ksnc_rx_ready) { - /* reschedule for rx */ - list_add_tail(&conn->ksnc_rx_list, - &sched->kss_rx_conns); - } else { - conn->ksnc_rx_scheduled = 0; - /* drop my ref */ - ksocknal_conn_decref(conn); - } - - did_something = 1; - } - - if (!list_empty(&sched->kss_tx_conns)) { - LIST_HEAD(zlist); - - if (!list_empty(&sched->kss_zombie_noop_txs)) { - list_add(&zlist, &sched->kss_zombie_noop_txs); - list_del_init(&sched->kss_zombie_noop_txs); - } - - conn = list_entry(sched->kss_tx_conns.next, - struct ksock_conn, ksnc_tx_list); - list_del(&conn->ksnc_tx_list); - - LASSERT(conn->ksnc_tx_scheduled); - LASSERT(conn->ksnc_tx_ready); - LASSERT(!list_empty(&conn->ksnc_tx_queue)); - - tx = list_entry(conn->ksnc_tx_queue.next, - struct ksock_tx, tx_list); - - if (conn->ksnc_tx_carrier == tx) - ksocknal_next_tx_carrier(conn); - - /* dequeue now so empty list => more to send */ - list_del(&tx->tx_list); - - /* - * Clear tx_ready in case send isn't complete. Do - * it BEFORE we call process_transmit, since - * write_space can set it any time after we release - * kss_lock. - */ - conn->ksnc_tx_ready = 0; - spin_unlock_bh(&sched->kss_lock); - - if (!list_empty(&zlist)) { - /* - * free zombie noop txs, it's fast because - * noop txs are just put in freelist - */ - ksocknal_txlist_done(NULL, &zlist, 0); - } - - rc = ksocknal_process_transmit(conn, tx); - - if (rc == -ENOMEM || rc == -EAGAIN) { - /* - * Incomplete send: replace tx on HEAD of - * tx_queue - */ - spin_lock_bh(&sched->kss_lock); - list_add(&tx->tx_list, &conn->ksnc_tx_queue); - } else { - /* Complete send; tx -ref */ - ksocknal_tx_decref(tx); - - spin_lock_bh(&sched->kss_lock); - /* assume space for more */ - conn->ksnc_tx_ready = 1; - } - - if (rc == -ENOMEM) { - /* - * Do nothing; after a short timeout, this - * conn will be reposted on kss_tx_conns. - */ - } else if (conn->ksnc_tx_ready && - !list_empty(&conn->ksnc_tx_queue)) { - /* reschedule for tx */ - list_add_tail(&conn->ksnc_tx_list, - &sched->kss_tx_conns); - } else { - conn->ksnc_tx_scheduled = 0; - /* drop my ref */ - ksocknal_conn_decref(conn); - } - - did_something = 1; - } - if (!did_something || /* nothing to do */ - ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ - spin_unlock_bh(&sched->kss_lock); - - nloops = 0; - - if (!did_something) { /* wait for something to do */ - rc = wait_event_interruptible_exclusive( - sched->kss_waitq, - !ksocknal_sched_cansleep(sched)); - LASSERT(!rc); - } else { - cond_resched(); - } - - spin_lock_bh(&sched->kss_lock); - } - } - - spin_unlock_bh(&sched->kss_lock); - ksocknal_thread_fini(); - return 0; -} - -/* - * Add connection to kss_rx_conns of scheduler - * and wakeup the scheduler. - */ -void ksocknal_read_callback(struct ksock_conn *conn) -{ - struct ksock_sched *sched; - - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - - conn->ksnc_rx_ready = 1; - - if (!conn->ksnc_rx_scheduled) { /* not being progressed */ - list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); - conn->ksnc_rx_scheduled = 1; - /* extra ref for scheduler */ - ksocknal_conn_addref(conn); - - wake_up(&sched->kss_waitq); - } - spin_unlock_bh(&sched->kss_lock); -} - -/* - * Add connection to kss_tx_conns of scheduler - * and wakeup the scheduler. - */ -void ksocknal_write_callback(struct ksock_conn *conn) -{ - struct ksock_sched *sched; - - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - - conn->ksnc_tx_ready = 1; - - if (!conn->ksnc_tx_scheduled && /* not being progressed */ - !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ - list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - /* extra ref for scheduler */ - ksocknal_conn_addref(conn); - - wake_up(&sched->kss_waitq); - } - - spin_unlock_bh(&sched->kss_lock); -} - -static struct ksock_proto * -ksocknal_parse_proto_version(struct ksock_hello_msg *hello) -{ - __u32 version = 0; - - if (hello->kshm_magic == LNET_PROTO_MAGIC) - version = hello->kshm_version; - else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) - version = __swab32(hello->kshm_version); - - if (version) { -#if SOCKNAL_VERSION_DEBUG - if (*ksocknal_tunables.ksnd_protocol == 1) - return NULL; - - if (*ksocknal_tunables.ksnd_protocol == 2 && - version == KSOCK_PROTO_V3) - return NULL; -#endif - if (version == KSOCK_PROTO_V2) - return &ksocknal_protocol_v2x; - - if (version == KSOCK_PROTO_V3) - return &ksocknal_protocol_v3x; - - return NULL; - } - - if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { - struct lnet_magicversion *hmv = (struct lnet_magicversion *)hello; - - BUILD_BUG_ON(sizeof(struct lnet_magicversion) != - offsetof(struct ksock_hello_msg, kshm_src_nid)); - - if (hmv->version_major == cpu_to_le16(KSOCK_PROTO_V1_MAJOR) && - hmv->version_minor == cpu_to_le16(KSOCK_PROTO_V1_MINOR)) - return &ksocknal_protocol_v1x; - } - - return NULL; -} - -int -ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, - lnet_nid_t peer_nid, struct ksock_hello_msg *hello) -{ - /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ - struct ksock_net *net = (struct ksock_net *)ni->ni_data; - - LASSERT(hello->kshm_nips <= LNET_MAX_INTERFACES); - - /* rely on caller to hold a ref on socket so it wouldn't disappear */ - LASSERT(conn->ksnc_proto); - - hello->kshm_src_nid = ni->ni_nid; - hello->kshm_dst_nid = peer_nid; - hello->kshm_src_pid = the_lnet.ln_pid; - - hello->kshm_src_incarnation = net->ksnn_incarnation; - hello->kshm_ctype = conn->ksnc_type; - - return conn->ksnc_proto->pro_send_hello(conn, hello); -} - -static int -ksocknal_invert_type(int type) -{ - switch (type) { - case SOCKLND_CONN_ANY: - case SOCKLND_CONN_CONTROL: - return type; - case SOCKLND_CONN_BULK_IN: - return SOCKLND_CONN_BULK_OUT; - case SOCKLND_CONN_BULK_OUT: - return SOCKLND_CONN_BULK_IN; - default: - return SOCKLND_CONN_NONE; - } -} - -int -ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, - struct ksock_hello_msg *hello, - struct lnet_process_id *peerid, - __u64 *incarnation) -{ - /* Return < 0 fatal error - * 0 success - * EALREADY lost connection race - * EPROTO protocol version mismatch - */ - struct socket *sock = conn->ksnc_sock; - int active = !!conn->ksnc_proto; - int timeout; - int proto_match; - int rc; - struct ksock_proto *proto; - struct lnet_process_id recv_id; - - /* socket type set on active connections - not set on passive */ - LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); - - timeout = active ? *ksocknal_tunables.ksnd_timeout : - lnet_acceptor_timeout(); - - rc = lnet_sock_read(sock, &hello->kshm_magic, - sizeof(hello->kshm_magic), timeout); - if (rc) { - CERROR("Error %d reading HELLO from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0); - return rc; - } - - if (hello->kshm_magic != LNET_PROTO_MAGIC && - hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && - hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { - /* Unexpected magic! */ - CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n", - __cpu_to_le32(hello->kshm_magic), - LNET_PROTO_TCP_MAGIC, - &conn->ksnc_ipaddr); - return -EPROTO; - } - - rc = lnet_sock_read(sock, &hello->kshm_version, - sizeof(hello->kshm_version), timeout); - if (rc) { - CERROR("Error %d reading HELLO from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0); - return rc; - } - - proto = ksocknal_parse_proto_version(hello); - if (!proto) { - if (!active) { - /* unknown protocol from peer, tell peer my protocol */ - conn->ksnc_proto = &ksocknal_protocol_v3x; -#if SOCKNAL_VERSION_DEBUG - if (*ksocknal_tunables.ksnd_protocol == 2) - conn->ksnc_proto = &ksocknal_protocol_v2x; - else if (*ksocknal_tunables.ksnd_protocol == 1) - conn->ksnc_proto = &ksocknal_protocol_v1x; -#endif - hello->kshm_nips = 0; - ksocknal_send_hello(ni, conn, ni->ni_nid, hello); - } - - CERROR("Unknown protocol version (%d.x expected) from %pI4h\n", - conn->ksnc_proto->pro_version, - &conn->ksnc_ipaddr); - - return -EPROTO; - } - - proto_match = (conn->ksnc_proto == proto); - conn->ksnc_proto = proto; - - /* receive the rest of hello message anyway */ - rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); - if (rc) { - CERROR("Error %d reading or checking hello from from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0); - return rc; - } - - *incarnation = hello->kshm_src_incarnation; - - if (hello->kshm_src_nid == LNET_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n", - &conn->ksnc_ipaddr); - return -EPROTO; - } - - if (!active && - conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { - /* Userspace NAL assigns peer process ID from socket */ - recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; - recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), - conn->ksnc_ipaddr); - } else { - recv_id.nid = hello->kshm_src_nid; - recv_id.pid = hello->kshm_src_pid; - } - - if (!active) { - *peerid = recv_id; - - /* peer determines type */ - conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); - if (conn->ksnc_type == SOCKLND_CONN_NONE) { - CERROR("Unexpected type %d from %s ip %pI4h\n", - hello->kshm_ctype, libcfs_id2str(*peerid), - &conn->ksnc_ipaddr); - return -EPROTO; - } - - return 0; - } - - if (peerid->pid != recv_id.pid || - peerid->nid != recv_id.nid) { - LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n", - libcfs_id2str(*peerid), - &conn->ksnc_ipaddr, - libcfs_id2str(recv_id)); - return -EPROTO; - } - - if (hello->kshm_ctype == SOCKLND_CONN_NONE) { - /* Possible protocol mismatch or I lost the connection race */ - return proto_match ? EALREADY : EPROTO; - } - - if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { - CERROR("Mismatched types: me %d, %s ip %pI4h %d\n", - conn->ksnc_type, libcfs_id2str(*peerid), - &conn->ksnc_ipaddr, hello->kshm_ctype); - return -EPROTO; - } - - return 0; -} - -static int -ksocknal_connect(struct ksock_route *route) -{ - LIST_HEAD(zombies); - struct ksock_peer *peer = route->ksnr_peer; - int type; - int wanted; - struct socket *sock; - unsigned long deadline; - int retry_later = 0; - int rc = 0; - - deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - LASSERT(route->ksnr_scheduled); - LASSERT(!route->ksnr_connecting); - - route->ksnr_connecting = 1; - - for (;;) { - wanted = ksocknal_route_mask() & ~route->ksnr_connected; - - /* - * stop connecting if peer/route got closed under me, or - * route got connected while queued - */ - if (peer->ksnp_closing || route->ksnr_deleted || - !wanted) { - retry_later = 0; - break; - } - - /* reschedule if peer is connecting to me */ - if (peer->ksnp_accepting > 0) { - CDEBUG(D_NET, - "peer %s(%d) already connecting to me, retry later.\n", - libcfs_nid2str(peer->ksnp_id.nid), - peer->ksnp_accepting); - retry_later = 1; - } - - if (retry_later) /* needs reschedule */ - break; - - if (wanted & BIT(SOCKLND_CONN_ANY)) { - type = SOCKLND_CONN_ANY; - } else if (wanted & BIT(SOCKLND_CONN_CONTROL)) { - type = SOCKLND_CONN_CONTROL; - } else if (wanted & BIT(SOCKLND_CONN_BULK_IN)) { - type = SOCKLND_CONN_BULK_IN; - } else { - LASSERT(wanted & BIT(SOCKLND_CONN_BULK_OUT)); - type = SOCKLND_CONN_BULK_OUT; - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - if (time_after_eq(jiffies, deadline)) { - rc = -ETIMEDOUT; - lnet_connect_console_error(rc, peer->ksnp_id.nid, - route->ksnr_ipaddr, - route->ksnr_port); - goto failed; - } - - rc = lnet_connect(&sock, peer->ksnp_id.nid, - route->ksnr_myipaddr, - route->ksnr_ipaddr, route->ksnr_port); - if (rc) - goto failed; - - rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); - if (rc < 0) { - lnet_connect_console_error(rc, peer->ksnp_id.nid, - route->ksnr_ipaddr, - route->ksnr_port); - goto failed; - } - - /* - * A +ve RC means I have to retry because I lost the connection - * race or I have to renegotiate protocol version - */ - retry_later = (rc); - if (retry_later) - CDEBUG(D_NET, "peer %s: conn race, retry later.\n", - libcfs_nid2str(peer->ksnp_id.nid)); - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - } - - route->ksnr_scheduled = 0; - route->ksnr_connecting = 0; - - if (retry_later) { - /* - * re-queue for attention; this frees me up to handle - * the peer's incoming connection request - */ - if (rc == EALREADY || - (!rc && peer->ksnp_accepting > 0)) { - /* - * We want to introduce a delay before next - * attempt to connect if we lost conn race, - * but the race is resolved quickly usually, - * so min_reconnectms should be good heuristic - */ - route->ksnr_retry_interval = - *ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000; - route->ksnr_timeout = jiffies + route->ksnr_retry_interval; - } - - ksocknal_launch_connection_locked(route); - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - return retry_later; - - failed: - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - route->ksnr_scheduled = 0; - route->ksnr_connecting = 0; - - /* This is a retry rather than a new connection */ - route->ksnr_retry_interval *= 2; - route->ksnr_retry_interval = - max(route->ksnr_retry_interval, - (long)*ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000); - route->ksnr_retry_interval = - min(route->ksnr_retry_interval, - (long)*ksocknal_tunables.ksnd_max_reconnectms * HZ / 1000); - - LASSERT(route->ksnr_retry_interval); - route->ksnr_timeout = jiffies + route->ksnr_retry_interval; - - if (!list_empty(&peer->ksnp_tx_queue) && - !peer->ksnp_accepting && - !ksocknal_find_connecting_route_locked(peer)) { - struct ksock_conn *conn; - - /* - * ksnp_tx_queue is queued on a conn on successful - * connection for V1.x and V2.x - */ - if (!list_empty(&peer->ksnp_conns)) { - conn = list_entry(peer->ksnp_conns.next, - struct ksock_conn, ksnc_list); - LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); - } - - /* - * take all the blocked packets while I've got the lock and - * complete below... - */ - list_splice_init(&peer->ksnp_tx_queue, &zombies); - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_peer_failed(peer); - ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); - return 0; -} - -/* - * check whether we need to create more connds. - * It will try to create new thread if it's necessary, @timeout can - * be updated if failed to create, so caller wouldn't keep try while - * running out of resource. - */ -static int -ksocknal_connd_check_start(time64_t sec, long *timeout) -{ - char name[16]; - int rc; - int total = ksocknal_data.ksnd_connd_starting + - ksocknal_data.ksnd_connd_running; - - if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { - /* still in initializing */ - return 0; - } - - if (total >= *ksocknal_tunables.ksnd_nconnds_max || - total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { - /* - * can't create more connd, or still have enough - * threads to handle more connecting - */ - return 0; - } - - if (list_empty(&ksocknal_data.ksnd_connd_routes)) { - /* no pending connecting request */ - return 0; - } - - if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { - /* may run out of resource, retry later */ - *timeout = HZ; - return 0; - } - - if (ksocknal_data.ksnd_connd_starting > 0) { - /* serialize starting to avoid flood */ - return 0; - } - - ksocknal_data.ksnd_connd_starting_stamp = sec; - ksocknal_data.ksnd_connd_starting++; - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - - /* NB: total is the next id */ - snprintf(name, sizeof(name), "socknal_cd%02d", total); - rc = ksocknal_thread_start(ksocknal_connd, NULL, name); - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - if (!rc) - return 1; - - /* we tried ... */ - LASSERT(ksocknal_data.ksnd_connd_starting > 0); - ksocknal_data.ksnd_connd_starting--; - ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds(); - - return 1; -} - -/* - * check whether current thread can exit, it will return 1 if there are too - * many threads and no creating in past 120 seconds. - * Also, this function may update @timeout to make caller come back - * again to recheck these conditions. - */ -static int -ksocknal_connd_check_stop(time64_t sec, long *timeout) -{ - int val; - - if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { - /* still in initializing */ - return 0; - } - - if (ksocknal_data.ksnd_connd_starting > 0) { - /* in progress of starting new thread */ - return 0; - } - - if (ksocknal_data.ksnd_connd_running <= - *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ - return 0; - } - - /* created thread in past 120 seconds? */ - val = (int)(ksocknal_data.ksnd_connd_starting_stamp + - SOCKNAL_CONND_TIMEOUT - sec); - - *timeout = (val > 0) ? val * HZ : - SOCKNAL_CONND_TIMEOUT * HZ; - if (val > 0) - return 0; - - /* no creating in past 120 seconds */ - - return ksocknal_data.ksnd_connd_running > - ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; -} - -/* - * Go through connd_routes queue looking for a route that we can process - * right now, @timeout_p can be updated if we need to come back later - */ -static struct ksock_route * -ksocknal_connd_get_route_locked(signed long *timeout_p) -{ - struct ksock_route *route; - unsigned long now; - - now = jiffies; - - /* connd_routes can contain both pending and ordinary routes */ - list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes, - ksnr_connd_list) { - if (!route->ksnr_retry_interval || - time_after_eq(now, route->ksnr_timeout)) - return route; - - if (*timeout_p == MAX_SCHEDULE_TIMEOUT || - (int)*timeout_p > (int)(route->ksnr_timeout - now)) - *timeout_p = (int)(route->ksnr_timeout - now); - } - - return NULL; -} - -int -ksocknal_connd(void *arg) -{ - spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; - struct ksock_connreq *cr; - wait_queue_entry_t wait; - int nloops = 0; - int cons_retry = 0; - - init_waitqueue_entry(&wait, current); - - spin_lock_bh(connd_lock); - - LASSERT(ksocknal_data.ksnd_connd_starting > 0); - ksocknal_data.ksnd_connd_starting--; - ksocknal_data.ksnd_connd_running++; - - while (!ksocknal_data.ksnd_shuttingdown) { - struct ksock_route *route = NULL; - time64_t sec = ktime_get_real_seconds(); - long timeout = MAX_SCHEDULE_TIMEOUT; - int dropped_lock = 0; - - if (ksocknal_connd_check_stop(sec, &timeout)) { - /* wakeup another one to check stop */ - wake_up(&ksocknal_data.ksnd_connd_waitq); - break; - } - - if (ksocknal_connd_check_start(sec, &timeout)) { - /* created new thread */ - dropped_lock = 1; - } - - if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { - /* Connection accepted by the listener */ - cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next, - struct ksock_connreq, ksncr_list); - - list_del(&cr->ksncr_list); - spin_unlock_bh(connd_lock); - dropped_lock = 1; - - ksocknal_create_conn(cr->ksncr_ni, NULL, - cr->ksncr_sock, SOCKLND_CONN_NONE); - lnet_ni_decref(cr->ksncr_ni); - kfree(cr); - - spin_lock_bh(connd_lock); - } - - /* - * Only handle an outgoing connection request if there - * is a thread left to handle incoming connections and - * create new connd - */ - if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < - ksocknal_data.ksnd_connd_running) { - route = ksocknal_connd_get_route_locked(&timeout); - } - if (route) { - list_del(&route->ksnr_connd_list); - ksocknal_data.ksnd_connd_connecting++; - spin_unlock_bh(connd_lock); - dropped_lock = 1; - - if (ksocknal_connect(route)) { - /* consecutive retry */ - if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { - CWARN("massive consecutive re-connecting to %pI4h\n", - &route->ksnr_ipaddr); - cons_retry = 0; - } - } else { - cons_retry = 0; - } - - ksocknal_route_decref(route); - - spin_lock_bh(connd_lock); - ksocknal_data.ksnd_connd_connecting--; - } - - if (dropped_lock) { - if (++nloops < SOCKNAL_RESCHED) - continue; - spin_unlock_bh(connd_lock); - nloops = 0; - cond_resched(); - spin_lock_bh(connd_lock); - continue; - } - - /* Nothing to do for 'timeout' */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, - &wait); - spin_unlock_bh(connd_lock); - - nloops = 0; - schedule_timeout(timeout); - - remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); - spin_lock_bh(connd_lock); - } - ksocknal_data.ksnd_connd_running--; - spin_unlock_bh(connd_lock); - - ksocknal_thread_fini(); - return 0; -} - -static struct ksock_conn * -ksocknal_find_timed_out_conn(struct ksock_peer *peer) -{ - /* We're called with a shared lock on ksnd_global_lock */ - struct ksock_conn *conn; - struct list_head *ctmp; - - list_for_each(ctmp, &peer->ksnp_conns) { - int error; - - conn = list_entry(ctmp, struct ksock_conn, ksnc_list); - - /* Don't need the {get,put}connsock dance to deref ksnc_sock */ - LASSERT(!conn->ksnc_closing); - - /* - * SOCK_ERROR will reset error code of socket in - * some platform (like Darwin8.x) - */ - error = conn->ksnc_sock->sk->sk_err; - if (error) { - ksocknal_conn_addref(conn); - - switch (error) { - case ECONNRESET: - CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - break; - case ETIMEDOUT: - CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - break; - default: - CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n", - error, - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - break; - } - - return conn; - } - - if (conn->ksnc_rx_started && - time_after_eq(jiffies, - conn->ksnc_rx_deadline)) { - /* Timed out incomplete incoming message */ - ksocknal_conn_addref(conn); - CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %zd left %d\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port, - conn->ksnc_rx_state, - iov_iter_count(&conn->ksnc_rx_to), - conn->ksnc_rx_nob_left); - return conn; - } - - if ((!list_empty(&conn->ksnc_tx_queue) || - conn->ksnc_sock->sk->sk_wmem_queued) && - time_after_eq(jiffies, - conn->ksnc_tx_deadline)) { - /* - * Timed out messages queued for sending or - * buffered in the socket's send buffer - */ - ksocknal_conn_addref(conn); - CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - return conn; - } - } - - return NULL; -} - -static inline void -ksocknal_flush_stale_txs(struct ksock_peer *peer) -{ - struct ksock_tx *tx; - struct ksock_tx *tmp; - LIST_HEAD(stale_txs); - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - list_for_each_entry_safe(tx, tmp, &peer->ksnp_tx_queue, tx_list) { - if (!time_after_eq(jiffies, - tx->tx_deadline)) - break; - - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &stale_txs); - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); -} - -static int -ksocknal_send_keepalive_locked(struct ksock_peer *peer) - __must_hold(&ksocknal_data.ksnd_global_lock) -{ - struct ksock_sched *sched; - struct ksock_conn *conn; - struct ksock_tx *tx; - - /* last_alive will be updated by create_conn */ - if (list_empty(&peer->ksnp_conns)) - return 0; - - if (peer->ksnp_proto != &ksocknal_protocol_v3x) - return 0; - - if (*ksocknal_tunables.ksnd_keepalive <= 0 || - time_before(jiffies, - peer->ksnp_last_alive + *ksocknal_tunables.ksnd_keepalive * HZ)) - return 0; - - if (time_before(jiffies, peer->ksnp_send_keepalive)) - return 0; - - /* - * retry 10 secs later, so we wouldn't put pressure - * on this peer if we failed to send keepalive this time - */ - peer->ksnp_send_keepalive = jiffies + 10 * HZ; - - conn = ksocknal_find_conn_locked(peer, NULL, 1); - if (conn) { - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - if (!list_empty(&conn->ksnc_tx_queue)) { - spin_unlock_bh(&sched->kss_lock); - /* there is an queued ACK, don't need keepalive */ - return 0; - } - - spin_unlock_bh(&sched->kss_lock); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - /* cookie = 1 is reserved for keepalive PING */ - tx = ksocknal_alloc_tx_noop(1, 1); - if (!tx) { - read_lock(&ksocknal_data.ksnd_global_lock); - return -ENOMEM; - } - - if (!ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) { - read_lock(&ksocknal_data.ksnd_global_lock); - return 1; - } - - ksocknal_free_tx(tx); - read_lock(&ksocknal_data.ksnd_global_lock); - - return -EIO; -} - -static void -ksocknal_check_peer_timeouts(int idx) -{ - struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; - struct ksock_peer *peer; - struct ksock_conn *conn; - struct ksock_tx *tx; - - again: - /* - * NB. We expect to have a look at all the peers and not find any - * connections to time out, so we just use a shared lock while we - * take a look... - */ - read_lock(&ksocknal_data.ksnd_global_lock); - - list_for_each_entry(peer, peers, ksnp_list) { - unsigned long deadline = 0; - struct ksock_tx *tx_stale; - int resid = 0; - int n = 0; - - if (ksocknal_send_keepalive_locked(peer)) { - read_unlock(&ksocknal_data.ksnd_global_lock); - goto again; - } - - conn = ksocknal_find_timed_out_conn(peer); - - if (conn) { - read_unlock(&ksocknal_data.ksnd_global_lock); - - ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); - - /* - * NB we won't find this one again, but we can't - * just proceed with the next peer, since we dropped - * ksnd_global_lock and it might be dead already! - */ - ksocknal_conn_decref(conn); - goto again; - } - - /* - * we can't process stale txs right here because we're - * holding only shared lock - */ - if (!list_empty(&peer->ksnp_tx_queue)) { - tx = list_entry(peer->ksnp_tx_queue.next, - struct ksock_tx, tx_list); - - if (time_after_eq(jiffies, - tx->tx_deadline)) { - ksocknal_peer_addref(peer); - read_unlock(&ksocknal_data.ksnd_global_lock); - - ksocknal_flush_stale_txs(peer); - - ksocknal_peer_decref(peer); - goto again; - } - } - - if (list_empty(&peer->ksnp_zc_req_list)) - continue; - - tx_stale = NULL; - spin_lock(&peer->ksnp_lock); - list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { - if (!time_after_eq(jiffies, - tx->tx_deadline)) - break; - /* ignore the TX if connection is being closed */ - if (tx->tx_conn->ksnc_closing) - continue; - if (!tx_stale) - tx_stale = tx; - n++; - } - - if (!tx_stale) { - spin_unlock(&peer->ksnp_lock); - continue; - } - - deadline = tx_stale->tx_deadline; - resid = tx_stale->tx_resid; - conn = tx_stale->tx_conn; - ksocknal_conn_addref(conn); - - spin_unlock(&peer->ksnp_lock); - read_unlock(&ksocknal_data.ksnd_global_lock); - - CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n", - n, libcfs_nid2str(peer->ksnp_id.nid), tx_stale, - (jiffies - deadline) / HZ, - resid, conn->ksnc_sock->sk->sk_wmem_queued); - - ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); - ksocknal_conn_decref(conn); - goto again; - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -int -ksocknal_reaper(void *arg) -{ - wait_queue_entry_t wait; - struct ksock_conn *conn; - struct ksock_sched *sched; - struct list_head enomem_conns; - int nenomem_conns; - long timeout; - int i; - int peer_index = 0; - unsigned long deadline = jiffies; - - INIT_LIST_HEAD(&enomem_conns); - init_waitqueue_entry(&wait, current); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - while (!ksocknal_data.ksnd_shuttingdown) { - if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) { - conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next, - struct ksock_conn, ksnc_list); - list_del(&conn->ksnc_list); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - ksocknal_terminate_conn(conn); - ksocknal_conn_decref(conn); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - continue; - } - - if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) { - conn = list_entry(ksocknal_data.ksnd_zombie_conns.next, - struct ksock_conn, ksnc_list); - list_del(&conn->ksnc_list); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - ksocknal_destroy_conn(conn); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - continue; - } - - if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) { - list_add(&enomem_conns, - &ksocknal_data.ksnd_enomem_conns); - list_del_init(&ksocknal_data.ksnd_enomem_conns); - } - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - /* reschedule all the connections that stalled with ENOMEM... */ - nenomem_conns = 0; - while (!list_empty(&enomem_conns)) { - conn = list_entry(enomem_conns.next, struct ksock_conn, - ksnc_tx_list); - list_del(&conn->ksnc_tx_list); - - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - - LASSERT(conn->ksnc_tx_scheduled); - conn->ksnc_tx_ready = 1; - list_add_tail(&conn->ksnc_tx_list, - &sched->kss_tx_conns); - wake_up(&sched->kss_waitq); - - spin_unlock_bh(&sched->kss_lock); - nenomem_conns++; - } - - /* careful with the jiffy wrap... */ - while ((timeout = deadline - jiffies) <= 0) { - const int n = 4; - const int p = 1; - int chunk = ksocknal_data.ksnd_peer_hash_size; - - /* - * Time to check for timeouts on a few more peers: I do - * checks every 'p' seconds on a proportion of the peer - * table and I need to check every connection 'n' times - * within a timeout interval, to ensure I detect a - * timeout on any connection within (n+1)/n times the - * timeout interval. - */ - if (*ksocknal_tunables.ksnd_timeout > n * p) - chunk = (chunk * n * p) / - *ksocknal_tunables.ksnd_timeout; - if (!chunk) - chunk = 1; - - for (i = 0; i < chunk; i++) { - ksocknal_check_peer_timeouts(peer_index); - peer_index = (peer_index + 1) % - ksocknal_data.ksnd_peer_hash_size; - } - - deadline = deadline + p * HZ; - } - - if (nenomem_conns) { - /* - * Reduce my timeout if I rescheduled ENOMEM conns. - * This also prevents me getting woken immediately - * if any go back on my enomem list. - */ - timeout = SOCKNAL_ENOMEM_RETRY; - } - ksocknal_data.ksnd_reaper_waketime = jiffies + timeout; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); - - if (!ksocknal_data.ksnd_shuttingdown && - list_empty(&ksocknal_data.ksnd_deathrow_conns) && - list_empty(&ksocknal_data.ksnd_zombie_conns)) - schedule_timeout(timeout); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - } - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - ksocknal_thread_fini(); - return 0; -} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c deleted file mode 100644 index 93a02cd..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c +++ /dev/null @@ -1,534 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include <linux/highmem.h> -#include "socklnd.h" - -int -ksocknal_lib_get_conn_addrs(struct ksock_conn *conn) -{ - int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr, - &conn->ksnc_port); - - /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ - LASSERT(!conn->ksnc_closing); - - if (rc) { - CERROR("Error %d getting sock peer IP\n", rc); - return rc; - } - - rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL); - if (rc) { - CERROR("Error %d getting sock local IP\n", rc); - return rc; - } - - return 0; -} - -int -ksocknal_lib_zc_capable(struct ksock_conn *conn) -{ - int caps = conn->ksnc_sock->sk->sk_route_caps; - - if (conn->ksnc_proto == &ksocknal_protocol_v1x) - return 0; - - /* - * ZC if the socket supports scatter/gather and doesn't need software - * checksums - */ - return ((caps & NETIF_F_SG) && (caps & NETIF_F_CSUM_MASK)); -} - -int -ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; - struct socket *sock = conn->ksnc_sock; - int nob, i; - - if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ - conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ - tx->tx_nob == tx->tx_resid && /* frist sending */ - !tx->tx_msg.ksm_csum) /* not checksummed */ - ksocknal_lib_csum_tx(tx); - - for (nob = i = 0; i < tx->tx_niov; i++) - nob += tx->tx_iov[i].iov_len; - - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, - tx->tx_iov, tx->tx_niov, nob); - return sock_sendmsg(sock, &msg); -} - -int -ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct socket *sock = conn->ksnc_sock; - struct bio_vec *kiov = tx->tx_kiov; - int rc; - int nob; - - /* Not NOOP message */ - LASSERT(tx->tx_lnetmsg); - - if (tx->tx_msg.ksm_zc_cookies[0]) { - /* Zero copy is enabled */ - struct sock *sk = sock->sk; - struct page *page = kiov->bv_page; - int offset = kiov->bv_offset; - int fragsize = kiov->bv_len; - int msgflg = MSG_DONTWAIT; - - CDEBUG(D_NET, "page %p + offset %x for %d\n", - page, offset, kiov->bv_len); - - if (!list_empty(&conn->ksnc_tx_queue) || - fragsize < tx->tx_resid) - msgflg |= MSG_MORE; - - if (sk->sk_prot->sendpage) { - rc = sk->sk_prot->sendpage(sk, page, - offset, fragsize, msgflg); - } else { - rc = tcp_sendpage(sk, page, offset, fragsize, msgflg); - } - } else { - struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; - int i; - - for (nob = i = 0; i < tx->tx_nkiov; i++) - nob += kiov[i].bv_len; - - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, - kiov, tx->tx_nkiov, nob); - rc = sock_sendmsg(sock, &msg); - } - return rc; -} - -void -ksocknal_lib_eager_ack(struct ksock_conn *conn) -{ - int opt = 1; - struct socket *sock = conn->ksnc_sock; - - /* - * Remind the socket to ACK eagerly. If I don't, the socket might - * think I'm about to send something it could piggy-back the ACK - * on, introducing delay in completing zero-copy sends in my - * peer. - */ - kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt, - sizeof(opt)); -} - -static int lustre_csum(struct kvec *v, void *context) -{ - struct ksock_conn *conn = context; - conn->ksnc_rx_csum = crc32_le(conn->ksnc_rx_csum, - v->iov_base, v->iov_len); - return 0; -} - -int -ksocknal_lib_recv(struct ksock_conn *conn) -{ - struct msghdr msg = { .msg_iter = conn->ksnc_rx_to }; - __u32 saved_csum; - int rc; - - rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT); - if (rc <= 0) - return rc; - - saved_csum = conn->ksnc_msg.ksm_csum; - if (!saved_csum) - return rc; - - /* header is included only in V2 - V3 checksums only the bulk data */ - if (!(conn->ksnc_rx_to.type & ITER_BVEC) && - conn->ksnc_proto != &ksocknal_protocol_v2x) - return rc; - - /* accumulate checksum */ - conn->ksnc_msg.ksm_csum = 0; - iov_iter_for_each_range(&conn->ksnc_rx_to, rc, lustre_csum, conn); - conn->ksnc_msg.ksm_csum = saved_csum; - - return rc; -} - -void -ksocknal_lib_csum_tx(struct ksock_tx *tx) -{ - int i; - __u32 csum; - void *base; - - LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg); - LASSERT(tx->tx_conn); - LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); - - tx->tx_msg.ksm_csum = 0; - - csum = crc32_le(~0, tx->tx_iov[0].iov_base, - tx->tx_iov[0].iov_len); - - if (tx->tx_kiov) { - for (i = 0; i < tx->tx_nkiov; i++) { - base = kmap(tx->tx_kiov[i].bv_page) + - tx->tx_kiov[i].bv_offset; - - csum = crc32_le(csum, base, tx->tx_kiov[i].bv_len); - - kunmap(tx->tx_kiov[i].bv_page); - } - } else { - for (i = 1; i < tx->tx_niov; i++) - csum = crc32_le(csum, tx->tx_iov[i].iov_base, - tx->tx_iov[i].iov_len); - } - - if (*ksocknal_tunables.ksnd_inject_csum_error) { - csum++; - *ksocknal_tunables.ksnd_inject_csum_error = 0; - } - - tx->tx_msg.ksm_csum = csum; -} - -int -ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, - int *rxmem, int *nagle) -{ - struct socket *sock = conn->ksnc_sock; - int len; - int rc; - - rc = ksocknal_connsock_addref(conn); - if (rc) { - LASSERT(conn->ksnc_closing); - *txmem = *rxmem = *nagle = 0; - return -ESHUTDOWN; - } - - rc = lnet_sock_getbuf(sock, txmem, rxmem); - if (!rc) { - len = sizeof(*nagle); - rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)nagle, &len); - } - - ksocknal_connsock_decref(conn); - - if (!rc) - *nagle = !*nagle; - else - *txmem = *rxmem = *nagle = 0; - - return rc; -} - -int -ksocknal_lib_setup_sock(struct socket *sock) -{ - int rc; - int option; - int keep_idle; - int keep_intvl; - int keep_count; - int do_keepalive; - struct linger linger; - - sock->sk->sk_allocation = GFP_NOFS; - - /* - * Ensure this socket aborts active sends immediately when we close - * it. - */ - linger.l_onoff = 0; - linger.l_linger = 0; - - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger, - sizeof(linger)); - if (rc) { - CERROR("Can't set SO_LINGER: %d\n", rc); - return rc; - } - - option = -1; - rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option, - sizeof(option)); - if (rc) { - CERROR("Can't set SO_LINGER2: %d\n", rc); - return rc; - } - - if (!*ksocknal_tunables.ksnd_nagle) { - option = 1; - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&option, sizeof(option)); - if (rc) { - CERROR("Can't disable nagle: %d\n", rc); - return rc; - } - } - - rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size, - *ksocknal_tunables.ksnd_rx_buffer_size); - if (rc) { - CERROR("Can't set buffer tx %d, rx %d buffers: %d\n", - *ksocknal_tunables.ksnd_tx_buffer_size, - *ksocknal_tunables.ksnd_rx_buffer_size, rc); - return rc; - } - -/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ - - /* snapshot tunables */ - keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; - keep_count = *ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; - - do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); - - option = (do_keepalive ? 1 : 0); - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option, - sizeof(option)); - if (rc) { - CERROR("Can't set SO_KEEPALIVE: %d\n", rc); - return rc; - } - - if (!do_keepalive) - return 0; - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle, - sizeof(keep_idle)); - if (rc) { - CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); - return rc; - } - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keep_intvl, sizeof(keep_intvl)); - if (rc) { - CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); - return rc; - } - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count, - sizeof(keep_count)); - if (rc) { - CERROR("Can't set TCP_KEEPCNT: %d\n", rc); - return rc; - } - - return 0; -} - -void -ksocknal_lib_push_conn(struct ksock_conn *conn) -{ - struct sock *sk; - struct tcp_sock *tp; - int nonagle; - int val = 1; - int rc; - - rc = ksocknal_connsock_addref(conn); - if (rc) /* being shut down */ - return; - - sk = conn->ksnc_sock->sk; - tp = tcp_sk(sk); - - lock_sock(sk); - nonagle = tp->nonagle; - tp->nonagle = 1; - release_sock(sk); - - rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - LASSERT(!rc); - - lock_sock(sk); - tp->nonagle = nonagle; - release_sock(sk); - - ksocknal_connsock_decref(conn); -} - -/* - * socket call back in Linux - */ -static void -ksocknal_data_ready(struct sock *sk) -{ - struct ksock_conn *conn; - - /* interleave correctly with closing sockets... */ - LASSERT(!in_irq()); - read_lock(&ksocknal_data.ksnd_global_lock); - - conn = sk->sk_user_data; - if (!conn) { /* raced with ksocknal_terminate_conn */ - LASSERT(sk->sk_data_ready != &ksocknal_data_ready); - sk->sk_data_ready(sk); - } else { - ksocknal_read_callback(conn); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -static void -ksocknal_write_space(struct sock *sk) -{ - struct ksock_conn *conn; - int wspace; - int min_wpace; - - /* interleave correctly with closing sockets... */ - LASSERT(!in_irq()); - read_lock(&ksocknal_data.ksnd_global_lock); - - conn = sk->sk_user_data; - wspace = sk_stream_wspace(sk); - min_wpace = sk_stream_min_wspace(sk); - - CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", - sk, wspace, min_wpace, conn, - !conn ? "" : (conn->ksnc_tx_ready ? - " ready" : " blocked"), - !conn ? "" : (conn->ksnc_tx_scheduled ? - " scheduled" : " idle"), - !conn ? "" : (list_empty(&conn->ksnc_tx_queue) ? - " empty" : " queued")); - - if (!conn) { /* raced with ksocknal_terminate_conn */ - LASSERT(sk->sk_write_space != &ksocknal_write_space); - sk->sk_write_space(sk); - - read_unlock(&ksocknal_data.ksnd_global_lock); - return; - } - - if (wspace >= min_wpace) { /* got enough space */ - ksocknal_write_callback(conn); - - /* - * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the - * ENOMEM check in ksocknal_transmit is race-free (think about - * it). - */ - clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -void -ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn) -{ - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; -} - -void -ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn) -{ - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = ksocknal_data_ready; - sock->sk->sk_write_space = ksocknal_write_space; -} - -void -ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn) -{ - /* - * Remove conn's network callbacks. - * NB I _have_ to restore the callback, rather than storing a noop, - * since the socket could survive past this module being unloaded!! - */ - sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - sock->sk->sk_write_space = conn->ksnc_saved_write_space; - - /* - * A callback could be in progress already; they hold a read lock - * on ksnd_global_lock (to serialise with me) and NOOP if - * sk_user_data is NULL. - */ - sock->sk->sk_user_data = NULL; -} - -int -ksocknal_lib_memory_pressure(struct ksock_conn *conn) -{ - int rc = 0; - struct ksock_sched *sched; - - sched = conn->ksnc_scheduler; - spin_lock_bh(&sched->kss_lock); - - if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && - !conn->ksnc_tx_ready) { - /* - * SOCK_NOSPACE is set when the socket fills - * and cleared in the write_space callback - * (which also sets ksnc_tx_ready). If - * SOCK_NOSPACE and ksnc_tx_ready are BOTH - * zero, I didn't fill the socket and - * write_space won't reschedule me, so I - * return -ENOMEM to get my caller to retry - * after a timeout - */ - rc = -ENOMEM; - } - - spin_unlock_bh(&sched->kss_lock); - - return rc; -} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c deleted file mode 100644 index 5663a4c..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * Author: Eric Barton <eric@bartonsoftware.com> - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include "socklnd.h" - -static int sock_timeout = 50; -module_param(sock_timeout, int, 0644); -MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); - -static int credits = 256; -module_param(credits, int, 0444); -MODULE_PARM_DESC(credits, "# concurrent sends"); - -static int peer_credits = 8; -module_param(peer_credits, int, 0444); -MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); - -static int peer_buffer_credits; -module_param(peer_buffer_credits, int, 0444); -MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); - -static int peer_timeout = 180; -module_param(peer_timeout, int, 0444); -MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); - -/* - * Number of daemons in each thread pool which is percpt, - * we will estimate reasonable value based on CPUs if it's not set. - */ -static unsigned int nscheds; -module_param(nscheds, int, 0444); -MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); - -static int nconnds = 4; -module_param(nconnds, int, 0444); -MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); - -static int nconnds_max = 64; -module_param(nconnds_max, int, 0444); -MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); - -static int min_reconnectms = 1000; -module_param(min_reconnectms, int, 0644); -MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); - -static int max_reconnectms = 60000; -module_param(max_reconnectms, int, 0644); -MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); - -# define DEFAULT_EAGER_ACK 0 -static int eager_ack = DEFAULT_EAGER_ACK; -module_param(eager_ack, int, 0644); -MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); - -static int typed_conns = 1; -module_param(typed_conns, int, 0444); -MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); - -static int min_bulk = 1 << 10; -module_param(min_bulk, int, 0644); -MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); - -# define DEFAULT_BUFFER_SIZE 0 -static int tx_buffer_size = DEFAULT_BUFFER_SIZE; -module_param(tx_buffer_size, int, 0644); -MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); - -static int rx_buffer_size = DEFAULT_BUFFER_SIZE; -module_param(rx_buffer_size, int, 0644); -MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); - -static int nagle; -module_param(nagle, int, 0644); -MODULE_PARM_DESC(nagle, "enable NAGLE?"); - -static int round_robin = 1; -module_param(round_robin, int, 0644); -MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); - -static int keepalive = 30; -module_param(keepalive, int, 0644); -MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); - -static int keepalive_idle = 30; -module_param(keepalive_idle, int, 0644); -MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); - -#define DEFAULT_KEEPALIVE_COUNT 5 -static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; -module_param(keepalive_count, int, 0644); -MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); - -static int keepalive_intvl = 5; -module_param(keepalive_intvl, int, 0644); -MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); - -static int enable_csum; -module_param(enable_csum, int, 0644); -MODULE_PARM_DESC(enable_csum, "enable check sum"); - -static int inject_csum_error; -module_param(inject_csum_error, int, 0644); -MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); - -static int nonblk_zcack = 1; -module_param(nonblk_zcack, int, 0644); -MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); - -static unsigned int zc_min_payload = 16 << 10; -module_param(zc_min_payload, int, 0644); -MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); - -static unsigned int zc_recv; -module_param(zc_recv, int, 0644); -MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); - -static unsigned int zc_recv_min_nfrags = 16; -module_param(zc_recv_min_nfrags, int, 0644); -MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); - -#if SOCKNAL_VERSION_DEBUG -static int protocol = 3; -module_param(protocol, int, 0644); -MODULE_PARM_DESC(protocol, "protocol version"); -#endif - -struct ksock_tunables ksocknal_tunables; - -int ksocknal_tunables_init(void) -{ - /* initialize ksocknal_tunables structure */ - ksocknal_tunables.ksnd_timeout = &sock_timeout; - ksocknal_tunables.ksnd_nscheds = &nscheds; - ksocknal_tunables.ksnd_nconnds = &nconnds; - ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; - ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; - ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; - ksocknal_tunables.ksnd_eager_ack = &eager_ack; - ksocknal_tunables.ksnd_typed_conns = &typed_conns; - ksocknal_tunables.ksnd_min_bulk = &min_bulk; - ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; - ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; - ksocknal_tunables.ksnd_nagle = &nagle; - ksocknal_tunables.ksnd_round_robin = &round_robin; - ksocknal_tunables.ksnd_keepalive = &keepalive; - ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; - ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; - ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; - ksocknal_tunables.ksnd_credits = &credits; - ksocknal_tunables.ksnd_peertxcredits = &peer_credits; - ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; - ksocknal_tunables.ksnd_peertimeout = &peer_timeout; - ksocknal_tunables.ksnd_enable_csum = &enable_csum; - ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; - ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; - ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; - ksocknal_tunables.ksnd_zc_recv = &zc_recv; - ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; - -#if SOCKNAL_VERSION_DEBUG - ksocknal_tunables.ksnd_protocol = &protocol; -#endif - - if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) - *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10; - - return 0; -}; diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c deleted file mode 100644 index 05982da..0000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c +++ /dev/null @@ -1,810 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2012, Intel Corporation. - * - * Author: Zach Brown <zab@zabbo.net> - * Author: Peter J. Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Eric Barton <eric@bartonsoftware.com> - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include "socklnd.h" - -/* - * Protocol entries : - * pro_send_hello : send hello message - * pro_recv_hello : receive hello message - * pro_pack : pack message header - * pro_unpack : unpack message header - * pro_queue_tx_zcack() : Called holding BH lock: kss_lock - * return 1 if ACK is piggybacked, otherwise return 0 - * pro_queue_tx_msg() : Called holding BH lock: kss_lock - * return the ACK that piggybacked by my message, or NULL - * pro_handle_zcreq() : handler of incoming ZC-REQ - * pro_handle_zcack() : handler of incoming ZC-ACK - * pro_match_tx() : Called holding glock - */ - -static struct ksock_tx * -ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg) -{ - /* V1.x, just enqueue it */ - list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); - return NULL; -} - -void -ksocknal_next_tx_carrier(struct ksock_conn *conn) -{ - struct ksock_tx *tx = conn->ksnc_tx_carrier; - - /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ - LASSERT(!list_empty(&conn->ksnc_tx_queue)); - LASSERT(tx); - - /* Next TX that can carry ZC-ACK or LNet message */ - if (tx->tx_list.next == &conn->ksnc_tx_queue) { - /* no more packets queued */ - conn->ksnc_tx_carrier = NULL; - } else { - conn->ksnc_tx_carrier = list_next_entry(tx, tx_list); - LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type); - } -} - -static int -ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn, - struct ksock_tx *tx_ack, __u64 cookie) -{ - struct ksock_tx *tx = conn->ksnc_tx_carrier; - - LASSERT(!tx_ack || - tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - /* - * Enqueue or piggyback tx_ack / cookie - * . no tx can piggyback cookie of tx_ack (or cookie), just - * enqueue the tx_ack (if tx_ack != NUL) and return NULL. - * . There is tx can piggyback cookie of tx_ack (or cookie), - * piggyback the cookie and return the tx. - */ - if (!tx) { - if (tx_ack) { - list_add_tail(&tx_ack->tx_list, - &conn->ksnc_tx_queue); - conn->ksnc_tx_carrier = tx_ack; - } - return 0; - } - - if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { - /* tx is noop zc-ack, can't piggyback zc-ack cookie */ - if (tx_ack) - list_add_tail(&tx_ack->tx_list, - &conn->ksnc_tx_queue); - return 0; - } - - LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); - LASSERT(!tx->tx_msg.ksm_zc_cookies[1]); - - if (tx_ack) - cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; - - /* piggyback the zc-ack cookie */ - tx->tx_msg.ksm_zc_cookies[1] = cookie; - /* move on to the next TX which can carry cookie */ - ksocknal_next_tx_carrier(conn); - - return 1; -} - -static struct ksock_tx * -ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg) -{ - struct ksock_tx *tx = conn->ksnc_tx_carrier; - - /* - * Enqueue tx_msg: - * . If there is no NOOP on the connection, just enqueue - * tx_msg and return NULL - * . If there is NOOP on the connection, piggyback the cookie - * and replace the NOOP tx, and return the NOOP tx. - */ - if (!tx) { /* nothing on queue */ - list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); - conn->ksnc_tx_carrier = tx_msg; - return NULL; - } - - if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ - list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); - return NULL; - } - - LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - /* There is a noop zc-ack can be piggybacked */ - tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; - ksocknal_next_tx_carrier(conn); - - /* use new_tx to replace the noop zc-ack packet */ - list_add(&tx_msg->tx_list, &tx->tx_list); - list_del(&tx->tx_list); - - return tx; -} - -static int -ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn, - struct ksock_tx *tx_ack, __u64 cookie) -{ - struct ksock_tx *tx; - - if (conn->ksnc_type != SOCKLND_CONN_ACK) - return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); - - /* non-blocking ZC-ACK (to router) */ - LASSERT(!tx_ack || - tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - tx = conn->ksnc_tx_carrier; - if (!tx) { - if (tx_ack) { - list_add_tail(&tx_ack->tx_list, - &conn->ksnc_tx_queue); - conn->ksnc_tx_carrier = tx_ack; - } - return 0; - } - - /* conn->ksnc_tx_carrier */ - - if (tx_ack) - cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; - - if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ - return 1; - - if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { - /* replace the keepalive PING with a real ACK */ - LASSERT(!tx->tx_msg.ksm_zc_cookies[0]); - tx->tx_msg.ksm_zc_cookies[1] = cookie; - return 1; - } - - if (cookie == tx->tx_msg.ksm_zc_cookies[0] || - cookie == tx->tx_msg.ksm_zc_cookies[1]) { - CWARN("%s: duplicated ZC cookie: %llu\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); - return 1; /* XXX return error in the future */ - } - - if (!tx->tx_msg.ksm_zc_cookies[0]) { - /* - * NOOP tx has only one ZC-ACK cookie, - * can carry at least one more - */ - if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { - tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; - tx->tx_msg.ksm_zc_cookies[1] = cookie; - } else { - tx->tx_msg.ksm_zc_cookies[0] = cookie; - } - - if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { - /* - * not likely to carry more ACKs, skip it - * to simplify logic - */ - ksocknal_next_tx_carrier(conn); - } - - return 1; - } - - /* takes two or more cookies already */ - - if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { - __u64 tmp = 0; - - /* two separated cookies: (a+2, a) or (a+1, a) */ - LASSERT(tx->tx_msg.ksm_zc_cookies[0] - - tx->tx_msg.ksm_zc_cookies[1] <= 2); - - if (tx->tx_msg.ksm_zc_cookies[0] - - tx->tx_msg.ksm_zc_cookies[1] == 2) { - if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) - tmp = cookie; - } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { - tmp = tx->tx_msg.ksm_zc_cookies[1]; - } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { - tmp = tx->tx_msg.ksm_zc_cookies[0]; - } - - if (tmp) { - /* range of cookies */ - tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; - tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; - return 1; - } - - } else { - /* - * ksm_zc_cookies[0] < ksm_zc_cookies[1], - * it is range of cookies - */ - if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && - cookie <= tx->tx_msg.ksm_zc_cookies[1]) { - CWARN("%s: duplicated ZC cookie: %llu\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); - return 1; /* XXX: return error in the future */ - } - - if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { - tx->tx_msg.ksm_zc_cookies[1] = cookie; - return 1; - } - - if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { - tx->tx_msg.ksm_zc_cookies[0] = cookie; - return 1; - } - } - - /* failed to piggyback ZC-ACK */ - if (tx_ack) { - list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); - /* the next tx can piggyback at least 1 ACK */ - ksocknal_next_tx_carrier(conn); - } - - return 0; -} - -static int -ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) -{ - int nob; - -#if SOCKNAL_VERSION_DEBUG - if (!*ksocknal_tunables.ksnd_typed_conns) - return SOCKNAL_MATCH_YES; -#endif - - if (!tx || !tx->tx_lnetmsg) { - /* noop packet */ - nob = offsetof(struct ksock_msg, ksm_u); - } else { - nob = tx->tx_lnetmsg->msg_len + - ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? - sizeof(struct lnet_hdr) : sizeof(struct ksock_msg)); - } - - /* default checking for typed connection */ - switch (conn->ksnc_type) { - default: - CERROR("ksnc_type bad: %u\n", conn->ksnc_type); - LBUG(); - case SOCKLND_CONN_ANY: - return SOCKNAL_MATCH_YES; - - case SOCKLND_CONN_BULK_IN: - return SOCKNAL_MATCH_MAY; - - case SOCKLND_CONN_BULK_OUT: - if (nob < *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - - case SOCKLND_CONN_CONTROL: - if (nob >= *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - } -} - -static int -ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) -{ - int nob; - - if (!tx || !tx->tx_lnetmsg) - nob = offsetof(struct ksock_msg, ksm_u); - else - nob = tx->tx_lnetmsg->msg_len + sizeof(struct ksock_msg); - - switch (conn->ksnc_type) { - default: - CERROR("ksnc_type bad: %u\n", conn->ksnc_type); - LBUG(); - case SOCKLND_CONN_ANY: - return SOCKNAL_MATCH_NO; - - case SOCKLND_CONN_ACK: - if (nonblk) - return SOCKNAL_MATCH_YES; - else if (!tx || !tx->tx_lnetmsg) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_NO; - - case SOCKLND_CONN_BULK_OUT: - if (nonblk) - return SOCKNAL_MATCH_NO; - else if (nob < *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - - case SOCKLND_CONN_CONTROL: - if (nonblk) - return SOCKNAL_MATCH_NO; - else if (nob >= *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - } -} - -/* (Sink) handle incoming ZC request from sender */ -static int -ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote) -{ - struct ksock_peer *peer = c->ksnc_peer; - struct ksock_conn *conn; - struct ksock_tx *tx; - int rc; - - read_lock(&ksocknal_data.ksnd_global_lock); - - conn = ksocknal_find_conn_locked(peer, NULL, !!remote); - if (conn) { - struct ksock_sched *sched = conn->ksnc_scheduler; - - LASSERT(conn->ksnc_proto->pro_queue_tx_zcack); - - spin_lock_bh(&sched->kss_lock); - - rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); - - spin_unlock_bh(&sched->kss_lock); - - if (rc) { /* piggybacked */ - read_unlock(&ksocknal_data.ksnd_global_lock); - return 0; - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - /* ACK connection is not ready, or can't piggyback the ACK */ - tx = ksocknal_alloc_tx_noop(cookie, !!remote); - if (!tx) - return -ENOMEM; - - rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id); - if (!rc) - return 0; - - ksocknal_free_tx(tx); - return rc; -} - -/* (Sender) handle ZC_ACK from sink */ -static int -ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2) -{ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_tx *tx; - struct ksock_tx *temp; - struct ksock_tx *tmp; - LIST_HEAD(zlist); - int count; - - if (!cookie1) - cookie1 = cookie2; - - count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); - - if (cookie2 == SOCKNAL_KEEPALIVE_PING && - conn->ksnc_proto == &ksocknal_protocol_v3x) { - /* keepalive PING for V3.x, just ignore it */ - return count == 1 ? 0 : -EPROTO; - } - - spin_lock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, - tx_zc_list) { - __u64 c = tx->tx_msg.ksm_zc_cookies[0]; - - if (c == cookie1 || c == cookie2 || - (cookie1 < c && c < cookie2)) { - tx->tx_msg.ksm_zc_cookies[0] = 0; - list_del(&tx->tx_zc_list); - list_add(&tx->tx_zc_list, &zlist); - - if (!--count) - break; - } - } - - spin_unlock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) { - list_del(&tx->tx_zc_list); - ksocknal_tx_decref(tx); - } - - return !count ? 0 : -EPROTO; -} - -static int -ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello) -{ - struct socket *sock = conn->ksnc_sock; - struct lnet_hdr *hdr; - struct lnet_magicversion *hmv; - int rc; - int i; - - BUILD_BUG_ON(sizeof(struct lnet_magicversion) != offsetof(struct lnet_hdr, src_nid)); - - hdr = kzalloc(sizeof(*hdr), GFP_NOFS); - if (!hdr) { - CERROR("Can't allocate struct lnet_hdr\n"); - return -ENOMEM; - } - - hmv = (struct lnet_magicversion *)&hdr->dest_nid; - - /* - * Re-organize V2.x message header to V1.x (struct lnet_hdr) - * header and send out - */ - hmv->magic = cpu_to_le32(LNET_PROTO_TCP_MAGIC); - hmv->version_major = cpu_to_le16(KSOCK_PROTO_V1_MAJOR); - hmv->version_minor = cpu_to_le16(KSOCK_PROTO_V1_MINOR); - - if (the_lnet.ln_testprotocompat) { - /* single-shot proto check */ - LNET_LOCK(); - if (the_lnet.ln_testprotocompat & 1) { - hmv->version_major++; /* just different! */ - the_lnet.ln_testprotocompat &= ~1; - } - if (the_lnet.ln_testprotocompat & 2) { - hmv->magic = LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~2; - } - LNET_UNLOCK(); - } - - hdr->src_nid = cpu_to_le64(hello->kshm_src_nid); - hdr->src_pid = cpu_to_le32(hello->kshm_src_pid); - hdr->type = cpu_to_le32(LNET_MSG_HELLO); - hdr->payload_length = cpu_to_le32(hello->kshm_nips * sizeof(__u32)); - hdr->msg.hello.type = cpu_to_le32(hello->kshm_ctype); - hdr->msg.hello.incarnation = cpu_to_le64(hello->kshm_src_incarnation); - - rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", - rc, &conn->ksnc_ipaddr, conn->ksnc_port); - goto out; - } - - if (!hello->kshm_nips) - goto out; - - for (i = 0; i < (int)hello->kshm_nips; i++) - hello->kshm_ips[i] = __cpu_to_le32(hello->kshm_ips[i]); - - rc = lnet_sock_write(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), - lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", - rc, hello->kshm_nips, - &conn->ksnc_ipaddr, conn->ksnc_port); - } -out: - kfree(hdr); - - return rc; -} - -static int -ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello) -{ - struct socket *sock = conn->ksnc_sock; - int rc; - - hello->kshm_magic = LNET_PROTO_MAGIC; - hello->kshm_version = conn->ksnc_proto->pro_version; - - if (the_lnet.ln_testprotocompat) { - /* single-shot proto check */ - LNET_LOCK(); - if (the_lnet.ln_testprotocompat & 1) { - hello->kshm_version++; /* just different! */ - the_lnet.ln_testprotocompat &= ~1; - } - LNET_UNLOCK(); - } - - rc = lnet_sock_write(sock, hello, offsetof(struct ksock_hello_msg, kshm_ips), - lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", - rc, &conn->ksnc_ipaddr, conn->ksnc_port); - return rc; - } - - if (!hello->kshm_nips) - return 0; - - rc = lnet_sock_write(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), - lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", - rc, hello->kshm_nips, - &conn->ksnc_ipaddr, conn->ksnc_port); - } - - return rc; -} - -static int -ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello, - int timeout) -{ - struct socket *sock = conn->ksnc_sock; - struct lnet_hdr *hdr; - int rc; - int i; - - hdr = kzalloc(sizeof(*hdr), GFP_NOFS); - if (!hdr) { - CERROR("Can't allocate struct lnet_hdr\n"); - return -ENOMEM; - } - - rc = lnet_sock_read(sock, &hdr->src_nid, - sizeof(*hdr) - offsetof(struct lnet_hdr, src_nid), - timeout); - if (rc) { - CERROR("Error %d reading rest of HELLO hdr from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - goto out; - } - - /* ...and check we got what we expected */ - if (hdr->type != cpu_to_le32(LNET_MSG_HELLO)) { - CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n", - le32_to_cpu(hdr->type), - &conn->ksnc_ipaddr); - rc = -EPROTO; - goto out; - } - - hello->kshm_src_nid = le64_to_cpu(hdr->src_nid); - hello->kshm_src_pid = le32_to_cpu(hdr->src_pid); - hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation); - hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type); - hello->kshm_nips = le32_to_cpu(hdr->payload_length) / - sizeof(__u32); - - if (hello->kshm_nips > LNET_MAX_INTERFACES) { - CERROR("Bad nips %d from ip %pI4h\n", - hello->kshm_nips, &conn->ksnc_ipaddr); - rc = -EPROTO; - goto out; - } - - if (!hello->kshm_nips) - goto out; - - rc = lnet_sock_read(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), timeout); - if (rc) { - CERROR("Error %d reading IPs from ip %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - goto out; - } - - for (i = 0; i < (int)hello->kshm_nips; i++) { - hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); - - if (!hello->kshm_ips[i]) { - CERROR("Zero IP[%d] from ip %pI4h\n", - i, &conn->ksnc_ipaddr); - rc = -EPROTO; - break; - } - } -out: - kfree(hdr); - - return rc; -} - -static int -ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello, - int timeout) -{ - struct socket *sock = conn->ksnc_sock; - int rc; - int i; - - if (hello->kshm_magic == LNET_PROTO_MAGIC) - conn->ksnc_flip = 0; - else - conn->ksnc_flip = 1; - - rc = lnet_sock_read(sock, &hello->kshm_src_nid, - offsetof(struct ksock_hello_msg, kshm_ips) - - offsetof(struct ksock_hello_msg, kshm_src_nid), - timeout); - if (rc) { - CERROR("Error %d reading HELLO from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - return rc; - } - - if (conn->ksnc_flip) { - __swab32s(&hello->kshm_src_pid); - __swab64s(&hello->kshm_src_nid); - __swab32s(&hello->kshm_dst_pid); - __swab64s(&hello->kshm_dst_nid); - __swab64s(&hello->kshm_src_incarnation); - __swab64s(&hello->kshm_dst_incarnation); - __swab32s(&hello->kshm_ctype); - __swab32s(&hello->kshm_nips); - } - - if (hello->kshm_nips > LNET_MAX_INTERFACES) { - CERROR("Bad nips %d from ip %pI4h\n", - hello->kshm_nips, &conn->ksnc_ipaddr); - return -EPROTO; - } - - if (!hello->kshm_nips) - return 0; - - rc = lnet_sock_read(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), timeout); - if (rc) { - CERROR("Error %d reading IPs from ip %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - return rc; - } - - for (i = 0; i < (int)hello->kshm_nips; i++) { - if (conn->ksnc_flip) - __swab32s(&hello->kshm_ips[i]); - - if (!hello->kshm_ips[i]) { - CERROR("Zero IP[%d] from ip %pI4h\n", - i, &conn->ksnc_ipaddr); - return -EPROTO; - } - } - - return 0; -} - -static void -ksocknal_pack_msg_v1(struct ksock_tx *tx) -{ - /* V1.x has no KSOCK_MSG_NOOP */ - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - LASSERT(tx->tx_lnetmsg); - - tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr; - tx->tx_iov[0].iov_len = sizeof(struct lnet_hdr); - - tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr); - tx->tx_resid = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr); -} - -static void -ksocknal_pack_msg_v2(struct ksock_tx *tx) -{ - tx->tx_iov[0].iov_base = &tx->tx_msg; - - if (tx->tx_lnetmsg) { - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - - tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; - tx->tx_iov[0].iov_len = sizeof(struct ksock_msg); - tx->tx_nob = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len; - tx->tx_resid = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len; - } else { - LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - tx->tx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); - tx->tx_nob = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); - tx->tx_resid = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); - } - /* - * Don't checksum before start sending, because packet can be - * piggybacked with ACK - */ -} - -static void -ksocknal_unpack_msg_v1(struct ksock_msg *msg) -{ - msg->ksm_csum = 0; - msg->ksm_type = KSOCK_MSG_LNET; - msg->ksm_zc_cookies[0] = 0; - msg->ksm_zc_cookies[1] = 0; -} - -static void -ksocknal_unpack_msg_v2(struct ksock_msg *msg) -{ - return; /* Do nothing */ -} - -struct ksock_proto ksocknal_protocol_v1x = { - .pro_version = KSOCK_PROTO_V1, - .pro_send_hello = ksocknal_send_hello_v1, - .pro_recv_hello = ksocknal_recv_hello_v1, - .pro_pack = ksocknal_pack_msg_v1, - .pro_unpack = ksocknal_unpack_msg_v1, - .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, - .pro_handle_zcreq = NULL, - .pro_handle_zcack = NULL, - .pro_queue_tx_zcack = NULL, - .pro_match_tx = ksocknal_match_tx -}; - -struct ksock_proto ksocknal_protocol_v2x = { - .pro_version = KSOCK_PROTO_V2, - .pro_send_hello = ksocknal_send_hello_v2, - .pro_recv_hello = ksocknal_recv_hello_v2, - .pro_pack = ksocknal_pack_msg_v2, - .pro_unpack = ksocknal_unpack_msg_v2, - .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, - .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, - .pro_handle_zcreq = ksocknal_handle_zcreq, - .pro_handle_zcack = ksocknal_handle_zcack, - .pro_match_tx = ksocknal_match_tx -}; - -struct ksock_proto ksocknal_protocol_v3x = { - .pro_version = KSOCK_PROTO_V3, - .pro_send_hello = ksocknal_send_hello_v2, - .pro_recv_hello = ksocknal_recv_hello_v2, - .pro_pack = ksocknal_pack_msg_v2, - .pro_unpack = ksocknal_unpack_msg_v2, - .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, - .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, - .pro_handle_zcreq = ksocknal_handle_zcreq, - .pro_handle_zcack = ksocknal_handle_zcack, - .pro_match_tx = ksocknal_match_tx_v3 -}; |