summaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lnet/klnds
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-06-01 10:59:48 +0200
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-06-05 19:22:35 +0200
commitbe65f9ed267fd7d8b3146b7c4be9ecdd3e0aa3ed (patch)
treef9fddf1a58b26a1f2eaf2ed7fa350c1622abbdbb /drivers/staging/lustre/lnet/klnds
parent3b93c0f4b6accb8105152900d7e414593a8b0c79 (diff)
downloadop-kernel-dev-be65f9ed267fd7d8b3146b7c4be9ecdd3e0aa3ed.zip
op-kernel-dev-be65f9ed267fd7d8b3146b7c4be9ecdd3e0aa3ed.tar.gz
staging: lustre: delete the filesystem from the tree.
The Lustre filesystem has been in the kernel tree for over 5 years now. While it has been an endless source of enjoyment for new kernel developers learning how to do basic codingstyle cleanups, as well as an semi-entertaining source of bewilderment from the vfs developers any time they have looked into the codebase to try to figure out how to port their latest api changes to this filesystem, it has not really moved forward into the "this is in shape to get out of staging" despite many half-completed attempts. And getting code out of staging is the main goal of that portion of the kernel tree. Code should not stagnate and it feels like having this code in staging is only causing the development cycle of the filesystem to take longer than it should. There is a whole separate out-of-tree copy of this codebase where the developers work on it, and then random changes are thrown over the wall at staging at some later point in time. This dual-tree development model has never worked, and the state of this codebase is proof of that. So, let's just delete the whole mess. Now the lustre developers can go off and work in their out-of-tree codebase and not have to worry about providing valid changelog entries and breaking their patches up into logical pieces. They can take the time they have spend doing those types of housekeeping chores and get the codebase into a much better shape, and it can be submitted for inclusion into the real part of the kernel tree when ready. Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: Andreas Dilger <andreas.dilger@intel.com> Cc: James Simmons <jsimmons@infradead.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/staging/lustre/lnet/klnds')
-rw-r--r--drivers/staging/lustre/lnet/klnds/Makefile1
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile5
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c2958
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h1048
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c3763
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c296
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/Makefile6
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c2921
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h704
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c2586
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c534
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c184
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c810
13 files changed, 0 insertions, 15816 deletions
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
deleted file mode 100644
index c23e4f6..0000000
--- a/drivers/staging/lustre/lnet/klnds/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_LNET) += o2iblnd/ socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
deleted file mode 100644
index 4affe1d..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include
-
-obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
-ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
deleted file mode 100644
index f0b4eb42..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
+++ /dev/null
@@ -1,2958 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <asm/div64.h>
-#include <asm/page.h>
-#include "o2iblnd.h"
-
-static struct lnet_lnd the_o2iblnd;
-
-struct kib_data kiblnd_data;
-
-static __u32 kiblnd_cksum(void *ptr, int nob)
-{
- char *c = ptr;
- __u32 sum = 0;
-
- while (nob-- > 0)
- sum = ((sum << 1) | (sum >> 31)) + *c++;
-
- /* ensure I don't return 0 (== no checksum) */
- return !sum ? 1 : sum;
-}
-
-static char *kiblnd_msgtype2str(int type)
-{
- switch (type) {
- case IBLND_MSG_CONNREQ:
- return "CONNREQ";
-
- case IBLND_MSG_CONNACK:
- return "CONNACK";
-
- case IBLND_MSG_NOOP:
- return "NOOP";
-
- case IBLND_MSG_IMMEDIATE:
- return "IMMEDIATE";
-
- case IBLND_MSG_PUT_REQ:
- return "PUT_REQ";
-
- case IBLND_MSG_PUT_NAK:
- return "PUT_NAK";
-
- case IBLND_MSG_PUT_ACK:
- return "PUT_ACK";
-
- case IBLND_MSG_PUT_DONE:
- return "PUT_DONE";
-
- case IBLND_MSG_GET_REQ:
- return "GET_REQ";
-
- case IBLND_MSG_GET_DONE:
- return "GET_DONE";
-
- default:
- return "???";
- }
-}
-
-static int kiblnd_msgtype2size(int type)
-{
- const int hdr_size = offsetof(struct kib_msg, ibm_u);
-
- switch (type) {
- case IBLND_MSG_CONNREQ:
- case IBLND_MSG_CONNACK:
- return hdr_size + sizeof(struct kib_connparams);
-
- case IBLND_MSG_NOOP:
- return hdr_size;
-
- case IBLND_MSG_IMMEDIATE:
- return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
-
- case IBLND_MSG_PUT_REQ:
- return hdr_size + sizeof(struct kib_putreq_msg);
-
- case IBLND_MSG_PUT_ACK:
- return hdr_size + sizeof(struct kib_putack_msg);
-
- case IBLND_MSG_GET_REQ:
- return hdr_size + sizeof(struct kib_get_msg);
-
- case IBLND_MSG_PUT_NAK:
- case IBLND_MSG_PUT_DONE:
- case IBLND_MSG_GET_DONE:
- return hdr_size + sizeof(struct kib_completion_msg);
- default:
- return -1;
- }
-}
-
-static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
-{
- struct kib_rdma_desc *rd;
- int msg_size;
- int nob;
- int n;
- int i;
-
- LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
- msg->ibm_type == IBLND_MSG_PUT_ACK);
-
- rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
- &msg->ibm_u.get.ibgm_rd :
- &msg->ibm_u.putack.ibpam_rd;
-
- if (flip) {
- __swab32s(&rd->rd_key);
- __swab32s(&rd->rd_nfrags);
- }
-
- n = rd->rd_nfrags;
-
- nob = offsetof(struct kib_msg, ibm_u) +
- kiblnd_rd_msg_size(rd, msg->ibm_type, n);
-
- if (msg->ibm_nob < nob) {
- CERROR("Short %s: %d(%d)\n",
- kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
- return 1;
- }
-
- msg_size = kiblnd_rd_size(rd);
- if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) {
- CERROR("Bad msg_size: %d, should be 0 < n <= %d\n",
- msg_size, LNET_MAX_PAYLOAD);
- return 1;
- }
-
- if (!flip)
- return 0;
-
- for (i = 0; i < n; i++) {
- __swab32s(&rd->rd_frags[i].rf_nob);
- __swab64s(&rd->rd_frags[i].rf_addr);
- }
-
- return 0;
-}
-
-void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
- int credits, lnet_nid_t dstnid, __u64 dststamp)
-{
- struct kib_net *net = ni->ni_data;
-
- /*
- * CAVEAT EMPTOR! all message fields not set here should have been
- * initialised previously.
- */
- msg->ibm_magic = IBLND_MSG_MAGIC;
- msg->ibm_version = version;
- /* ibm_type */
- msg->ibm_credits = credits;
- /* ibm_nob */
- msg->ibm_cksum = 0;
- msg->ibm_srcnid = ni->ni_nid;
- msg->ibm_srcstamp = net->ibn_incarnation;
- msg->ibm_dstnid = dstnid;
- msg->ibm_dststamp = dststamp;
-
- if (*kiblnd_tunables.kib_cksum) {
- /* NB ibm_cksum zero while computing cksum */
- msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
- }
-}
-
-int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
-{
- const int hdr_size = offsetof(struct kib_msg, ibm_u);
- __u32 msg_cksum;
- __u16 version;
- int msg_nob;
- int flip;
-
- /* 6 bytes are enough to have received magic + version */
- if (nob < 6) {
- CERROR("Short message: %d\n", nob);
- return -EPROTO;
- }
-
- if (msg->ibm_magic == IBLND_MSG_MAGIC) {
- flip = 0;
- } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
- flip = 1;
- } else {
- CERROR("Bad magic: %08x\n", msg->ibm_magic);
- return -EPROTO;
- }
-
- version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
- if (version != IBLND_MSG_VERSION &&
- version != IBLND_MSG_VERSION_1) {
- CERROR("Bad version: %x\n", version);
- return -EPROTO;
- }
-
- if (nob < hdr_size) {
- CERROR("Short message: %d\n", nob);
- return -EPROTO;
- }
-
- msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
- if (msg_nob > nob) {
- CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
- return -EPROTO;
- }
-
- /*
- * checksum must be computed with ibm_cksum zero and BEFORE anything
- * gets flipped
- */
- msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
- msg->ibm_cksum = 0;
- if (msg_cksum &&
- msg_cksum != kiblnd_cksum(msg, msg_nob)) {
- CERROR("Bad checksum\n");
- return -EPROTO;
- }
-
- msg->ibm_cksum = msg_cksum;
-
- if (flip) {
- /* leave magic unflipped as a clue to peer endianness */
- msg->ibm_version = version;
- BUILD_BUG_ON(sizeof(msg->ibm_type) != 1);
- BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1);
- msg->ibm_nob = msg_nob;
- __swab64s(&msg->ibm_srcnid);
- __swab64s(&msg->ibm_srcstamp);
- __swab64s(&msg->ibm_dstnid);
- __swab64s(&msg->ibm_dststamp);
- }
-
- if (msg->ibm_srcnid == LNET_NID_ANY) {
- CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
- return -EPROTO;
- }
-
- if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
- CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
- msg_nob, kiblnd_msgtype2size(msg->ibm_type));
- return -EPROTO;
- }
-
- switch (msg->ibm_type) {
- default:
- CERROR("Unknown message type %x\n", msg->ibm_type);
- return -EPROTO;
-
- case IBLND_MSG_NOOP:
- case IBLND_MSG_IMMEDIATE:
- case IBLND_MSG_PUT_REQ:
- break;
-
- case IBLND_MSG_PUT_ACK:
- case IBLND_MSG_GET_REQ:
- if (kiblnd_unpack_rd(msg, flip))
- return -EPROTO;
- break;
-
- case IBLND_MSG_PUT_NAK:
- case IBLND_MSG_PUT_DONE:
- case IBLND_MSG_GET_DONE:
- if (flip)
- __swab32s(&msg->ibm_u.completion.ibcm_status);
- break;
-
- case IBLND_MSG_CONNREQ:
- case IBLND_MSG_CONNACK:
- if (flip) {
- __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
- __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
- __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
- }
- break;
- }
- return 0;
-}
-
-int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp,
- lnet_nid_t nid)
-{
- struct kib_peer *peer;
- struct kib_net *net = ni->ni_data;
- int cpt = lnet_cpt_of_nid(nid);
- unsigned long flags;
-
- LASSERT(net);
- LASSERT(nid != LNET_NID_ANY);
-
- peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt);
- if (!peer) {
- CERROR("Cannot allocate peer\n");
- return -ENOMEM;
- }
-
- peer->ibp_ni = ni;
- peer->ibp_nid = nid;
- peer->ibp_error = 0;
- peer->ibp_last_alive = 0;
- peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
- peer->ibp_queue_depth = ni->ni_peertxcredits;
- atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
-
- INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
- INIT_LIST_HEAD(&peer->ibp_conns);
- INIT_LIST_HEAD(&peer->ibp_tx_queue);
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- /* always called with a ref on ni, which prevents ni being shutdown */
- LASSERT(!net->ibn_shutdown);
-
- /* npeers only grows with the global lock held */
- atomic_inc(&net->ibn_npeers);
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- *peerp = peer;
- return 0;
-}
-
-void kiblnd_destroy_peer(struct kib_peer *peer)
-{
- struct kib_net *net = peer->ibp_ni->ni_data;
-
- LASSERT(net);
- LASSERT(!atomic_read(&peer->ibp_refcount));
- LASSERT(!kiblnd_peer_active(peer));
- LASSERT(kiblnd_peer_idle(peer));
- LASSERT(list_empty(&peer->ibp_tx_queue));
-
- kfree(peer);
-
- /*
- * NB a peer's connections keep a reference on their peer until
- * they are destroyed, so we can be assured that _all_ state to do
- * with this peer has been cleaned up when its refcount drops to
- * zero.
- */
- atomic_dec(&net->ibn_npeers);
-}
-
-struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid)
-{
- /*
- * the caller is responsible for accounting the additional reference
- * that this creates
- */
- struct list_head *peer_list = kiblnd_nid2peerlist(nid);
- struct list_head *tmp;
- struct kib_peer *peer;
-
- list_for_each(tmp, peer_list) {
- peer = list_entry(tmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_nid != nid)
- continue;
-
- CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
- peer, libcfs_nid2str(nid),
- atomic_read(&peer->ibp_refcount),
- peer->ibp_version);
- return peer;
- }
- return NULL;
-}
-
-void kiblnd_unlink_peer_locked(struct kib_peer *peer)
-{
- LASSERT(list_empty(&peer->ibp_conns));
-
- LASSERT(kiblnd_peer_active(peer));
- list_del_init(&peer->ibp_list);
- /* lose peerlist's ref */
- kiblnd_peer_decref(peer);
-}
-
-static int kiblnd_get_peer_info(struct lnet_ni *ni, int index,
- lnet_nid_t *nidp, int *count)
-{
- struct kib_peer *peer;
- struct list_head *ptmp;
- int i;
- unsigned long flags;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
- list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- if (index-- > 0)
- continue;
-
- *nidp = peer->ibp_nid;
- *count = atomic_read(&peer->ibp_refcount);
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
- flags);
- return 0;
- }
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- return -ENOENT;
-}
-
-static void kiblnd_del_peer_locked(struct kib_peer *peer)
-{
- struct list_head *ctmp;
- struct list_head *cnxt;
- struct kib_conn *conn;
-
- if (list_empty(&peer->ibp_conns)) {
- kiblnd_unlink_peer_locked(peer);
- } else {
- list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- kiblnd_close_conn_locked(conn, 0);
- }
- /* NB closing peer's last conn unlinked it. */
- }
- /*
- * NB peer now unlinked; might even be freed if the peer table had the
- * last ref on it.
- */
-}
-
-static int kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
-{
- LIST_HEAD(zombies);
- struct list_head *ptmp;
- struct list_head *pnxt;
- struct kib_peer *peer;
- int lo;
- int hi;
- int i;
- unsigned long flags;
- int rc = -ENOENT;
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (nid != LNET_NID_ANY) {
- lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- } else {
- lo = 0;
- hi = kiblnd_data.kib_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
- continue;
-
- if (!list_empty(&peer->ibp_tx_queue)) {
- LASSERT(list_empty(&peer->ibp_conns));
-
- list_splice_init(&peer->ibp_tx_queue,
- &zombies);
- }
-
- kiblnd_del_peer_locked(peer);
- rc = 0; /* matched something */
- }
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_txlist_done(ni, &zombies, -EIO);
-
- return rc;
-}
-
-static struct kib_conn *kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
-{
- struct kib_peer *peer;
- struct list_head *ptmp;
- struct kib_conn *conn;
- struct list_head *ctmp;
- int i;
- unsigned long flags;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
- list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- list_for_each(ctmp, &peer->ibp_conns) {
- if (index-- > 0)
- continue;
-
- conn = list_entry(ctmp, struct kib_conn,
- ibc_list);
- kiblnd_conn_addref(conn);
- read_unlock_irqrestore(
- &kiblnd_data.kib_global_lock,
- flags);
- return conn;
- }
- }
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- return NULL;
-}
-
-int kiblnd_translate_mtu(int value)
-{
- switch (value) {
- default:
- return -1;
- case 0:
- return 0;
- case 256:
- return IB_MTU_256;
- case 512:
- return IB_MTU_512;
- case 1024:
- return IB_MTU_1024;
- case 2048:
- return IB_MTU_2048;
- case 4096:
- return IB_MTU_4096;
- }
-}
-
-static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
-{
- int mtu;
-
- /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
- if (!cmid->route.path_rec)
- return;
-
- mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
- LASSERT(mtu >= 0);
- if (mtu)
- cmid->route.path_rec->mtu = mtu;
-}
-
-static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
-{
- cpumask_var_t *mask;
- int vectors;
- int off;
- int i;
- lnet_nid_t nid = conn->ibc_peer->ibp_nid;
-
- vectors = conn->ibc_cmid->device->num_comp_vectors;
- if (vectors <= 1)
- return 0;
-
- mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
- if (!mask)
- return 0;
-
- /* hash NID to CPU id in this partition... */
- off = do_div(nid, cpumask_weight(*mask));
- for_each_cpu(i, *mask) {
- if (!off--)
- return i % vectors;
- }
-
- LBUG();
- return 1;
-}
-
-struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid,
- int state, int version)
-{
- /*
- * CAVEAT EMPTOR:
- * If the new conn is created successfully it takes over the caller's
- * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
- * is destroyed. On failure, the caller's ref on 'peer' remains and
- * she must dispose of 'cmid'. (Actually I'd block forever if I tried
- * to destroy 'cmid' here since I'm called from the CM which still has
- * its ref on 'cmid').
- */
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_net *net = peer->ibp_ni->ni_data;
- struct kib_dev *dev;
- struct ib_qp_init_attr *init_qp_attr;
- struct kib_sched_info *sched;
- struct ib_cq_init_attr cq_attr = {};
- struct kib_conn *conn;
- struct ib_cq *cq;
- unsigned long flags;
- int cpt;
- int rc;
- int i;
-
- LASSERT(net);
- LASSERT(!in_interrupt());
-
- dev = net->ibn_dev;
-
- cpt = lnet_cpt_of_nid(peer->ibp_nid);
- sched = kiblnd_data.kib_scheds[cpt];
-
- LASSERT(sched->ibs_nthreads > 0);
-
- init_qp_attr = kzalloc_cpt(sizeof(*init_qp_attr), GFP_NOFS, cpt);
- if (!init_qp_attr) {
- CERROR("Can't allocate qp_attr for %s\n",
- libcfs_nid2str(peer->ibp_nid));
- goto failed_0;
- }
-
- conn = kzalloc_cpt(sizeof(*conn), GFP_NOFS, cpt);
- if (!conn) {
- CERROR("Can't allocate connection for %s\n",
- libcfs_nid2str(peer->ibp_nid));
- goto failed_1;
- }
-
- conn->ibc_state = IBLND_CONN_INIT;
- conn->ibc_version = version;
- conn->ibc_peer = peer; /* I take the caller's ref */
- cmid->context = conn; /* for future CM callbacks */
- conn->ibc_cmid = cmid;
- conn->ibc_max_frags = peer->ibp_max_frags;
- conn->ibc_queue_depth = peer->ibp_queue_depth;
-
- INIT_LIST_HEAD(&conn->ibc_early_rxs);
- INIT_LIST_HEAD(&conn->ibc_tx_noops);
- INIT_LIST_HEAD(&conn->ibc_tx_queue);
- INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
- INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
- INIT_LIST_HEAD(&conn->ibc_active_txs);
- spin_lock_init(&conn->ibc_lock);
-
- conn->ibc_connvars = kzalloc_cpt(sizeof(*conn->ibc_connvars), GFP_NOFS, cpt);
- if (!conn->ibc_connvars) {
- CERROR("Can't allocate in-progress connection state\n");
- goto failed_2;
- }
-
- write_lock_irqsave(glock, flags);
- if (dev->ibd_failover) {
- write_unlock_irqrestore(glock, flags);
- CERROR("%s: failover in progress\n", dev->ibd_ifname);
- goto failed_2;
- }
-
- if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
- /* wakeup failover thread and teardown connection */
- if (kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- wake_up(&kiblnd_data.kib_failover_waitq);
- }
-
- write_unlock_irqrestore(glock, flags);
- CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
- cmid->device->name, dev->ibd_ifname);
- goto failed_2;
- }
-
- kiblnd_hdev_addref_locked(dev->ibd_hdev);
- conn->ibc_hdev = dev->ibd_hdev;
-
- kiblnd_setup_mtu_locked(cmid);
-
- write_unlock_irqrestore(glock, flags);
-
- conn->ibc_rxs = kzalloc_cpt(IBLND_RX_MSGS(conn) * sizeof(struct kib_rx),
- GFP_NOFS, cpt);
- if (!conn->ibc_rxs) {
- CERROR("Cannot allocate RX buffers\n");
- goto failed_2;
- }
-
- rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
- IBLND_RX_MSG_PAGES(conn));
- if (rc)
- goto failed_2;
-
- kiblnd_map_rx_descs(conn);
-
- cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
- cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
- cq = ib_create_cq(cmid->device,
- kiblnd_cq_completion, kiblnd_cq_event, conn,
- &cq_attr);
- if (IS_ERR(cq)) {
- CERROR("Failed to create CQ with %d CQEs: %ld\n",
- IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
- goto failed_2;
- }
-
- conn->ibc_cq = cq;
-
- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (rc) {
- CERROR("Can't request completion notification: %d\n", rc);
- goto failed_2;
- }
-
- init_qp_attr->event_handler = kiblnd_qp_event;
- init_qp_attr->qp_context = conn;
- init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
- init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
- init_qp_attr->cap.max_send_sge = 1;
- init_qp_attr->cap.max_recv_sge = 1;
- init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
- init_qp_attr->qp_type = IB_QPT_RC;
- init_qp_attr->send_cq = cq;
- init_qp_attr->recv_cq = cq;
-
- conn->ibc_sched = sched;
-
- rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
- if (rc) {
- CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
- rc, init_qp_attr->cap.max_send_wr,
- init_qp_attr->cap.max_recv_wr);
- goto failed_2;
- }
-
- kfree(init_qp_attr);
-
- /* 1 ref for caller and each rxmsg */
- atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
- conn->ibc_nrx = IBLND_RX_MSGS(conn);
-
- /* post receives */
- for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
- rc = kiblnd_post_rx(&conn->ibc_rxs[i],
- IBLND_POSTRX_NO_CREDIT);
- if (rc) {
- CERROR("Can't post rxmsg: %d\n", rc);
-
- /* Make posted receives complete */
- kiblnd_abort_receives(conn);
-
- /*
- * correct # of posted buffers
- * NB locking needed now I'm racing with completion
- */
- spin_lock_irqsave(&sched->ibs_lock, flags);
- conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- /*
- * cmid will be destroyed by CM(ofed) after cm_callback
- * returned, so we can't refer it anymore
- * (by kiblnd_connd()->kiblnd_destroy_conn)
- */
- rdma_destroy_qp(conn->ibc_cmid);
- conn->ibc_cmid = NULL;
-
- /* Drop my own and unused rxbuffer refcounts */
- while (i++ <= IBLND_RX_MSGS(conn))
- kiblnd_conn_decref(conn);
-
- return NULL;
- }
- }
-
- /* Init successful! */
- LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
- state == IBLND_CONN_PASSIVE_WAIT);
- conn->ibc_state = state;
-
- /* 1 more conn */
- atomic_inc(&net->ibn_nconns);
- return conn;
-
- failed_2:
- kiblnd_destroy_conn(conn);
- kfree(conn);
- failed_1:
- kfree(init_qp_attr);
- failed_0:
- return NULL;
-}
-
-void kiblnd_destroy_conn(struct kib_conn *conn)
-{
- struct rdma_cm_id *cmid = conn->ibc_cmid;
- struct kib_peer *peer = conn->ibc_peer;
- int rc;
-
- LASSERT(!in_interrupt());
- LASSERT(!atomic_read(&conn->ibc_refcount));
- LASSERT(list_empty(&conn->ibc_early_rxs));
- LASSERT(list_empty(&conn->ibc_tx_noops));
- LASSERT(list_empty(&conn->ibc_tx_queue));
- LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
- LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
- LASSERT(list_empty(&conn->ibc_active_txs));
- LASSERT(!conn->ibc_noops_posted);
- LASSERT(!conn->ibc_nsends_posted);
-
- switch (conn->ibc_state) {
- default:
- /* conn must be completely disengaged from the network */
- LBUG();
-
- case IBLND_CONN_DISCONNECTED:
- /* connvars should have been freed already */
- LASSERT(!conn->ibc_connvars);
- break;
-
- case IBLND_CONN_INIT:
- break;
- }
-
- /* conn->ibc_cmid might be destroyed by CM already */
- if (cmid && cmid->qp)
- rdma_destroy_qp(cmid);
-
- if (conn->ibc_cq) {
- rc = ib_destroy_cq(conn->ibc_cq);
- if (rc)
- CWARN("Error destroying CQ: %d\n", rc);
- }
-
- if (conn->ibc_rx_pages)
- kiblnd_unmap_rx_descs(conn);
-
- kfree(conn->ibc_rxs);
- kfree(conn->ibc_connvars);
-
- if (conn->ibc_hdev)
- kiblnd_hdev_decref(conn->ibc_hdev);
-
- /* See CAVEAT EMPTOR above in kiblnd_create_conn */
- if (conn->ibc_state != IBLND_CONN_INIT) {
- struct kib_net *net = peer->ibp_ni->ni_data;
-
- kiblnd_peer_decref(peer);
- rdma_destroy_id(cmid);
- atomic_dec(&net->ibn_nconns);
- }
-}
-
-int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why)
-{
- struct kib_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
- int count = 0;
-
- list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
- libcfs_nid2str(peer->ibp_nid),
- conn->ibc_version, why);
-
- kiblnd_close_conn_locked(conn, why);
- count++;
- }
-
- return count;
-}
-
-int kiblnd_close_stale_conns_locked(struct kib_peer *peer,
- int version, __u64 incarnation)
-{
- struct kib_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
- int count = 0;
-
- list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- if (conn->ibc_version == version &&
- conn->ibc_incarnation == incarnation)
- continue;
-
- CDEBUG(D_NET,
- "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
- libcfs_nid2str(peer->ibp_nid),
- conn->ibc_version, conn->ibc_incarnation,
- version, incarnation);
-
- kiblnd_close_conn_locked(conn, -ESTALE);
- count++;
- }
-
- return count;
-}
-
-static int kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
-{
- struct kib_peer *peer;
- struct list_head *ptmp;
- struct list_head *pnxt;
- int lo;
- int hi;
- int i;
- unsigned long flags;
- int count = 0;
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (nid != LNET_NID_ANY) {
- lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- } else {
- lo = 0;
- hi = kiblnd_data.kib_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
- continue;
-
- count += kiblnd_close_peer_conns_locked(peer, 0);
- }
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- /* wildcards always succeed */
- if (nid == LNET_NID_ANY)
- return 0;
-
- return !count ? -ENOENT : 0;
-}
-
-static int kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
-{
- struct libcfs_ioctl_data *data = arg;
- int rc = -EINVAL;
-
- switch (cmd) {
- case IOC_LIBCFS_GET_PEER: {
- lnet_nid_t nid = 0;
- int count = 0;
-
- rc = kiblnd_get_peer_info(ni, data->ioc_count,
- &nid, &count);
- data->ioc_nid = nid;
- data->ioc_count = count;
- break;
- }
-
- case IOC_LIBCFS_DEL_PEER: {
- rc = kiblnd_del_peer(ni, data->ioc_nid);
- break;
- }
- case IOC_LIBCFS_GET_CONN: {
- struct kib_conn *conn;
-
- rc = 0;
- conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
- if (!conn) {
- rc = -ENOENT;
- break;
- }
-
- LASSERT(conn->ibc_cmid);
- data->ioc_nid = conn->ibc_peer->ibp_nid;
- if (!conn->ibc_cmid->route.path_rec)
- data->ioc_u32[0] = 0; /* iWarp has no path MTU */
- else
- data->ioc_u32[0] =
- ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
- kiblnd_conn_decref(conn);
- break;
- }
- case IOC_LIBCFS_CLOSE_CONNECTION: {
- rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
- break;
- }
-
- default:
- break;
- }
-
- return rc;
-}
-
-static void kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid,
- unsigned long *when)
-{
- unsigned long last_alive = 0;
- unsigned long now = jiffies;
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_peer *peer;
- unsigned long flags;
-
- read_lock_irqsave(glock, flags);
-
- peer = kiblnd_find_peer_locked(nid);
- if (peer)
- last_alive = peer->ibp_last_alive;
-
- read_unlock_irqrestore(glock, flags);
-
- if (last_alive)
- *when = last_alive;
-
- /*
- * peer is not persistent in hash, trigger peer creation
- * and connection establishment with a NULL tx
- */
- if (!peer)
- kiblnd_launch_tx(ni, NULL, nid);
-
- CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
- libcfs_nid2str(nid), peer,
- last_alive ? (now - last_alive) / HZ : -1);
-}
-
-static void kiblnd_free_pages(struct kib_pages *p)
-{
- int npages = p->ibp_npages;
- int i;
-
- for (i = 0; i < npages; i++) {
- if (p->ibp_pages[i])
- __free_page(p->ibp_pages[i]);
- }
-
- kfree(p);
-}
-
-int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
-{
- struct kib_pages *p;
- int i;
-
- p = kzalloc_cpt(offsetof(struct kib_pages, ibp_pages[npages]),
- GFP_NOFS, cpt);
- if (!p) {
- CERROR("Can't allocate descriptor for %d pages\n", npages);
- return -ENOMEM;
- }
-
- p->ibp_npages = npages;
-
- for (i = 0; i < npages; i++) {
- p->ibp_pages[i] = alloc_pages_node(
- cfs_cpt_spread_node(lnet_cpt_table(), cpt),
- GFP_NOFS, 0);
- if (!p->ibp_pages[i]) {
- CERROR("Can't allocate page %d of %d\n", i, npages);
- kiblnd_free_pages(p);
- return -ENOMEM;
- }
- }
-
- *pp = p;
- return 0;
-}
-
-void kiblnd_unmap_rx_descs(struct kib_conn *conn)
-{
- struct kib_rx *rx;
- int i;
-
- LASSERT(conn->ibc_rxs);
- LASSERT(conn->ibc_hdev);
-
- for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
- rx = &conn->ibc_rxs[i];
-
- LASSERT(rx->rx_nob >= 0); /* not posted */
-
- kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
- KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
- rx->rx_msgaddr),
- IBLND_MSG_SIZE, DMA_FROM_DEVICE);
- }
-
- kiblnd_free_pages(conn->ibc_rx_pages);
-
- conn->ibc_rx_pages = NULL;
-}
-
-void kiblnd_map_rx_descs(struct kib_conn *conn)
-{
- struct kib_rx *rx;
- struct page *pg;
- int pg_off;
- int ipg;
- int i;
-
- for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
- pg = conn->ibc_rx_pages->ibp_pages[ipg];
- rx = &conn->ibc_rxs[i];
-
- rx->rx_conn = conn;
- rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
-
- rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
- rx->rx_msg,
- IBLND_MSG_SIZE,
- DMA_FROM_DEVICE);
- LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
- rx->rx_msgaddr));
- KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
-
- CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
- i, rx->rx_msg, rx->rx_msgaddr,
- (__u64)(page_to_phys(pg) + pg_off));
-
- pg_off += IBLND_MSG_SIZE;
- LASSERT(pg_off <= PAGE_SIZE);
-
- if (pg_off == PAGE_SIZE) {
- pg_off = 0;
- ipg++;
- LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
- }
- }
-}
-
-static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
-{
- struct kib_hca_dev *hdev = tpo->tpo_hdev;
- struct kib_tx *tx;
- int i;
-
- LASSERT(!tpo->tpo_pool.po_allocated);
-
- if (!hdev)
- return;
-
- for (i = 0; i < tpo->tpo_pool.po_size; i++) {
- tx = &tpo->tpo_tx_descs[i];
- kiblnd_dma_unmap_single(hdev->ibh_ibdev,
- KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
- tx->tx_msgaddr),
- IBLND_MSG_SIZE, DMA_TO_DEVICE);
- }
-
- kiblnd_hdev_decref(hdev);
- tpo->tpo_hdev = NULL;
-}
-
-static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev)
-{
- struct kib_hca_dev *hdev;
- unsigned long flags;
- int i = 0;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- while (dev->ibd_failover) {
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- if (!(i++ % 50))
- CDEBUG(D_NET, "%s: Wait for failover\n",
- dev->ibd_ifname);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ / 100);
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- }
-
- kiblnd_hdev_addref_locked(dev->ibd_hdev);
- hdev = dev->ibd_hdev;
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- return hdev;
-}
-
-static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
-{
- struct kib_pages *txpgs = tpo->tpo_tx_pages;
- struct kib_pool *pool = &tpo->tpo_pool;
- struct kib_net *net = pool->po_owner->ps_net;
- struct kib_dev *dev;
- struct page *page;
- struct kib_tx *tx;
- int page_offset;
- int ipage;
- int i;
-
- LASSERT(net);
-
- dev = net->ibn_dev;
-
- /* pre-mapped messages are not bigger than 1 page */
- BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE);
-
- /* No fancy arithmetic when we do the buffer calculations */
- BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE);
-
- tpo->tpo_hdev = kiblnd_current_hdev(dev);
-
- for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
- page = txpgs->ibp_pages[ipage];
- tx = &tpo->tpo_tx_descs[i];
-
- tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
- page_offset);
-
- tx->tx_msgaddr = kiblnd_dma_map_single(
- tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
- IBLND_MSG_SIZE, DMA_TO_DEVICE);
- LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
- tx->tx_msgaddr));
- KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
-
- list_add(&tx->tx_list, &pool->po_free_list);
-
- page_offset += IBLND_MSG_SIZE;
- LASSERT(page_offset <= PAGE_SIZE);
-
- if (page_offset == PAGE_SIZE) {
- page_offset = 0;
- ipage++;
- LASSERT(ipage <= txpgs->ibp_npages);
- }
- }
-}
-
-static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
-{
- LASSERT(!fpo->fpo_map_count);
-
- if (fpo->fpo_is_fmr) {
- if (fpo->fmr.fpo_fmr_pool)
- ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
- } else {
- struct kib_fast_reg_descriptor *frd, *tmp;
- int i = 0;
-
- list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
- frd_list) {
- list_del(&frd->frd_list);
- ib_dereg_mr(frd->frd_mr);
- kfree(frd);
- i++;
- }
- if (i < fpo->fast_reg.fpo_pool_size)
- CERROR("FastReg pool still has %d regions registered\n",
- fpo->fast_reg.fpo_pool_size - i);
- }
-
- if (fpo->fpo_hdev)
- kiblnd_hdev_decref(fpo->fpo_hdev);
-
- kfree(fpo);
-}
-
-static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
-{
- struct kib_fmr_pool *fpo, *tmp;
-
- list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
- list_del(&fpo->fpo_list);
- kiblnd_destroy_fmr_pool(fpo);
- }
-}
-
-static int
-kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
- int ncpts)
-{
- int size = tunables->lnd_fmr_pool_size / ncpts;
-
- return max(IBLND_FMR_POOL, size);
-}
-
-static int
-kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
- int ncpts)
-{
- int size = tunables->lnd_fmr_flush_trigger / ncpts;
-
- return max(IBLND_FMR_POOL_FLUSH, size);
-}
-
-static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
-{
- struct ib_fmr_pool_param param = {
- .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
- .page_shift = PAGE_SHIFT,
- .access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE),
- .pool_size = fps->fps_pool_size,
- .dirty_watermark = fps->fps_flush_trigger,
- .flush_function = NULL,
- .flush_arg = NULL,
- .cache = !!fps->fps_cache };
- int rc = 0;
-
- fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
- &param);
- if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
- rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
- if (rc != -ENOSYS)
- CERROR("Failed to create FMR pool: %d\n", rc);
- else
- CERROR("FMRs are not supported\n");
- }
-
- return rc;
-}
-
-static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
-{
- struct kib_fast_reg_descriptor *frd, *tmp;
- int i, rc;
-
- INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
- fpo->fast_reg.fpo_pool_size = 0;
- for (i = 0; i < fps->fps_pool_size; i++) {
- frd = kzalloc_cpt(sizeof(*frd), GFP_NOFS, fps->fps_cpt);
- if (!frd) {
- CERROR("Failed to allocate a new fast_reg descriptor\n");
- rc = -ENOMEM;
- goto out;
- }
-
- frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
- IB_MR_TYPE_MEM_REG,
- LNET_MAX_PAYLOAD / PAGE_SIZE);
- if (IS_ERR(frd->frd_mr)) {
- rc = PTR_ERR(frd->frd_mr);
- CERROR("Failed to allocate ib_alloc_mr: %d\n", rc);
- frd->frd_mr = NULL;
- goto out_middle;
- }
-
- frd->frd_valid = true;
-
- list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
- fpo->fast_reg.fpo_pool_size++;
- }
-
- return 0;
-
-out_middle:
- if (frd->frd_mr)
- ib_dereg_mr(frd->frd_mr);
- kfree(frd);
-
-out:
- list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
- frd_list) {
- list_del(&frd->frd_list);
- ib_dereg_mr(frd->frd_mr);
- kfree(frd);
- }
-
- return rc;
-}
-
-static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
- struct kib_fmr_pool **pp_fpo)
-{
- struct kib_dev *dev = fps->fps_net->ibn_dev;
- struct ib_device_attr *dev_attr;
- struct kib_fmr_pool *fpo;
- int rc;
-
- fpo = kzalloc_cpt(sizeof(*fpo), GFP_NOFS, fps->fps_cpt);
- if (!fpo)
- return -ENOMEM;
-
- fpo->fpo_hdev = kiblnd_current_hdev(dev);
- dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
-
- /* Check for FMR or FastReg support */
- fpo->fpo_is_fmr = 0;
- if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
- fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
- fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
- fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
- LCONSOLE_INFO("Using FMR for registration\n");
- fpo->fpo_is_fmr = 1;
- } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
- LCONSOLE_INFO("Using FastReg for registration\n");
- } else {
- rc = -ENOSYS;
- LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
- goto out_fpo;
- }
-
- if (fpo->fpo_is_fmr)
- rc = kiblnd_alloc_fmr_pool(fps, fpo);
- else
- rc = kiblnd_alloc_freg_pool(fps, fpo);
- if (rc)
- goto out_fpo;
-
- fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- fpo->fpo_owner = fps;
- *pp_fpo = fpo;
-
- return 0;
-
-out_fpo:
- kiblnd_hdev_decref(fpo->fpo_hdev);
- kfree(fpo);
- return rc;
-}
-
-static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps,
- struct list_head *zombies)
-{
- if (!fps->fps_net) /* initialized? */
- return;
-
- spin_lock(&fps->fps_lock);
-
- while (!list_empty(&fps->fps_pool_list)) {
- struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
- struct kib_fmr_pool, fpo_list);
- fpo->fpo_failed = 1;
- list_del(&fpo->fpo_list);
- if (!fpo->fpo_map_count)
- list_add(&fpo->fpo_list, zombies);
- else
- list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
- }
-
- spin_unlock(&fps->fps_lock);
-}
-
-static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
-{
- if (fps->fps_net) { /* initialized? */
- kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
- kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
- }
-}
-
-static int
-kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
- struct kib_net *net,
- struct lnet_ioctl_config_o2iblnd_tunables *tunables)
-{
- struct kib_fmr_pool *fpo;
- int rc;
-
- memset(fps, 0, sizeof(*fps));
-
- fps->fps_net = net;
- fps->fps_cpt = cpt;
-
- fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
- fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
- fps->fps_cache = tunables->lnd_fmr_cache;
-
- spin_lock_init(&fps->fps_lock);
- INIT_LIST_HEAD(&fps->fps_pool_list);
- INIT_LIST_HEAD(&fps->fps_failed_pool_list);
-
- rc = kiblnd_create_fmr_pool(fps, &fpo);
- if (!rc)
- list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
-
- return rc;
-}
-
-static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now)
-{
- if (fpo->fpo_map_count) /* still in use */
- return 0;
- if (fpo->fpo_failed)
- return 1;
- return time_after_eq(now, fpo->fpo_deadline);
-}
-
-static int
-kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
-{
- __u64 *pages = tx->tx_pages;
- struct kib_hca_dev *hdev;
- int npages;
- int size;
- int i;
-
- hdev = tx->tx_pool->tpo_hdev;
-
- for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
- for (size = 0; size < rd->rd_frags[i].rf_nob;
- size += hdev->ibh_page_size) {
- pages[npages++] = (rd->rd_frags[i].rf_addr &
- hdev->ibh_page_mask) + size;
- }
- }
-
- return npages;
-}
-
-void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
-{
- LIST_HEAD(zombies);
- struct kib_fmr_pool *fpo = fmr->fmr_pool;
- struct kib_fmr_poolset *fps;
- unsigned long now = jiffies;
- struct kib_fmr_pool *tmp;
- int rc;
-
- if (!fpo)
- return;
-
- fps = fpo->fpo_owner;
- if (fpo->fpo_is_fmr) {
- if (fmr->fmr_pfmr) {
- rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
- LASSERT(!rc);
- fmr->fmr_pfmr = NULL;
- }
-
- if (status) {
- rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
- LASSERT(!rc);
- }
- } else {
- struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
-
- if (frd) {
- frd->frd_valid = false;
- spin_lock(&fps->fps_lock);
- list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
- spin_unlock(&fps->fps_lock);
- fmr->fmr_frd = NULL;
- }
- }
- fmr->fmr_pool = NULL;
-
- spin_lock(&fps->fps_lock);
- fpo->fpo_map_count--; /* decref the pool */
-
- list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
- /* the first pool is persistent */
- if (fps->fps_pool_list.next == &fpo->fpo_list)
- continue;
-
- if (kiblnd_fmr_pool_is_idle(fpo, now)) {
- list_move(&fpo->fpo_list, &zombies);
- fps->fps_version++;
- }
- }
- spin_unlock(&fps->fps_lock);
-
- if (!list_empty(&zombies))
- kiblnd_destroy_fmr_pool_list(&zombies);
-}
-
-int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
- struct kib_rdma_desc *rd, __u32 nob, __u64 iov,
- struct kib_fmr *fmr)
-{
- __u64 *pages = tx->tx_pages;
- bool is_rx = (rd != tx->tx_rd);
- bool tx_pages_mapped = false;
- struct kib_fmr_pool *fpo;
- int npages = 0;
- __u64 version;
- int rc;
-
- again:
- spin_lock(&fps->fps_lock);
- version = fps->fps_version;
- list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
- fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- fpo->fpo_map_count++;
-
- if (fpo->fpo_is_fmr) {
- struct ib_pool_fmr *pfmr;
-
- spin_unlock(&fps->fps_lock);
-
- if (!tx_pages_mapped) {
- npages = kiblnd_map_tx_pages(tx, rd);
- tx_pages_mapped = 1;
- }
-
- pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
- pages, npages, iov);
- if (likely(!IS_ERR(pfmr))) {
- fmr->fmr_key = is_rx ? pfmr->fmr->rkey :
- pfmr->fmr->lkey;
- fmr->fmr_frd = NULL;
- fmr->fmr_pfmr = pfmr;
- fmr->fmr_pool = fpo;
- return 0;
- }
- rc = PTR_ERR(pfmr);
- } else {
- if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
- struct kib_fast_reg_descriptor *frd;
- struct ib_reg_wr *wr;
- struct ib_mr *mr;
- int n;
-
- frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
- struct kib_fast_reg_descriptor,
- frd_list);
- list_del(&frd->frd_list);
- spin_unlock(&fps->fps_lock);
-
- mr = frd->frd_mr;
-
- if (!frd->frd_valid) {
- __u32 key = is_rx ? mr->rkey : mr->lkey;
- struct ib_send_wr *inv_wr;
-
- inv_wr = &frd->frd_inv_wr;
- memset(inv_wr, 0, sizeof(*inv_wr));
- inv_wr->opcode = IB_WR_LOCAL_INV;
- inv_wr->wr_id = IBLND_WID_MR;
- inv_wr->ex.invalidate_rkey = key;
-
- /* Bump the key */
- key = ib_inc_rkey(key);
- ib_update_fast_reg_key(mr, key);
- }
-
- n = ib_map_mr_sg(mr, tx->tx_frags,
- tx->tx_nfrags, NULL, PAGE_SIZE);
- if (unlikely(n != tx->tx_nfrags)) {
- CERROR("Failed to map mr %d/%d elements\n",
- n, tx->tx_nfrags);
- return n < 0 ? n : -EINVAL;
- }
-
- mr->iova = iov;
-
- /* Prepare FastReg WR */
- wr = &frd->frd_fastreg_wr;
- memset(wr, 0, sizeof(*wr));
- wr->wr.opcode = IB_WR_REG_MR;
- wr->wr.wr_id = IBLND_WID_MR;
- wr->wr.num_sge = 0;
- wr->wr.send_flags = 0;
- wr->mr = mr;
- wr->key = is_rx ? mr->rkey : mr->lkey;
- wr->access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE);
-
- fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
- fmr->fmr_frd = frd;
- fmr->fmr_pfmr = NULL;
- fmr->fmr_pool = fpo;
- return 0;
- }
- spin_unlock(&fps->fps_lock);
- rc = -EAGAIN;
- }
-
- spin_lock(&fps->fps_lock);
- fpo->fpo_map_count--;
- if (rc != -EAGAIN) {
- spin_unlock(&fps->fps_lock);
- return rc;
- }
-
- /* EAGAIN and ... */
- if (version != fps->fps_version) {
- spin_unlock(&fps->fps_lock);
- goto again;
- }
- }
-
- if (fps->fps_increasing) {
- spin_unlock(&fps->fps_lock);
- CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n");
- schedule();
- goto again;
- }
-
- if (time_before(jiffies, fps->fps_next_retry)) {
- /* someone failed recently */
- spin_unlock(&fps->fps_lock);
- return -EAGAIN;
- }
-
- fps->fps_increasing = 1;
- spin_unlock(&fps->fps_lock);
-
- CDEBUG(D_NET, "Allocate new FMR pool\n");
- rc = kiblnd_create_fmr_pool(fps, &fpo);
- spin_lock(&fps->fps_lock);
- fps->fps_increasing = 0;
- if (!rc) {
- fps->fps_version++;
- list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
- } else {
- fps->fps_next_retry = jiffies + IBLND_POOL_RETRY * HZ;
- }
- spin_unlock(&fps->fps_lock);
-
- goto again;
-}
-
-static void kiblnd_fini_pool(struct kib_pool *pool)
-{
- LASSERT(list_empty(&pool->po_free_list));
- LASSERT(!pool->po_allocated);
-
- CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
-}
-
-static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
-{
- CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
-
- memset(pool, 0, sizeof(*pool));
- INIT_LIST_HEAD(&pool->po_free_list);
- pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- pool->po_owner = ps;
- pool->po_size = size;
-}
-
-static void kiblnd_destroy_pool_list(struct list_head *head)
-{
- struct kib_pool *pool;
-
- while (!list_empty(head)) {
- pool = list_entry(head->next, struct kib_pool, po_list);
- list_del(&pool->po_list);
-
- LASSERT(pool->po_owner);
- pool->po_owner->ps_pool_destroy(pool);
- }
-}
-
-static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
-{
- if (!ps->ps_net) /* initialized? */
- return;
-
- spin_lock(&ps->ps_lock);
- while (!list_empty(&ps->ps_pool_list)) {
- struct kib_pool *po = list_entry(ps->ps_pool_list.next,
- struct kib_pool, po_list);
- po->po_failed = 1;
- list_del(&po->po_list);
- if (!po->po_allocated)
- list_add(&po->po_list, zombies);
- else
- list_add(&po->po_list, &ps->ps_failed_pool_list);
- }
- spin_unlock(&ps->ps_lock);
-}
-
-static void kiblnd_fini_poolset(struct kib_poolset *ps)
-{
- if (ps->ps_net) { /* initialized? */
- kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
- kiblnd_destroy_pool_list(&ps->ps_pool_list);
- }
-}
-
-static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
- struct kib_net *net, char *name, int size,
- kib_ps_pool_create_t po_create,
- kib_ps_pool_destroy_t po_destroy,
- kib_ps_node_init_t nd_init,
- kib_ps_node_fini_t nd_fini)
-{
- struct kib_pool *pool;
- int rc;
-
- memset(ps, 0, sizeof(*ps));
-
- ps->ps_cpt = cpt;
- ps->ps_net = net;
- ps->ps_pool_create = po_create;
- ps->ps_pool_destroy = po_destroy;
- ps->ps_node_init = nd_init;
- ps->ps_node_fini = nd_fini;
- ps->ps_pool_size = size;
- if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
- >= sizeof(ps->ps_name))
- return -E2BIG;
- spin_lock_init(&ps->ps_lock);
- INIT_LIST_HEAD(&ps->ps_pool_list);
- INIT_LIST_HEAD(&ps->ps_failed_pool_list);
-
- rc = ps->ps_pool_create(ps, size, &pool);
- if (!rc)
- list_add(&pool->po_list, &ps->ps_pool_list);
- else
- CERROR("Failed to create the first pool for %s\n", ps->ps_name);
-
- return rc;
-}
-
-static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now)
-{
- if (pool->po_allocated) /* still in use */
- return 0;
- if (pool->po_failed)
- return 1;
- return time_after_eq(now, pool->po_deadline);
-}
-
-void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
-{
- LIST_HEAD(zombies);
- struct kib_poolset *ps = pool->po_owner;
- struct kib_pool *tmp;
- unsigned long now = jiffies;
-
- spin_lock(&ps->ps_lock);
-
- if (ps->ps_node_fini)
- ps->ps_node_fini(pool, node);
-
- LASSERT(pool->po_allocated > 0);
- list_add(node, &pool->po_free_list);
- pool->po_allocated--;
-
- list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
- /* the first pool is persistent */
- if (ps->ps_pool_list.next == &pool->po_list)
- continue;
-
- if (kiblnd_pool_is_idle(pool, now))
- list_move(&pool->po_list, &zombies);
- }
- spin_unlock(&ps->ps_lock);
-
- if (!list_empty(&zombies))
- kiblnd_destroy_pool_list(&zombies);
-}
-
-struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps)
-{
- struct list_head *node;
- struct kib_pool *pool;
- unsigned int interval = 1;
- unsigned long time_before;
- unsigned int trips = 0;
- int rc;
-
- again:
- spin_lock(&ps->ps_lock);
- list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
- if (list_empty(&pool->po_free_list))
- continue;
-
- pool->po_allocated++;
- pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- node = pool->po_free_list.next;
- list_del(node);
-
- if (ps->ps_node_init) {
- /* still hold the lock */
- ps->ps_node_init(pool, node);
- }
- spin_unlock(&ps->ps_lock);
- return node;
- }
-
- /* no available tx pool and ... */
- if (ps->ps_increasing) {
- /* another thread is allocating a new pool */
- spin_unlock(&ps->ps_lock);
- trips++;
- CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n",
- ps->ps_name, interval, trips);
-
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(interval);
- if (interval < HZ)
- interval *= 2;
-
- goto again;
- }
-
- if (time_before(jiffies, ps->ps_next_retry)) {
- /* someone failed recently */
- spin_unlock(&ps->ps_lock);
- return NULL;
- }
-
- ps->ps_increasing = 1;
- spin_unlock(&ps->ps_lock);
-
- CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
- time_before = jiffies;
- rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
- CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
- jiffies - time_before);
-
- spin_lock(&ps->ps_lock);
- ps->ps_increasing = 0;
- if (!rc) {
- list_add_tail(&pool->po_list, &ps->ps_pool_list);
- } else {
- ps->ps_next_retry = jiffies + IBLND_POOL_RETRY * HZ;
- CERROR("Can't allocate new %s pool because out of memory\n",
- ps->ps_name);
- }
- spin_unlock(&ps->ps_lock);
-
- goto again;
-}
-
-static void kiblnd_destroy_tx_pool(struct kib_pool *pool)
-{
- struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool);
- int i;
-
- LASSERT(!pool->po_allocated);
-
- if (tpo->tpo_tx_pages) {
- kiblnd_unmap_tx_pool(tpo);
- kiblnd_free_pages(tpo->tpo_tx_pages);
- }
-
- if (!tpo->tpo_tx_descs)
- goto out;
-
- for (i = 0; i < pool->po_size; i++) {
- struct kib_tx *tx = &tpo->tpo_tx_descs[i];
-
- list_del(&tx->tx_list);
- kfree(tx->tx_pages);
- kfree(tx->tx_frags);
- kfree(tx->tx_wrq);
- kfree(tx->tx_sge);
- kfree(tx->tx_rd);
- }
-
- kfree(tpo->tpo_tx_descs);
-out:
- kiblnd_fini_pool(pool);
- kfree(tpo);
-}
-
-static int kiblnd_tx_pool_size(int ncpts)
-{
- int ntx = *kiblnd_tunables.kib_ntx / ncpts;
-
- return max(IBLND_TX_POOL, ntx);
-}
-
-static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size,
- struct kib_pool **pp_po)
-{
- int i;
- int npg;
- struct kib_pool *pool;
- struct kib_tx_pool *tpo;
-
- tpo = kzalloc_cpt(sizeof(*tpo), GFP_NOFS, ps->ps_cpt);
- if (!tpo) {
- CERROR("Failed to allocate TX pool\n");
- return -ENOMEM;
- }
-
- pool = &tpo->tpo_pool;
- kiblnd_init_pool(ps, pool, size);
- tpo->tpo_tx_descs = NULL;
- tpo->tpo_tx_pages = NULL;
-
- npg = DIV_ROUND_UP(size * IBLND_MSG_SIZE, PAGE_SIZE);
- if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) {
- CERROR("Can't allocate tx pages: %d\n", npg);
- kfree(tpo);
- return -ENOMEM;
- }
-
- tpo->tpo_tx_descs = kzalloc_cpt(size * sizeof(struct kib_tx),
- GFP_NOFS, ps->ps_cpt);
- if (!tpo->tpo_tx_descs) {
- CERROR("Can't allocate %d tx descriptors\n", size);
- ps->ps_pool_destroy(pool);
- return -ENOMEM;
- }
-
- memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
-
- for (i = 0; i < size; i++) {
- struct kib_tx *tx = &tpo->tpo_tx_descs[i];
-
- tx->tx_pool = tpo;
- if (ps->ps_net->ibn_fmr_ps) {
- tx->tx_pages = kzalloc_cpt(LNET_MAX_IOV * sizeof(*tx->tx_pages),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_pages)
- break;
- }
-
- tx->tx_frags = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) *
- sizeof(*tx->tx_frags),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_frags)
- break;
-
- sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
-
- tx->tx_wrq = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) *
- sizeof(*tx->tx_wrq),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_wrq)
- break;
-
- tx->tx_sge = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) *
- sizeof(*tx->tx_sge),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_sge)
- break;
-
- tx->tx_rd = kzalloc_cpt(offsetof(struct kib_rdma_desc,
- rd_frags[IBLND_MAX_RDMA_FRAGS]),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_rd)
- break;
- }
-
- if (i == size) {
- kiblnd_map_tx_pool(tpo);
- *pp_po = pool;
- return 0;
- }
-
- ps->ps_pool_destroy(pool);
- return -ENOMEM;
-}
-
-static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
-{
- struct kib_tx_poolset *tps = container_of(pool->po_owner,
- struct kib_tx_poolset,
- tps_poolset);
- struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list);
-
- tx->tx_cookie = tps->tps_next_tx_cookie++;
-}
-
-static void kiblnd_net_fini_pools(struct kib_net *net)
-{
- int i;
-
- cfs_cpt_for_each(i, lnet_cpt_table()) {
- struct kib_tx_poolset *tps;
- struct kib_fmr_poolset *fps;
-
- if (net->ibn_tx_ps) {
- tps = net->ibn_tx_ps[i];
- kiblnd_fini_poolset(&tps->tps_poolset);
- }
-
- if (net->ibn_fmr_ps) {
- fps = net->ibn_fmr_ps[i];
- kiblnd_fini_fmr_poolset(fps);
- }
- }
-
- if (net->ibn_tx_ps) {
- cfs_percpt_free(net->ibn_tx_ps);
- net->ibn_tx_ps = NULL;
- }
-
- if (net->ibn_fmr_ps) {
- cfs_percpt_free(net->ibn_fmr_ps);
- net->ibn_fmr_ps = NULL;
- }
-}
-
-static int kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni,
- __u32 *cpts, int ncpts)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int cpt;
- int rc;
- int i;
-
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
-
- if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
- CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
- tunables->lnd_fmr_pool_size,
- *kiblnd_tunables.kib_ntx / 4);
- rc = -EINVAL;
- goto failed;
- }
-
- /*
- * TX pool must be created later than FMR, see LU-2268
- * for details
- */
- LASSERT(!net->ibn_tx_ps);
-
- /*
- * premapping can fail if ibd_nmr > 1, so we always create
- * FMR pool and map-on-demand if premapping failed
- *
- * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset
- * The number of struct kib_fmr_poolsets create is equal to the
- * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt].
- */
- net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(struct kib_fmr_poolset));
- if (!net->ibn_fmr_ps) {
- CERROR("Failed to allocate FMR pool array\n");
- rc = -ENOMEM;
- goto failed;
- }
-
- for (i = 0; i < ncpts; i++) {
- cpt = !cpts ? i : cpts[i];
- rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
- net, tunables);
- if (rc) {
- CERROR("Can't initialize FMR pool for CPT %d: %d\n",
- cpt, rc);
- goto failed;
- }
- }
-
- if (i > 0)
- LASSERT(i == ncpts);
-
- /*
- * cfs_precpt_alloc is creating an array of struct kib_tx_poolset
- * The number of struct kib_tx_poolsets create is equal to the
- * number of CPTs that exist, i.e net->ibn_tx_ps[cpt].
- */
- net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(struct kib_tx_poolset));
- if (!net->ibn_tx_ps) {
- CERROR("Failed to allocate tx pool array\n");
- rc = -ENOMEM;
- goto failed;
- }
-
- for (i = 0; i < ncpts; i++) {
- cpt = !cpts ? i : cpts[i];
- rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
- cpt, net, "TX",
- kiblnd_tx_pool_size(ncpts),
- kiblnd_create_tx_pool,
- kiblnd_destroy_tx_pool,
- kiblnd_tx_init, NULL);
- if (rc) {
- CERROR("Can't initialize TX pool for CPT %d: %d\n",
- cpt, rc);
- goto failed;
- }
- }
-
- return 0;
- failed:
- kiblnd_net_fini_pools(net);
- LASSERT(rc);
- return rc;
-}
-
-static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
-{
- /*
- * It's safe to assume a HCA can handle a page size
- * matching that of the native system
- */
- hdev->ibh_page_shift = PAGE_SHIFT;
- hdev->ibh_page_size = 1 << PAGE_SHIFT;
- hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
-
- hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
- if (hdev->ibh_mr_size == ~0ULL) {
- hdev->ibh_mr_shift = 64;
- return 0;
- }
-
- CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
- return -EINVAL;
-}
-
-void kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
-{
- if (hdev->ibh_pd)
- ib_dealloc_pd(hdev->ibh_pd);
-
- if (hdev->ibh_cmid)
- rdma_destroy_id(hdev->ibh_cmid);
-
- kfree(hdev);
-}
-
-/* DUMMY */
-static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
- struct rdma_cm_event *event)
-{
- return 0;
-}
-
-static int kiblnd_dev_need_failover(struct kib_dev *dev)
-{
- struct rdma_cm_id *cmid;
- struct sockaddr_in srcaddr;
- struct sockaddr_in dstaddr;
- int rc;
-
- if (!dev->ibd_hdev || /* initializing */
- !dev->ibd_hdev->ibh_cmid || /* listener is dead */
- *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
- return 1;
-
- /*
- * XXX: it's UGLY, but I don't have better way to find
- * ib-bonding HCA failover because:
- *
- * a. no reliable CM event for HCA failover...
- * b. no OFED API to get ib_device for current net_device...
- *
- * We have only two choices at this point:
- *
- * a. rdma_bind_addr(), it will conflict with listener cmid
- * b. rdma_resolve_addr() to zero addr
- */
- cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
- IB_QPT_RC);
- if (IS_ERR(cmid)) {
- rc = PTR_ERR(cmid);
- CERROR("Failed to create cmid for failover: %d\n", rc);
- return rc;
- }
-
- memset(&srcaddr, 0, sizeof(srcaddr));
- srcaddr.sin_family = AF_INET;
- srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
-
- memset(&dstaddr, 0, sizeof(dstaddr));
- dstaddr.sin_family = AF_INET;
- rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
- (struct sockaddr *)&dstaddr, 1);
- if (rc || !cmid->device) {
- CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
- dev->ibd_ifname, &dev->ibd_ifip,
- cmid->device, rc);
- rdma_destroy_id(cmid);
- return rc;
- }
-
- rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
- rdma_destroy_id(cmid);
-
- return rc;
-}
-
-int kiblnd_dev_failover(struct kib_dev *dev)
-{
- LIST_HEAD(zombie_tpo);
- LIST_HEAD(zombie_ppo);
- LIST_HEAD(zombie_fpo);
- struct rdma_cm_id *cmid = NULL;
- struct kib_hca_dev *hdev = NULL;
- struct ib_pd *pd;
- struct kib_net *net;
- struct sockaddr_in addr;
- unsigned long flags;
- int rc = 0;
- int i;
-
- LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
- dev->ibd_can_failover || !dev->ibd_hdev);
-
- rc = kiblnd_dev_need_failover(dev);
- if (rc <= 0)
- goto out;
-
- if (dev->ibd_hdev &&
- dev->ibd_hdev->ibh_cmid) {
- /*
- * XXX it's not good to close old listener at here,
- * because we can fail to create new listener.
- * But we have to close it now, otherwise rdma_bind_addr
- * will return EADDRINUSE... How crap!
- */
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- cmid = dev->ibd_hdev->ibh_cmid;
- /*
- * make next schedule of kiblnd_dev_need_failover()
- * return 1 for me
- */
- dev->ibd_hdev->ibh_cmid = NULL;
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- rdma_destroy_id(cmid);
- }
-
- cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
- IB_QPT_RC);
- if (IS_ERR(cmid)) {
- rc = PTR_ERR(cmid);
- CERROR("Failed to create cmid for failover: %d\n", rc);
- goto out;
- }
-
- memset(&addr, 0, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = htonl(dev->ibd_ifip);
- addr.sin_port = htons(*kiblnd_tunables.kib_service);
-
- /* Bind to failover device or port */
- rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
- if (rc || !cmid->device) {
- CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
- dev->ibd_ifname, &dev->ibd_ifip,
- cmid->device, rc);
- rdma_destroy_id(cmid);
- goto out;
- }
-
- hdev = kzalloc(sizeof(*hdev), GFP_NOFS);
- if (!hdev) {
- CERROR("Failed to allocate kib_hca_dev\n");
- rdma_destroy_id(cmid);
- rc = -ENOMEM;
- goto out;
- }
-
- atomic_set(&hdev->ibh_ref, 1);
- hdev->ibh_dev = dev;
- hdev->ibh_cmid = cmid;
- hdev->ibh_ibdev = cmid->device;
-
- pd = ib_alloc_pd(cmid->device, 0);
- if (IS_ERR(pd)) {
- rc = PTR_ERR(pd);
- CERROR("Can't allocate PD: %d\n", rc);
- goto out;
- }
-
- hdev->ibh_pd = pd;
-
- rc = rdma_listen(cmid, 0);
- if (rc) {
- CERROR("Can't start new listener: %d\n", rc);
- goto out;
- }
-
- rc = kiblnd_hdev_get_attr(hdev);
- if (rc) {
- CERROR("Can't get device attributes: %d\n", rc);
- goto out;
- }
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- swap(dev->ibd_hdev, hdev); /* take over the refcount */
-
- list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
- cfs_cpt_for_each(i, lnet_cpt_table()) {
- kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
- &zombie_tpo);
-
- if (net->ibn_fmr_ps)
- kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
- &zombie_fpo);
- }
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- out:
- if (!list_empty(&zombie_tpo))
- kiblnd_destroy_pool_list(&zombie_tpo);
- if (!list_empty(&zombie_ppo))
- kiblnd_destroy_pool_list(&zombie_ppo);
- if (!list_empty(&zombie_fpo))
- kiblnd_destroy_fmr_pool_list(&zombie_fpo);
- if (hdev)
- kiblnd_hdev_decref(hdev);
-
- if (rc)
- dev->ibd_failed_failover++;
- else
- dev->ibd_failed_failover = 0;
-
- return rc;
-}
-
-void kiblnd_destroy_dev(struct kib_dev *dev)
-{
- LASSERT(!dev->ibd_nnets);
- LASSERT(list_empty(&dev->ibd_nets));
-
- list_del(&dev->ibd_fail_list);
- list_del(&dev->ibd_list);
-
- if (dev->ibd_hdev)
- kiblnd_hdev_decref(dev->ibd_hdev);
-
- kfree(dev);
-}
-
-static struct kib_dev *kiblnd_create_dev(char *ifname)
-{
- struct net_device *netdev;
- struct kib_dev *dev;
- __u32 netmask;
- __u32 ip;
- int up;
- int rc;
-
- rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
- if (rc) {
- CERROR("Can't query IPoIB interface %s: %d\n",
- ifname, rc);
- return NULL;
- }
-
- if (!up) {
- CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
- return NULL;
- }
-
- dev = kzalloc(sizeof(*dev), GFP_NOFS);
- if (!dev)
- return NULL;
-
- netdev = dev_get_by_name(&init_net, ifname);
- if (!netdev) {
- dev->ibd_can_failover = 0;
- } else {
- dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
- dev_put(netdev);
- }
-
- INIT_LIST_HEAD(&dev->ibd_nets);
- INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
- INIT_LIST_HEAD(&dev->ibd_fail_list);
- dev->ibd_ifip = ip;
- strcpy(&dev->ibd_ifname[0], ifname);
-
- /* initialize the device */
- rc = kiblnd_dev_failover(dev);
- if (rc) {
- CERROR("Can't initialize device: %d\n", rc);
- kfree(dev);
- return NULL;
- }
-
- list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
- return dev;
-}
-
-static void kiblnd_base_shutdown(void)
-{
- struct kib_sched_info *sched;
- int i;
-
- LASSERT(list_empty(&kiblnd_data.kib_devs));
-
- switch (kiblnd_data.kib_init) {
- default:
- LBUG();
-
- case IBLND_INIT_ALL:
- case IBLND_INIT_DATA:
- LASSERT(kiblnd_data.kib_peers);
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
- LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
- LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
- LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
- LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
- LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
-
- /* flag threads to terminate; wake and wait for them to die */
- kiblnd_data.kib_shutdown = 1;
-
- /*
- * NB: we really want to stop scheduler threads net by net
- * instead of the whole module, this should be improved
- * with dynamic configuration LNet
- */
- cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
- wake_up_all(&sched->ibs_waitq);
-
- wake_up_all(&kiblnd_data.kib_connd_waitq);
- wake_up_all(&kiblnd_data.kib_failover_waitq);
-
- i = 2;
- while (atomic_read(&kiblnd_data.kib_nthreads)) {
- i++;
- /* power of 2 ? */
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
- "Waiting for %d threads to terminate\n",
- atomic_read(&kiblnd_data.kib_nthreads));
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- }
-
- /* fall through */
-
- case IBLND_INIT_NOTHING:
- break;
- }
-
- kvfree(kiblnd_data.kib_peers);
-
- if (kiblnd_data.kib_scheds)
- cfs_percpt_free(kiblnd_data.kib_scheds);
-
- kiblnd_data.kib_init = IBLND_INIT_NOTHING;
- module_put(THIS_MODULE);
-}
-
-static void kiblnd_shutdown(struct lnet_ni *ni)
-{
- struct kib_net *net = ni->ni_data;
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- int i;
- unsigned long flags;
-
- LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
-
- if (!net)
- goto out;
-
- write_lock_irqsave(g_lock, flags);
- net->ibn_shutdown = 1;
- write_unlock_irqrestore(g_lock, flags);
-
- switch (net->ibn_init) {
- default:
- LBUG();
-
- case IBLND_INIT_ALL:
- /* nuke all existing peers within this net */
- kiblnd_del_peer(ni, LNET_NID_ANY);
-
- /* Wait for all peer state to clean up */
- i = 2;
- while (atomic_read(&net->ibn_npeers)) {
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
- "%s: waiting for %d peers to disconnect\n",
- libcfs_nid2str(ni->ni_nid),
- atomic_read(&net->ibn_npeers));
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- }
-
- kiblnd_net_fini_pools(net);
-
- write_lock_irqsave(g_lock, flags);
- LASSERT(net->ibn_dev->ibd_nnets > 0);
- net->ibn_dev->ibd_nnets--;
- list_del(&net->ibn_list);
- write_unlock_irqrestore(g_lock, flags);
-
- /* fall through */
-
- case IBLND_INIT_NOTHING:
- LASSERT(!atomic_read(&net->ibn_nconns));
-
- if (net->ibn_dev && !net->ibn_dev->ibd_nnets)
- kiblnd_destroy_dev(net->ibn_dev);
-
- break;
- }
-
- net->ibn_init = IBLND_INIT_NOTHING;
- ni->ni_data = NULL;
-
- kfree(net);
-
-out:
- if (list_empty(&kiblnd_data.kib_devs))
- kiblnd_base_shutdown();
-}
-
-static int kiblnd_base_startup(void)
-{
- struct kib_sched_info *sched;
- int rc;
- int i;
-
- LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
-
- try_module_get(THIS_MODULE);
- /* zero pointers, flags etc */
- memset(&kiblnd_data, 0, sizeof(kiblnd_data));
-
- rwlock_init(&kiblnd_data.kib_global_lock);
-
- INIT_LIST_HEAD(&kiblnd_data.kib_devs);
- INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
-
- kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
- kiblnd_data.kib_peers = kvmalloc_array(kiblnd_data.kib_peer_hash_size,
- sizeof(struct list_head),
- GFP_KERNEL);
- if (!kiblnd_data.kib_peers)
- goto failed;
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
- INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
-
- spin_lock_init(&kiblnd_data.kib_connd_lock);
- INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
- INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
- INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
- INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
-
- init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
- init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
-
- kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(*sched));
- if (!kiblnd_data.kib_scheds)
- goto failed;
-
- cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
- int nthrs;
-
- spin_lock_init(&sched->ibs_lock);
- INIT_LIST_HEAD(&sched->ibs_conns);
- init_waitqueue_head(&sched->ibs_waitq);
-
- nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
- if (*kiblnd_tunables.kib_nscheds > 0) {
- nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
- } else {
- /*
- * max to half of CPUs, another half is reserved for
- * upper layer modules
- */
- nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
- }
-
- sched->ibs_nthreads_max = nthrs;
- sched->ibs_cpt = i;
- }
-
- kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
-
- /* lists/ptrs/locks initialised */
- kiblnd_data.kib_init = IBLND_INIT_DATA;
- /*****************************************************/
-
- rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
- if (rc) {
- CERROR("Can't spawn o2iblnd connd: %d\n", rc);
- goto failed;
- }
-
- if (*kiblnd_tunables.kib_dev_failover)
- rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
- "kiblnd_failover");
-
- if (rc) {
- CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
- goto failed;
- }
-
- /* flag everything initialised */
- kiblnd_data.kib_init = IBLND_INIT_ALL;
- /*****************************************************/
-
- return 0;
-
- failed:
- kiblnd_base_shutdown();
- return -ENETDOWN;
-}
-
-static int kiblnd_start_schedulers(struct kib_sched_info *sched)
-{
- int rc = 0;
- int nthrs;
- int i;
-
- if (!sched->ibs_nthreads) {
- if (*kiblnd_tunables.kib_nscheds > 0) {
- nthrs = sched->ibs_nthreads_max;
- } else {
- nthrs = cfs_cpt_weight(lnet_cpt_table(),
- sched->ibs_cpt);
- nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
- nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
- }
- } else {
- LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
- /* increase one thread if there is new interface */
- nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
- }
-
- for (i = 0; i < nthrs; i++) {
- long id;
- char name[20];
-
- id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
- snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
- KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
- rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
- if (!rc)
- continue;
-
- CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
- sched->ibs_cpt, sched->ibs_nthreads + i, rc);
- break;
- }
-
- sched->ibs_nthreads += i;
- return rc;
-}
-
-static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts,
- int ncpts)
-{
- int cpt;
- int rc;
- int i;
-
- for (i = 0; i < ncpts; i++) {
- struct kib_sched_info *sched;
-
- cpt = !cpts ? i : cpts[i];
- sched = kiblnd_data.kib_scheds[cpt];
-
- if (!newdev && sched->ibs_nthreads > 0)
- continue;
-
- rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
- if (rc) {
- CERROR("Failed to start scheduler threads for %s\n",
- dev->ibd_ifname);
- return rc;
- }
- }
- return 0;
-}
-
-static struct kib_dev *kiblnd_dev_search(char *ifname)
-{
- struct kib_dev *alias = NULL;
- struct kib_dev *dev;
- char *colon;
- char *colon2;
-
- colon = strchr(ifname, ':');
- list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
- if (!strcmp(&dev->ibd_ifname[0], ifname))
- return dev;
-
- if (alias)
- continue;
-
- colon2 = strchr(dev->ibd_ifname, ':');
- if (colon)
- *colon = 0;
- if (colon2)
- *colon2 = 0;
-
- if (!strcmp(&dev->ibd_ifname[0], ifname))
- alias = dev;
-
- if (colon)
- *colon = ':';
- if (colon2)
- *colon2 = ':';
- }
- return alias;
-}
-
-static int kiblnd_startup(struct lnet_ni *ni)
-{
- char *ifname;
- struct kib_dev *ibdev = NULL;
- struct kib_net *net;
- struct timespec64 tv;
- unsigned long flags;
- int rc;
- int newdev;
-
- LASSERT(ni->ni_lnd == &the_o2iblnd);
-
- if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
- rc = kiblnd_base_startup();
- if (rc)
- return rc;
- }
-
- net = kzalloc(sizeof(*net), GFP_NOFS);
- ni->ni_data = net;
- if (!net)
- goto net_failed;
-
- ktime_get_real_ts64(&tv);
- net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
- tv.tv_nsec / NSEC_PER_USEC;
-
- rc = kiblnd_tunables_setup(ni);
- if (rc)
- goto net_failed;
-
- if (ni->ni_interfaces[0]) {
- /* Use the IPoIB interface specified in 'networks=' */
-
- BUILD_BUG_ON(LNET_MAX_INTERFACES <= 1);
- if (ni->ni_interfaces[1]) {
- CERROR("Multiple interfaces not supported\n");
- goto failed;
- }
-
- ifname = ni->ni_interfaces[0];
- } else {
- ifname = *kiblnd_tunables.kib_default_ipif;
- }
-
- if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
- CERROR("IPoIB interface name too long: %s\n", ifname);
- goto failed;
- }
-
- ibdev = kiblnd_dev_search(ifname);
-
- newdev = !ibdev;
- /* hmm...create kib_dev even for alias */
- if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname))
- ibdev = kiblnd_create_dev(ifname);
-
- if (!ibdev)
- goto failed;
-
- net->ibn_dev = ibdev;
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
-
- rc = kiblnd_dev_start_threads(ibdev, newdev,
- ni->ni_cpts, ni->ni_ncpts);
- if (rc)
- goto failed;
-
- rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
- if (rc) {
- CERROR("Failed to initialize NI pools: %d\n", rc);
- goto failed;
- }
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- ibdev->ibd_nnets++;
- list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- net->ibn_init = IBLND_INIT_ALL;
-
- return 0;
-
-failed:
- if (!net->ibn_dev && ibdev)
- kiblnd_destroy_dev(ibdev);
-
-net_failed:
- kiblnd_shutdown(ni);
-
- CDEBUG(D_NET, "%s failed\n", __func__);
- return -ENETDOWN;
-}
-
-static struct lnet_lnd the_o2iblnd = {
- .lnd_type = O2IBLND,
- .lnd_startup = kiblnd_startup,
- .lnd_shutdown = kiblnd_shutdown,
- .lnd_ctl = kiblnd_ctl,
- .lnd_query = kiblnd_query,
- .lnd_send = kiblnd_send,
- .lnd_recv = kiblnd_recv,
-};
-
-static void __exit ko2iblnd_exit(void)
-{
- lnet_unregister_lnd(&the_o2iblnd);
-}
-
-static int __init ko2iblnd_init(void)
-{
- int rc;
-
- BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE);
- BUILD_BUG_ON(offsetof(struct kib_msg,
- ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
- > IBLND_MSG_SIZE);
- BUILD_BUG_ON(offsetof(struct kib_msg,
- ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
- > IBLND_MSG_SIZE);
-
- kiblnd_tunables_init();
-
- rc = libcfs_setup();
- if (rc)
- return rc;
-
- lnet_register_lnd(&the_o2iblnd);
-
- return 0;
-}
-
-MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
-MODULE_VERSION("2.7.0");
-MODULE_LICENSE("GPL");
-
-module_init(ko2iblnd_init);
-module_exit(ko2iblnd_exit);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
deleted file mode 100644
index 217503f..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
+++ /dev/null
@@ -1,1048 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd.h
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/unistd.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-
-#include <linux/io.h>
-
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/list.h>
-#include <linux/kmod.h>
-#include <linux/sysctl.h>
-#include <linux/pci.h>
-
-#include <net/sock.h>
-#include <linux/in.h>
-
-#include <rdma/rdma_cm.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_fmr_pool.h>
-
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <linux/lnet/lib-lnet.h>
-
-#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */
-/* # scheduler loops before reschedule */
-#define IBLND_RESCHED 100
-
-#define IBLND_N_SCHED 2
-#define IBLND_N_SCHED_HIGH 4
-
-struct kib_tunables {
- int *kib_dev_failover; /* HCA failover */
- unsigned int *kib_service; /* IB service number */
- int *kib_min_reconnect_interval; /* first failed connection retry... */
- int *kib_max_reconnect_interval; /* exponentially increasing to this */
- int *kib_cksum; /* checksum struct kib_msg? */
- int *kib_timeout; /* comms timeout (seconds) */
- int *kib_keepalive; /* keepalive timeout (seconds) */
- int *kib_ntx; /* # tx descs */
- char **kib_default_ipif; /* default IPoIB interface */
- int *kib_retry_count;
- int *kib_rnr_retry_count;
- int *kib_ib_mtu; /* IB MTU */
- int *kib_require_priv_port; /* accept only privileged ports */
- int *kib_use_priv_port; /* use privileged port for active connect */
- int *kib_nscheds; /* # threads on each CPT */
-};
-
-extern struct kib_tunables kiblnd_tunables;
-
-#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */
-#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */
-
-#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */
-#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *)0)->ibm_credits)) - 1) /* Max # of peer credits */
-
-/* when eagerly to return credits */
-#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
- IBLND_CREDIT_HIGHWATER_V1 : \
- t->lnd_peercredits_hiw)
-
-#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \
- cb, dev, \
- ps, qpt)
-
-/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
-#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
-#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
-
-#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */
-#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */
-#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */
-
-/************************/
-/* derived constants... */
-/* Pools (shared by connections on each CPT) */
-/* These pools can grow at runtime, so don't need give a very large value */
-#define IBLND_TX_POOL 256
-#define IBLND_FMR_POOL 256
-#define IBLND_FMR_POOL_FLUSH 192
-
-#define IBLND_RX_MSGS(c) \
- ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
-#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
-#define IBLND_RX_MSG_PAGES(c) \
- ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
-
-/* WRs and CQEs (per connection) */
-#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c)
-#define IBLND_SEND_WRS(c) \
- (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \
- kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni))
-#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
-
-struct kib_hca_dev;
-
-/* o2iblnd can run over aliased interface */
-#ifdef IFALIASZ
-#define KIB_IFNAME_SIZE IFALIASZ
-#else
-#define KIB_IFNAME_SIZE 256
-#endif
-
-struct kib_dev {
- struct list_head ibd_list; /* chain on kib_devs */
- struct list_head ibd_fail_list; /* chain on kib_failed_devs */
- __u32 ibd_ifip; /* IPoIB interface IP */
-
- /* IPoIB interface name */
- char ibd_ifname[KIB_IFNAME_SIZE];
- int ibd_nnets; /* # nets extant */
-
- unsigned long ibd_next_failover;
- int ibd_failed_failover; /* # failover failures */
- unsigned int ibd_failover; /* failover in progress */
- unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */
- struct list_head ibd_nets;
- struct kib_hca_dev *ibd_hdev;
-};
-
-struct kib_hca_dev {
- struct rdma_cm_id *ibh_cmid; /* listener cmid */
- struct ib_device *ibh_ibdev; /* IB device */
- int ibh_page_shift; /* page shift of current HCA */
- int ibh_page_size; /* page size of current HCA */
- __u64 ibh_page_mask; /* page mask of current HCA */
- int ibh_mr_shift; /* bits shift of max MR size */
- __u64 ibh_mr_size; /* size of MR */
- struct ib_pd *ibh_pd; /* PD */
- struct kib_dev *ibh_dev; /* owner */
- atomic_t ibh_ref; /* refcount */
-};
-
-/** # of seconds to keep pool alive */
-#define IBLND_POOL_DEADLINE 300
-/** # of seconds to retry if allocation failed */
-#define IBLND_POOL_RETRY 1
-
-struct kib_pages {
- int ibp_npages; /* # pages */
- struct page *ibp_pages[0]; /* page array */
-};
-
-struct kib_pool;
-struct kib_poolset;
-
-typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps,
- int inc, struct kib_pool **pp_po);
-typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
-typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
-typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
-
-struct kib_net;
-
-#define IBLND_POOL_NAME_LEN 32
-
-struct kib_poolset {
- spinlock_t ps_lock; /* serialize */
- struct kib_net *ps_net; /* network it belongs to */
- char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
- struct list_head ps_pool_list; /* list of pools */
- struct list_head ps_failed_pool_list;/* failed pool list */
- unsigned long ps_next_retry; /* time stamp for retry if */
- /* failed to allocate */
- int ps_increasing; /* is allocating new pool */
- int ps_pool_size; /* new pool size */
- int ps_cpt; /* CPT id */
-
- kib_ps_pool_create_t ps_pool_create; /* create a new pool */
- kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */
- kib_ps_node_init_t ps_node_init; /* initialize new allocated node */
- kib_ps_node_fini_t ps_node_fini; /* finalize node */
-};
-
-struct kib_pool {
- struct list_head po_list; /* chain on pool list */
- struct list_head po_free_list; /* pre-allocated node */
- struct kib_poolset *po_owner; /* pool_set of this pool */
- unsigned long po_deadline; /* deadline of this pool */
- int po_allocated; /* # of elements in use */
- int po_failed; /* pool is created on failed HCA */
- int po_size; /* # of pre-allocated elements */
-};
-
-struct kib_tx_poolset {
- struct kib_poolset tps_poolset; /* pool-set */
- __u64 tps_next_tx_cookie; /* cookie of TX */
-};
-
-struct kib_tx_pool {
- struct kib_pool tpo_pool; /* pool */
- struct kib_hca_dev *tpo_hdev; /* device for this pool */
- struct kib_tx *tpo_tx_descs; /* all the tx descriptors */
- struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */
-};
-
-struct kib_fmr_poolset {
- spinlock_t fps_lock; /* serialize */
- struct kib_net *fps_net; /* IB network */
- struct list_head fps_pool_list; /* FMR pool list */
- struct list_head fps_failed_pool_list;/* FMR pool list */
- __u64 fps_version; /* validity stamp */
- int fps_cpt; /* CPT id */
- int fps_pool_size;
- int fps_flush_trigger;
- int fps_cache;
- int fps_increasing; /* is allocating new pool */
- unsigned long fps_next_retry; /* time stamp for retry if*/
- /* failed to allocate */
-};
-
-struct kib_fast_reg_descriptor { /* For fast registration */
- struct list_head frd_list;
- struct ib_send_wr frd_inv_wr;
- struct ib_reg_wr frd_fastreg_wr;
- struct ib_mr *frd_mr;
- bool frd_valid;
-};
-
-struct kib_fmr_pool {
- struct list_head fpo_list; /* chain on pool list */
- struct kib_hca_dev *fpo_hdev; /* device for this pool */
- struct kib_fmr_poolset *fpo_owner; /* owner of this pool */
- union {
- struct {
- struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
- } fmr;
- struct { /* For fast registration */
- struct list_head fpo_pool_list;
- int fpo_pool_size;
- } fast_reg;
- };
- unsigned long fpo_deadline; /* deadline of this pool */
- int fpo_failed; /* fmr pool is failed */
- int fpo_map_count; /* # of mapped FMR */
- int fpo_is_fmr;
-};
-
-struct kib_fmr {
- struct kib_fmr_pool *fmr_pool; /* pool of FMR */
- struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
- struct kib_fast_reg_descriptor *fmr_frd;
- u32 fmr_key;
-};
-
-struct kib_net {
- struct list_head ibn_list; /* chain on struct kib_dev::ibd_nets */
- __u64 ibn_incarnation;/* my epoch */
- int ibn_init; /* initialisation state */
- int ibn_shutdown; /* shutting down? */
-
- atomic_t ibn_npeers; /* # peers extant */
- atomic_t ibn_nconns; /* # connections extant */
-
- struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */
- struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */
-
- struct kib_dev *ibn_dev; /* underlying IB device */
-};
-
-#define KIB_THREAD_SHIFT 16
-#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid))
-#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT)
-#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
-
-struct kib_sched_info {
- spinlock_t ibs_lock; /* serialise */
- wait_queue_head_t ibs_waitq; /* schedulers sleep here */
- struct list_head ibs_conns; /* conns to check for rx completions */
- int ibs_nthreads; /* number of scheduler threads */
- int ibs_nthreads_max; /* max allowed scheduler threads */
- int ibs_cpt; /* CPT id */
-};
-
-struct kib_data {
- int kib_init; /* initialisation state */
- int kib_shutdown; /* shut down? */
- struct list_head kib_devs; /* IB devices extant */
- struct list_head kib_failed_devs; /* list head of failed devices */
- wait_queue_head_t kib_failover_waitq; /* schedulers sleep here */
- atomic_t kib_nthreads; /* # live threads */
- rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */
- struct list_head *kib_peers; /* hash table of all my known peers */
- int kib_peer_hash_size; /* size of kib_peers */
- void *kib_connd; /* the connd task (serialisation assertions) */
- struct list_head kib_connd_conns; /* connections to setup/teardown */
- struct list_head kib_connd_zombies; /* connections with zero refcount */
- /* connections to reconnect */
- struct list_head kib_reconn_list;
- /* peers wait for reconnection */
- struct list_head kib_reconn_wait;
- /**
- * The second that peers are pulled out from \a kib_reconn_wait
- * for reconnection.
- */
- time64_t kib_reconn_sec;
-
- wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */
- spinlock_t kib_connd_lock; /* serialise */
- struct ib_qp_attr kib_error_qpa; /* QP->ERROR */
- struct kib_sched_info **kib_scheds; /* percpt data for schedulers */
-};
-
-#define IBLND_INIT_NOTHING 0
-#define IBLND_INIT_DATA 1
-#define IBLND_INIT_ALL 2
-
-/************************************************************************
- * IB Wire message format.
- * These are sent in sender's byte order (i.e. receiver flips).
- */
-
-struct kib_connparams {
- __u16 ibcp_queue_depth;
- __u16 ibcp_max_frags;
- __u32 ibcp_max_msg_size;
-} WIRE_ATTR;
-
-struct kib_immediate_msg {
- struct lnet_hdr ibim_hdr; /* portals header */
- char ibim_payload[0]; /* piggy-backed payload */
-} WIRE_ATTR;
-
-struct kib_rdma_frag {
- __u32 rf_nob; /* # bytes this frag */
- __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR;
-
-struct kib_rdma_desc {
- __u32 rd_key; /* local/remote key */
- __u32 rd_nfrags; /* # fragments */
- struct kib_rdma_frag rd_frags[0]; /* buffer frags */
-} WIRE_ATTR;
-
-struct kib_putreq_msg {
- struct lnet_hdr ibprm_hdr; /* portals header */
- __u64 ibprm_cookie; /* opaque completion cookie */
-} WIRE_ATTR;
-
-struct kib_putack_msg {
- __u64 ibpam_src_cookie; /* reflected completion cookie */
- __u64 ibpam_dst_cookie; /* opaque completion cookie */
- struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */
-} WIRE_ATTR;
-
-struct kib_get_msg {
- struct lnet_hdr ibgm_hdr; /* portals header */
- __u64 ibgm_cookie; /* opaque completion cookie */
- struct kib_rdma_desc ibgm_rd; /* rdma descriptor */
-} WIRE_ATTR;
-
-struct kib_completion_msg {
- __u64 ibcm_cookie; /* opaque completion cookie */
- __s32 ibcm_status; /* < 0 failure: >= 0 length */
-} WIRE_ATTR;
-
-struct kib_msg {
- /* First 2 fields fixed FOR ALL TIME */
- __u32 ibm_magic; /* I'm an ibnal message */
- __u16 ibm_version; /* this is my version number */
-
- __u8 ibm_type; /* msg type */
- __u8 ibm_credits; /* returned credits */
- __u32 ibm_nob; /* # bytes in whole message */
- __u32 ibm_cksum; /* checksum (0 == no checksum) */
- __u64 ibm_srcnid; /* sender's NID */
- __u64 ibm_srcstamp; /* sender's incarnation */
- __u64 ibm_dstnid; /* destination's NID */
- __u64 ibm_dststamp; /* destination's incarnation */
-
- union {
- struct kib_connparams connparams;
- struct kib_immediate_msg immediate;
- struct kib_putreq_msg putreq;
- struct kib_putack_msg putack;
- struct kib_get_msg get;
- struct kib_completion_msg completion;
- } WIRE_ATTR ibm_u;
-} WIRE_ATTR;
-
-#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */
-
-#define IBLND_MSG_VERSION_1 0x11
-#define IBLND_MSG_VERSION_2 0x12
-#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2
-
-#define IBLND_MSG_CONNREQ 0xc0 /* connection request */
-#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */
-#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */
-#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */
-#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */
-#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */
-#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */
-#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */
-#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */
-#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */
-
-struct kib_rej {
- __u32 ibr_magic; /* sender's magic */
- __u16 ibr_version; /* sender's version */
- __u8 ibr_why; /* reject reason */
- __u8 ibr_padding; /* padding */
- __u64 ibr_incarnation; /* incarnation of peer */
- struct kib_connparams ibr_cp; /* connection parameters */
-} WIRE_ATTR;
-
-/* connection rejection reasons */
-#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */
-#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */
-#define IBLND_REJECT_FATAL 3 /* Anything else */
-#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */
-#define IBLND_REJECT_CONN_STALE 5 /* stale peer */
-/* peer's rdma frags doesn't match mine */
-#define IBLND_REJECT_RDMA_FRAGS 6
-/* peer's msg queue size doesn't match mine */
-#define IBLND_REJECT_MSG_QUEUE_SIZE 7
-
-/***********************************************************************/
-
-struct kib_rx { /* receive message */
- struct list_head rx_list; /* queue for attention */
- struct kib_conn *rx_conn; /* owning conn */
- int rx_nob; /* # bytes received (-1 while posted) */
- enum ib_wc_status rx_status; /* completion status */
- struct kib_msg *rx_msg; /* message buffer (host vaddr) */
- __u64 rx_msgaddr; /* message buffer (I/O addr) */
- DECLARE_PCI_UNMAP_ADDR(rx_msgunmap); /* for dma_unmap_single() */
- struct ib_recv_wr rx_wrq; /* receive work item... */
- struct ib_sge rx_sge; /* ...and its memory */
-};
-
-#define IBLND_POSTRX_DONT_POST 0 /* don't post */
-#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */
-#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */
-#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give self back 1 reserved credit */
-
-struct kib_tx { /* transmit message */
- struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
- struct kib_tx_pool *tx_pool; /* pool I'm from */
- struct kib_conn *tx_conn; /* owning conn */
- short tx_sending; /* # tx callbacks outstanding */
- short tx_queued; /* queued for sending */
- short tx_waiting; /* waiting for peer */
- int tx_status; /* LNET completion status */
- unsigned long tx_deadline; /* completion deadline */
- __u64 tx_cookie; /* completion cookie */
- struct lnet_msg *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
- struct kib_msg *tx_msg; /* message buffer (host vaddr) */
- __u64 tx_msgaddr; /* message buffer (I/O addr) */
- DECLARE_PCI_UNMAP_ADDR(tx_msgunmap); /* for dma_unmap_single() */
- int tx_nwrq; /* # send work items */
- struct ib_rdma_wr *tx_wrq; /* send work items... */
- struct ib_sge *tx_sge; /* ...and their memory */
- struct kib_rdma_desc *tx_rd; /* rdma descriptor */
- int tx_nfrags; /* # entries in... */
- struct scatterlist *tx_frags; /* dma_map_sg descriptor */
- __u64 *tx_pages; /* rdma phys page addrs */
- struct kib_fmr fmr; /* FMR */
- int tx_dmadir; /* dma direction */
-};
-
-struct kib_connvars {
- struct kib_msg cv_msg; /* connection-in-progress variables */
-};
-
-struct kib_conn {
- struct kib_sched_info *ibc_sched; /* scheduler information */
- struct kib_peer *ibc_peer; /* owning peer */
- struct kib_hca_dev *ibc_hdev; /* HCA bound on */
- struct list_head ibc_list; /* stash on peer's conn list */
- struct list_head ibc_sched_list; /* schedule for attention */
- __u16 ibc_version; /* version of connection */
- /* reconnect later */
- __u16 ibc_reconnect:1;
- __u64 ibc_incarnation; /* which instance of the peer */
- atomic_t ibc_refcount; /* # users */
- int ibc_state; /* what's happening */
- int ibc_nsends_posted; /* # uncompleted sends */
- int ibc_noops_posted; /* # uncompleted NOOPs */
- int ibc_credits; /* # credits I have */
- int ibc_outstanding_credits; /* # credits to return */
- int ibc_reserved_credits; /* # ACK/DONE msg credits */
- int ibc_comms_error; /* set on comms error */
- /* connections queue depth */
- __u16 ibc_queue_depth;
- /* connections max frags */
- __u16 ibc_max_frags;
- unsigned int ibc_nrx:16; /* receive buffers owned */
- unsigned int ibc_scheduled:1; /* scheduled for attention */
- unsigned int ibc_ready:1; /* CQ callback fired */
- unsigned long ibc_last_send; /* time of last send */
- struct list_head ibc_connd_list; /* link chain for */
- /* kiblnd_check_conns only */
- struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */
- struct list_head ibc_tx_noops; /* IBLND_MSG_NOOPs for */
- /* IBLND_MSG_VERSION_1 */
- struct list_head ibc_tx_queue; /* sends that need a credit */
- struct list_head ibc_tx_queue_nocred; /* sends that don't need a */
- /* credit */
- struct list_head ibc_tx_queue_rsrvd; /* sends that need to */
- /* reserve an ACK/DONE msg */
- struct list_head ibc_active_txs; /* active tx awaiting completion */
- spinlock_t ibc_lock; /* serialise */
- struct kib_rx *ibc_rxs; /* the rx descs */
- struct kib_pages *ibc_rx_pages; /* premapped rx msg pages */
-
- struct rdma_cm_id *ibc_cmid; /* CM id */
- struct ib_cq *ibc_cq; /* completion queue */
-
- struct kib_connvars *ibc_connvars; /* in-progress connection state */
-};
-
-#define IBLND_CONN_INIT 0 /* being initialised */
-#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */
-#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */
-#define IBLND_CONN_ESTABLISHED 3 /* connection established */
-#define IBLND_CONN_CLOSING 4 /* being closed */
-#define IBLND_CONN_DISCONNECTED 5 /* disconnected */
-
-struct kib_peer {
- struct list_head ibp_list; /* stash on global peer list */
- lnet_nid_t ibp_nid; /* who's on the other end(s) */
- struct lnet_ni *ibp_ni; /* LNet interface */
- struct list_head ibp_conns; /* all active connections */
- struct kib_conn *ibp_next_conn; /* next connection to send on for
- * round robin */
- struct list_head ibp_tx_queue; /* msgs waiting for a conn */
- __u64 ibp_incarnation; /* incarnation of peer */
- /* when (in jiffies) I was last alive */
- unsigned long ibp_last_alive;
- /* # users */
- atomic_t ibp_refcount;
- /* version of peer */
- __u16 ibp_version;
- /* current passive connection attempts */
- unsigned short ibp_accepting;
- /* current active connection attempts */
- unsigned short ibp_connecting;
- /* reconnect this peer later */
- unsigned char ibp_reconnecting;
- /* counter of how many times we triggered a conn race */
- unsigned char ibp_races;
- /* # consecutive reconnection attempts to this peer */
- unsigned int ibp_reconnected;
- /* errno on closing this peer */
- int ibp_error;
- /* max map_on_demand */
- __u16 ibp_max_frags;
- /* max_peer_credits */
- __u16 ibp_queue_depth;
-};
-
-extern struct kib_data kiblnd_data;
-
-void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
-
-int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
-
-/* max # of fragments configured by user */
-static inline int
-kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int mod;
-
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
- mod = tunables->lnd_map_on_demand;
- return mod ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT;
-}
-
-static inline int
-kiblnd_rdma_frags(int version, struct lnet_ni *ni)
-{
- return version == IBLND_MSG_VERSION_1 ?
- (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) :
- kiblnd_cfg_rdma_frags(ni);
-}
-
-static inline int
-kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int concurrent_sends;
-
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
- concurrent_sends = tunables->lnd_concurrent_sends;
-
- if (version == IBLND_MSG_VERSION_1) {
- if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
- return IBLND_MSG_QUEUE_SIZE_V1 * 2;
-
- if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
- return IBLND_MSG_QUEUE_SIZE_V1 / 2;
- }
-
- return concurrent_sends;
-}
-
-static inline void
-kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
-{
- LASSERT(atomic_read(&hdev->ibh_ref) > 0);
- atomic_inc(&hdev->ibh_ref);
-}
-
-static inline void
-kiblnd_hdev_decref(struct kib_hca_dev *hdev)
-{
- LASSERT(atomic_read(&hdev->ibh_ref) > 0);
- if (atomic_dec_and_test(&hdev->ibh_ref))
- kiblnd_hdev_destroy(hdev);
-}
-
-static inline int
-kiblnd_dev_can_failover(struct kib_dev *dev)
-{
- if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
- return 0;
-
- if (!*kiblnd_tunables.kib_dev_failover) /* disabled */
- return 0;
-
- if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
- return 1;
-
- return dev->ibd_can_failover;
-}
-
-#define kiblnd_conn_addref(conn) \
-do { \
- CDEBUG(D_NET, "conn[%p] (%d)++\n", \
- (conn), atomic_read(&(conn)->ibc_refcount)); \
- atomic_inc(&(conn)->ibc_refcount); \
-} while (0)
-
-#define kiblnd_conn_decref(conn) \
-do { \
- unsigned long flags; \
- \
- CDEBUG(D_NET, "conn[%p] (%d)--\n", \
- (conn), atomic_read(&(conn)->ibc_refcount)); \
- LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \
- if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \
- spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \
- list_add_tail(&(conn)->ibc_list, \
- &kiblnd_data.kib_connd_zombies); \
- wake_up(&kiblnd_data.kib_connd_waitq); \
- spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
- } \
-} while (0)
-
-#define kiblnd_peer_addref(peer) \
-do { \
- CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \
- (peer), libcfs_nid2str((peer)->ibp_nid), \
- atomic_read(&(peer)->ibp_refcount)); \
- atomic_inc(&(peer)->ibp_refcount); \
-} while (0)
-
-#define kiblnd_peer_decref(peer) \
-do { \
- CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \
- (peer), libcfs_nid2str((peer)->ibp_nid), \
- atomic_read(&(peer)->ibp_refcount)); \
- LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \
- if (atomic_dec_and_test(&(peer)->ibp_refcount)) \
- kiblnd_destroy_peer(peer); \
-} while (0)
-
-static inline bool
-kiblnd_peer_connecting(struct kib_peer *peer)
-{
- return peer->ibp_connecting ||
- peer->ibp_reconnecting ||
- peer->ibp_accepting;
-}
-
-static inline bool
-kiblnd_peer_idle(struct kib_peer *peer)
-{
- return !kiblnd_peer_connecting(peer) && list_empty(&peer->ibp_conns);
-}
-
-static inline struct list_head *
-kiblnd_nid2peerlist(lnet_nid_t nid)
-{
- unsigned int hash =
- ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
-
- return &kiblnd_data.kib_peers[hash];
-}
-
-static inline int
-kiblnd_peer_active(struct kib_peer *peer)
-{
- /* Am I in the peer hash table? */
- return !list_empty(&peer->ibp_list);
-}
-
-static inline struct kib_conn *
-kiblnd_get_conn_locked(struct kib_peer *peer)
-{
- struct list_head *next;
-
- LASSERT(!list_empty(&peer->ibp_conns));
-
- /* Advance to next connection, be sure to skip the head node */
- if (!peer->ibp_next_conn ||
- peer->ibp_next_conn->ibc_list.next == &peer->ibp_conns)
- next = peer->ibp_conns.next;
- else
- next = peer->ibp_next_conn->ibc_list.next;
- peer->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
-
- return peer->ibp_next_conn;
-}
-
-static inline int
-kiblnd_send_keepalive(struct kib_conn *conn)
-{
- return (*kiblnd_tunables.kib_keepalive > 0) &&
- time_after(jiffies, conn->ibc_last_send +
- msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
- MSEC_PER_SEC));
-}
-
-static inline int
-kiblnd_need_noop(struct kib_conn *conn)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
-
- if (conn->ibc_outstanding_credits <
- IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
- !kiblnd_send_keepalive(conn))
- return 0; /* No need to send NOOP */
-
- if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
- if (!list_empty(&conn->ibc_tx_queue_nocred))
- return 0; /* NOOP can be piggybacked */
-
- /* No tx to piggyback NOOP onto or no credit to send a tx */
- return (list_empty(&conn->ibc_tx_queue) ||
- !conn->ibc_credits);
- }
-
- if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
- !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
- !conn->ibc_credits) /* no credit */
- return 0;
-
- if (conn->ibc_credits == 1 && /* last credit reserved for */
- !conn->ibc_outstanding_credits) /* giving back credits */
- return 0;
-
- /* No tx to piggyback NOOP onto or no credit to send a tx */
- return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
-}
-
-static inline void
-kiblnd_abort_receives(struct kib_conn *conn)
-{
- ib_modify_qp(conn->ibc_cmid->qp,
- &kiblnd_data.kib_error_qpa, IB_QP_STATE);
-}
-
-static inline const char *
-kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
-{
- if (q == &conn->ibc_tx_queue)
- return "tx_queue";
-
- if (q == &conn->ibc_tx_queue_rsrvd)
- return "tx_queue_rsrvd";
-
- if (q == &conn->ibc_tx_queue_nocred)
- return "tx_queue_nocred";
-
- if (q == &conn->ibc_active_txs)
- return "active_txs";
-
- LBUG();
- return NULL;
-}
-
-/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the */
-/* lowest bits of the work request id to stash the work item type. */
-
-#define IBLND_WID_INVAL 0
-#define IBLND_WID_TX 1
-#define IBLND_WID_RX 2
-#define IBLND_WID_RDMA 3
-#define IBLND_WID_MR 4
-#define IBLND_WID_MASK 7UL
-
-static inline __u64
-kiblnd_ptr2wreqid(void *ptr, int type)
-{
- unsigned long lptr = (unsigned long)ptr;
-
- LASSERT(!(lptr & IBLND_WID_MASK));
- LASSERT(!(type & ~IBLND_WID_MASK));
- return (__u64)(lptr | type);
-}
-
-static inline void *
-kiblnd_wreqid2ptr(__u64 wreqid)
-{
- return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
-}
-
-static inline int
-kiblnd_wreqid2type(__u64 wreqid)
-{
- return wreqid & IBLND_WID_MASK;
-}
-
-static inline void
-kiblnd_set_conn_state(struct kib_conn *conn, int state)
-{
- conn->ibc_state = state;
- mb();
-}
-
-static inline void
-kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
-{
- msg->ibm_type = type;
- msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
-}
-
-static inline int
-kiblnd_rd_size(struct kib_rdma_desc *rd)
-{
- int i;
- int size;
-
- for (i = size = 0; i < rd->rd_nfrags; i++)
- size += rd->rd_frags[i].rf_nob;
-
- return size;
-}
-
-static inline __u64
-kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
-{
- return rd->rd_frags[index].rf_addr;
-}
-
-static inline __u32
-kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
-{
- return rd->rd_frags[index].rf_nob;
-}
-
-static inline __u32
-kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
-{
- return rd->rd_key;
-}
-
-static inline int
-kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
-{
- if (nob < rd->rd_frags[index].rf_nob) {
- rd->rd_frags[index].rf_addr += nob;
- rd->rd_frags[index].rf_nob -= nob;
- } else {
- index++;
- }
-
- return index;
-}
-
-static inline int
-kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
-{
- LASSERT(msgtype == IBLND_MSG_GET_REQ ||
- msgtype == IBLND_MSG_PUT_ACK);
-
- return msgtype == IBLND_MSG_GET_REQ ?
- offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
- offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
-}
-
-static inline __u64
-kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
- return ib_dma_mapping_error(dev, dma_addr);
-}
-
-static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
- void *msg, size_t size,
- enum dma_data_direction direction)
-{
- return ib_dma_map_single(dev, msg, size, direction);
-}
-
-static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
- __u64 addr, size_t size,
- enum dma_data_direction direction)
-{
- ib_dma_unmap_single(dev, addr, size, direction);
-}
-
-#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
-#define KIBLND_UNMAP_ADDR(p, m, a) (a)
-
-static inline int kiblnd_dma_map_sg(struct ib_device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
-{
- return ib_dma_map_sg(dev, sg, nents, direction);
-}
-
-static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
-{
- ib_dma_unmap_sg(dev, sg, nents, direction);
-}
-
-static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
- struct scatterlist *sg)
-{
- return ib_sg_dma_address(dev, sg);
-}
-
-static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
- struct scatterlist *sg)
-{
- return ib_sg_dma_len(dev, sg);
-}
-
-/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly */
-/* right because OFED1.2 defines it as const, to use it we have to add */
-/* (void *) cast to overcome "const" */
-
-#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
-#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
-
-void kiblnd_map_rx_descs(struct kib_conn *conn);
-void kiblnd_unmap_rx_descs(struct kib_conn *conn);
-void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
-struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
-
-int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
- struct kib_rdma_desc *rd, __u32 nob, __u64 iov,
- struct kib_fmr *fmr);
-void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
-
-int kiblnd_tunables_setup(struct lnet_ni *ni);
-void kiblnd_tunables_init(void);
-
-int kiblnd_connd(void *arg);
-int kiblnd_scheduler(void *arg);
-int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
-int kiblnd_failover_thread(void *arg);
-
-int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
-
-int kiblnd_cm_callback(struct rdma_cm_id *cmid,
- struct rdma_cm_event *event);
-int kiblnd_translate_mtu(int value);
-
-int kiblnd_dev_failover(struct kib_dev *dev);
-int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp,
- lnet_nid_t nid);
-void kiblnd_destroy_peer(struct kib_peer *peer);
-bool kiblnd_reconnect_peer(struct kib_peer *peer);
-void kiblnd_destroy_dev(struct kib_dev *dev);
-void kiblnd_unlink_peer_locked(struct kib_peer *peer);
-struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid);
-int kiblnd_close_stale_conns_locked(struct kib_peer *peer,
- int version, __u64 incarnation);
-int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why);
-
-struct kib_conn *kiblnd_create_conn(struct kib_peer *peer,
- struct rdma_cm_id *cmid,
- int state, int version);
-void kiblnd_destroy_conn(struct kib_conn *conn);
-void kiblnd_close_conn(struct kib_conn *conn, int error);
-void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
-
-void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
-void kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
- int status);
-
-void kiblnd_qp_event(struct ib_event *event, void *arg);
-void kiblnd_cq_event(struct ib_event *event, void *arg);
-void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
-
-void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
- int credits, lnet_nid_t dstnid, __u64 dststamp);
-int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
-int kiblnd_post_rx(struct kib_rx *rx, int credit);
-
-int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
-int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, struct iov_iter *to, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
deleted file mode 100644
index 65b7a62..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ /dev/null
@@ -1,3763 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd_cb.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <linux/highmem.h>
-#include "o2iblnd.h"
-
-#define MAX_CONN_RACES_BEFORE_ABORT 20
-
-static void kiblnd_peer_alive(struct kib_peer *peer);
-static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error);
-static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx,
- int type, int body_nob);
-static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
- int resid, struct kib_rdma_desc *dstrd,
- __u64 dstcookie);
-static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
-static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
-static void kiblnd_unmap_tx(struct kib_tx *tx);
-static void kiblnd_check_sends_locked(struct kib_conn *conn);
-
-static void
-kiblnd_tx_done(struct lnet_ni *ni, struct kib_tx *tx)
-{
- struct lnet_msg *lntmsg[2];
- struct kib_net *net = ni->ni_data;
- int rc;
- int i;
-
- LASSERT(net);
- LASSERT(!in_interrupt());
- LASSERT(!tx->tx_queued); /* mustn't be queued for sending */
- LASSERT(!tx->tx_sending); /* mustn't be awaiting sent callback */
- LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */
- LASSERT(tx->tx_pool);
-
- kiblnd_unmap_tx(tx);
-
- /* tx may have up to 2 lnet msgs to finalise */
- lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
- lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
- rc = tx->tx_status;
-
- if (tx->tx_conn) {
- LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni);
-
- kiblnd_conn_decref(tx->tx_conn);
- tx->tx_conn = NULL;
- }
-
- tx->tx_nwrq = 0;
- tx->tx_status = 0;
-
- kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
-
- /* delay finalize until my descs have been freed */
- for (i = 0; i < 2; i++) {
- if (!lntmsg[i])
- continue;
-
- lnet_finalize(ni, lntmsg[i], rc);
- }
-}
-
-void
-kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int status)
-{
- struct kib_tx *tx;
-
- while (!list_empty(txlist)) {
- tx = list_entry(txlist->next, struct kib_tx, tx_list);
-
- list_del(&tx->tx_list);
- /* complete now */
- tx->tx_waiting = 0;
- tx->tx_status = status;
- kiblnd_tx_done(ni, tx);
- }
-}
-
-static struct kib_tx *
-kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
-{
- struct kib_net *net = (struct kib_net *)ni->ni_data;
- struct list_head *node;
- struct kib_tx *tx;
- struct kib_tx_poolset *tps;
-
- tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
- node = kiblnd_pool_alloc_node(&tps->tps_poolset);
- if (!node)
- return NULL;
- tx = list_entry(node, struct kib_tx, tx_list);
-
- LASSERT(!tx->tx_nwrq);
- LASSERT(!tx->tx_queued);
- LASSERT(!tx->tx_sending);
- LASSERT(!tx->tx_waiting);
- LASSERT(!tx->tx_status);
- LASSERT(!tx->tx_conn);
- LASSERT(!tx->tx_lntmsg[0]);
- LASSERT(!tx->tx_lntmsg[1]);
- LASSERT(!tx->tx_nfrags);
-
- return tx;
-}
-
-static void
-kiblnd_drop_rx(struct kib_rx *rx)
-{
- struct kib_conn *conn = rx->rx_conn;
- struct kib_sched_info *sched = conn->ibc_sched;
- unsigned long flags;
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
- LASSERT(conn->ibc_nrx > 0);
- conn->ibc_nrx--;
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- kiblnd_conn_decref(conn);
-}
-
-int
-kiblnd_post_rx(struct kib_rx *rx, int credit)
-{
- struct kib_conn *conn = rx->rx_conn;
- struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
- struct ib_recv_wr *bad_wrq = NULL;
- int rc;
-
- LASSERT(net);
- LASSERT(!in_interrupt());
- LASSERT(credit == IBLND_POSTRX_NO_CREDIT ||
- credit == IBLND_POSTRX_PEER_CREDIT ||
- credit == IBLND_POSTRX_RSRVD_CREDIT);
-
- rx->rx_sge.lkey = conn->ibc_hdev->ibh_pd->local_dma_lkey;
- rx->rx_sge.addr = rx->rx_msgaddr;
- rx->rx_sge.length = IBLND_MSG_SIZE;
-
- rx->rx_wrq.next = NULL;
- rx->rx_wrq.sg_list = &rx->rx_sge;
- rx->rx_wrq.num_sge = 1;
- rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
-
- LASSERT(conn->ibc_state >= IBLND_CONN_INIT);
- LASSERT(rx->rx_nob >= 0); /* not posted */
-
- if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
- kiblnd_drop_rx(rx); /* No more posts for this rx */
- return 0;
- }
-
- rx->rx_nob = -1; /* flag posted */
-
- /* NB: need an extra reference after ib_post_recv because we don't
- * own this rx (and rx::rx_conn) anymore, LU-5678.
- */
- kiblnd_conn_addref(conn);
- rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
- if (unlikely(rc)) {
- CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
- rx->rx_nob = 0;
- }
-
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
- goto out;
-
- if (unlikely(rc)) {
- kiblnd_close_conn(conn, rc);
- kiblnd_drop_rx(rx); /* No more posts for this rx */
- goto out;
- }
-
- if (credit == IBLND_POSTRX_NO_CREDIT)
- goto out;
-
- spin_lock(&conn->ibc_lock);
- if (credit == IBLND_POSTRX_PEER_CREDIT)
- conn->ibc_outstanding_credits++;
- else
- conn->ibc_reserved_credits++;
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
-out:
- kiblnd_conn_decref(conn);
- return rc;
-}
-
-static struct kib_tx *
-kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, __u64 cookie)
-{
- struct list_head *tmp;
-
- list_for_each(tmp, &conn->ibc_active_txs) {
- struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list);
-
- LASSERT(!tx->tx_queued);
- LASSERT(tx->tx_sending || tx->tx_waiting);
-
- if (tx->tx_cookie != cookie)
- continue;
-
- if (tx->tx_waiting &&
- tx->tx_msg->ibm_type == txtype)
- return tx;
-
- CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
- tx->tx_waiting ? "" : "NOT ",
- tx->tx_msg->ibm_type, txtype);
- }
- return NULL;
-}
-
-static void
-kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, __u64 cookie)
-{
- struct kib_tx *tx;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- int idle;
-
- spin_lock(&conn->ibc_lock);
-
- tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
- if (!tx) {
- spin_unlock(&conn->ibc_lock);
-
- CWARN("Unmatched completion type %x cookie %#llx from %s\n",
- txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_close_conn(conn, -EPROTO);
- return;
- }
-
- if (!tx->tx_status) { /* success so far */
- if (status < 0) /* failed? */
- tx->tx_status = status;
- else if (txtype == IBLND_MSG_GET_REQ)
- lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
- }
-
- tx->tx_waiting = 0;
-
- idle = !tx->tx_queued && !tx->tx_sending;
- if (idle)
- list_del(&tx->tx_list);
-
- spin_unlock(&conn->ibc_lock);
-
- if (idle)
- kiblnd_tx_done(ni, tx);
-}
-
-static void
-kiblnd_send_completion(struct kib_conn *conn, int type, int status, __u64 cookie)
-{
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
-
- if (!tx) {
- CERROR("Can't get tx for completion %x for %s\n",
- type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- return;
- }
-
- tx->tx_msg->ibm_u.completion.ibcm_status = status;
- tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
- kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
-
- kiblnd_queue_tx(tx, conn);
-}
-
-static void
-kiblnd_handle_rx(struct kib_rx *rx)
-{
- struct kib_msg *msg = rx->rx_msg;
- struct kib_conn *conn = rx->rx_conn;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- int credits = msg->ibm_credits;
- struct kib_tx *tx;
- int rc = 0;
- int rc2;
- int post_credit;
-
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- CDEBUG(D_NET, "Received %x[%d] from %s\n",
- msg->ibm_type, credits,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
-
- if (credits) {
- /* Have I received credits that will let me send? */
- spin_lock(&conn->ibc_lock);
-
- if (conn->ibc_credits + credits >
- conn->ibc_queue_depth) {
- rc2 = conn->ibc_credits;
- spin_unlock(&conn->ibc_lock);
-
- CERROR("Bad credits from %s: %d + %d > %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- rc2, credits, conn->ibc_queue_depth);
-
- kiblnd_close_conn(conn, -EPROTO);
- kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
- return;
- }
-
- conn->ibc_credits += credits;
-
- /* This ensures the credit taken by NOOP can be returned */
- if (msg->ibm_type == IBLND_MSG_NOOP &&
- !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
- conn->ibc_outstanding_credits++;
-
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
- }
-
- switch (msg->ibm_type) {
- default:
- CERROR("Bad IBLND message type %x from %s\n",
- msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- post_credit = IBLND_POSTRX_NO_CREDIT;
- rc = -EPROTO;
- break;
-
- case IBLND_MSG_NOOP:
- if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
- post_credit = IBLND_POSTRX_NO_CREDIT;
- break;
- }
-
- if (credits) /* credit already posted */
- post_credit = IBLND_POSTRX_NO_CREDIT;
- else /* a keepalive NOOP */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_IMMEDIATE:
- post_credit = IBLND_POSTRX_DONT_POST;
- rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
- msg->ibm_srcnid, rx, 0);
- if (rc < 0) /* repost on error */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_PUT_REQ:
- post_credit = IBLND_POSTRX_DONT_POST;
- rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
- msg->ibm_srcnid, rx, 1);
- if (rc < 0) /* repost on error */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_PUT_NAK:
- CWARN("PUT_NACK from %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- post_credit = IBLND_POSTRX_RSRVD_CREDIT;
- kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
- msg->ibm_u.completion.ibcm_status,
- msg->ibm_u.completion.ibcm_cookie);
- break;
-
- case IBLND_MSG_PUT_ACK:
- post_credit = IBLND_POSTRX_RSRVD_CREDIT;
-
- spin_lock(&conn->ibc_lock);
- tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
- msg->ibm_u.putack.ibpam_src_cookie);
- if (tx)
- list_del(&tx->tx_list);
- spin_unlock(&conn->ibc_lock);
-
- if (!tx) {
- CERROR("Unmatched PUT_ACK from %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- rc = -EPROTO;
- break;
- }
-
- LASSERT(tx->tx_waiting);
- /*
- * CAVEAT EMPTOR: I could be racing with tx_complete, but...
- * (a) I can overwrite tx_msg since my peer has received it!
- * (b) tx_waiting set tells tx_complete() it's not done.
- */
- tx->tx_nwrq = 0; /* overwrite PUT_REQ */
-
- rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
- kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
- &msg->ibm_u.putack.ibpam_rd,
- msg->ibm_u.putack.ibpam_dst_cookie);
- if (rc2 < 0)
- CERROR("Can't setup rdma for PUT to %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
-
- spin_lock(&conn->ibc_lock);
- tx->tx_waiting = 0; /* clear waiting and queue atomically */
- kiblnd_queue_tx_locked(tx, conn);
- spin_unlock(&conn->ibc_lock);
- break;
-
- case IBLND_MSG_PUT_DONE:
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
- msg->ibm_u.completion.ibcm_status,
- msg->ibm_u.completion.ibcm_cookie);
- break;
-
- case IBLND_MSG_GET_REQ:
- post_credit = IBLND_POSTRX_DONT_POST;
- rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
- msg->ibm_srcnid, rx, 1);
- if (rc < 0) /* repost on error */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_GET_DONE:
- post_credit = IBLND_POSTRX_RSRVD_CREDIT;
- kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
- msg->ibm_u.completion.ibcm_status,
- msg->ibm_u.completion.ibcm_cookie);
- break;
- }
-
- if (rc < 0) /* protocol error */
- kiblnd_close_conn(conn, rc);
-
- if (post_credit != IBLND_POSTRX_DONT_POST)
- kiblnd_post_rx(rx, post_credit);
-}
-
-static void
-kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
-{
- struct kib_msg *msg = rx->rx_msg;
- struct kib_conn *conn = rx->rx_conn;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- struct kib_net *net = ni->ni_data;
- int rc;
- int err = -EIO;
-
- LASSERT(net);
- LASSERT(rx->rx_nob < 0); /* was posted */
- rx->rx_nob = 0; /* isn't now */
-
- if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
- goto ignore;
-
- if (status != IB_WC_SUCCESS) {
- CNETERR("Rx from %s failed: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
- goto failed;
- }
-
- LASSERT(nob >= 0);
- rx->rx_nob = nob;
-
- rc = kiblnd_unpack_msg(msg, rx->rx_nob);
- if (rc) {
- CERROR("Error %d unpacking rx from %s\n",
- rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- goto failed;
- }
-
- if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
- msg->ibm_dstnid != ni->ni_nid ||
- msg->ibm_srcstamp != conn->ibc_incarnation ||
- msg->ibm_dststamp != net->ibn_incarnation) {
- CERROR("Stale rx from %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- err = -ESTALE;
- goto failed;
- }
-
- /* set time last known alive */
- kiblnd_peer_alive(conn->ibc_peer);
-
- /* racing with connection establishment/teardown! */
-
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- unsigned long flags;
-
- write_lock_irqsave(g_lock, flags);
- /* must check holding global lock to eliminate race */
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
- write_unlock_irqrestore(g_lock, flags);
- return;
- }
- write_unlock_irqrestore(g_lock, flags);
- }
- kiblnd_handle_rx(rx);
- return;
-
- failed:
- CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
- kiblnd_close_conn(conn, err);
- ignore:
- kiblnd_drop_rx(rx); /* Don't re-post rx. */
-}
-
-static struct page *
-kiblnd_kvaddr_to_page(unsigned long vaddr)
-{
- struct page *page;
-
- if (is_vmalloc_addr((void *)vaddr)) {
- page = vmalloc_to_page((void *)vaddr);
- LASSERT(page);
- return page;
- }
-#ifdef CONFIG_HIGHMEM
- if (vaddr >= PKMAP_BASE &&
- vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
- /* No highmem pages only used for bulk (kiov) I/O */
- CERROR("find page for address in highmem\n");
- LBUG();
- }
-#endif
- page = virt_to_page(vaddr);
- LASSERT(page);
- return page;
-}
-
-static int
-kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, struct kib_rdma_desc *rd, __u32 nob)
-{
- struct kib_hca_dev *hdev;
- struct kib_fmr_poolset *fps;
- int cpt;
- int rc;
-
- LASSERT(tx->tx_pool);
- LASSERT(tx->tx_pool->tpo_pool.po_owner);
-
- hdev = tx->tx_pool->tpo_hdev;
- cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
-
- fps = net->ibn_fmr_ps[cpt];
- rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr);
- if (rc) {
- CERROR("Can't map %u bytes: %d\n", nob, rc);
- return rc;
- }
-
- /*
- * If rd is not tx_rd, it's going to get sent to a peer, who will need
- * the rkey
- */
- rd->rd_key = tx->fmr.fmr_key;
- rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
- rd->rd_frags[0].rf_nob = nob;
- rd->rd_nfrags = 1;
-
- return 0;
-}
-
-static void kiblnd_unmap_tx(struct kib_tx *tx)
-{
- if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd)
- kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
-
- if (tx->tx_nfrags) {
- kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
- tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
- tx->tx_nfrags = 0;
- }
-}
-
-static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
- struct kib_rdma_desc *rd, int nfrags)
-{
- struct kib_net *net = ni->ni_data;
- struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
- __u32 nob;
- int i;
-
- /*
- * If rd is not tx_rd, it's going to get sent to a peer and I'm the
- * RDMA sink
- */
- tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
- tx->tx_nfrags = nfrags;
-
- rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
- tx->tx_nfrags, tx->tx_dmadir);
-
- for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
- rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len(
- hdev->ibh_ibdev, &tx->tx_frags[i]);
- rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
- hdev->ibh_ibdev, &tx->tx_frags[i]);
- nob += rd->rd_frags[i].rf_nob;
- }
-
- if (net->ibn_fmr_ps)
- return kiblnd_fmr_map_tx(net, tx, rd, nob);
-
- return -EINVAL;
-}
-
-static int
-kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
- struct kib_rdma_desc *rd, unsigned int niov,
- const struct kvec *iov, int offset, int nob)
-{
- struct kib_net *net = ni->ni_data;
- struct page *page;
- struct scatterlist *sg;
- unsigned long vaddr;
- int fragnob;
- int page_offset;
-
- LASSERT(nob > 0);
- LASSERT(niov > 0);
- LASSERT(net);
-
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- niov--;
- iov++;
- LASSERT(niov > 0);
- }
-
- sg = tx->tx_frags;
- do {
- LASSERT(niov > 0);
-
- vaddr = ((unsigned long)iov->iov_base) + offset;
- page_offset = vaddr & (PAGE_SIZE - 1);
- page = kiblnd_kvaddr_to_page(vaddr);
- if (!page) {
- CERROR("Can't find page\n");
- return -EFAULT;
- }
-
- fragnob = min((int)(iov->iov_len - offset), nob);
- fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
-
- sg_set_page(sg, page, fragnob, page_offset);
- sg = sg_next(sg);
- if (!sg) {
- CERROR("lacking enough sg entries to map tx\n");
- return -EFAULT;
- }
-
- if (offset + fragnob < iov->iov_len) {
- offset += fragnob;
- } else {
- offset = 0;
- iov++;
- niov--;
- }
- nob -= fragnob;
- } while (nob > 0);
-
- return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
-}
-
-static int
-kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
- struct kib_rdma_desc *rd, int nkiov,
- const struct bio_vec *kiov, int offset, int nob)
-{
- struct kib_net *net = ni->ni_data;
- struct scatterlist *sg;
- int fragnob;
-
- CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
-
- LASSERT(nob > 0);
- LASSERT(nkiov > 0);
- LASSERT(net);
-
- while (offset >= kiov->bv_len) {
- offset -= kiov->bv_len;
- nkiov--;
- kiov++;
- LASSERT(nkiov > 0);
- }
-
- sg = tx->tx_frags;
- do {
- LASSERT(nkiov > 0);
-
- fragnob = min((int)(kiov->bv_len - offset), nob);
-
- sg_set_page(sg, kiov->bv_page, fragnob,
- kiov->bv_offset + offset);
- sg = sg_next(sg);
- if (!sg) {
- CERROR("lacking enough sg entries to map tx\n");
- return -EFAULT;
- }
-
- offset = 0;
- kiov++;
- nkiov--;
- nob -= fragnob;
- } while (nob > 0);
-
- return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
-}
-
-static int
-kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
- __must_hold(&conn->ibc_lock)
-{
- struct kib_msg *msg = tx->tx_msg;
- struct kib_peer *peer = conn->ibc_peer;
- struct lnet_ni *ni = peer->ibp_ni;
- int ver = conn->ibc_version;
- int rc;
- int done;
-
- LASSERT(tx->tx_queued);
- /* We rely on this for QP sizing */
- LASSERT(tx->tx_nwrq > 0);
-
- LASSERT(!credit || credit == 1);
- LASSERT(conn->ibc_outstanding_credits >= 0);
- LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth);
- LASSERT(conn->ibc_credits >= 0);
- LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
-
- if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) {
- /* tx completions outstanding... */
- CDEBUG(D_NET, "%s: posted enough\n",
- libcfs_nid2str(peer->ibp_nid));
- return -EAGAIN;
- }
-
- if (credit && !conn->ibc_credits) { /* no credits */
- CDEBUG(D_NET, "%s: no credits\n",
- libcfs_nid2str(peer->ibp_nid));
- return -EAGAIN;
- }
-
- if (credit && !IBLND_OOB_CAPABLE(ver) &&
- conn->ibc_credits == 1 && /* last credit reserved */
- msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */
- CDEBUG(D_NET, "%s: not using last credit\n",
- libcfs_nid2str(peer->ibp_nid));
- return -EAGAIN;
- }
-
- /* NB don't drop ibc_lock before bumping tx_sending */
- list_del(&tx->tx_list);
- tx->tx_queued = 0;
-
- if (msg->ibm_type == IBLND_MSG_NOOP &&
- (!kiblnd_need_noop(conn) || /* redundant NOOP */
- (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
- conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
- /*
- * OK to drop when posted enough NOOPs, since
- * kiblnd_check_sends_locked will queue NOOP again when
- * posted NOOPs complete
- */
- spin_unlock(&conn->ibc_lock);
- kiblnd_tx_done(peer->ibp_ni, tx);
- spin_lock(&conn->ibc_lock);
- CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
- libcfs_nid2str(peer->ibp_nid),
- conn->ibc_noops_posted);
- return 0;
- }
-
- kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
- peer->ibp_nid, conn->ibc_incarnation);
-
- conn->ibc_credits -= credit;
- conn->ibc_outstanding_credits = 0;
- conn->ibc_nsends_posted++;
- if (msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted++;
-
- /*
- * CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
- * PUT. If so, it was first queued here as a PUT_REQ, sent and
- * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
- * and then re-queued here. It's (just) possible that
- * tx_sending is non-zero if we've not done the tx_complete()
- * from the first send; hence the ++ rather than = below.
- */
- tx->tx_sending++;
- list_add(&tx->tx_list, &conn->ibc_active_txs);
-
- /* I'm still holding ibc_lock! */
- if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
- rc = -ECONNABORTED;
- } else if (tx->tx_pool->tpo_pool.po_failed ||
- conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
- /* close_conn will launch failover */
- rc = -ENETDOWN;
- } else {
- struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
- struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
- struct ib_send_wr *wrq = &tx->tx_wrq[0].wr;
-
- if (frd) {
- if (!frd->frd_valid) {
- wrq = &frd->frd_inv_wr;
- wrq->next = &frd->frd_fastreg_wr.wr;
- } else {
- wrq = &frd->frd_fastreg_wr.wr;
- }
- frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
- }
-
- LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
- "bad wr_id %llx, opc %d, flags %d, peer: %s\n",
- bad->wr_id, bad->opcode, bad->send_flags,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- bad = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
- }
-
- conn->ibc_last_send = jiffies;
-
- if (!rc)
- return 0;
-
- /*
- * NB credits are transferred in the actual
- * message, which can only be the last work item
- */
- conn->ibc_credits += credit;
- conn->ibc_outstanding_credits += msg->ibm_credits;
- conn->ibc_nsends_posted--;
- if (msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted--;
-
- tx->tx_status = rc;
- tx->tx_waiting = 0;
- tx->tx_sending--;
-
- done = !tx->tx_sending;
- if (done)
- list_del(&tx->tx_list);
-
- spin_unlock(&conn->ibc_lock);
-
- if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
- CERROR("Error %d posting transmit to %s\n",
- rc, libcfs_nid2str(peer->ibp_nid));
- else
- CDEBUG(D_NET, "Error %d posting transmit to %s\n",
- rc, libcfs_nid2str(peer->ibp_nid));
-
- kiblnd_close_conn(conn, rc);
-
- if (done)
- kiblnd_tx_done(peer->ibp_ni, tx);
-
- spin_lock(&conn->ibc_lock);
-
- return -EIO;
-}
-
-static void
-kiblnd_check_sends_locked(struct kib_conn *conn)
-{
- int ver = conn->ibc_version;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- struct kib_tx *tx;
-
- /* Don't send anything until after the connection is established */
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- CDEBUG(D_NET, "%s too soon\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- return;
- }
-
- LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni));
- LASSERT(!IBLND_OOB_CAPABLE(ver) ||
- conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
- LASSERT(conn->ibc_reserved_credits >= 0);
-
- while (conn->ibc_reserved_credits > 0 &&
- !list_empty(&conn->ibc_tx_queue_rsrvd)) {
- tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
- struct kib_tx, tx_list);
- list_del(&tx->tx_list);
- list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
- conn->ibc_reserved_credits--;
- }
-
- if (kiblnd_need_noop(conn)) {
- spin_unlock(&conn->ibc_lock);
-
- tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
- if (tx)
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
-
- spin_lock(&conn->ibc_lock);
- if (tx)
- kiblnd_queue_tx_locked(tx, conn);
- }
-
- for (;;) {
- int credit;
-
- if (!list_empty(&conn->ibc_tx_queue_nocred)) {
- credit = 0;
- tx = list_entry(conn->ibc_tx_queue_nocred.next,
- struct kib_tx, tx_list);
- } else if (!list_empty(&conn->ibc_tx_noops)) {
- LASSERT(!IBLND_OOB_CAPABLE(ver));
- credit = 1;
- tx = list_entry(conn->ibc_tx_noops.next,
- struct kib_tx, tx_list);
- } else if (!list_empty(&conn->ibc_tx_queue)) {
- credit = 1;
- tx = list_entry(conn->ibc_tx_queue.next,
- struct kib_tx, tx_list);
- } else {
- break;
- }
-
- if (kiblnd_post_tx_locked(conn, tx, credit))
- break;
- }
-}
-
-static void
-kiblnd_tx_complete(struct kib_tx *tx, int status)
-{
- int failed = (status != IB_WC_SUCCESS);
- struct kib_conn *conn = tx->tx_conn;
- int idle;
-
- LASSERT(tx->tx_sending > 0);
-
- if (failed) {
- if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
- CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
- status);
-
- kiblnd_close_conn(conn, -EIO);
- } else {
- kiblnd_peer_alive(conn->ibc_peer);
- }
-
- spin_lock(&conn->ibc_lock);
-
- /*
- * I could be racing with rdma completion. Whoever makes 'tx' idle
- * gets to free it, which also drops its ref on 'conn'.
- */
- tx->tx_sending--;
- conn->ibc_nsends_posted--;
- if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted--;
-
- if (failed) {
- tx->tx_waiting = 0; /* don't wait for peer */
- tx->tx_status = -EIO;
- }
-
- idle = !tx->tx_sending && /* This is the final callback */
- !tx->tx_waiting && /* Not waiting for peer */
- !tx->tx_queued; /* Not re-queued (PUT_DONE) */
- if (idle)
- list_del(&tx->tx_list);
-
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
- if (idle)
- kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
-}
-
-static void
-kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
- int body_nob)
-{
- struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
- struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
- struct ib_rdma_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
- int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
-
- LASSERT(tx->tx_nwrq >= 0);
- LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
- LASSERT(nob <= IBLND_MSG_SIZE);
-
- kiblnd_init_msg(tx->tx_msg, type, body_nob);
-
- sge->lkey = hdev->ibh_pd->local_dma_lkey;
- sge->addr = tx->tx_msgaddr;
- sge->length = nob;
-
- memset(wrq, 0, sizeof(*wrq));
-
- wrq->wr.next = NULL;
- wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
- wrq->wr.sg_list = sge;
- wrq->wr.num_sge = 1;
- wrq->wr.opcode = IB_WR_SEND;
- wrq->wr.send_flags = IB_SEND_SIGNALED;
-
- tx->tx_nwrq++;
-}
-
-static int
-kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
- int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie)
-{
- struct kib_msg *ibmsg = tx->tx_msg;
- struct kib_rdma_desc *srcrd = tx->tx_rd;
- struct ib_sge *sge = &tx->tx_sge[0];
- struct ib_rdma_wr *wrq, *next;
- int rc = resid;
- int srcidx = 0;
- int dstidx = 0;
- int wrknob;
-
- LASSERT(!in_interrupt());
- LASSERT(!tx->tx_nwrq);
- LASSERT(type == IBLND_MSG_GET_DONE ||
- type == IBLND_MSG_PUT_DONE);
-
- if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) {
- CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- conn->ibc_max_frags << PAGE_SHIFT,
- kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd));
- rc = -EMSGSIZE;
- goto too_big;
- }
-
- while (resid > 0) {
- if (srcidx >= srcrd->rd_nfrags) {
- CERROR("Src buffer exhausted: %d frags\n", srcidx);
- rc = -EPROTO;
- break;
- }
-
- if (dstidx == dstrd->rd_nfrags) {
- CERROR("Dst buffer exhausted: %d frags\n", dstidx);
- rc = -EPROTO;
- break;
- }
-
- if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) {
- CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- IBLND_MAX_RDMA_FRAGS,
- srcidx, srcrd->rd_nfrags,
- dstidx, dstrd->rd_nfrags);
- rc = -EMSGSIZE;
- break;
- }
-
- wrknob = min3(kiblnd_rd_frag_size(srcrd, srcidx),
- kiblnd_rd_frag_size(dstrd, dstidx),
- (__u32)resid);
-
- sge = &tx->tx_sge[tx->tx_nwrq];
- sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx);
- sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx);
- sge->length = wrknob;
-
- wrq = &tx->tx_wrq[tx->tx_nwrq];
- next = wrq + 1;
-
- wrq->wr.next = &next->wr;
- wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
- wrq->wr.sg_list = sge;
- wrq->wr.num_sge = 1;
- wrq->wr.opcode = IB_WR_RDMA_WRITE;
- wrq->wr.send_flags = 0;
-
- wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
- wrq->rkey = kiblnd_rd_frag_key(dstrd, dstidx);
-
- srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
- dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
-
- resid -= wrknob;
-
- tx->tx_nwrq++;
- wrq++;
- sge++;
- }
-too_big:
- if (rc < 0) /* no RDMA if completing with failure */
- tx->tx_nwrq = 0;
-
- ibmsg->ibm_u.completion.ibcm_status = rc;
- ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
- kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
- type, sizeof(struct kib_completion_msg));
-
- return rc;
-}
-
-static void
-kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
-{
- struct list_head *q;
-
- LASSERT(tx->tx_nwrq > 0); /* work items set up */
- LASSERT(!tx->tx_queued); /* not queued for sending already */
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- tx->tx_queued = 1;
- tx->tx_deadline = jiffies +
- msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
- MSEC_PER_SEC);
-
- if (!tx->tx_conn) {
- kiblnd_conn_addref(conn);
- tx->tx_conn = conn;
- LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
- } else {
- /* PUT_DONE first attached to conn as a PUT_REQ */
- LASSERT(tx->tx_conn == conn);
- LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
- }
-
- switch (tx->tx_msg->ibm_type) {
- default:
- LBUG();
-
- case IBLND_MSG_PUT_REQ:
- case IBLND_MSG_GET_REQ:
- q = &conn->ibc_tx_queue_rsrvd;
- break;
-
- case IBLND_MSG_PUT_NAK:
- case IBLND_MSG_PUT_ACK:
- case IBLND_MSG_PUT_DONE:
- case IBLND_MSG_GET_DONE:
- q = &conn->ibc_tx_queue_nocred;
- break;
-
- case IBLND_MSG_NOOP:
- if (IBLND_OOB_CAPABLE(conn->ibc_version))
- q = &conn->ibc_tx_queue_nocred;
- else
- q = &conn->ibc_tx_noops;
- break;
-
- case IBLND_MSG_IMMEDIATE:
- q = &conn->ibc_tx_queue;
- break;
- }
-
- list_add_tail(&tx->tx_list, q);
-}
-
-static void
-kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
-{
- spin_lock(&conn->ibc_lock);
- kiblnd_queue_tx_locked(tx, conn);
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-}
-
-static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
- struct sockaddr_in *srcaddr,
- struct sockaddr_in *dstaddr,
- int timeout_ms)
-{
- unsigned short port;
- int rc;
-
- /* allow the port to be reused */
- rc = rdma_set_reuseaddr(cmid, 1);
- if (rc) {
- CERROR("Unable to set reuse on cmid: %d\n", rc);
- return rc;
- }
-
- /* look for a free privileged port */
- for (port = PROT_SOCK - 1; port > 0; port--) {
- srcaddr->sin_port = htons(port);
- rc = rdma_resolve_addr(cmid,
- (struct sockaddr *)srcaddr,
- (struct sockaddr *)dstaddr,
- timeout_ms);
- if (!rc) {
- CDEBUG(D_NET, "bound to port %hu\n", port);
- return 0;
- } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
- CDEBUG(D_NET, "bind to port %hu failed: %d\n",
- port, rc);
- } else {
- return rc;
- }
- }
-
- CERROR("Failed to bind to a free privileged port\n");
- return rc;
-}
-
-static void
-kiblnd_connect_peer(struct kib_peer *peer)
-{
- struct rdma_cm_id *cmid;
- struct kib_dev *dev;
- struct kib_net *net = peer->ibp_ni->ni_data;
- struct sockaddr_in srcaddr;
- struct sockaddr_in dstaddr;
- int rc;
-
- LASSERT(net);
- LASSERT(peer->ibp_connecting > 0);
-
- cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
- IB_QPT_RC);
-
- if (IS_ERR(cmid)) {
- CERROR("Can't create CMID for %s: %ld\n",
- libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
- rc = PTR_ERR(cmid);
- goto failed;
- }
-
- dev = net->ibn_dev;
- memset(&srcaddr, 0, sizeof(srcaddr));
- srcaddr.sin_family = AF_INET;
- srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
-
- memset(&dstaddr, 0, sizeof(dstaddr));
- dstaddr.sin_family = AF_INET;
- dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
- dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
-
- kiblnd_peer_addref(peer); /* cmid's ref */
-
- if (*kiblnd_tunables.kib_use_priv_port) {
- rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- } else {
- rc = rdma_resolve_addr(cmid,
- (struct sockaddr *)&srcaddr,
- (struct sockaddr *)&dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- }
- if (rc) {
- /* Can't initiate address resolution: */
- CERROR("Can't resolve addr for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- goto failed2;
- }
-
- return;
-
- failed2:
- kiblnd_peer_connect_failed(peer, 1, rc);
- kiblnd_peer_decref(peer); /* cmid's ref */
- rdma_destroy_id(cmid);
- return;
- failed:
- kiblnd_peer_connect_failed(peer, 1, rc);
-}
-
-bool
-kiblnd_reconnect_peer(struct kib_peer *peer)
-{
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- char *reason = NULL;
- struct list_head txs;
- unsigned long flags;
-
- INIT_LIST_HEAD(&txs);
-
- write_lock_irqsave(glock, flags);
- if (!peer->ibp_reconnecting) {
- if (peer->ibp_accepting)
- reason = "accepting";
- else if (peer->ibp_connecting)
- reason = "connecting";
- else if (!list_empty(&peer->ibp_conns))
- reason = "connected";
- else /* connected then closed */
- reason = "closed";
-
- goto no_reconnect;
- }
-
- LASSERT(!peer->ibp_accepting && !peer->ibp_connecting &&
- list_empty(&peer->ibp_conns));
- peer->ibp_reconnecting--;
-
- if (!kiblnd_peer_active(peer)) {
- list_splice_init(&peer->ibp_tx_queue, &txs);
- reason = "unlinked";
- goto no_reconnect;
- }
-
- peer->ibp_connecting++;
- peer->ibp_reconnected++;
- write_unlock_irqrestore(glock, flags);
-
- kiblnd_connect_peer(peer);
- return true;
-
-no_reconnect:
- write_unlock_irqrestore(glock, flags);
-
- CWARN("Abort reconnection of %s: %s\n",
- libcfs_nid2str(peer->ibp_nid), reason);
- kiblnd_txlist_done(peer->ibp_ni, &txs, -ECONNABORTED);
- return false;
-}
-
-void
-kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
-{
- struct kib_peer *peer;
- struct kib_peer *peer2;
- struct kib_conn *conn;
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- unsigned long flags;
- int rc;
- int i;
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-
- /*
- * If I get here, I've committed to send, so I complete the tx with
- * failure on any problems
- */
- LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */
- LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */
-
- /*
- * First time, just use a read lock since I expect to find my peer
- * connected
- */
- read_lock_irqsave(g_lock, flags);
-
- peer = kiblnd_find_peer_locked(nid);
- if (peer && !list_empty(&peer->ibp_conns)) {
- /* Found a peer with an established connection */
- conn = kiblnd_get_conn_locked(peer);
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- read_unlock_irqrestore(g_lock, flags);
-
- if (tx)
- kiblnd_queue_tx(tx, conn);
- kiblnd_conn_decref(conn); /* ...to here */
- return;
- }
-
- read_unlock(g_lock);
- /* Re-try with a write lock */
- write_lock(g_lock);
-
- peer = kiblnd_find_peer_locked(nid);
- if (peer) {
- if (list_empty(&peer->ibp_conns)) {
- /* found a peer, but it's still connecting... */
- LASSERT(kiblnd_peer_connecting(peer));
- if (tx)
- list_add_tail(&tx->tx_list,
- &peer->ibp_tx_queue);
- write_unlock_irqrestore(g_lock, flags);
- } else {
- conn = kiblnd_get_conn_locked(peer);
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- write_unlock_irqrestore(g_lock, flags);
-
- if (tx)
- kiblnd_queue_tx(tx, conn);
- kiblnd_conn_decref(conn); /* ...to here */
- }
- return;
- }
-
- write_unlock_irqrestore(g_lock, flags);
-
- /* Allocate a peer ready to add to the peer table and retry */
- rc = kiblnd_create_peer(ni, &peer, nid);
- if (rc) {
- CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
- if (tx) {
- tx->tx_status = -EHOSTUNREACH;
- tx->tx_waiting = 0;
- kiblnd_tx_done(ni, tx);
- }
- return;
- }
-
- write_lock_irqsave(g_lock, flags);
-
- peer2 = kiblnd_find_peer_locked(nid);
- if (peer2) {
- if (list_empty(&peer2->ibp_conns)) {
- /* found a peer, but it's still connecting... */
- LASSERT(kiblnd_peer_connecting(peer2));
- if (tx)
- list_add_tail(&tx->tx_list,
- &peer2->ibp_tx_queue);
- write_unlock_irqrestore(g_lock, flags);
- } else {
- conn = kiblnd_get_conn_locked(peer2);
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- write_unlock_irqrestore(g_lock, flags);
-
- if (tx)
- kiblnd_queue_tx(tx, conn);
- kiblnd_conn_decref(conn); /* ...to here */
- }
-
- kiblnd_peer_decref(peer);
- return;
- }
-
- /* Brand new peer */
- LASSERT(!peer->ibp_connecting);
- tunables = &peer->ibp_ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
- peer->ibp_connecting = tunables->lnd_conns_per_peer;
-
- /* always called with a ref on ni, which prevents ni being shutdown */
- LASSERT(!((struct kib_net *)ni->ni_data)->ibn_shutdown);
-
- if (tx)
- list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
-
- kiblnd_peer_addref(peer);
- list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
-
- write_unlock_irqrestore(g_lock, flags);
-
- for (i = 0; i < tunables->lnd_conns_per_peer; i++)
- kiblnd_connect_peer(peer);
- kiblnd_peer_decref(peer);
-}
-
-int
-kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
-{
- struct lnet_hdr *hdr = &lntmsg->msg_hdr;
- int type = lntmsg->msg_type;
- struct lnet_process_id target = lntmsg->msg_target;
- int target_is_router = lntmsg->msg_target_is_router;
- int routing = lntmsg->msg_routing;
- unsigned int payload_niov = lntmsg->msg_niov;
- struct kvec *payload_iov = lntmsg->msg_iov;
- struct bio_vec *payload_kiov = lntmsg->msg_kiov;
- unsigned int payload_offset = lntmsg->msg_offset;
- unsigned int payload_nob = lntmsg->msg_len;
- struct iov_iter from;
- struct kib_msg *ibmsg;
- struct kib_rdma_desc *rd;
- struct kib_tx *tx;
- int nob;
- int rc;
-
- /* NB 'private' is different depending on what we're sending.... */
-
- CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
- payload_nob, payload_niov, libcfs_id2str(target));
-
- LASSERT(!payload_nob || payload_niov > 0);
- LASSERT(payload_niov <= LNET_MAX_IOV);
-
- /* Thread context */
- LASSERT(!in_interrupt());
- /* payload is either all vaddrs or all pages */
- LASSERT(!(payload_kiov && payload_iov));
-
- if (payload_kiov)
- iov_iter_bvec(&from, ITER_BVEC | WRITE,
- payload_kiov, payload_niov,
- payload_nob + payload_offset);
- else
- iov_iter_kvec(&from, ITER_KVEC | WRITE,
- payload_iov, payload_niov,
- payload_nob + payload_offset);
-
- iov_iter_advance(&from, payload_offset);
-
- switch (type) {
- default:
- LBUG();
- return -EIO;
-
- case LNET_MSG_ACK:
- LASSERT(!payload_nob);
- break;
-
- case LNET_MSG_GET:
- if (routing || target_is_router)
- break; /* send IMMEDIATE */
-
- /* is the REPLY message too small for RDMA? */
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
- if (nob <= IBLND_MSG_SIZE)
- break; /* send IMMEDIATE */
-
- tx = kiblnd_get_idle_tx(ni, target.nid);
- if (!tx) {
- CERROR("Can't allocate txd for GET to %s\n",
- libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
-
- ibmsg = tx->tx_msg;
- rd = &ibmsg->ibm_u.get.ibgm_rd;
- if (!(lntmsg->msg_md->md_options & LNET_MD_KIOV))
- rc = kiblnd_setup_rd_iov(ni, tx, rd,
- lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.iov,
- 0, lntmsg->msg_md->md_length);
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, rd,
- lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.kiov,
- 0, lntmsg->msg_md->md_length);
- if (rc) {
- CERROR("Can't setup GET sink for %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- kiblnd_tx_done(ni, tx);
- return -EIO;
- }
-
- nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
- ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
- ibmsg->ibm_u.get.ibgm_hdr = *hdr;
-
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
-
- tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
- if (!tx->tx_lntmsg[1]) {
- CERROR("Can't create reply for GET -> %s\n",
- libcfs_nid2str(target.nid));
- kiblnd_tx_done(ni, tx);
- return -EIO;
- }
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
- tx->tx_waiting = 1; /* waiting for GET_DONE */
- kiblnd_launch_tx(ni, tx, target.nid);
- return 0;
-
- case LNET_MSG_REPLY:
- case LNET_MSG_PUT:
- /* Is the payload small enough not to need RDMA? */
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
- if (nob <= IBLND_MSG_SIZE)
- break; /* send IMMEDIATE */
-
- tx = kiblnd_get_idle_tx(ni, target.nid);
- if (!tx) {
- CERROR("Can't allocate %s txd for %s\n",
- type == LNET_MSG_PUT ? "PUT" : "REPLY",
- libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
-
- if (!payload_kiov)
- rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
- payload_niov, payload_iov,
- payload_offset, payload_nob);
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
- payload_niov, payload_kiov,
- payload_offset, payload_nob);
- if (rc) {
- CERROR("Can't setup PUT src for %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- kiblnd_tx_done(ni, tx);
- return -EIO;
- }
-
- ibmsg = tx->tx_msg;
- ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
- ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(struct kib_putreq_msg));
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
- kiblnd_launch_tx(ni, tx, target.nid);
- return 0;
- }
-
- /* send IMMEDIATE */
-
- LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
- <= IBLND_MSG_SIZE);
-
- tx = kiblnd_get_idle_tx(ni, target.nid);
- if (!tx) {
- CERROR("Can't send %d to %s: tx descs exhausted\n",
- type, libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
-
- ibmsg = tx->tx_msg;
- ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
-
- rc = copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, payload_nob,
- &from);
- if (rc != payload_nob) {
- kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
- return -EFAULT;
- }
-
- nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]);
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- kiblnd_launch_tx(ni, tx, target.nid);
- return 0;
-}
-
-static void
-kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
-{
- struct lnet_process_id target = lntmsg->msg_target;
- unsigned int niov = lntmsg->msg_niov;
- struct kvec *iov = lntmsg->msg_iov;
- struct bio_vec *kiov = lntmsg->msg_kiov;
- unsigned int offset = lntmsg->msg_offset;
- unsigned int nob = lntmsg->msg_len;
- struct kib_tx *tx;
- int rc;
-
- tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
- if (!tx) {
- CERROR("Can't get tx for REPLY to %s\n",
- libcfs_nid2str(target.nid));
- goto failed_0;
- }
-
- if (!nob)
- rc = 0;
- else if (!kiov)
- rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
- niov, iov, offset, nob);
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
- niov, kiov, offset, nob);
-
- if (rc) {
- CERROR("Can't setup GET src for %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- goto failed_1;
- }
-
- rc = kiblnd_init_rdma(rx->rx_conn, tx,
- IBLND_MSG_GET_DONE, nob,
- &rx->rx_msg->ibm_u.get.ibgm_rd,
- rx->rx_msg->ibm_u.get.ibgm_cookie);
- if (rc < 0) {
- CERROR("Can't setup rdma for GET from %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- goto failed_1;
- }
-
- if (!nob) {
- /* No RDMA: local completion may happen now! */
- lnet_finalize(ni, lntmsg, 0);
- } else {
- /* RDMA: lnet_finalize(lntmsg) when it completes */
- tx->tx_lntmsg[0] = lntmsg;
- }
-
- kiblnd_queue_tx(tx, rx->rx_conn);
- return;
-
- failed_1:
- kiblnd_tx_done(ni, tx);
- failed_0:
- lnet_finalize(ni, lntmsg, -EIO);
-}
-
-int
-kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, struct iov_iter *to, unsigned int rlen)
-{
- struct kib_rx *rx = private;
- struct kib_msg *rxmsg = rx->rx_msg;
- struct kib_conn *conn = rx->rx_conn;
- struct kib_tx *tx;
- int nob;
- int post_credit = IBLND_POSTRX_PEER_CREDIT;
- int rc = 0;
-
- LASSERT(iov_iter_count(to) <= rlen);
- LASSERT(!in_interrupt());
- /* Either all pages or all vaddrs */
-
- switch (rxmsg->ibm_type) {
- default:
- LBUG();
-
- case IBLND_MSG_IMMEDIATE:
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
- if (nob > rx->rx_nob) {
- CERROR("Immediate message from %s too big: %d(%d)\n",
- libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
- nob, rx->rx_nob);
- rc = -EPROTO;
- break;
- }
-
- rc = copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, rlen,
- to);
- if (rc != rlen) {
- rc = -EFAULT;
- break;
- }
-
- rc = 0;
- lnet_finalize(ni, lntmsg, 0);
- break;
-
- case IBLND_MSG_PUT_REQ: {
- struct kib_msg *txmsg;
- struct kib_rdma_desc *rd;
-
- if (!iov_iter_count(to)) {
- lnet_finalize(ni, lntmsg, 0);
- kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
- rxmsg->ibm_u.putreq.ibprm_cookie);
- break;
- }
-
- tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
- if (!tx) {
- CERROR("Can't allocate tx for %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- /* Not replying will break the connection */
- rc = -ENOMEM;
- break;
- }
-
- txmsg = tx->tx_msg;
- rd = &txmsg->ibm_u.putack.ibpam_rd;
- if (!(to->type & ITER_BVEC))
- rc = kiblnd_setup_rd_iov(ni, tx, rd,
- to->nr_segs, to->kvec,
- to->iov_offset,
- iov_iter_count(to));
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, rd,
- to->nr_segs, to->bvec,
- to->iov_offset,
- iov_iter_count(to));
- if (rc) {
- CERROR("Can't setup PUT sink for %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
- kiblnd_tx_done(ni, tx);
- /* tell peer it's over */
- kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
- rxmsg->ibm_u.putreq.ibprm_cookie);
- break;
- }
-
- nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
- txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
- txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
-
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- tx->tx_waiting = 1; /* waiting for PUT_DONE */
- kiblnd_queue_tx(tx, conn);
-
- /* reposted buffer reserved for PUT_DONE */
- post_credit = IBLND_POSTRX_NO_CREDIT;
- break;
- }
-
- case IBLND_MSG_GET_REQ:
- if (lntmsg) {
- /* Optimized GET; RDMA lntmsg's payload */
- kiblnd_reply(ni, rx, lntmsg);
- } else {
- /* GET didn't match anything */
- kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
- -ENODATA,
- rxmsg->ibm_u.get.ibgm_cookie);
- }
- break;
- }
-
- kiblnd_post_rx(rx, post_credit);
- return rc;
-}
-
-int
-kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
-{
- struct task_struct *task = kthread_run(fn, arg, "%s", name);
-
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- atomic_inc(&kiblnd_data.kib_nthreads);
- return 0;
-}
-
-static void
-kiblnd_thread_fini(void)
-{
- atomic_dec(&kiblnd_data.kib_nthreads);
-}
-
-static void
-kiblnd_peer_alive(struct kib_peer *peer)
-{
- /* This is racy, but everyone's only writing jiffies */
- peer->ibp_last_alive = jiffies;
- mb();
-}
-
-static void
-kiblnd_peer_notify(struct kib_peer *peer)
-{
- int error = 0;
- unsigned long last_alive = 0;
- unsigned long flags;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (kiblnd_peer_idle(peer) && peer->ibp_error) {
- error = peer->ibp_error;
- peer->ibp_error = 0;
-
- last_alive = peer->ibp_last_alive;
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- if (error)
- lnet_notify(peer->ibp_ni,
- peer->ibp_nid, 0, last_alive);
-}
-
-void
-kiblnd_close_conn_locked(struct kib_conn *conn, int error)
-{
- /*
- * This just does the immediate housekeeping. 'error' is zero for a
- * normal shutdown which can happen only after the connection has been
- * established. If the connection is established, schedule the
- * connection to be finished off by the connd. Otherwise the connd is
- * already dealing with it (either to set it up or tear it down).
- * Caller holds kib_global_lock exclusively in irq context
- */
- struct kib_peer *peer = conn->ibc_peer;
- struct kib_dev *dev;
- unsigned long flags;
-
- LASSERT(error || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- if (error && !conn->ibc_comms_error)
- conn->ibc_comms_error = error;
-
- if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
- return; /* already being handled */
-
- if (!error &&
- list_empty(&conn->ibc_tx_noops) &&
- list_empty(&conn->ibc_tx_queue) &&
- list_empty(&conn->ibc_tx_queue_rsrvd) &&
- list_empty(&conn->ibc_tx_queue_nocred) &&
- list_empty(&conn->ibc_active_txs)) {
- CDEBUG(D_NET, "closing conn to %s\n",
- libcfs_nid2str(peer->ibp_nid));
- } else {
- CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
- libcfs_nid2str(peer->ibp_nid), error,
- list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
- list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
- list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
- list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
- list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
- }
-
- dev = ((struct kib_net *)peer->ibp_ni->ni_data)->ibn_dev;
- if (peer->ibp_next_conn == conn)
- /* clear next_conn so it won't be used */
- peer->ibp_next_conn = NULL;
- list_del(&conn->ibc_list);
- /* connd (see below) takes over ibc_list's ref */
-
- if (list_empty(&peer->ibp_conns) && /* no more conns */
- kiblnd_peer_active(peer)) { /* still in peer table */
- kiblnd_unlink_peer_locked(peer);
-
- /* set/clear error on last conn */
- peer->ibp_error = conn->ibc_comms_error;
- }
-
- kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
-
- if (error &&
- kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- wake_up(&kiblnd_data.kib_failover_waitq);
- }
-
- spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
-
- list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
- wake_up(&kiblnd_data.kib_connd_waitq);
-
- spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
-}
-
-void
-kiblnd_close_conn(struct kib_conn *conn, int error)
-{
- unsigned long flags;
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_close_conn_locked(conn, error);
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-}
-
-static void
-kiblnd_handle_early_rxs(struct kib_conn *conn)
-{
- unsigned long flags;
- struct kib_rx *rx;
-
- LASSERT(!in_interrupt());
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- while (!list_empty(&conn->ibc_early_rxs)) {
- rx = list_entry(conn->ibc_early_rxs.next,
- struct kib_rx, rx_list);
- list_del(&rx->rx_list);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_handle_rx(rx);
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- }
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-}
-
-static void
-kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
-{
- LIST_HEAD(zombies);
- struct list_head *tmp;
- struct list_head *nxt;
- struct kib_tx *tx;
-
- spin_lock(&conn->ibc_lock);
-
- list_for_each_safe(tmp, nxt, txs) {
- tx = list_entry(tmp, struct kib_tx, tx_list);
-
- if (txs == &conn->ibc_active_txs) {
- LASSERT(!tx->tx_queued);
- LASSERT(tx->tx_waiting || tx->tx_sending);
- } else {
- LASSERT(tx->tx_queued);
- }
-
- tx->tx_status = -ECONNABORTED;
- tx->tx_waiting = 0;
-
- if (!tx->tx_sending) {
- tx->tx_queued = 0;
- list_del(&tx->tx_list);
- list_add(&tx->tx_list, &zombies);
- }
- }
-
- spin_unlock(&conn->ibc_lock);
-
- kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
-}
-
-static void
-kiblnd_finalise_conn(struct kib_conn *conn)
-{
- LASSERT(!in_interrupt());
- LASSERT(conn->ibc_state > IBLND_CONN_INIT);
-
- kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
- /*
- * abort_receives moves QP state to IB_QPS_ERR. This is only required
- * for connections that didn't get as far as being connected, because
- * rdma_disconnect() does this for free.
- */
- kiblnd_abort_receives(conn);
-
- /*
- * Complete all tx descs not waiting for sends to complete.
- * NB we should be safe from RDMA now that the QP has changed state
- */
- kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
- kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
- kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
- kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
- kiblnd_abort_txs(conn, &conn->ibc_active_txs);
-
- kiblnd_handle_early_rxs(conn);
-}
-
-static void
-kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error)
-{
- LIST_HEAD(zombies);
- unsigned long flags;
-
- LASSERT(error);
- LASSERT(!in_interrupt());
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (active) {
- LASSERT(peer->ibp_connecting > 0);
- peer->ibp_connecting--;
- } else {
- LASSERT(peer->ibp_accepting > 0);
- peer->ibp_accepting--;
- }
-
- if (kiblnd_peer_connecting(peer)) {
- /* another connection attempt under way... */
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
- flags);
- return;
- }
-
- peer->ibp_reconnected = 0;
- if (list_empty(&peer->ibp_conns)) {
- /* Take peer's blocked transmits to complete with error */
- list_add(&zombies, &peer->ibp_tx_queue);
- list_del_init(&peer->ibp_tx_queue);
-
- if (kiblnd_peer_active(peer))
- kiblnd_unlink_peer_locked(peer);
-
- peer->ibp_error = error;
- } else {
- /* Can't have blocked transmits if there are connections */
- LASSERT(list_empty(&peer->ibp_tx_queue));
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_peer_notify(peer);
-
- if (list_empty(&zombies))
- return;
-
- CNETERR("Deleting messages for %s: connection failed\n",
- libcfs_nid2str(peer->ibp_nid));
-
- kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
-}
-
-static void
-kiblnd_connreq_done(struct kib_conn *conn, int status)
-{
- struct kib_peer *peer = conn->ibc_peer;
- struct kib_tx *tx;
- struct kib_tx *tmp;
- struct list_head txs;
- unsigned long flags;
- int active;
-
- active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
-
- CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n",
- libcfs_nid2str(peer->ibp_nid), active,
- conn->ibc_version, status);
-
- LASSERT(!in_interrupt());
- LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
- peer->ibp_connecting > 0) ||
- (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
- peer->ibp_accepting > 0));
-
- kfree(conn->ibc_connvars);
- conn->ibc_connvars = NULL;
-
- if (status) {
- /* failed to establish connection */
- kiblnd_peer_connect_failed(peer, active, status);
- kiblnd_finalise_conn(conn);
- return;
- }
-
- /* connection established */
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- conn->ibc_last_send = jiffies;
- kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
- kiblnd_peer_alive(peer);
-
- /*
- * Add conn to peer's list and nuke any dangling conns from a different
- * peer instance...
- */
- kiblnd_conn_addref(conn); /* +1 ref for ibc_list */
- list_add(&conn->ibc_list, &peer->ibp_conns);
- peer->ibp_reconnected = 0;
- if (active)
- peer->ibp_connecting--;
- else
- peer->ibp_accepting--;
-
- if (!peer->ibp_version) {
- peer->ibp_version = conn->ibc_version;
- peer->ibp_incarnation = conn->ibc_incarnation;
- }
-
- if (peer->ibp_version != conn->ibc_version ||
- peer->ibp_incarnation != conn->ibc_incarnation) {
- kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
- conn->ibc_incarnation);
- peer->ibp_version = conn->ibc_version;
- peer->ibp_incarnation = conn->ibc_incarnation;
- }
-
- /* grab pending txs while I have the lock */
- list_add(&txs, &peer->ibp_tx_queue);
- list_del_init(&peer->ibp_tx_queue);
-
- if (!kiblnd_peer_active(peer) || /* peer has been deleted */
- conn->ibc_comms_error) { /* error has happened already */
- struct lnet_ni *ni = peer->ibp_ni;
-
- /* start to shut down connection */
- kiblnd_close_conn_locked(conn, -ECONNABORTED);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
-
- return;
- }
-
- /*
- * +1 ref for myself, this connection is visible to other threads
- * now, refcount of peer:ibp_conns can be released by connection
- * close from either a different thread, or the calling of
- * kiblnd_check_sends_locked() below. See bz21911 for details.
- */
- kiblnd_conn_addref(conn);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- /* Schedule blocked txs
- * Note: if we are running with conns_per_peer > 1, these blocked
- * txs will all get scheduled to the first connection which gets
- * scheduled. We won't be using round robin on this first batch.
- */
- spin_lock(&conn->ibc_lock);
- list_for_each_entry_safe(tx, tmp, &txs, tx_list) {
- list_del(&tx->tx_list);
-
- kiblnd_queue_tx_locked(tx, conn);
- }
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
- /* schedule blocked rxs */
- kiblnd_handle_early_rxs(conn);
-
- kiblnd_conn_decref(conn);
-}
-
-static void
-kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
-{
- int rc;
-
- rc = rdma_reject(cmid, rej, sizeof(*rej));
-
- if (rc)
- CWARN("Error %d sending reject\n", rc);
-}
-
-static int
-kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
-{
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- struct kib_msg *reqmsg = priv;
- struct kib_msg *ackmsg;
- struct kib_dev *ibdev;
- struct kib_peer *peer;
- struct kib_peer *peer2;
- struct kib_conn *conn;
- struct lnet_ni *ni = NULL;
- struct kib_net *net = NULL;
- lnet_nid_t nid;
- struct rdma_conn_param cp;
- struct kib_rej rej;
- int version = IBLND_MSG_VERSION;
- unsigned long flags;
- int max_frags;
- int rc;
- struct sockaddr_in *peer_addr;
-
- LASSERT(!in_interrupt());
-
- /* cmid inherits 'context' from the corresponding listener id */
- ibdev = (struct kib_dev *)cmid->context;
- LASSERT(ibdev);
-
- memset(&rej, 0, sizeof(rej));
- rej.ibr_magic = IBLND_MSG_MAGIC;
- rej.ibr_why = IBLND_REJECT_FATAL;
- rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
-
- peer_addr = (struct sockaddr_in *)&cmid->route.addr.dst_addr;
- if (*kiblnd_tunables.kib_require_priv_port &&
- ntohs(peer_addr->sin_port) >= PROT_SOCK) {
- __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
-
- CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
- &ip, ntohs(peer_addr->sin_port));
- goto failed;
- }
-
- if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
- CERROR("Short connection request\n");
- goto failed;
- }
-
- /*
- * Future protocol version compatibility support! If the
- * o2iblnd-specific protocol changes, or when LNET unifies
- * protocols over all LNDs, the initial connection will
- * negotiate a protocol version. I trap this here to avoid
- * console errors; the reject tells the peer which protocol I
- * speak.
- */
- if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
- reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
- goto failed;
- if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
- reqmsg->ibm_version != IBLND_MSG_VERSION &&
- reqmsg->ibm_version != IBLND_MSG_VERSION_1)
- goto failed;
- if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
- reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
- reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
- goto failed;
-
- rc = kiblnd_unpack_msg(reqmsg, priv_nob);
- if (rc) {
- CERROR("Can't parse connection request: %d\n", rc);
- goto failed;
- }
-
- nid = reqmsg->ibm_srcnid;
- ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
-
- if (ni) {
- net = (struct kib_net *)ni->ni_data;
- rej.ibr_incarnation = net->ibn_incarnation;
- }
-
- if (!ni || /* no matching net */
- ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */
- net->ibn_dev != ibdev) { /* wrong device */
- CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n",
- libcfs_nid2str(nid),
- !ni ? "NA" : libcfs_nid2str(ni->ni_nid),
- ibdev->ibd_ifname, ibdev->ibd_nnets,
- &ibdev->ibd_ifip,
- libcfs_nid2str(reqmsg->ibm_dstnid));
-
- goto failed;
- }
-
- /* check time stamp as soon as possible */
- if (reqmsg->ibm_dststamp &&
- reqmsg->ibm_dststamp != net->ibn_incarnation) {
- CWARN("Stale connection request\n");
- rej.ibr_why = IBLND_REJECT_CONN_STALE;
- goto failed;
- }
-
- /* I can accept peer's version */
- version = reqmsg->ibm_version;
-
- if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
- CERROR("Unexpected connreq msg type: %x from %s\n",
- reqmsg->ibm_type, libcfs_nid2str(nid));
- goto failed;
- }
-
- if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
- kiblnd_msg_queue_size(version, ni)) {
- CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n",
- libcfs_nid2str(nid),
- reqmsg->ibm_u.connparams.ibcp_queue_depth,
- kiblnd_msg_queue_size(version, ni));
-
- if (version == IBLND_MSG_VERSION)
- rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
-
- goto failed;
- }
-
- max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
- if (max_frags > kiblnd_rdma_frags(version, ni)) {
- CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n",
- libcfs_nid2str(nid), version, max_frags,
- kiblnd_rdma_frags(version, ni));
-
- if (version >= IBLND_MSG_VERSION)
- rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
-
- goto failed;
- } else if (max_frags < kiblnd_rdma_frags(version, ni) &&
- !net->ibn_fmr_ps) {
- CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n",
- libcfs_nid2str(nid), version, max_frags,
- kiblnd_rdma_frags(version, ni));
-
- if (version == IBLND_MSG_VERSION)
- rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
-
- goto failed;
- }
-
- if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
- CERROR("Can't accept %s: message size %d too big (%d max)\n",
- libcfs_nid2str(nid),
- reqmsg->ibm_u.connparams.ibcp_max_msg_size,
- IBLND_MSG_SIZE);
- goto failed;
- }
-
- /* assume 'nid' is a new peer; create */
- rc = kiblnd_create_peer(ni, &peer, nid);
- if (rc) {
- CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
- rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
- goto failed;
- }
-
- /* We have validated the peer's parameters so use those */
- peer->ibp_max_frags = max_frags;
- peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
-
- write_lock_irqsave(g_lock, flags);
-
- peer2 = kiblnd_find_peer_locked(nid);
- if (peer2) {
- if (!peer2->ibp_version) {
- peer2->ibp_version = version;
- peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
- }
-
- /* not the guy I've talked with */
- if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
- peer2->ibp_version != version) {
- kiblnd_close_peer_conns_locked(peer2, -ESTALE);
-
- if (kiblnd_peer_active(peer2)) {
- peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
- peer2->ibp_version = version;
- }
- write_unlock_irqrestore(g_lock, flags);
-
- CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n",
- libcfs_nid2str(nid), peer2->ibp_version, version,
- peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
-
- kiblnd_peer_decref(peer);
- rej.ibr_why = IBLND_REJECT_CONN_STALE;
- goto failed;
- }
-
- /*
- * Tie-break connection race in favour of the higher NID.
- * If we keep running into a race condition multiple times,
- * we have to assume that the connection attempt with the
- * higher NID is stuck in a connecting state and will never
- * recover. As such, we pass through this if-block and let
- * the lower NID connection win so we can move forward.
- */
- if (peer2->ibp_connecting &&
- nid < ni->ni_nid && peer2->ibp_races <
- MAX_CONN_RACES_BEFORE_ABORT) {
- peer2->ibp_races++;
- write_unlock_irqrestore(g_lock, flags);
-
- CDEBUG(D_NET, "Conn race %s\n",
- libcfs_nid2str(peer2->ibp_nid));
-
- kiblnd_peer_decref(peer);
- rej.ibr_why = IBLND_REJECT_CONN_RACE;
- goto failed;
- }
- if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT)
- CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n",
- libcfs_nid2str(peer2->ibp_nid),
- MAX_CONN_RACES_BEFORE_ABORT);
- /**
- * passive connection is allowed even this peer is waiting for
- * reconnection.
- */
- peer2->ibp_reconnecting = 0;
- peer2->ibp_races = 0;
- peer2->ibp_accepting++;
- kiblnd_peer_addref(peer2);
-
- /**
- * Race with kiblnd_launch_tx (active connect) to create peer
- * so copy validated parameters since we now know what the
- * peer's limits are
- */
- peer2->ibp_max_frags = peer->ibp_max_frags;
- peer2->ibp_queue_depth = peer->ibp_queue_depth;
-
- write_unlock_irqrestore(g_lock, flags);
- kiblnd_peer_decref(peer);
- peer = peer2;
- } else {
- /* Brand new peer */
- LASSERT(!peer->ibp_accepting);
- LASSERT(!peer->ibp_version &&
- !peer->ibp_incarnation);
-
- peer->ibp_accepting = 1;
- peer->ibp_version = version;
- peer->ibp_incarnation = reqmsg->ibm_srcstamp;
-
- /* I have a ref on ni that prevents it being shutdown */
- LASSERT(!net->ibn_shutdown);
-
- kiblnd_peer_addref(peer);
- list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
-
- write_unlock_irqrestore(g_lock, flags);
- }
-
- conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT,
- version);
- if (!conn) {
- kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
- kiblnd_peer_decref(peer);
- rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
- goto failed;
- }
-
- /*
- * conn now "owns" cmid, so I return success from here on to ensure the
- * CM callback doesn't destroy cmid.
- */
- conn->ibc_incarnation = reqmsg->ibm_srcstamp;
- conn->ibc_credits = conn->ibc_queue_depth;
- conn->ibc_reserved_credits = conn->ibc_queue_depth;
- LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
- IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn));
-
- ackmsg = &conn->ibc_connvars->cv_msg;
- memset(ackmsg, 0, sizeof(*ackmsg));
-
- kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
- sizeof(ackmsg->ibm_u.connparams));
- ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth;
- ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
- ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
-
- kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
-
- memset(&cp, 0, sizeof(cp));
- cp.private_data = ackmsg;
- cp.private_data_len = ackmsg->ibm_nob;
- cp.responder_resources = 0; /* No atomic ops or RDMA reads */
- cp.initiator_depth = 0;
- cp.flow_control = 1;
- cp.retry_count = *kiblnd_tunables.kib_retry_count;
- cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
-
- CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
-
- rc = rdma_accept(cmid, &cp);
- if (rc) {
- CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
- rej.ibr_version = version;
- rej.ibr_why = IBLND_REJECT_FATAL;
-
- kiblnd_reject(cmid, &rej);
- kiblnd_connreq_done(conn, rc);
- kiblnd_conn_decref(conn);
- }
-
- lnet_ni_decref(ni);
- return 0;
-
- failed:
- if (ni) {
- rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni);
- rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni);
- lnet_ni_decref(ni);
- }
-
- rej.ibr_version = version;
- kiblnd_reject(cmid, &rej);
-
- return -ECONNREFUSED;
-}
-
-static void
-kiblnd_check_reconnect(struct kib_conn *conn, int version,
- __u64 incarnation, int why, struct kib_connparams *cp)
-{
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_peer *peer = conn->ibc_peer;
- char *reason;
- int msg_size = IBLND_MSG_SIZE;
- int frag_num = -1;
- int queue_dep = -1;
- bool reconnect;
- unsigned long flags;
-
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
- LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */
-
- if (cp) {
- msg_size = cp->ibcp_max_msg_size;
- frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT;
- queue_dep = cp->ibcp_queue_depth;
- }
-
- write_lock_irqsave(glock, flags);
- /**
- * retry connection if it's still needed and no other connection
- * attempts (active or passive) are in progress
- * NB: reconnect is still needed even when ibp_tx_queue is
- * empty if ibp_version != version because reconnect may be
- * initiated by kiblnd_query()
- */
- reconnect = (!list_empty(&peer->ibp_tx_queue) ||
- peer->ibp_version != version) &&
- peer->ibp_connecting &&
- !peer->ibp_accepting;
- if (!reconnect) {
- reason = "no need";
- goto out;
- }
-
- switch (why) {
- default:
- reason = "Unknown";
- break;
-
- case IBLND_REJECT_RDMA_FRAGS: {
- struct lnet_ioctl_config_lnd_tunables *tunables;
-
- if (!cp) {
- reason = "can't negotiate max frags";
- goto out;
- }
- tunables = peer->ibp_ni->ni_lnd_tunables;
- if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
- reason = "map_on_demand must be enabled";
- goto out;
- }
- if (conn->ibc_max_frags <= frag_num) {
- reason = "unsupported max frags";
- goto out;
- }
-
- peer->ibp_max_frags = frag_num;
- reason = "rdma fragments";
- break;
- }
- case IBLND_REJECT_MSG_QUEUE_SIZE:
- if (!cp) {
- reason = "can't negotiate queue depth";
- goto out;
- }
- if (conn->ibc_queue_depth <= queue_dep) {
- reason = "unsupported queue depth";
- goto out;
- }
-
- peer->ibp_queue_depth = queue_dep;
- reason = "queue depth";
- break;
-
- case IBLND_REJECT_CONN_STALE:
- reason = "stale";
- break;
-
- case IBLND_REJECT_CONN_RACE:
- reason = "conn race";
- break;
-
- case IBLND_REJECT_CONN_UNCOMPAT:
- reason = "version negotiation";
- break;
- }
-
- conn->ibc_reconnect = 1;
- peer->ibp_reconnecting++;
- peer->ibp_version = version;
- if (incarnation)
- peer->ibp_incarnation = incarnation;
-out:
- write_unlock_irqrestore(glock, flags);
-
- CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n",
- libcfs_nid2str(peer->ibp_nid),
- reconnect ? "reconnect" : "don't reconnect",
- reason, IBLND_MSG_VERSION, version, msg_size,
- conn->ibc_queue_depth, queue_dep,
- conn->ibc_max_frags, frag_num);
- /**
- * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer
- * while destroying the zombie
- */
-}
-
-static void
-kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
-{
- struct kib_peer *peer = conn->ibc_peer;
-
- LASSERT(!in_interrupt());
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
-
- switch (reason) {
- case IB_CM_REJ_STALE_CONN:
- kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
- IBLND_REJECT_CONN_STALE, NULL);
- break;
-
- case IB_CM_REJ_INVALID_SERVICE_ID:
- CNETERR("%s rejected: no listener at %d\n",
- libcfs_nid2str(peer->ibp_nid),
- *kiblnd_tunables.kib_service);
- break;
-
- case IB_CM_REJ_CONSUMER_DEFINED:
- if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
- struct kib_rej *rej = priv;
- struct kib_connparams *cp = NULL;
- int flip = 0;
- __u64 incarnation = -1;
-
- /* NB. default incarnation is -1 because:
- * a) V1 will ignore dst incarnation in connreq.
- * b) V2 will provide incarnation while rejecting me,
- * -1 will be overwrote.
- *
- * if I try to connect to a V1 peer with V2 protocol,
- * it rejected me then upgrade to V2, I have no idea
- * about the upgrading and try to reconnect with V1,
- * in this case upgraded V2 can find out I'm trying to
- * talk to the old guy and reject me(incarnation is -1).
- */
-
- if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
- rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
- __swab32s(&rej->ibr_magic);
- __swab16s(&rej->ibr_version);
- flip = 1;
- }
-
- if (priv_nob >= sizeof(struct kib_rej) &&
- rej->ibr_version > IBLND_MSG_VERSION_1) {
- /*
- * priv_nob is always 148 in current version
- * of OFED, so we still need to check version.
- * (define of IB_CM_REJ_PRIVATE_DATA_SIZE)
- */
- cp = &rej->ibr_cp;
-
- if (flip) {
- __swab64s(&rej->ibr_incarnation);
- __swab16s(&cp->ibcp_queue_depth);
- __swab16s(&cp->ibcp_max_frags);
- __swab32s(&cp->ibcp_max_msg_size);
- }
-
- incarnation = rej->ibr_incarnation;
- }
-
- if (rej->ibr_magic != IBLND_MSG_MAGIC &&
- rej->ibr_magic != LNET_PROTO_MAGIC) {
- CERROR("%s rejected: consumer defined fatal error\n",
- libcfs_nid2str(peer->ibp_nid));
- break;
- }
-
- if (rej->ibr_version != IBLND_MSG_VERSION &&
- rej->ibr_version != IBLND_MSG_VERSION_1) {
- CERROR("%s rejected: o2iblnd version %x error\n",
- libcfs_nid2str(peer->ibp_nid),
- rej->ibr_version);
- break;
- }
-
- if (rej->ibr_why == IBLND_REJECT_FATAL &&
- rej->ibr_version == IBLND_MSG_VERSION_1) {
- CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
- libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
-
- if (conn->ibc_version != IBLND_MSG_VERSION_1)
- rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
- }
-
- switch (rej->ibr_why) {
- case IBLND_REJECT_CONN_RACE:
- case IBLND_REJECT_CONN_STALE:
- case IBLND_REJECT_CONN_UNCOMPAT:
- case IBLND_REJECT_MSG_QUEUE_SIZE:
- case IBLND_REJECT_RDMA_FRAGS:
- kiblnd_check_reconnect(conn, rej->ibr_version,
- incarnation,
- rej->ibr_why, cp);
- break;
-
- case IBLND_REJECT_NO_RESOURCES:
- CERROR("%s rejected: o2iblnd no resources\n",
- libcfs_nid2str(peer->ibp_nid));
- break;
-
- case IBLND_REJECT_FATAL:
- CERROR("%s rejected: o2iblnd fatal error\n",
- libcfs_nid2str(peer->ibp_nid));
- break;
-
- default:
- CERROR("%s rejected: o2iblnd reason %d\n",
- libcfs_nid2str(peer->ibp_nid),
- rej->ibr_why);
- break;
- }
- break;
- }
- /* fall through */
- default:
- CNETERR("%s rejected: reason %d, size %d\n",
- libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
- break;
- }
-
- kiblnd_connreq_done(conn, -ECONNREFUSED);
-}
-
-static void
-kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
-{
- struct kib_peer *peer = conn->ibc_peer;
- struct lnet_ni *ni = peer->ibp_ni;
- struct kib_net *net = ni->ni_data;
- struct kib_msg *msg = priv;
- int ver = conn->ibc_version;
- int rc = kiblnd_unpack_msg(msg, priv_nob);
- unsigned long flags;
-
- LASSERT(net);
-
- if (rc) {
- CERROR("Can't unpack connack from %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- goto failed;
- }
-
- if (msg->ibm_type != IBLND_MSG_CONNACK) {
- CERROR("Unexpected message %d from %s\n",
- msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
- rc = -EPROTO;
- goto failed;
- }
-
- if (ver != msg->ibm_version) {
- CERROR("%s replied version %x is different with requested version %x\n",
- libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
- rc = -EPROTO;
- goto failed;
- }
-
- if (msg->ibm_u.connparams.ibcp_queue_depth >
- conn->ibc_queue_depth) {
- CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
- libcfs_nid2str(peer->ibp_nid),
- msg->ibm_u.connparams.ibcp_queue_depth,
- conn->ibc_queue_depth);
- rc = -EPROTO;
- goto failed;
- }
-
- if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) >
- conn->ibc_max_frags) {
- CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
- libcfs_nid2str(peer->ibp_nid),
- msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT,
- conn->ibc_max_frags);
- rc = -EPROTO;
- goto failed;
- }
-
- if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
- CERROR("%s max message size %d too big (%d max)\n",
- libcfs_nid2str(peer->ibp_nid),
- msg->ibm_u.connparams.ibcp_max_msg_size,
- IBLND_MSG_SIZE);
- rc = -EPROTO;
- goto failed;
- }
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- if (msg->ibm_dstnid == ni->ni_nid &&
- msg->ibm_dststamp == net->ibn_incarnation)
- rc = 0;
- else
- rc = -ESTALE;
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- if (rc) {
- CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc,
- msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
- goto failed;
- }
-
- conn->ibc_incarnation = msg->ibm_srcstamp;
- conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth;
- conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
- conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth;
- conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
- LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
- IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
-
- kiblnd_connreq_done(conn, 0);
- return;
-
- failed:
- /*
- * NB My QP has already established itself, so I handle anything going
- * wrong here by setting ibc_comms_error.
- * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
- * immediately tears it down.
- */
- LASSERT(rc);
- conn->ibc_comms_error = rc;
- kiblnd_connreq_done(conn, 0);
-}
-
-static int
-kiblnd_active_connect(struct rdma_cm_id *cmid)
-{
- struct kib_peer *peer = (struct kib_peer *)cmid->context;
- struct kib_conn *conn;
- struct kib_msg *msg;
- struct rdma_conn_param cp;
- int version;
- __u64 incarnation;
- unsigned long flags;
- int rc;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- incarnation = peer->ibp_incarnation;
- version = !peer->ibp_version ? IBLND_MSG_VERSION :
- peer->ibp_version;
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT,
- version);
- if (!conn) {
- kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
- kiblnd_peer_decref(peer); /* lose cmid's ref */
- return -ENOMEM;
- }
-
- /*
- * conn "owns" cmid now, so I return success from here on to ensure the
- * CM callback doesn't destroy cmid. conn also takes over cmid's ref
- * on peer
- */
- msg = &conn->ibc_connvars->cv_msg;
-
- memset(msg, 0, sizeof(*msg));
- kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
- msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth;
- msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
- msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
-
- kiblnd_pack_msg(peer->ibp_ni, msg, version,
- 0, peer->ibp_nid, incarnation);
-
- memset(&cp, 0, sizeof(cp));
- cp.private_data = msg;
- cp.private_data_len = msg->ibm_nob;
- cp.responder_resources = 0; /* No atomic ops or RDMA reads */
- cp.initiator_depth = 0;
- cp.flow_control = 1;
- cp.retry_count = *kiblnd_tunables.kib_retry_count;
- cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
-
- LASSERT(cmid->context == (void *)conn);
- LASSERT(conn->ibc_cmid == cmid);
-
- rc = rdma_connect(cmid, &cp);
- if (rc) {
- CERROR("Can't connect to %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- kiblnd_connreq_done(conn, rc);
- kiblnd_conn_decref(conn);
- }
-
- return 0;
-}
-
-int
-kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
-{
- struct kib_peer *peer;
- struct kib_conn *conn;
- int rc;
-
- switch (event->event) {
- default:
- CERROR("Unexpected event: %d, status: %d\n",
- event->event, event->status);
- LBUG();
-
- case RDMA_CM_EVENT_CONNECT_REQUEST:
- /* destroy cmid on failure */
- rc = kiblnd_passive_connect(cmid,
- (void *)KIBLND_CONN_PARAM(event),
- KIBLND_CONN_PARAM_LEN(event));
- CDEBUG(D_NET, "connreq: %d\n", rc);
- return rc;
-
- case RDMA_CM_EVENT_ADDR_ERROR:
- peer = (struct kib_peer *)cmid->context;
- CNETERR("%s: ADDR ERROR %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
- kiblnd_peer_decref(peer);
- return -EHOSTUNREACH; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_ADDR_RESOLVED:
- peer = (struct kib_peer *)cmid->context;
-
- CDEBUG(D_NET, "%s Addr resolved: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
-
- if (event->status) {
- CNETERR("Can't resolve address for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- rc = event->status;
- } else {
- rc = rdma_resolve_route(
- cmid, *kiblnd_tunables.kib_timeout * 1000);
- if (!rc) {
- struct kib_net *net = peer->ibp_ni->ni_data;
- struct kib_dev *dev = net->ibn_dev;
-
- CDEBUG(D_NET, "%s: connection bound to "\
- "%s:%pI4h:%s\n",
- libcfs_nid2str(peer->ibp_nid),
- dev->ibd_ifname,
- &dev->ibd_ifip, cmid->device->name);
-
- return 0;
- }
-
- /* Can't initiate route resolution */
- CERROR("Can't resolve route for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- }
- kiblnd_peer_connect_failed(peer, 1, rc);
- kiblnd_peer_decref(peer);
- return rc; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_ROUTE_ERROR:
- peer = (struct kib_peer *)cmid->context;
- CNETERR("%s: ROUTE ERROR %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
- kiblnd_peer_decref(peer);
- return -EHOSTUNREACH; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_ROUTE_RESOLVED:
- peer = (struct kib_peer *)cmid->context;
- CDEBUG(D_NET, "%s Route resolved: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
-
- if (!event->status)
- return kiblnd_active_connect(cmid);
-
- CNETERR("Can't resolve route for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- kiblnd_peer_connect_failed(peer, 1, event->status);
- kiblnd_peer_decref(peer);
- return event->status; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_UNREACHABLE:
- conn = (struct kib_conn *)cmid->context;
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
- conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
- CNETERR("%s: UNREACHABLE %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
- kiblnd_connreq_done(conn, -ENETDOWN);
- kiblnd_conn_decref(conn);
- return 0;
-
- case RDMA_CM_EVENT_CONNECT_ERROR:
- conn = (struct kib_conn *)cmid->context;
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
- conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
- CNETERR("%s: CONNECT ERROR %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
- kiblnd_connreq_done(conn, -ENOTCONN);
- kiblnd_conn_decref(conn);
- return 0;
-
- case RDMA_CM_EVENT_REJECTED:
- conn = (struct kib_conn *)cmid->context;
- switch (conn->ibc_state) {
- default:
- LBUG();
-
- case IBLND_CONN_PASSIVE_WAIT:
- CERROR("%s: REJECTED %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- event->status);
- kiblnd_connreq_done(conn, -ECONNRESET);
- break;
-
- case IBLND_CONN_ACTIVE_CONNECT:
- kiblnd_rejected(conn, event->status,
- (void *)KIBLND_CONN_PARAM(event),
- KIBLND_CONN_PARAM_LEN(event));
- break;
- }
- kiblnd_conn_decref(conn);
- return 0;
-
- case RDMA_CM_EVENT_ESTABLISHED:
- conn = (struct kib_conn *)cmid->context;
- switch (conn->ibc_state) {
- default:
- LBUG();
-
- case IBLND_CONN_PASSIVE_WAIT:
- CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_connreq_done(conn, 0);
- break;
-
- case IBLND_CONN_ACTIVE_CONNECT:
- CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_check_connreply(conn,
- (void *)KIBLND_CONN_PARAM(event),
- KIBLND_CONN_PARAM_LEN(event));
- break;
- }
- /* net keeps its ref on conn! */
- return 0;
-
- case RDMA_CM_EVENT_TIMEWAIT_EXIT:
- CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
- return 0;
- case RDMA_CM_EVENT_DISCONNECTED:
- conn = (struct kib_conn *)cmid->context;
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- CERROR("%s DISCONNECTED\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_connreq_done(conn, -ECONNRESET);
- } else {
- kiblnd_close_conn(conn, 0);
- }
- kiblnd_conn_decref(conn);
- cmid->context = NULL;
- return 0;
-
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- LCONSOLE_ERROR_MSG(0x131,
- "Received notification of device removal\n"
- "Please shutdown LNET to allow this to proceed\n");
- /*
- * Can't remove network from underneath LNET for now, so I have
- * to ignore this
- */
- return 0;
-
- case RDMA_CM_EVENT_ADDR_CHANGE:
- LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
- return 0;
- }
-}
-
-static int
-kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
-{
- struct kib_tx *tx;
- struct list_head *ttmp;
-
- list_for_each(ttmp, txs) {
- tx = list_entry(ttmp, struct kib_tx, tx_list);
-
- if (txs != &conn->ibc_active_txs) {
- LASSERT(tx->tx_queued);
- } else {
- LASSERT(!tx->tx_queued);
- LASSERT(tx->tx_waiting || tx->tx_sending);
- }
-
- if (time_after_eq(jiffies, tx->tx_deadline)) {
- CERROR("Timed out tx: %s, %lu seconds\n",
- kiblnd_queue2str(conn, txs),
- (jiffies - tx->tx_deadline) / HZ);
- return 1;
- }
- }
-
- return 0;
-}
-
-static int
-kiblnd_conn_timed_out_locked(struct kib_conn *conn)
-{
- return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
-}
-
-static void
-kiblnd_check_conns(int idx)
-{
- LIST_HEAD(closes);
- LIST_HEAD(checksends);
- struct list_head *peers = &kiblnd_data.kib_peers[idx];
- struct list_head *ptmp;
- struct kib_peer *peer;
- struct kib_conn *conn;
- struct kib_conn *temp;
- struct kib_conn *tmp;
- struct list_head *ctmp;
- unsigned long flags;
-
- /*
- * NB. We expect to have a look at all the peers and not find any
- * RDMAs to time out, so we just use a shared lock while we
- * take a look...
- */
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- list_for_each(ptmp, peers) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
-
- list_for_each(ctmp, &peer->ibp_conns) {
- int timedout;
- int sendnoop;
-
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
-
- spin_lock(&conn->ibc_lock);
-
- sendnoop = kiblnd_need_noop(conn);
- timedout = kiblnd_conn_timed_out_locked(conn);
- if (!sendnoop && !timedout) {
- spin_unlock(&conn->ibc_lock);
- continue;
- }
-
- if (timedout) {
- CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n",
- libcfs_nid2str(peer->ibp_nid),
- (jiffies - peer->ibp_last_alive) / HZ,
- conn->ibc_credits,
- conn->ibc_outstanding_credits,
- conn->ibc_reserved_credits);
- list_add(&conn->ibc_connd_list, &closes);
- } else {
- list_add(&conn->ibc_connd_list, &checksends);
- }
- /* +ref for 'closes' or 'checksends' */
- kiblnd_conn_addref(conn);
-
- spin_unlock(&conn->ibc_lock);
- }
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- /*
- * Handle timeout by closing the whole
- * connection. We can only be sure RDMA activity
- * has ceased once the QP has been modified.
- */
- list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) {
- list_del(&conn->ibc_connd_list);
- kiblnd_close_conn(conn, -ETIMEDOUT);
- kiblnd_conn_decref(conn);
- }
-
- /*
- * In case we have enough credits to return via a
- * NOOP, but there were no non-blocking tx descs
- * free to do it last time...
- */
- list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) {
- list_del(&conn->ibc_connd_list);
-
- spin_lock(&conn->ibc_lock);
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
- kiblnd_conn_decref(conn);
- }
-}
-
-static void
-kiblnd_disconnect_conn(struct kib_conn *conn)
-{
- LASSERT(!in_interrupt());
- LASSERT(current == kiblnd_data.kib_connd);
- LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
-
- rdma_disconnect(conn->ibc_cmid);
- kiblnd_finalise_conn(conn);
-
- kiblnd_peer_notify(conn->ibc_peer);
-}
-
-/**
- * High-water for reconnection to the same peer, reconnection attempt should
- * be delayed after trying more than KIB_RECONN_HIGH_RACE.
- */
-#define KIB_RECONN_HIGH_RACE 10
-/**
- * Allow connd to take a break and handle other things after consecutive
- * reconnection attempts.
- */
-#define KIB_RECONN_BREAK 100
-
-int
-kiblnd_connd(void *arg)
-{
- spinlock_t *lock = &kiblnd_data.kib_connd_lock;
- wait_queue_entry_t wait;
- unsigned long flags;
- struct kib_conn *conn;
- int timeout;
- int i;
- int dropped_lock;
- int peer_index = 0;
- unsigned long deadline = jiffies;
-
- init_waitqueue_entry(&wait, current);
- kiblnd_data.kib_connd = current;
-
- spin_lock_irqsave(lock, flags);
-
- while (!kiblnd_data.kib_shutdown) {
- int reconn = 0;
-
- dropped_lock = 0;
-
- if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
- struct kib_peer *peer = NULL;
-
- conn = list_entry(kiblnd_data.kib_connd_zombies.next,
- struct kib_conn, ibc_list);
- list_del(&conn->ibc_list);
- if (conn->ibc_reconnect) {
- peer = conn->ibc_peer;
- kiblnd_peer_addref(peer);
- }
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- kiblnd_destroy_conn(conn);
-
- spin_lock_irqsave(lock, flags);
- if (!peer) {
- kfree(conn);
- continue;
- }
-
- conn->ibc_peer = peer;
- if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE)
- list_add_tail(&conn->ibc_list,
- &kiblnd_data.kib_reconn_list);
- else
- list_add_tail(&conn->ibc_list,
- &kiblnd_data.kib_reconn_wait);
- }
-
- if (!list_empty(&kiblnd_data.kib_connd_conns)) {
- conn = list_entry(kiblnd_data.kib_connd_conns.next,
- struct kib_conn, ibc_list);
- list_del(&conn->ibc_list);
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- kiblnd_disconnect_conn(conn);
- kiblnd_conn_decref(conn);
-
- spin_lock_irqsave(lock, flags);
- }
-
- while (reconn < KIB_RECONN_BREAK) {
- if (kiblnd_data.kib_reconn_sec !=
- ktime_get_real_seconds()) {
- kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
- list_splice_init(&kiblnd_data.kib_reconn_wait,
- &kiblnd_data.kib_reconn_list);
- }
-
- if (list_empty(&kiblnd_data.kib_reconn_list))
- break;
-
- conn = list_entry(kiblnd_data.kib_reconn_list.next,
- struct kib_conn, ibc_list);
- list_del(&conn->ibc_list);
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- reconn += kiblnd_reconnect_peer(conn->ibc_peer);
- kiblnd_peer_decref(conn->ibc_peer);
- kfree(conn);
-
- spin_lock_irqsave(lock, flags);
- }
-
- /* careful with the jiffy wrap... */
- timeout = (int)(deadline - jiffies);
- if (timeout <= 0) {
- const int n = 4;
- const int p = 1;
- int chunk = kiblnd_data.kib_peer_hash_size;
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- /*
- * Time to check for RDMA timeouts on a few more
- * peers: I do checks every 'p' seconds on a
- * proportion of the peer table and I need to check
- * every connection 'n' times within a timeout
- * interval, to ensure I detect a timeout on any
- * connection within (n+1)/n times the timeout
- * interval.
- */
- if (*kiblnd_tunables.kib_timeout > n * p)
- chunk = (chunk * n * p) /
- *kiblnd_tunables.kib_timeout;
- if (!chunk)
- chunk = 1;
-
- for (i = 0; i < chunk; i++) {
- kiblnd_check_conns(peer_index);
- peer_index = (peer_index + 1) %
- kiblnd_data.kib_peer_hash_size;
- }
-
- deadline += msecs_to_jiffies(p * MSEC_PER_SEC);
- spin_lock_irqsave(lock, flags);
- }
-
- if (dropped_lock)
- continue;
-
- /* Nothing to do for 'timeout' */
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
- spin_unlock_irqrestore(lock, flags);
-
- schedule_timeout(timeout);
-
- remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
- spin_lock_irqsave(lock, flags);
- }
-
- spin_unlock_irqrestore(lock, flags);
-
- kiblnd_thread_fini();
- return 0;
-}
-
-void
-kiblnd_qp_event(struct ib_event *event, void *arg)
-{
- struct kib_conn *conn = arg;
-
- switch (event->event) {
- case IB_EVENT_COMM_EST:
- CDEBUG(D_NET, "%s established\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- /*
- * We received a packet but connection isn't established
- * probably handshake packet was lost, so free to
- * force make connection established
- */
- rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
- return;
-
- default:
- CERROR("%s: Async QP event type %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
- return;
- }
-}
-
-static void
-kiblnd_complete(struct ib_wc *wc)
-{
- switch (kiblnd_wreqid2type(wc->wr_id)) {
- default:
- LBUG();
-
- case IBLND_WID_MR:
- if (wc->status != IB_WC_SUCCESS &&
- wc->status != IB_WC_WR_FLUSH_ERR)
- CNETERR("FastReg failed: %d\n", wc->status);
- break;
-
- case IBLND_WID_RDMA:
- /*
- * We only get RDMA completion notification if it fails. All
- * subsequent work items, including the final SEND will fail
- * too. However we can't print out any more info about the
- * failing RDMA because 'tx' might be back on the idle list or
- * even reused already if we didn't manage to post all our work
- * items
- */
- CNETERR("RDMA (tx: %p) failed: %d\n",
- kiblnd_wreqid2ptr(wc->wr_id), wc->status);
- return;
-
- case IBLND_WID_TX:
- kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
- return;
-
- case IBLND_WID_RX:
- kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
- wc->byte_len);
- return;
- }
-}
-
-void
-kiblnd_cq_completion(struct ib_cq *cq, void *arg)
-{
- /*
- * NB I'm not allowed to schedule this conn once its refcount has
- * reached 0. Since fundamentally I'm racing with scheduler threads
- * consuming my CQ I could be called after all completions have
- * occurred. But in this case, !ibc_nrx && !ibc_nsends_posted
- * and this CQ is about to be destroyed so I NOOP.
- */
- struct kib_conn *conn = arg;
- struct kib_sched_info *sched = conn->ibc_sched;
- unsigned long flags;
-
- LASSERT(cq == conn->ibc_cq);
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
-
- conn->ibc_ready = 1;
-
- if (!conn->ibc_scheduled &&
- (conn->ibc_nrx > 0 ||
- conn->ibc_nsends_posted > 0)) {
- kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
- conn->ibc_scheduled = 1;
- list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
-
- if (waitqueue_active(&sched->ibs_waitq))
- wake_up(&sched->ibs_waitq);
- }
-
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-}
-
-void
-kiblnd_cq_event(struct ib_event *event, void *arg)
-{
- struct kib_conn *conn = arg;
-
- CERROR("%s: async CQ event type %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
-}
-
-int
-kiblnd_scheduler(void *arg)
-{
- long id = (long)arg;
- struct kib_sched_info *sched;
- struct kib_conn *conn;
- wait_queue_entry_t wait;
- unsigned long flags;
- struct ib_wc wc;
- int did_something;
- int busy_loops = 0;
- int rc;
-
- init_waitqueue_entry(&wait, current);
-
- sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
-
- rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
- if (rc) {
- CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n",
- sched->ibs_cpt);
- }
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
-
- while (!kiblnd_data.kib_shutdown) {
- if (busy_loops++ >= IBLND_RESCHED) {
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- cond_resched();
- busy_loops = 0;
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
- }
-
- did_something = 0;
-
- if (!list_empty(&sched->ibs_conns)) {
- conn = list_entry(sched->ibs_conns.next, struct kib_conn,
- ibc_sched_list);
- /* take over kib_sched_conns' ref on conn... */
- LASSERT(conn->ibc_scheduled);
- list_del(&conn->ibc_sched_list);
- conn->ibc_ready = 0;
-
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- wc.wr_id = IBLND_WID_INVAL;
-
- rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
- if (!rc) {
- rc = ib_req_notify_cq(conn->ibc_cq,
- IB_CQ_NEXT_COMP);
- if (rc < 0) {
- CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
- kiblnd_close_conn(conn, -EIO);
- kiblnd_conn_decref(conn);
- spin_lock_irqsave(&sched->ibs_lock,
- flags);
- continue;
- }
-
- rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
- }
-
- if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
- LCONSOLE_ERROR("ib_poll_cq (rc: %d) returned invalid wr_id, opcode %d, status: %d, vendor_err: %d, conn: %s status: %d\nplease upgrade firmware and OFED or contact vendor.\n",
- rc, wc.opcode, wc.status,
- wc.vendor_err,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- conn->ibc_state);
- rc = -EINVAL;
- }
-
- if (rc < 0) {
- CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- rc);
- kiblnd_close_conn(conn, -EIO);
- kiblnd_conn_decref(conn);
- spin_lock_irqsave(&sched->ibs_lock, flags);
- continue;
- }
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
-
- if (rc || conn->ibc_ready) {
- /*
- * There may be another completion waiting; get
- * another scheduler to check while I handle
- * this one...
- */
- /* +1 ref for sched_conns */
- kiblnd_conn_addref(conn);
- list_add_tail(&conn->ibc_sched_list,
- &sched->ibs_conns);
- if (waitqueue_active(&sched->ibs_waitq))
- wake_up(&sched->ibs_waitq);
- } else {
- conn->ibc_scheduled = 0;
- }
-
- if (rc) {
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
- kiblnd_complete(&wc);
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
- }
-
- kiblnd_conn_decref(conn); /* ...drop my ref from above */
- did_something = 1;
- }
-
- if (did_something)
- continue;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- schedule();
- busy_loops = 0;
-
- remove_wait_queue(&sched->ibs_waitq, &wait);
- spin_lock_irqsave(&sched->ibs_lock, flags);
- }
-
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- kiblnd_thread_fini();
- return 0;
-}
-
-int
-kiblnd_failover_thread(void *arg)
-{
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_dev *dev;
- wait_queue_entry_t wait;
- unsigned long flags;
- int rc;
-
- LASSERT(*kiblnd_tunables.kib_dev_failover);
-
- init_waitqueue_entry(&wait, current);
- write_lock_irqsave(glock, flags);
-
- while (!kiblnd_data.kib_shutdown) {
- int do_failover = 0;
- int long_sleep;
-
- list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
- ibd_fail_list) {
- if (time_before(jiffies,
- dev->ibd_next_failover))
- continue;
- do_failover = 1;
- break;
- }
-
- if (do_failover) {
- list_del_init(&dev->ibd_fail_list);
- dev->ibd_failover = 1;
- write_unlock_irqrestore(glock, flags);
-
- rc = kiblnd_dev_failover(dev);
-
- write_lock_irqsave(glock, flags);
-
- LASSERT(dev->ibd_failover);
- dev->ibd_failover = 0;
- if (rc >= 0) { /* Device is OK or failover succeed */
- dev->ibd_next_failover = jiffies + 3 * HZ;
- continue;
- }
-
- /* failed to failover, retry later */
- dev->ibd_next_failover =
- jiffies + min(dev->ibd_failed_failover, 10) * HZ;
- if (kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- }
-
- continue;
- }
-
- /* long sleep if no more pending failover */
- long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
- write_unlock_irqrestore(glock, flags);
-
- rc = schedule_timeout(long_sleep ? 10 * HZ :
- HZ);
- remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
- write_lock_irqsave(glock, flags);
-
- if (!long_sleep || rc)
- continue;
-
- /*
- * have a long sleep, routine check all active devices,
- * we need checking like this because if there is not active
- * connection on the dev and no SEND from local, we may listen
- * on wrong HCA for ever while there is a bonding failover
- */
- list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
- if (kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- }
- }
- }
-
- write_unlock_irqrestore(glock, flags);
-
- kiblnd_thread_fini();
- return 0;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
deleted file mode 100644
index 39d0792..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ /dev/null
@@ -1,296 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd_modparams.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include "o2iblnd.h"
-
-static int service = 987;
-module_param(service, int, 0444);
-MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
-
-static int cksum;
-module_param(cksum, int, 0644);
-MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
-
-static int timeout = 50;
-module_param(timeout, int, 0644);
-MODULE_PARM_DESC(timeout, "timeout (seconds)");
-
-/*
- * Number of threads in each scheduler pool which is percpt,
- * we will estimate reasonable value based on CPUs if it's set to zero.
- */
-static int nscheds;
-module_param(nscheds, int, 0444);
-MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
-
-static unsigned int conns_per_peer = 1;
-module_param(conns_per_peer, uint, 0444);
-MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
-
-/* NB: this value is shared by all CPTs, it can grow at runtime */
-static int ntx = 512;
-module_param(ntx, int, 0444);
-MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
-
-/* NB: this value is shared by all CPTs */
-static int credits = 256;
-module_param(credits, int, 0444);
-MODULE_PARM_DESC(credits, "# concurrent sends");
-
-static int peer_credits = 8;
-module_param(peer_credits, int, 0444);
-MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
-
-static int peer_credits_hiw;
-module_param(peer_credits_hiw, int, 0444);
-MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
-
-static int peer_buffer_credits;
-module_param(peer_buffer_credits, int, 0444);
-MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
-
-static int peer_timeout = 180;
-module_param(peer_timeout, int, 0444);
-MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
-
-static char *ipif_name = "ib0";
-module_param(ipif_name, charp, 0444);
-MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
-
-static int retry_count = 5;
-module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
-
-static int rnr_retry_count = 6;
-module_param(rnr_retry_count, int, 0644);
-MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
-
-static int keepalive = 100;
-module_param(keepalive, int, 0644);
-MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
-
-static int ib_mtu;
-module_param(ib_mtu, int, 0444);
-MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
-
-static int concurrent_sends;
-module_param(concurrent_sends, int, 0444);
-MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
-
-#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
-static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
-module_param(map_on_demand, int, 0444);
-MODULE_PARM_DESC(map_on_demand, "map on demand");
-
-/* NB: this value is shared by all CPTs, it can grow at runtime */
-static int fmr_pool_size = 512;
-module_param(fmr_pool_size, int, 0444);
-MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
-
-/* NB: this value is shared by all CPTs, it can grow at runtime */
-static int fmr_flush_trigger = 384;
-module_param(fmr_flush_trigger, int, 0444);
-MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
-
-static int fmr_cache = 1;
-module_param(fmr_cache, int, 0444);
-MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
-
-/*
- * 0: disable failover
- * 1: enable failover if necessary
- * 2: force to failover (for debug)
- */
-static int dev_failover;
-module_param(dev_failover, int, 0444);
-MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
-
-static int require_privileged_port;
-module_param(require_privileged_port, int, 0644);
-MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
-
-static int use_privileged_port = 1;
-module_param(use_privileged_port, int, 0644);
-MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
-
-struct kib_tunables kiblnd_tunables = {
- .kib_dev_failover = &dev_failover,
- .kib_service = &service,
- .kib_cksum = &cksum,
- .kib_timeout = &timeout,
- .kib_keepalive = &keepalive,
- .kib_ntx = &ntx,
- .kib_default_ipif = &ipif_name,
- .kib_retry_count = &retry_count,
- .kib_rnr_retry_count = &rnr_retry_count,
- .kib_ib_mtu = &ib_mtu,
- .kib_require_priv_port = &require_privileged_port,
- .kib_use_priv_port = &use_privileged_port,
- .kib_nscheds = &nscheds
-};
-
-static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
-
-/* # messages/RDMAs in-flight */
-int kiblnd_msg_queue_size(int version, struct lnet_ni *ni)
-{
- if (version == IBLND_MSG_VERSION_1)
- return IBLND_MSG_QUEUE_SIZE_V1;
- else if (ni)
- return ni->ni_peertxcredits;
- else
- return peer_credits;
-}
-
-int kiblnd_tunables_setup(struct lnet_ni *ni)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-
- /*
- * if there was no tunables specified, setup the tunables to be
- * defaulted
- */
- if (!ni->ni_lnd_tunables) {
- ni->ni_lnd_tunables = kzalloc(sizeof(*ni->ni_lnd_tunables),
- GFP_NOFS);
- if (!ni->ni_lnd_tunables)
- return -ENOMEM;
-
- memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib,
- &default_tunables, sizeof(*tunables));
- }
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
-
- /* Current API version */
- tunables->lnd_version = 0;
-
- if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
- CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
- *kiblnd_tunables.kib_ib_mtu);
- return -EINVAL;
- }
-
- if (!ni->ni_peertimeout)
- ni->ni_peertimeout = peer_timeout;
-
- if (!ni->ni_maxtxcredits)
- ni->ni_maxtxcredits = credits;
-
- if (!ni->ni_peertxcredits)
- ni->ni_peertxcredits = peer_credits;
-
- if (!ni->ni_peerrtrcredits)
- ni->ni_peerrtrcredits = peer_buffer_credits;
-
- if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT)
- ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT;
-
- if (ni->ni_peertxcredits > IBLND_CREDITS_MAX)
- ni->ni_peertxcredits = IBLND_CREDITS_MAX;
-
- if (ni->ni_peertxcredits > credits)
- ni->ni_peertxcredits = credits;
-
- if (!tunables->lnd_peercredits_hiw)
- tunables->lnd_peercredits_hiw = peer_credits_hiw;
-
- if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2)
- tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2;
-
- if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits)
- tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1;
-
- if (tunables->lnd_map_on_demand <= 0 ||
- tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
- /* Use the default */
- CWARN("Invalid map_on_demand (%d), expects 1 - %d. Using default of %d\n",
- tunables->lnd_map_on_demand,
- IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
- tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
- }
-
- if (tunables->lnd_map_on_demand == 1) {
- /* don't make sense to create map if only one fragment */
- tunables->lnd_map_on_demand = 2;
- }
-
- if (!tunables->lnd_concurrent_sends) {
- if (tunables->lnd_map_on_demand > 0 &&
- tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
- tunables->lnd_concurrent_sends =
- ni->ni_peertxcredits * 2;
- } else {
- tunables->lnd_concurrent_sends = ni->ni_peertxcredits;
- }
- }
-
- if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2)
- tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2;
-
- if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2)
- tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2;
-
- if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) {
- CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n",
- tunables->lnd_concurrent_sends, ni->ni_peertxcredits);
- }
-
- if (!tunables->lnd_fmr_pool_size)
- tunables->lnd_fmr_pool_size = fmr_pool_size;
- if (!tunables->lnd_fmr_flush_trigger)
- tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
- if (!tunables->lnd_fmr_cache)
- tunables->lnd_fmr_cache = fmr_cache;
- if (!tunables->lnd_conns_per_peer) {
- tunables->lnd_conns_per_peer = (conns_per_peer) ?
- conns_per_peer : 1;
- }
-
- return 0;
-}
-
-void kiblnd_tunables_init(void)
-{
- default_tunables.lnd_version = 0;
- default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
- default_tunables.lnd_map_on_demand = map_on_demand;
- default_tunables.lnd_concurrent_sends = concurrent_sends;
- default_tunables.lnd_fmr_pool_size = fmr_pool_size;
- default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
- default_tunables.lnd_fmr_cache = fmr_cache;
- default_tunables.lnd_conns_per_peer = conns_per_peer;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
deleted file mode 100644
index a7da1ab..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include
-
-obj-$(CONFIG_LNET) += ksocklnd.o
-
-ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib.o
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
deleted file mode 100644
index f01b34a..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ /dev/null
@@ -1,2921 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/socklnd/socklnd.c
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include "socklnd.h"
-
-static struct lnet_lnd the_ksocklnd;
-struct ksock_nal_data ksocknal_data;
-
-static struct ksock_interface *
-ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
-{
- struct ksock_net *net = ni->ni_data;
- int i;
- struct ksock_interface *iface;
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- LASSERT(i < LNET_MAX_INTERFACES);
- iface = &net->ksnn_interfaces[i];
-
- if (iface->ksni_ipaddr == ip)
- return iface;
- }
-
- return NULL;
-}
-
-static struct ksock_route *
-ksocknal_create_route(__u32 ipaddr, int port)
-{
- struct ksock_route *route;
-
- route = kzalloc(sizeof(*route), GFP_NOFS);
- if (!route)
- return NULL;
-
- atomic_set(&route->ksnr_refcount, 1);
- route->ksnr_peer = NULL;
- route->ksnr_retry_interval = 0; /* OK to connect at any time */
- route->ksnr_ipaddr = ipaddr;
- route->ksnr_port = port;
- route->ksnr_scheduled = 0;
- route->ksnr_connecting = 0;
- route->ksnr_connected = 0;
- route->ksnr_deleted = 0;
- route->ksnr_conn_count = 0;
- route->ksnr_share_count = 0;
-
- return route;
-}
-
-void
-ksocknal_destroy_route(struct ksock_route *route)
-{
- LASSERT(!atomic_read(&route->ksnr_refcount));
-
- if (route->ksnr_peer)
- ksocknal_peer_decref(route->ksnr_peer);
-
- kfree(route);
-}
-
-static int
-ksocknal_create_peer(struct ksock_peer **peerp, struct lnet_ni *ni,
- struct lnet_process_id id)
-{
- int cpt = lnet_cpt_of_nid(id.nid);
- struct ksock_net *net = ni->ni_data;
- struct ksock_peer *peer;
-
- LASSERT(id.nid != LNET_NID_ANY);
- LASSERT(id.pid != LNET_PID_ANY);
- LASSERT(!in_interrupt());
-
- peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt);
- if (!peer)
- return -ENOMEM;
-
- peer->ksnp_ni = ni;
- peer->ksnp_id = id;
- atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */
- peer->ksnp_closing = 0;
- peer->ksnp_accepting = 0;
- peer->ksnp_proto = NULL;
- peer->ksnp_last_alive = 0;
- peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
-
- INIT_LIST_HEAD(&peer->ksnp_conns);
- INIT_LIST_HEAD(&peer->ksnp_routes);
- INIT_LIST_HEAD(&peer->ksnp_tx_queue);
- INIT_LIST_HEAD(&peer->ksnp_zc_req_list);
- spin_lock_init(&peer->ksnp_lock);
-
- spin_lock_bh(&net->ksnn_lock);
-
- if (net->ksnn_shutdown) {
- spin_unlock_bh(&net->ksnn_lock);
-
- kfree(peer);
- CERROR("Can't create peer: network shutdown\n");
- return -ESHUTDOWN;
- }
-
- net->ksnn_npeers++;
-
- spin_unlock_bh(&net->ksnn_lock);
-
- *peerp = peer;
- return 0;
-}
-
-void
-ksocknal_destroy_peer(struct ksock_peer *peer)
-{
- struct ksock_net *net = peer->ksnp_ni->ni_data;
-
- CDEBUG(D_NET, "peer %s %p deleted\n",
- libcfs_id2str(peer->ksnp_id), peer);
-
- LASSERT(!atomic_read(&peer->ksnp_refcount));
- LASSERT(!peer->ksnp_accepting);
- LASSERT(list_empty(&peer->ksnp_conns));
- LASSERT(list_empty(&peer->ksnp_routes));
- LASSERT(list_empty(&peer->ksnp_tx_queue));
- LASSERT(list_empty(&peer->ksnp_zc_req_list));
-
- kfree(peer);
-
- /*
- * NB a peer's connections and routes keep a reference on their peer
- * until they are destroyed, so we can be assured that _all_ state to
- * do with this peer has been cleaned up when its refcount drops to
- * zero.
- */
- spin_lock_bh(&net->ksnn_lock);
- net->ksnn_npeers--;
- spin_unlock_bh(&net->ksnn_lock);
-}
-
-struct ksock_peer *
-ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
-{
- struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
- struct ksock_peer *peer;
-
- list_for_each_entry(peer, peer_list, ksnp_list) {
- LASSERT(!peer->ksnp_closing);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- if (peer->ksnp_id.nid != id.nid ||
- peer->ksnp_id.pid != id.pid)
- continue;
-
- CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
- peer, libcfs_id2str(id),
- atomic_read(&peer->ksnp_refcount));
- return peer;
- }
- return NULL;
-}
-
-struct ksock_peer *
-ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
-{
- struct ksock_peer *peer;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer) /* +1 ref for caller? */
- ksocknal_peer_addref(peer);
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- return peer;
-}
-
-static void
-ksocknal_unlink_peer_locked(struct ksock_peer *peer)
-{
- int i;
- __u32 ip;
- struct ksock_interface *iface;
-
- for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
- LASSERT(i < LNET_MAX_INTERFACES);
- ip = peer->ksnp_passive_ips[i];
-
- iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
- /*
- * All IPs in peer->ksnp_passive_ips[] come from the
- * interface list, therefore the call must succeed.
- */
- LASSERT(iface);
-
- CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
- peer, iface, iface->ksni_nroutes);
- iface->ksni_npeers--;
- }
-
- LASSERT(list_empty(&peer->ksnp_conns));
- LASSERT(list_empty(&peer->ksnp_routes));
- LASSERT(!peer->ksnp_closing);
- peer->ksnp_closing = 1;
- list_del(&peer->ksnp_list);
- /* lose peerlist's ref */
- ksocknal_peer_decref(peer);
-}
-
-static int
-ksocknal_get_peer_info(struct lnet_ni *ni, int index,
- struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
- int *port, int *conn_count, int *share_count)
-{
- struct ksock_peer *peer;
- struct list_head *ptmp;
- struct ksock_route *route;
- struct list_head *rtmp;
- int i;
- int j;
- int rc = -ENOENT;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- if (!peer->ksnp_n_passive_ips &&
- list_empty(&peer->ksnp_routes)) {
- if (index-- > 0)
- continue;
-
- *id = peer->ksnp_id;
- *myip = 0;
- *peer_ip = 0;
- *port = 0;
- *conn_count = 0;
- *share_count = 0;
- rc = 0;
- goto out;
- }
-
- for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
- if (index-- > 0)
- continue;
-
- *id = peer->ksnp_id;
- *myip = peer->ksnp_passive_ips[j];
- *peer_ip = 0;
- *port = 0;
- *conn_count = 0;
- *share_count = 0;
- rc = 0;
- goto out;
- }
-
- list_for_each(rtmp, &peer->ksnp_routes) {
- if (index-- > 0)
- continue;
-
- route = list_entry(rtmp, struct ksock_route,
- ksnr_list);
-
- *id = peer->ksnp_id;
- *myip = route->ksnr_myipaddr;
- *peer_ip = route->ksnr_ipaddr;
- *port = route->ksnr_port;
- *conn_count = route->ksnr_conn_count;
- *share_count = route->ksnr_share_count;
- rc = 0;
- goto out;
- }
- }
- }
- out:
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return rc;
-}
-
-static void
-ksocknal_associate_route_conn_locked(struct ksock_route *route,
- struct ksock_conn *conn)
-{
- struct ksock_peer *peer = route->ksnr_peer;
- int type = conn->ksnc_type;
- struct ksock_interface *iface;
-
- conn->ksnc_route = route;
- ksocknal_route_addref(route);
-
- if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
- if (!route->ksnr_myipaddr) {
- /* route wasn't bound locally yet (the initial route) */
- CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr,
- &conn->ksnc_myipaddr);
- } else {
- CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr,
- &route->ksnr_myipaddr,
- &conn->ksnc_myipaddr);
-
- iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
- route->ksnr_myipaddr);
- if (iface)
- iface->ksni_nroutes--;
- }
- route->ksnr_myipaddr = conn->ksnc_myipaddr;
- iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
- route->ksnr_myipaddr);
- if (iface)
- iface->ksni_nroutes++;
- }
-
- route->ksnr_connected |= (1 << type);
- route->ksnr_conn_count++;
-
- /*
- * Successful connection => further attempts can
- * proceed immediately
- */
- route->ksnr_retry_interval = 0;
-}
-
-static void
-ksocknal_add_route_locked(struct ksock_peer *peer, struct ksock_route *route)
-{
- struct list_head *tmp;
- struct ksock_conn *conn;
- struct ksock_route *route2;
-
- LASSERT(!peer->ksnp_closing);
- LASSERT(!route->ksnr_peer);
- LASSERT(!route->ksnr_scheduled);
- LASSERT(!route->ksnr_connecting);
- LASSERT(!route->ksnr_connected);
-
- /* LASSERT(unique) */
- list_for_each(tmp, &peer->ksnp_routes) {
- route2 = list_entry(tmp, struct ksock_route, ksnr_list);
-
- if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
- CERROR("Duplicate route %s %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr);
- LBUG();
- }
- }
-
- route->ksnr_peer = peer;
- ksocknal_peer_addref(peer);
- /* peer's routelist takes over my ref on 'route' */
- list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
-
- list_for_each(tmp, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
- continue;
-
- ksocknal_associate_route_conn_locked(route, conn);
- /* keep going (typed routes) */
- }
-}
-
-static void
-ksocknal_del_route_locked(struct ksock_route *route)
-{
- struct ksock_peer *peer = route->ksnr_peer;
- struct ksock_interface *iface;
- struct ksock_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
-
- LASSERT(!route->ksnr_deleted);
-
- /* Close associated conns */
- list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
- conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
-
- if (conn->ksnc_route != route)
- continue;
-
- ksocknal_close_conn_locked(conn, 0);
- }
-
- if (route->ksnr_myipaddr) {
- iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
- route->ksnr_myipaddr);
- if (iface)
- iface->ksni_nroutes--;
- }
-
- route->ksnr_deleted = 1;
- list_del(&route->ksnr_list);
- ksocknal_route_decref(route); /* drop peer's ref */
-
- if (list_empty(&peer->ksnp_routes) &&
- list_empty(&peer->ksnp_conns)) {
- /*
- * I've just removed the last route to a peer with no active
- * connections
- */
- ksocknal_unlink_peer_locked(peer);
- }
-}
-
-int
-ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
- int port)
-{
- struct ksock_peer *peer;
- struct ksock_peer *peer2;
- struct ksock_route *route;
- struct ksock_route *route2;
- int rc;
-
- if (id.nid == LNET_NID_ANY ||
- id.pid == LNET_PID_ANY)
- return -EINVAL;
-
- /* Have a brand new peer ready... */
- rc = ksocknal_create_peer(&peer, ni, id);
- if (rc)
- return rc;
-
- route = ksocknal_create_route(ipaddr, port);
- if (!route) {
- ksocknal_peer_decref(peer);
- return -ENOMEM;
- }
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- /* always called with a ref on ni, so shutdown can't have started */
- LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown);
-
- peer2 = ksocknal_find_peer_locked(ni, id);
- if (peer2) {
- ksocknal_peer_decref(peer);
- peer = peer2;
- } else {
- /* peer table takes my ref on peer */
- list_add_tail(&peer->ksnp_list,
- ksocknal_nid2peerlist(id.nid));
- }
-
- list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) {
- if (route2->ksnr_ipaddr == ipaddr) {
- /* Route already exists, use the old one */
- ksocknal_route_decref(route);
- route2->ksnr_share_count++;
- goto out;
- }
- }
- /* Route doesn't already exist, add the new one */
- ksocknal_add_route_locked(peer, route);
- route->ksnr_share_count++;
-out:
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return 0;
-}
-
-static void
-ksocknal_del_peer_locked(struct ksock_peer *peer, __u32 ip)
-{
- struct ksock_conn *conn;
- struct ksock_route *route;
- struct list_head *tmp;
- struct list_head *nxt;
- int nshared;
-
- LASSERT(!peer->ksnp_closing);
-
- /* Extra ref prevents peer disappearing until I'm done with it */
- ksocknal_peer_addref(peer);
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- /* no match */
- if (!(!ip || route->ksnr_ipaddr == ip))
- continue;
-
- route->ksnr_share_count = 0;
- /* This deletes associated conns too */
- ksocknal_del_route_locked(route);
- }
-
- nshared = 0;
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
- nshared += route->ksnr_share_count;
- }
-
- if (!nshared) {
- /*
- * remove everything else if there are no explicit entries
- * left
- */
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- /* we should only be removing auto-entries */
- LASSERT(!route->ksnr_share_count);
- ksocknal_del_route_locked(route);
- }
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- ksocknal_close_conn_locked(conn, 0);
- }
- }
-
- ksocknal_peer_decref(peer);
- /* NB peer unlinks itself when last conn/route is removed */
-}
-
-static int
-ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
-{
- LIST_HEAD(zombies);
- struct list_head *ptmp;
- struct list_head *pnxt;
- struct ksock_peer *peer;
- int lo;
- int hi;
- int i;
- int rc = -ENOENT;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (id.nid != LNET_NID_ANY) {
- lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- } else {
- lo = 0;
- hi = ksocknal_data.ksnd_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
- (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
- continue;
-
- ksocknal_peer_addref(peer); /* a ref for me... */
-
- ksocknal_del_peer_locked(peer, ip);
-
- if (peer->ksnp_closing &&
- !list_empty(&peer->ksnp_tx_queue)) {
- LASSERT(list_empty(&peer->ksnp_conns));
- LASSERT(list_empty(&peer->ksnp_routes));
-
- list_splice_init(&peer->ksnp_tx_queue,
- &zombies);
- }
-
- ksocknal_peer_decref(peer); /* ...till here */
-
- rc = 0; /* matched! */
- }
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_txlist_done(ni, &zombies, 1);
-
- return rc;
-}
-
-static struct ksock_conn *
-ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
-{
- struct ksock_peer *peer;
- struct list_head *ptmp;
- struct ksock_conn *conn;
- struct list_head *ctmp;
- int i;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- LASSERT(!peer->ksnp_closing);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- list_for_each(ctmp, &peer->ksnp_conns) {
- if (index-- > 0)
- continue;
-
- conn = list_entry(ctmp, struct ksock_conn,
- ksnc_list);
- ksocknal_conn_addref(conn);
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return conn;
- }
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return NULL;
-}
-
-static struct ksock_sched *
-ksocknal_choose_scheduler_locked(unsigned int cpt)
-{
- struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
- struct ksock_sched *sched;
- int i;
-
- LASSERT(info->ksi_nthreads > 0);
-
- sched = &info->ksi_scheds[0];
- /*
- * NB: it's safe so far, but info->ksi_nthreads could be changed
- * at runtime when we have dynamic LNet configuration, then we
- * need to take care of this.
- */
- for (i = 1; i < info->ksi_nthreads; i++) {
- if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
- sched = &info->ksi_scheds[i];
- }
-
- return sched;
-}
-
-static int
-ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
-{
- struct ksock_net *net = ni->ni_data;
- int i;
- int nip;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- nip = net->ksnn_ninterfaces;
- LASSERT(nip <= LNET_MAX_INTERFACES);
-
- /*
- * Only offer interfaces for additional connections if I have
- * more than one.
- */
- if (nip < 2) {
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return 0;
- }
-
- for (i = 0; i < nip; i++) {
- ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
- LASSERT(ipaddrs[i]);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return nip;
-}
-
-static int
-ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips)
-{
- int best_netmatch = 0;
- int best_xor = 0;
- int best = -1;
- int this_xor;
- int this_netmatch;
- int i;
-
- for (i = 0; i < nips; i++) {
- if (!ips[i])
- continue;
-
- this_xor = ips[i] ^ iface->ksni_ipaddr;
- this_netmatch = !(this_xor & iface->ksni_netmask) ? 1 : 0;
-
- if (!(best < 0 ||
- best_netmatch < this_netmatch ||
- (best_netmatch == this_netmatch &&
- best_xor > this_xor)))
- continue;
-
- best = i;
- best_netmatch = this_netmatch;
- best_xor = this_xor;
- }
-
- LASSERT(best >= 0);
- return best;
-}
-
-static int
-ksocknal_select_ips(struct ksock_peer *peer, __u32 *peerips, int n_peerips)
-{
- rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
- struct ksock_net *net = peer->ksnp_ni->ni_data;
- struct ksock_interface *iface;
- struct ksock_interface *best_iface;
- int n_ips;
- int i;
- int j;
- int k;
- __u32 ip;
- __u32 xor;
- int this_netmatch;
- int best_netmatch;
- int best_npeers;
-
- /*
- * CAVEAT EMPTOR: We do all our interface matching with an
- * exclusive hold of global lock at IRQ priority. We're only
- * expecting to be dealing with small numbers of interfaces, so the
- * O(n**3)-ness shouldn't matter
- */
- /*
- * Also note that I'm not going to return more than n_peerips
- * interfaces, even if I have more myself
- */
- write_lock_bh(global_lock);
-
- LASSERT(n_peerips <= LNET_MAX_INTERFACES);
- LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
-
- /*
- * Only match interfaces for additional connections
- * if I have > 1 interface
- */
- n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
- min(n_peerips, net->ksnn_ninterfaces);
-
- for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
- /* ^ yes really... */
-
- /*
- * If we have any new interfaces, first tick off all the
- * peer IPs that match old interfaces, then choose new
- * interfaces to match the remaining peer IPS.
- * We don't forget interfaces we've stopped using; we might
- * start using them again...
- */
- if (i < peer->ksnp_n_passive_ips) {
- /* Old interface. */
- ip = peer->ksnp_passive_ips[i];
- best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
-
- /* peer passive ips are kept up to date */
- LASSERT(best_iface);
- } else {
- /* choose a new interface */
- LASSERT(i == peer->ksnp_n_passive_ips);
-
- best_iface = NULL;
- best_netmatch = 0;
- best_npeers = 0;
-
- for (j = 0; j < net->ksnn_ninterfaces; j++) {
- iface = &net->ksnn_interfaces[j];
- ip = iface->ksni_ipaddr;
-
- for (k = 0; k < peer->ksnp_n_passive_ips; k++)
- if (peer->ksnp_passive_ips[k] == ip)
- break;
-
- if (k < peer->ksnp_n_passive_ips) /* using it already */
- continue;
-
- k = ksocknal_match_peerip(iface, peerips,
- n_peerips);
- xor = ip ^ peerips[k];
- this_netmatch = !(xor & iface->ksni_netmask) ? 1 : 0;
-
- if (!(!best_iface ||
- best_netmatch < this_netmatch ||
- (best_netmatch == this_netmatch &&
- best_npeers > iface->ksni_npeers)))
- continue;
-
- best_iface = iface;
- best_netmatch = this_netmatch;
- best_npeers = iface->ksni_npeers;
- }
-
- LASSERT(best_iface);
-
- best_iface->ksni_npeers++;
- ip = best_iface->ksni_ipaddr;
- peer->ksnp_passive_ips[i] = ip;
- peer->ksnp_n_passive_ips = i + 1;
- }
-
- /* mark the best matching peer IP used */
- j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
- peerips[j] = 0;
- }
-
- /* Overwrite input peer IP addresses */
- memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
-
- write_unlock_bh(global_lock);
-
- return n_ips;
-}
-
-static void
-ksocknal_create_routes(struct ksock_peer *peer, int port,
- __u32 *peer_ipaddrs, int npeer_ipaddrs)
-{
- struct ksock_route *newroute = NULL;
- rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
- struct lnet_ni *ni = peer->ksnp_ni;
- struct ksock_net *net = ni->ni_data;
- struct list_head *rtmp;
- struct ksock_route *route;
- struct ksock_interface *iface;
- struct ksock_interface *best_iface;
- int best_netmatch;
- int this_netmatch;
- int best_nroutes;
- int i;
- int j;
-
- /*
- * CAVEAT EMPTOR: We do all our interface matching with an
- * exclusive hold of global lock at IRQ priority. We're only
- * expecting to be dealing with small numbers of interfaces, so the
- * O(n**3)-ness here shouldn't matter
- */
- write_lock_bh(global_lock);
-
- if (net->ksnn_ninterfaces < 2) {
- /*
- * Only create additional connections
- * if I have > 1 interface
- */
- write_unlock_bh(global_lock);
- return;
- }
-
- LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES);
-
- for (i = 0; i < npeer_ipaddrs; i++) {
- if (newroute) {
- newroute->ksnr_ipaddr = peer_ipaddrs[i];
- } else {
- write_unlock_bh(global_lock);
-
- newroute = ksocknal_create_route(peer_ipaddrs[i], port);
- if (!newroute)
- return;
-
- write_lock_bh(global_lock);
- }
-
- if (peer->ksnp_closing) {
- /* peer got closed under me */
- break;
- }
-
- /* Already got a route? */
- route = NULL;
- list_for_each(rtmp, &peer->ksnp_routes) {
- route = list_entry(rtmp, struct ksock_route, ksnr_list);
-
- if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
- break;
-
- route = NULL;
- }
- if (route)
- continue;
-
- best_iface = NULL;
- best_nroutes = 0;
- best_netmatch = 0;
-
- LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
-
- /* Select interface to connect from */
- for (j = 0; j < net->ksnn_ninterfaces; j++) {
- iface = &net->ksnn_interfaces[j];
-
- /* Using this interface already? */
- list_for_each(rtmp, &peer->ksnp_routes) {
- route = list_entry(rtmp, struct ksock_route,
- ksnr_list);
-
- if (route->ksnr_myipaddr == iface->ksni_ipaddr)
- break;
-
- route = NULL;
- }
- if (route)
- continue;
-
- this_netmatch = (!((iface->ksni_ipaddr ^
- newroute->ksnr_ipaddr) &
- iface->ksni_netmask)) ? 1 : 0;
-
- if (!(!best_iface ||
- best_netmatch < this_netmatch ||
- (best_netmatch == this_netmatch &&
- best_nroutes > iface->ksni_nroutes)))
- continue;
-
- best_iface = iface;
- best_netmatch = this_netmatch;
- best_nroutes = iface->ksni_nroutes;
- }
-
- if (!best_iface)
- continue;
-
- newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
- best_iface->ksni_nroutes++;
-
- ksocknal_add_route_locked(peer, newroute);
- newroute = NULL;
- }
-
- write_unlock_bh(global_lock);
- if (newroute)
- ksocknal_route_decref(newroute);
-}
-
-int
-ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
-{
- struct ksock_connreq *cr;
- int rc;
- __u32 peer_ip;
- int peer_port;
-
- rc = lnet_sock_getaddr(sock, 1, &peer_ip, &peer_port);
- LASSERT(!rc); /* we succeeded before */
-
- cr = kzalloc(sizeof(*cr), GFP_NOFS);
- if (!cr) {
- LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n",
- &peer_ip);
- return -ENOMEM;
- }
-
- lnet_ni_addref(ni);
- cr->ksncr_ni = ni;
- cr->ksncr_sock = sock;
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
-
- list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
- wake_up(&ksocknal_data.ksnd_connd_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
- return 0;
-}
-
-static int
-ksocknal_connecting(struct ksock_peer *peer, __u32 ipaddr)
-{
- struct ksock_route *route;
-
- list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) {
- if (route->ksnr_ipaddr == ipaddr)
- return route->ksnr_connecting;
- }
- return 0;
-}
-
-int
-ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
- struct socket *sock, int type)
-{
- rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
- LIST_HEAD(zombies);
- struct lnet_process_id peerid;
- struct list_head *tmp;
- __u64 incarnation;
- struct ksock_conn *conn;
- struct ksock_conn *conn2;
- struct ksock_peer *peer = NULL;
- struct ksock_peer *peer2;
- struct ksock_sched *sched;
- struct ksock_hello_msg *hello;
- int cpt;
- struct ksock_tx *tx;
- struct ksock_tx *txtmp;
- int rc;
- int active;
- char *warn = NULL;
-
- active = !!route;
-
- LASSERT(active == (type != SOCKLND_CONN_NONE));
-
- conn = kzalloc(sizeof(*conn), GFP_NOFS);
- if (!conn) {
- rc = -ENOMEM;
- goto failed_0;
- }
-
- conn->ksnc_peer = NULL;
- conn->ksnc_route = NULL;
- conn->ksnc_sock = sock;
- /*
- * 2 ref, 1 for conn, another extra ref prevents socket
- * being closed before establishment of connection
- */
- atomic_set(&conn->ksnc_sock_refcount, 2);
- conn->ksnc_type = type;
- ksocknal_lib_save_callback(sock, conn);
- atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
-
- conn->ksnc_rx_ready = 0;
- conn->ksnc_rx_scheduled = 0;
-
- INIT_LIST_HEAD(&conn->ksnc_tx_queue);
- conn->ksnc_tx_ready = 0;
- conn->ksnc_tx_scheduled = 0;
- conn->ksnc_tx_carrier = NULL;
- atomic_set(&conn->ksnc_tx_nob, 0);
-
- hello = kvzalloc(offsetof(struct ksock_hello_msg,
- kshm_ips[LNET_MAX_INTERFACES]),
- GFP_KERNEL);
- if (!hello) {
- rc = -ENOMEM;
- goto failed_1;
- }
-
- /* stash conn's local and remote addrs */
- rc = ksocknal_lib_get_conn_addrs(conn);
- if (rc)
- goto failed_1;
-
- /*
- * Find out/confirm peer's NID and connection type and get the
- * vector of interfaces she's willing to let me connect to.
- * Passive connections use the listener timeout since the peer sends
- * eagerly
- */
- if (active) {
- peer = route->ksnr_peer;
- LASSERT(ni == peer->ksnp_ni);
-
- /* Active connection sends HELLO eagerly */
- hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
- peerid = peer->ksnp_id;
-
- write_lock_bh(global_lock);
- conn->ksnc_proto = peer->ksnp_proto;
- write_unlock_bh(global_lock);
-
- if (!conn->ksnc_proto) {
- conn->ksnc_proto = &ksocknal_protocol_v3x;
-#if SOCKNAL_VERSION_DEBUG
- if (*ksocknal_tunables.ksnd_protocol == 2)
- conn->ksnc_proto = &ksocknal_protocol_v2x;
- else if (*ksocknal_tunables.ksnd_protocol == 1)
- conn->ksnc_proto = &ksocknal_protocol_v1x;
-#endif
- }
-
- rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
- if (rc)
- goto failed_1;
- } else {
- peerid.nid = LNET_NID_ANY;
- peerid.pid = LNET_PID_ANY;
-
- /* Passive, get protocol from peer */
- conn->ksnc_proto = NULL;
- }
-
- rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation);
- if (rc < 0)
- goto failed_1;
-
- LASSERT(!rc || active);
- LASSERT(conn->ksnc_proto);
- LASSERT(peerid.nid != LNET_NID_ANY);
-
- cpt = lnet_cpt_of_nid(peerid.nid);
-
- if (active) {
- ksocknal_peer_addref(peer);
- write_lock_bh(global_lock);
- } else {
- rc = ksocknal_create_peer(&peer, ni, peerid);
- if (rc)
- goto failed_1;
-
- write_lock_bh(global_lock);
-
- /* called with a ref on ni, so shutdown can't have started */
- LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown);
-
- peer2 = ksocknal_find_peer_locked(ni, peerid);
- if (!peer2) {
- /*
- * NB this puts an "empty" peer in the peer
- * table (which takes my ref)
- */
- list_add_tail(&peer->ksnp_list,
- ksocknal_nid2peerlist(peerid.nid));
- } else {
- ksocknal_peer_decref(peer);
- peer = peer2;
- }
-
- /* +1 ref for me */
- ksocknal_peer_addref(peer);
- peer->ksnp_accepting++;
-
- /*
- * Am I already connecting to this guy? Resolve in
- * favour of higher NID...
- */
- if (peerid.nid < ni->ni_nid &&
- ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
- rc = EALREADY;
- warn = "connection race resolution";
- goto failed_2;
- }
- }
-
- if (peer->ksnp_closing ||
- (active && route->ksnr_deleted)) {
- /* peer/route got closed under me */
- rc = -ESTALE;
- warn = "peer/route removed";
- goto failed_2;
- }
-
- if (!peer->ksnp_proto) {
- /*
- * Never connected before.
- * NB recv_hello may have returned EPROTO to signal my peer
- * wants a different protocol than the one I asked for.
- */
- LASSERT(list_empty(&peer->ksnp_conns));
-
- peer->ksnp_proto = conn->ksnc_proto;
- peer->ksnp_incarnation = incarnation;
- }
-
- if (peer->ksnp_proto != conn->ksnc_proto ||
- peer->ksnp_incarnation != incarnation) {
- /* Peer rebooted or I've got the wrong protocol version */
- ksocknal_close_peer_conns_locked(peer, 0, 0);
-
- peer->ksnp_proto = NULL;
- rc = ESTALE;
- warn = peer->ksnp_incarnation != incarnation ?
- "peer rebooted" :
- "wrong proto version";
- goto failed_2;
- }
-
- switch (rc) {
- default:
- LBUG();
- case 0:
- break;
- case EALREADY:
- warn = "lost conn race";
- goto failed_2;
- case EPROTO:
- warn = "retry with different protocol version";
- goto failed_2;
- }
-
- /*
- * Refuse to duplicate an existing connection, unless this is a
- * loopback connection
- */
- if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
- list_for_each(tmp, &peer->ksnp_conns) {
- conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
- conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
- conn2->ksnc_type != conn->ksnc_type)
- continue;
-
- /*
- * Reply on a passive connection attempt so the peer
- * realises we're connected.
- */
- LASSERT(!rc);
- if (!active)
- rc = EALREADY;
-
- warn = "duplicate";
- goto failed_2;
- }
- }
-
- /*
- * If the connection created by this route didn't bind to the IP
- * address the route connected to, the connection/route matching
- * code below probably isn't going to work.
- */
- if (active &&
- route->ksnr_ipaddr != conn->ksnc_ipaddr) {
- CERROR("Route %s %pI4h connected to %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr,
- &conn->ksnc_ipaddr);
- }
-
- /*
- * Search for a route corresponding to the new connection and
- * create an association. This allows incoming connections created
- * by routes in my peer to match my own route entries so I don't
- * continually create duplicate routes.
- */
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
- continue;
-
- ksocknal_associate_route_conn_locked(route, conn);
- break;
- }
-
- conn->ksnc_peer = peer; /* conn takes my ref on peer */
- peer->ksnp_last_alive = jiffies;
- peer->ksnp_send_keepalive = 0;
- peer->ksnp_error = 0;
-
- sched = ksocknal_choose_scheduler_locked(cpt);
- sched->kss_nconns++;
- conn->ksnc_scheduler = sched;
-
- conn->ksnc_tx_last_post = jiffies;
- /* Set the deadline for the outgoing HELLO to drain */
- conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
- conn->ksnc_tx_deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- mb(); /* order with adding to peer's conn list */
-
- list_add(&conn->ksnc_list, &peer->ksnp_conns);
- ksocknal_conn_addref(conn);
-
- ksocknal_new_packet(conn, 0);
-
- conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
-
- /* Take packets blocking for this connection. */
- list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
- int match = conn->ksnc_proto->pro_match_tx(conn, tx,
- tx->tx_nonblk);
-
- if (match == SOCKNAL_MATCH_NO)
- continue;
-
- list_del(&tx->tx_list);
- ksocknal_queue_tx_locked(tx, conn);
- }
-
- write_unlock_bh(global_lock);
-
- /*
- * We've now got a new connection. Any errors from here on are just
- * like "normal" comms errors and we close the connection normally.
- * NB (a) we still have to send the reply HELLO for passive
- * connections,
- * (b) normal I/O on the conn is blocked until I setup and call the
- * socket callbacks.
- */
- CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n",
- libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
- &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
- conn->ksnc_port, incarnation, cpt,
- (int)(sched - &sched->kss_info->ksi_scheds[0]));
-
- if (active) {
- /* additional routes after interface exchange? */
- ksocknal_create_routes(peer, conn->ksnc_port,
- hello->kshm_ips, hello->kshm_nips);
- } else {
- hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
- hello->kshm_nips);
- rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
- }
-
- kvfree(hello);
-
- /*
- * setup the socket AFTER I've received hello (it disables
- * SO_LINGER). I might call back to the acceptor who may want
- * to send a protocol version response and then close the
- * socket; this ensures the socket only tears down after the
- * response has been sent.
- */
- if (!rc)
- rc = ksocknal_lib_setup_sock(sock);
-
- write_lock_bh(global_lock);
-
- /* NB my callbacks block while I hold ksnd_global_lock */
- ksocknal_lib_set_callback(sock, conn);
-
- if (!active)
- peer->ksnp_accepting--;
-
- write_unlock_bh(global_lock);
-
- if (rc) {
- write_lock_bh(global_lock);
- if (!conn->ksnc_closing) {
- /* could be closed by another thread */
- ksocknal_close_conn_locked(conn, rc);
- }
- write_unlock_bh(global_lock);
- } else if (!ksocknal_connsock_addref(conn)) {
- /* Allow I/O to proceed. */
- ksocknal_read_callback(conn);
- ksocknal_write_callback(conn);
- ksocknal_connsock_decref(conn);
- }
-
- ksocknal_connsock_decref(conn);
- ksocknal_conn_decref(conn);
- return rc;
-
- failed_2:
- if (!peer->ksnp_closing &&
- list_empty(&peer->ksnp_conns) &&
- list_empty(&peer->ksnp_routes)) {
- list_add(&zombies, &peer->ksnp_tx_queue);
- list_del_init(&peer->ksnp_tx_queue);
- ksocknal_unlink_peer_locked(peer);
- }
-
- write_unlock_bh(global_lock);
-
- if (warn) {
- if (rc < 0)
- CERROR("Not creating conn %s type %d: %s\n",
- libcfs_id2str(peerid), conn->ksnc_type, warn);
- else
- CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
- libcfs_id2str(peerid), conn->ksnc_type, warn);
- }
-
- if (!active) {
- if (rc > 0) {
- /*
- * Request retry by replying with CONN_NONE
- * ksnc_proto has been set already
- */
- conn->ksnc_type = SOCKLND_CONN_NONE;
- hello->kshm_nips = 0;
- ksocknal_send_hello(ni, conn, peerid.nid, hello);
- }
-
- write_lock_bh(global_lock);
- peer->ksnp_accepting--;
- write_unlock_bh(global_lock);
- }
-
- ksocknal_txlist_done(ni, &zombies, 1);
- ksocknal_peer_decref(peer);
-
-failed_1:
- kvfree(hello);
-
- kfree(conn);
-
-failed_0:
- sock_release(sock);
- return rc;
-}
-
-void
-ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
-{
- /*
- * This just does the immmediate housekeeping, and queues the
- * connection for the reaper to terminate.
- * Caller holds ksnd_global_lock exclusively in irq context
- */
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_route *route;
- struct ksock_conn *conn2;
- struct list_head *tmp;
-
- LASSERT(!peer->ksnp_error);
- LASSERT(!conn->ksnc_closing);
- conn->ksnc_closing = 1;
-
- /* ksnd_deathrow_conns takes over peer's ref */
- list_del(&conn->ksnc_list);
-
- route = conn->ksnc_route;
- if (route) {
- /* dissociate conn from route... */
- LASSERT(!route->ksnr_deleted);
- LASSERT(route->ksnr_connected & (1 << conn->ksnc_type));
-
- conn2 = NULL;
- list_for_each(tmp, &peer->ksnp_conns) {
- conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn2->ksnc_route == route &&
- conn2->ksnc_type == conn->ksnc_type)
- break;
-
- conn2 = NULL;
- }
- if (!conn2)
- route->ksnr_connected &= ~(1 << conn->ksnc_type);
-
- conn->ksnc_route = NULL;
-
- ksocknal_route_decref(route); /* drop conn's ref on route */
- }
-
- if (list_empty(&peer->ksnp_conns)) {
- /* No more connections to this peer */
-
- if (!list_empty(&peer->ksnp_tx_queue)) {
- struct ksock_tx *tx;
-
- LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
-
- /*
- * throw them to the last connection...,
- * these TXs will be send to /dev/null by scheduler
- */
- list_for_each_entry(tx, &peer->ksnp_tx_queue,
- tx_list)
- ksocknal_tx_prep(conn, tx);
-
- spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
- list_splice_init(&peer->ksnp_tx_queue,
- &conn->ksnc_tx_queue);
- spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
- }
-
- peer->ksnp_proto = NULL; /* renegotiate protocol version */
- peer->ksnp_error = error; /* stash last conn close reason */
-
- if (list_empty(&peer->ksnp_routes)) {
- /*
- * I've just closed last conn belonging to a
- * peer with no routes to it
- */
- ksocknal_unlink_peer_locked(peer);
- }
- }
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- list_add_tail(&conn->ksnc_list,
- &ksocknal_data.ksnd_deathrow_conns);
- wake_up(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-}
-
-void
-ksocknal_peer_failed(struct ksock_peer *peer)
-{
- int notify = 0;
- unsigned long last_alive = 0;
-
- /*
- * There has been a connection failure or comms error; but I'll only
- * tell LNET I think the peer is dead if it's to another kernel and
- * there are no connections or connection attempts in existence.
- */
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- if (!(peer->ksnp_id.pid & LNET_PID_USERFLAG) &&
- list_empty(&peer->ksnp_conns) &&
- !peer->ksnp_accepting &&
- !ksocknal_find_connecting_route_locked(peer)) {
- notify = 1;
- last_alive = peer->ksnp_last_alive;
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- if (notify)
- lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0,
- last_alive);
-}
-
-void
-ksocknal_finalize_zcreq(struct ksock_conn *conn)
-{
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_tx *tx;
- struct ksock_tx *temp;
- struct ksock_tx *tmp;
- LIST_HEAD(zlist);
-
- /*
- * NB safe to finalize TXs because closing of socket will
- * abort all buffered data
- */
- LASSERT(!conn->ksnc_sock);
-
- spin_lock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
- if (tx->tx_conn != conn)
- continue;
-
- LASSERT(tx->tx_msg.ksm_zc_cookies[0]);
-
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- tx->tx_zc_aborted = 1; /* mark it as not-acked */
- list_del(&tx->tx_zc_list);
- list_add(&tx->tx_zc_list, &zlist);
- }
-
- spin_unlock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) {
- list_del(&tx->tx_zc_list);
- ksocknal_tx_decref(tx);
- }
-}
-
-void
-ksocknal_terminate_conn(struct ksock_conn *conn)
-{
- /*
- * This gets called by the reaper (guaranteed thread context) to
- * disengage the socket from its callbacks and close it.
- * ksnc_refcount will eventually hit zero, and then the reaper will
- * destroy it.
- */
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_sched *sched = conn->ksnc_scheduler;
- int failed = 0;
-
- LASSERT(conn->ksnc_closing);
-
- /* wake up the scheduler to "send" all remaining packets to /dev/null */
- spin_lock_bh(&sched->kss_lock);
-
- /* a closing conn is always ready to tx */
- conn->ksnc_tx_ready = 1;
-
- if (!conn->ksnc_tx_scheduled &&
- !list_empty(&conn->ksnc_tx_queue)) {
- list_add_tail(&conn->ksnc_tx_list,
- &sched->kss_tx_conns);
- conn->ksnc_tx_scheduled = 1;
- /* extra ref for scheduler */
- ksocknal_conn_addref(conn);
-
- wake_up(&sched->kss_waitq);
- }
-
- spin_unlock_bh(&sched->kss_lock);
-
- /* serialise with callbacks */
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
-
- /*
- * OK, so this conn may not be completely disengaged from its
- * scheduler yet, but it _has_ committed to terminate...
- */
- conn->ksnc_scheduler->kss_nconns--;
-
- if (peer->ksnp_error) {
- /* peer's last conn closed in error */
- LASSERT(list_empty(&peer->ksnp_conns));
- failed = 1;
- peer->ksnp_error = 0; /* avoid multiple notifications */
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (failed)
- ksocknal_peer_failed(peer);
-
- /*
- * The socket is closed on the final put; either here, or in
- * ksocknal_{send,recv}msg(). Since we set up the linger2 option
- * when the connection was established, this will close the socket
- * immediately, aborting anything buffered in it. Any hung
- * zero-copy transmits will therefore complete in finite time.
- */
- ksocknal_connsock_decref(conn);
-}
-
-void
-ksocknal_queue_zombie_conn(struct ksock_conn *conn)
-{
- /* Queue the conn for the reaper to destroy */
-
- LASSERT(!atomic_read(&conn->ksnc_conn_refcount));
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
- wake_up(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-}
-
-void
-ksocknal_destroy_conn(struct ksock_conn *conn)
-{
- unsigned long last_rcv;
-
- /* Final coup-de-grace of the reaper */
- CDEBUG(D_NET, "connection %p\n", conn);
-
- LASSERT(!atomic_read(&conn->ksnc_conn_refcount));
- LASSERT(!atomic_read(&conn->ksnc_sock_refcount));
- LASSERT(!conn->ksnc_sock);
- LASSERT(!conn->ksnc_route);
- LASSERT(!conn->ksnc_tx_scheduled);
- LASSERT(!conn->ksnc_rx_scheduled);
- LASSERT(list_empty(&conn->ksnc_tx_queue));
-
- /* complete current receive if any */
- switch (conn->ksnc_rx_state) {
- case SOCKNAL_RX_LNET_PAYLOAD:
- last_rcv = conn->ksnc_rx_deadline -
- *ksocknal_tunables.ksnd_timeout * HZ;
- CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %zd, left: %d, last alive is %ld secs ago\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
- &conn->ksnc_ipaddr, conn->ksnc_port,
- iov_iter_count(&conn->ksnc_rx_to), conn->ksnc_rx_nob_left,
- (jiffies - last_rcv) / HZ);
- lnet_finalize(conn->ksnc_peer->ksnp_ni,
- conn->ksnc_cookie, -EIO);
- break;
- case SOCKNAL_RX_LNET_HEADER:
- if (conn->ksnc_rx_started)
- CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port,
- conn->ksnc_proto->pro_version);
- break;
- case SOCKNAL_RX_KSM_HEADER:
- if (conn->ksnc_rx_started)
- CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port,
- conn->ksnc_proto->pro_version);
- break;
- case SOCKNAL_RX_SLOP:
- if (conn->ksnc_rx_started)
- CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port);
- break;
- default:
- LBUG();
- break;
- }
-
- ksocknal_peer_decref(conn->ksnc_peer);
-
- kfree(conn);
-}
-
-int
-ksocknal_close_peer_conns_locked(struct ksock_peer *peer, __u32 ipaddr, int why)
-{
- struct ksock_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
- int count = 0;
-
- list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
- conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
-
- if (!ipaddr || conn->ksnc_ipaddr == ipaddr) {
- count++;
- ksocknal_close_conn_locked(conn, why);
- }
- }
-
- return count;
-}
-
-int
-ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
-{
- struct ksock_peer *peer = conn->ksnc_peer;
- __u32 ipaddr = conn->ksnc_ipaddr;
- int count;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- count = ksocknal_close_peer_conns_locked(peer, ipaddr, why);
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return count;
-}
-
-int
-ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
-{
- struct ksock_peer *peer;
- struct list_head *ptmp;
- struct list_head *pnxt;
- int lo;
- int hi;
- int i;
- int count = 0;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (id.nid != LNET_NID_ANY) {
- lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- } else {
- lo = 0;
- hi = ksocknal_data.ksnd_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt,
- &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
- (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
- continue;
-
- count += ksocknal_close_peer_conns_locked(peer, ipaddr,
- 0);
- }
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- /* wildcards always succeed */
- if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || !ipaddr)
- return 0;
-
- if (!count)
- return -ENOENT;
- else
- return 0;
-}
-
-void
-ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
-{
- /*
- * The router is telling me she's been notified of a change in
- * gateway state....
- */
- struct lnet_process_id id = {0};
-
- id.nid = gw_nid;
- id.pid = LNET_PID_ANY;
-
- CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
- alive ? "up" : "down");
-
- if (!alive) {
- /* If the gateway crashed, close all open connections... */
- ksocknal_close_matching_conns(id, 0);
- return;
- }
-
- /*
- * ...otherwise do nothing. We can only establish new connections
- * if we have autroutes, and these connect on demand.
- */
-}
-
-void
-ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when)
-{
- int connect = 1;
- unsigned long last_alive = 0;
- unsigned long now = jiffies;
- struct ksock_peer *peer = NULL;
- rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
- struct lnet_process_id id = {
- .nid = nid,
- .pid = LNET_PID_LUSTRE,
- };
-
- read_lock(glock);
-
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer) {
- struct ksock_conn *conn;
- int bufnob;
-
- list_for_each_entry(conn, &peer->ksnp_conns, ksnc_list) {
- bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
-
- if (bufnob < conn->ksnc_tx_bufnob) {
- /* something got ACKed */
- conn->ksnc_tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- peer->ksnp_last_alive = now;
- conn->ksnc_tx_bufnob = bufnob;
- }
- }
-
- last_alive = peer->ksnp_last_alive;
- if (!ksocknal_find_connectable_route_locked(peer))
- connect = 0;
- }
-
- read_unlock(glock);
-
- if (last_alive)
- *when = last_alive;
-
- CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
- libcfs_nid2str(nid), peer,
- last_alive ? (now - last_alive) / HZ : -1,
- connect);
-
- if (!connect)
- return;
-
- ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
-
- write_lock_bh(glock);
-
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer)
- ksocknal_launch_all_connections_locked(peer);
-
- write_unlock_bh(glock);
-}
-
-static void
-ksocknal_push_peer(struct ksock_peer *peer)
-{
- int index;
- int i;
- struct list_head *tmp;
- struct ksock_conn *conn;
-
- for (index = 0; ; index++) {
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- i = 0;
- conn = NULL;
-
- list_for_each(tmp, &peer->ksnp_conns) {
- if (i++ == index) {
- conn = list_entry(tmp, struct ksock_conn,
- ksnc_list);
- ksocknal_conn_addref(conn);
- break;
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- if (!conn)
- break;
-
- ksocknal_lib_push_conn(conn);
- ksocknal_conn_decref(conn);
- }
-}
-
-static int ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
-{
- struct list_head *start;
- struct list_head *end;
- struct list_head *tmp;
- int rc = -ENOENT;
- unsigned int hsize = ksocknal_data.ksnd_peer_hash_size;
-
- if (id.nid == LNET_NID_ANY) {
- start = &ksocknal_data.ksnd_peers[0];
- end = &ksocknal_data.ksnd_peers[hsize - 1];
- } else {
- start = ksocknal_nid2peerlist(id.nid);
- end = ksocknal_nid2peerlist(id.nid);
- }
-
- for (tmp = start; tmp <= end; tmp++) {
- int peer_off; /* searching offset in peer hash table */
-
- for (peer_off = 0; ; peer_off++) {
- struct ksock_peer *peer;
- int i = 0;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
- list_for_each_entry(peer, tmp, ksnp_list) {
- if (!((id.nid == LNET_NID_ANY ||
- id.nid == peer->ksnp_id.nid) &&
- (id.pid == LNET_PID_ANY ||
- id.pid == peer->ksnp_id.pid)))
- continue;
-
- if (i++ == peer_off) {
- ksocknal_peer_addref(peer);
- break;
- }
- }
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- if (!i) /* no match */
- break;
-
- rc = 0;
- ksocknal_push_peer(peer);
- ksocknal_peer_decref(peer);
- }
- }
- return rc;
-}
-
-static int
-ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
-{
- struct ksock_net *net = ni->ni_data;
- struct ksock_interface *iface;
- int rc;
- int i;
- int j;
- struct list_head *ptmp;
- struct ksock_peer *peer;
- struct list_head *rtmp;
- struct ksock_route *route;
-
- if (!ipaddress || !netmask)
- return -EINVAL;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- iface = ksocknal_ip2iface(ni, ipaddress);
- if (iface) {
- /* silently ignore dups */
- rc = 0;
- } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
- rc = -ENOSPC;
- } else {
- iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
-
- iface->ksni_ipaddr = ipaddress;
- iface->ksni_netmask = netmask;
- iface->ksni_nroutes = 0;
- iface->ksni_npeers = 0;
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer,
- ksnp_list);
-
- for (j = 0; j < peer->ksnp_n_passive_ips; j++)
- if (peer->ksnp_passive_ips[j] == ipaddress)
- iface->ksni_npeers++;
-
- list_for_each(rtmp, &peer->ksnp_routes) {
- route = list_entry(rtmp, struct ksock_route,
- ksnr_list);
-
- if (route->ksnr_myipaddr == ipaddress)
- iface->ksni_nroutes++;
- }
- }
- }
-
- rc = 0;
- /*
- * NB only new connections will pay attention to the
- * new interface!
- */
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return rc;
-}
-
-static void
-ksocknal_peer_del_interface_locked(struct ksock_peer *peer, __u32 ipaddr)
-{
- struct list_head *tmp;
- struct list_head *nxt;
- struct ksock_route *route;
- struct ksock_conn *conn;
- int i;
- int j;
-
- for (i = 0; i < peer->ksnp_n_passive_ips; i++)
- if (peer->ksnp_passive_ips[i] == ipaddr) {
- for (j = i + 1; j < peer->ksnp_n_passive_ips; j++)
- peer->ksnp_passive_ips[j - 1] =
- peer->ksnp_passive_ips[j];
- peer->ksnp_n_passive_ips--;
- break;
- }
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- if (route->ksnr_myipaddr != ipaddr)
- continue;
-
- if (route->ksnr_share_count) {
- /* Manually created; keep, but unbind */
- route->ksnr_myipaddr = 0;
- } else {
- ksocknal_del_route_locked(route);
- }
- }
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn->ksnc_myipaddr == ipaddr)
- ksocknal_close_conn_locked(conn, 0);
- }
-}
-
-static int
-ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
-{
- struct ksock_net *net = ni->ni_data;
- int rc = -ENOENT;
- struct list_head *tmp;
- struct list_head *nxt;
- struct ksock_peer *peer;
- __u32 this_ip;
- int i;
- int j;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
-
- if (!(!ipaddress || ipaddress == this_ip))
- continue;
-
- rc = 0;
-
- for (j = i + 1; j < net->ksnn_ninterfaces; j++)
- net->ksnn_interfaces[j - 1] =
- net->ksnn_interfaces[j];
-
- net->ksnn_ninterfaces--;
-
- for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
- list_for_each_safe(tmp, nxt,
- &ksocknal_data.ksnd_peers[j]) {
- peer = list_entry(tmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- ksocknal_peer_del_interface_locked(peer, this_ip);
- }
- }
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return rc;
-}
-
-int
-ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
-{
- struct lnet_process_id id = {0};
- struct libcfs_ioctl_data *data = arg;
- int rc;
-
- switch (cmd) {
- case IOC_LIBCFS_GET_INTERFACE: {
- struct ksock_net *net = ni->ni_data;
- struct ksock_interface *iface;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
- rc = -ENOENT;
- } else {
- rc = 0;
- iface = &net->ksnn_interfaces[data->ioc_count];
-
- data->ioc_u32[0] = iface->ksni_ipaddr;
- data->ioc_u32[1] = iface->ksni_netmask;
- data->ioc_u32[2] = iface->ksni_npeers;
- data->ioc_u32[3] = iface->ksni_nroutes;
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return rc;
- }
-
- case IOC_LIBCFS_ADD_INTERFACE:
- return ksocknal_add_interface(ni,
- data->ioc_u32[0], /* IP address */
- data->ioc_u32[1]); /* net mask */
-
- case IOC_LIBCFS_DEL_INTERFACE:
- return ksocknal_del_interface(ni,
- data->ioc_u32[0]); /* IP address */
-
- case IOC_LIBCFS_GET_PEER: {
- __u32 myip = 0;
- __u32 ip = 0;
- int port = 0;
- int conn_count = 0;
- int share_count = 0;
-
- rc = ksocknal_get_peer_info(ni, data->ioc_count,
- &id, &myip, &ip, &port,
- &conn_count, &share_count);
- if (rc)
- return rc;
-
- data->ioc_nid = id.nid;
- data->ioc_count = share_count;
- data->ioc_u32[0] = ip;
- data->ioc_u32[1] = port;
- data->ioc_u32[2] = myip;
- data->ioc_u32[3] = conn_count;
- data->ioc_u32[4] = id.pid;
- return 0;
- }
-
- case IOC_LIBCFS_ADD_PEER:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_LUSTRE;
- return ksocknal_add_peer(ni, id,
- data->ioc_u32[0], /* IP */
- data->ioc_u32[1]); /* port */
-
- case IOC_LIBCFS_DEL_PEER:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_ANY;
- return ksocknal_del_peer(ni, id,
- data->ioc_u32[0]); /* IP */
-
- case IOC_LIBCFS_GET_CONN: {
- int txmem;
- int rxmem;
- int nagle;
- struct ksock_conn *conn;
-
- conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
- if (!conn)
- return -ENOENT;
-
- ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
-
- data->ioc_count = txmem;
- data->ioc_nid = conn->ksnc_peer->ksnp_id.nid;
- data->ioc_flags = nagle;
- data->ioc_u32[0] = conn->ksnc_ipaddr;
- data->ioc_u32[1] = conn->ksnc_port;
- data->ioc_u32[2] = conn->ksnc_myipaddr;
- data->ioc_u32[3] = conn->ksnc_type;
- data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
- data->ioc_u32[5] = rxmem;
- data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
- ksocknal_conn_decref(conn);
- return 0;
- }
-
- case IOC_LIBCFS_CLOSE_CONNECTION:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_ANY;
- return ksocknal_close_matching_conns(id,
- data->ioc_u32[0]);
-
- case IOC_LIBCFS_REGISTER_MYNID:
- /* Ignore if this is a noop */
- if (data->ioc_nid == ni->ni_nid)
- return 0;
-
- CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
- libcfs_nid2str(data->ioc_nid),
- libcfs_nid2str(ni->ni_nid));
- return -EINVAL;
-
- case IOC_LIBCFS_PUSH_CONNECTION:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_ANY;
- return ksocknal_push(ni, id);
-
- default:
- return -EINVAL;
- }
- /* not reached */
-}
-
-static void
-ksocknal_free_buffers(void)
-{
- LASSERT(!atomic_read(&ksocknal_data.ksnd_nactive_txs));
-
- if (ksocknal_data.ksnd_sched_info) {
- struct ksock_sched_info *info;
- int i;
-
- cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info)
- kfree(info->ksi_scheds);
- cfs_percpt_free(ksocknal_data.ksnd_sched_info);
- }
-
- kvfree(ksocknal_data.ksnd_peers);
-
- spin_lock(&ksocknal_data.ksnd_tx_lock);
-
- if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
- struct list_head zlist;
- struct ksock_tx *tx;
- struct ksock_tx *temp;
-
- list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
- list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
-
- list_for_each_entry_safe(tx, temp, &zlist, tx_list) {
- list_del(&tx->tx_list);
- kfree(tx);
- }
- } else {
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
- }
-}
-
-static void
-ksocknal_base_shutdown(void)
-{
- struct ksock_sched_info *info;
- struct ksock_sched *sched;
- int i;
- int j;
-
- LASSERT(!ksocknal_data.ksnd_nnets);
-
- switch (ksocknal_data.ksnd_init) {
- default:
- LASSERT(0);
- /* fall through */
- case SOCKNAL_INIT_ALL:
- case SOCKNAL_INIT_DATA:
- LASSERT(ksocknal_data.ksnd_peers);
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
- LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
-
- LASSERT(list_empty(&ksocknal_data.ksnd_nets));
- LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
- LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns));
- LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
- LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
-
- if (ksocknal_data.ksnd_sched_info) {
- cfs_percpt_for_each(info, i,
- ksocknal_data.ksnd_sched_info) {
- if (!info->ksi_scheds)
- continue;
-
- for (j = 0; j < info->ksi_nthreads_max; j++) {
- sched = &info->ksi_scheds[j];
- LASSERT(list_empty(
- &sched->kss_tx_conns));
- LASSERT(list_empty(
- &sched->kss_rx_conns));
- LASSERT(list_empty(
- &sched->kss_zombie_noop_txs));
- LASSERT(!sched->kss_nconns);
- }
- }
- }
-
- /* flag threads to terminate; wake and wait for them to die */
- ksocknal_data.ksnd_shuttingdown = 1;
- wake_up_all(&ksocknal_data.ksnd_connd_waitq);
- wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
-
- if (ksocknal_data.ksnd_sched_info) {
- cfs_percpt_for_each(info, i,
- ksocknal_data.ksnd_sched_info) {
- if (!info->ksi_scheds)
- continue;
-
- for (j = 0; j < info->ksi_nthreads_max; j++) {
- sched = &info->ksi_scheds[j];
- wake_up_all(&sched->kss_waitq);
- }
- }
- }
-
- i = 4;
- read_lock(&ksocknal_data.ksnd_global_lock);
- while (ksocknal_data.ksnd_nthreads) {
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
- "waiting for %d threads to terminate\n",
- ksocknal_data.ksnd_nthreads);
- read_unlock(&ksocknal_data.ksnd_global_lock);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- read_lock(&ksocknal_data.ksnd_global_lock);
- }
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_free_buffers();
-
- ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
- break;
- }
-
- module_put(THIS_MODULE);
-}
-
-static __u64
-ksocknal_new_incarnation(void)
-{
- /* The incarnation number is the time this module loaded and it
- * identifies this particular instance of the socknal.
- */
- return ktime_get_ns();
-}
-
-static int
-ksocknal_base_startup(void)
-{
- struct ksock_sched_info *info;
- int rc;
- int i;
-
- LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
- LASSERT(!ksocknal_data.ksnd_nnets);
-
- memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */
-
- ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
- ksocknal_data.ksnd_peers = kvmalloc_array(ksocknal_data.ksnd_peer_hash_size,
- sizeof(struct list_head),
- GFP_KERNEL);
- if (!ksocknal_data.ksnd_peers)
- return -ENOMEM;
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
- INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
-
- rwlock_init(&ksocknal_data.ksnd_global_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
-
- spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns);
- init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_lock_init(&ksocknal_data.ksnd_connd_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes);
- init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
-
- spin_lock_init(&ksocknal_data.ksnd_tx_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs);
-
- /* NB memset above zeros whole of ksocknal_data */
-
- /* flag lists/ptrs/locks initialised */
- ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
- try_module_get(THIS_MODULE);
-
- ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(*info));
- if (!ksocknal_data.ksnd_sched_info)
- goto failed;
-
- cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
- struct ksock_sched *sched;
- int nthrs;
-
- nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
- if (*ksocknal_tunables.ksnd_nscheds > 0) {
- nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
- } else {
- /*
- * max to half of CPUs, assume another half should be
- * reserved for upper layer modules
- */
- nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
- }
-
- info->ksi_nthreads_max = nthrs;
- info->ksi_cpt = i;
-
- info->ksi_scheds = kzalloc_cpt(info->ksi_nthreads_max * sizeof(*sched),
- GFP_NOFS, i);
- if (!info->ksi_scheds)
- goto failed;
-
- for (; nthrs > 0; nthrs--) {
- sched = &info->ksi_scheds[nthrs - 1];
-
- sched->kss_info = info;
- spin_lock_init(&sched->kss_lock);
- INIT_LIST_HEAD(&sched->kss_rx_conns);
- INIT_LIST_HEAD(&sched->kss_tx_conns);
- INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
- init_waitqueue_head(&sched->kss_waitq);
- }
- }
-
- ksocknal_data.ksnd_connd_starting = 0;
- ksocknal_data.ksnd_connd_failed_stamp = 0;
- ksocknal_data.ksnd_connd_starting_stamp = ktime_get_real_seconds();
- /*
- * must have at least 2 connds to remain responsive to accepts while
- * connecting
- */
- if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
- *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
-
- if (*ksocknal_tunables.ksnd_nconnds_max <
- *ksocknal_tunables.ksnd_nconnds) {
- ksocknal_tunables.ksnd_nconnds_max =
- ksocknal_tunables.ksnd_nconnds;
- }
-
- for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
- char name[16];
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
- ksocknal_data.ksnd_connd_starting++;
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
-
- snprintf(name, sizeof(name), "socknal_cd%02d", i);
- rc = ksocknal_thread_start(ksocknal_connd,
- (void *)((uintptr_t)i), name);
- if (rc) {
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
- ksocknal_data.ksnd_connd_starting--;
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
- CERROR("Can't spawn socknal connd: %d\n", rc);
- goto failed;
- }
- }
-
- rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
- if (rc) {
- CERROR("Can't spawn socknal reaper: %d\n", rc);
- goto failed;
- }
-
- /* flag everything initialised */
- ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
-
- return 0;
-
- failed:
- ksocknal_base_shutdown();
- return -ENETDOWN;
-}
-
-static void
-ksocknal_debug_peerhash(struct lnet_ni *ni)
-{
- struct ksock_peer *peer = NULL;
- struct list_head *tmp;
- int i;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(tmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni == ni)
- break;
-
- peer = NULL;
- }
- }
-
- if (peer) {
- struct ksock_route *route;
- struct ksock_conn *conn;
-
- CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n",
- libcfs_id2str(peer->ksnp_id),
- atomic_read(&peer->ksnp_refcount),
- peer->ksnp_sharecount, peer->ksnp_closing,
- peer->ksnp_accepting, peer->ksnp_error,
- peer->ksnp_zc_next_cookie,
- !list_empty(&peer->ksnp_tx_queue),
- !list_empty(&peer->ksnp_zc_req_list));
-
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
- CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n",
- atomic_read(&route->ksnr_refcount),
- route->ksnr_scheduled, route->ksnr_connecting,
- route->ksnr_connected, route->ksnr_deleted);
- }
-
- list_for_each(tmp, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
- CWARN("Conn: ref %d, sref %d, t %d, c %d\n",
- atomic_read(&conn->ksnc_conn_refcount),
- atomic_read(&conn->ksnc_sock_refcount),
- conn->ksnc_type, conn->ksnc_closing);
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-void
-ksocknal_shutdown(struct lnet_ni *ni)
-{
- struct ksock_net *net = ni->ni_data;
- int i;
- struct lnet_process_id anyid = {0};
-
- anyid.nid = LNET_NID_ANY;
- anyid.pid = LNET_PID_ANY;
-
- LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
- LASSERT(ksocknal_data.ksnd_nnets > 0);
-
- spin_lock_bh(&net->ksnn_lock);
- net->ksnn_shutdown = 1; /* prevent new peers */
- spin_unlock_bh(&net->ksnn_lock);
-
- /* Delete all peers */
- ksocknal_del_peer(ni, anyid, 0);
-
- /* Wait for all peer state to clean up */
- i = 2;
- spin_lock_bh(&net->ksnn_lock);
- while (net->ksnn_npeers) {
- spin_unlock_bh(&net->ksnn_lock);
-
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
- "waiting for %d peers to disconnect\n",
- net->ksnn_npeers);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
-
- ksocknal_debug_peerhash(ni);
-
- spin_lock_bh(&net->ksnn_lock);
- }
- spin_unlock_bh(&net->ksnn_lock);
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- LASSERT(!net->ksnn_interfaces[i].ksni_npeers);
- LASSERT(!net->ksnn_interfaces[i].ksni_nroutes);
- }
-
- list_del(&net->ksnn_list);
- kfree(net);
-
- ksocknal_data.ksnd_nnets--;
- if (!ksocknal_data.ksnd_nnets)
- ksocknal_base_shutdown();
-}
-
-static int
-ksocknal_enumerate_interfaces(struct ksock_net *net)
-{
- char **names;
- int i;
- int j;
- int rc;
- int n;
-
- n = lnet_ipif_enumerate(&names);
- if (n <= 0) {
- CERROR("Can't enumerate interfaces: %d\n", n);
- return n;
- }
-
- for (i = j = 0; i < n; i++) {
- int up;
- __u32 ip;
- __u32 mask;
-
- if (!strcmp(names[i], "lo")) /* skip the loopback IF */
- continue;
-
- rc = lnet_ipif_query(names[i], &up, &ip, &mask);
- if (rc) {
- CWARN("Can't get interface %s info: %d\n",
- names[i], rc);
- continue;
- }
-
- if (!up) {
- CWARN("Ignoring interface %s (down)\n",
- names[i]);
- continue;
- }
-
- if (j == LNET_MAX_INTERFACES) {
- CWARN("Ignoring interface %s (too many interfaces)\n",
- names[i]);
- continue;
- }
-
- net->ksnn_interfaces[j].ksni_ipaddr = ip;
- net->ksnn_interfaces[j].ksni_netmask = mask;
- strlcpy(net->ksnn_interfaces[j].ksni_name,
- names[i], sizeof(net->ksnn_interfaces[j].ksni_name));
- j++;
- }
-
- lnet_ipif_free_enumeration(names, n);
-
- if (!j)
- CERROR("Can't find any usable interfaces\n");
-
- return j;
-}
-
-static int
-ksocknal_search_new_ipif(struct ksock_net *net)
-{
- int new_ipif = 0;
- int i;
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
- char *colon = strchr(ifnam, ':');
- int found = 0;
- struct ksock_net *tmp;
- int j;
-
- if (colon) /* ignore alias device */
- *colon = 0;
-
- list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, ksnn_list) {
- for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
- char *ifnam2 =
- &tmp->ksnn_interfaces[j].ksni_name[0];
- char *colon2 = strchr(ifnam2, ':');
-
- if (colon2)
- *colon2 = 0;
-
- found = !strcmp(ifnam, ifnam2);
- if (colon2)
- *colon2 = ':';
- }
- if (found)
- break;
- }
-
- new_ipif += !found;
- if (colon)
- *colon = ':';
- }
-
- return new_ipif;
-}
-
-static int
-ksocknal_start_schedulers(struct ksock_sched_info *info)
-{
- int nthrs;
- int rc = 0;
- int i;
-
- if (!info->ksi_nthreads) {
- if (*ksocknal_tunables.ksnd_nscheds > 0) {
- nthrs = info->ksi_nthreads_max;
- } else {
- nthrs = cfs_cpt_weight(lnet_cpt_table(),
- info->ksi_cpt);
- nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
- nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
- }
- nthrs = min(nthrs, info->ksi_nthreads_max);
- } else {
- LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
- /* increase two threads if there is new interface */
- nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
- }
-
- for (i = 0; i < nthrs; i++) {
- long id;
- char name[20];
- struct ksock_sched *sched;
-
- id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
- sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
- snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
- info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
-
- rc = ksocknal_thread_start(ksocknal_scheduler,
- (void *)id, name);
- if (!rc)
- continue;
-
- CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
- info->ksi_cpt, info->ksi_nthreads + i, rc);
- break;
- }
-
- info->ksi_nthreads += i;
- return rc;
-}
-
-static int
-ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
-{
- int newif = ksocknal_search_new_ipif(net);
- int rc;
- int i;
-
- LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
-
- for (i = 0; i < ncpts; i++) {
- struct ksock_sched_info *info;
- int cpt = !cpts ? i : cpts[i];
-
- LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
- info = ksocknal_data.ksnd_sched_info[cpt];
-
- if (!newif && info->ksi_nthreads > 0)
- continue;
-
- rc = ksocknal_start_schedulers(info);
- if (rc)
- return rc;
- }
- return 0;
-}
-
-int
-ksocknal_startup(struct lnet_ni *ni)
-{
- struct ksock_net *net;
- int rc;
- int i;
-
- LASSERT(ni->ni_lnd == &the_ksocklnd);
-
- if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
- rc = ksocknal_base_startup();
- if (rc)
- return rc;
- }
-
- net = kzalloc(sizeof(*net), GFP_NOFS);
- if (!net)
- goto fail_0;
-
- spin_lock_init(&net->ksnn_lock);
- net->ksnn_incarnation = ksocknal_new_incarnation();
- ni->ni_data = net;
- ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout;
- ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits;
- ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits;
- ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
-
- if (!ni->ni_interfaces[0]) {
- rc = ksocknal_enumerate_interfaces(net);
- if (rc <= 0)
- goto fail_1;
-
- net->ksnn_ninterfaces = 1;
- } else {
- for (i = 0; i < LNET_MAX_INTERFACES; i++) {
- int up;
-
- if (!ni->ni_interfaces[i])
- break;
-
- rc = lnet_ipif_query(ni->ni_interfaces[i], &up,
- &net->ksnn_interfaces[i].ksni_ipaddr,
- &net->ksnn_interfaces[i].ksni_netmask);
-
- if (rc) {
- CERROR("Can't get interface %s info: %d\n",
- ni->ni_interfaces[i], rc);
- goto fail_1;
- }
-
- if (!up) {
- CERROR("Interface %s is down\n",
- ni->ni_interfaces[i]);
- goto fail_1;
- }
-
- strlcpy(net->ksnn_interfaces[i].ksni_name,
- ni->ni_interfaces[i],
- sizeof(net->ksnn_interfaces[i].ksni_name));
- }
- net->ksnn_ninterfaces = i;
- }
-
- /* call it before add it to ksocknal_data.ksnd_nets */
- rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
- if (rc)
- goto fail_1;
-
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
- net->ksnn_interfaces[0].ksni_ipaddr);
- list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
-
- ksocknal_data.ksnd_nnets++;
-
- return 0;
-
- fail_1:
- kfree(net);
- fail_0:
- if (!ksocknal_data.ksnd_nnets)
- ksocknal_base_shutdown();
-
- return -ENETDOWN;
-}
-
-static void __exit ksocklnd_exit(void)
-{
- lnet_unregister_lnd(&the_ksocklnd);
-}
-
-static int __init ksocklnd_init(void)
-{
- int rc;
-
- /* check ksnr_connected/connecting field large enough */
- BUILD_BUG_ON(SOCKLND_CONN_NTYPES > 4);
- BUILD_BUG_ON(SOCKLND_CONN_ACK != SOCKLND_CONN_BULK_IN);
-
- /* initialize the_ksocklnd */
- the_ksocklnd.lnd_type = SOCKLND;
- the_ksocklnd.lnd_startup = ksocknal_startup;
- the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
- the_ksocklnd.lnd_ctl = ksocknal_ctl;
- the_ksocklnd.lnd_send = ksocknal_send;
- the_ksocklnd.lnd_recv = ksocknal_recv;
- the_ksocklnd.lnd_notify = ksocknal_notify;
- the_ksocklnd.lnd_query = ksocknal_query;
- the_ksocklnd.lnd_accept = ksocknal_accept;
-
- rc = ksocknal_tunables_init();
- if (rc)
- return rc;
-
- rc = libcfs_setup();
- if (rc)
- return rc;
-
- lnet_register_lnd(&the_ksocklnd);
-
- return 0;
-}
-
-MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("TCP Socket LNet Network Driver");
-MODULE_VERSION("2.7.0");
-MODULE_LICENSE("GPL");
-
-module_init(ksocklnd_init);
-module_exit(ksocklnd_exit);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
deleted file mode 100644
index 4e5c89a..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
+++ /dev/null
@@ -1,704 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Lustre, http://www.lustre.org
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _SOCKLND_SOCKLND_H_
-#define _SOCKLND_SOCKLND_H_
-
-#define DEBUG_PORTAL_ALLOC
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <linux/crc32.h>
-#include <linux/errno.h>
-#include <linux/if.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/sysctl.h>
-#include <linux/uio.h>
-#include <linux/unistd.h>
-#include <asm/irq.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-
-#include <linux/lnet/lib-lnet.h>
-#include <linux/lnet/socklnd.h>
-
-/* assume one thread for each connection type */
-#define SOCKNAL_NSCHEDS 3
-#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1)
-
-#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
-#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */
-#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */
-#define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */
-
-#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */
-#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */
-
-#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */
-
-/*
- * risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
- * no risk if we're not running on a CONFIG_HIGHMEM platform.
- */
-#ifdef CONFIG_HIGHMEM
-# define SOCKNAL_RISK_KMAP_DEADLOCK 0
-#else
-# define SOCKNAL_RISK_KMAP_DEADLOCK 1
-#endif
-
-struct ksock_sched_info;
-
-struct ksock_sched { /* per scheduler state */
- spinlock_t kss_lock; /* serialise */
- struct list_head kss_rx_conns; /* conn waiting to be read */
- struct list_head kss_tx_conns; /* conn waiting to be written */
- struct list_head kss_zombie_noop_txs; /* zombie noop tx list */
- wait_queue_head_t kss_waitq; /* where scheduler sleeps */
- int kss_nconns; /* # connections assigned to
- * this scheduler
- */
- struct ksock_sched_info *kss_info; /* owner of it */
-};
-
-struct ksock_sched_info {
- int ksi_nthreads_max; /* max allowed threads */
- int ksi_nthreads; /* number of threads */
- int ksi_cpt; /* CPT id */
- struct ksock_sched *ksi_scheds; /* array of schedulers */
-};
-
-#define KSOCK_CPT_SHIFT 16
-#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid))
-#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT)
-#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
-
-struct ksock_interface { /* in-use interface */
- __u32 ksni_ipaddr; /* interface's IP address */
- __u32 ksni_netmask; /* interface's network mask */
- int ksni_nroutes; /* # routes using (active) */
- int ksni_npeers; /* # peers using (passive) */
- char ksni_name[IFNAMSIZ]; /* interface name */
-};
-
-struct ksock_tunables {
- int *ksnd_timeout; /* "stuck" socket timeout
- * (seconds)
- */
- int *ksnd_nscheds; /* # scheduler threads in each
- * pool while starting
- */
- int *ksnd_nconnds; /* # connection daemons */
- int *ksnd_nconnds_max; /* max # connection daemons */
- int *ksnd_min_reconnectms; /* first connection retry after
- * (ms)...
- */
- int *ksnd_max_reconnectms; /* ...exponentially increasing to
- * this
- */
- int *ksnd_eager_ack; /* make TCP ack eagerly? */
- int *ksnd_typed_conns; /* drive sockets by type? */
- int *ksnd_min_bulk; /* smallest "large" message */
- int *ksnd_tx_buffer_size; /* socket tx buffer size */
- int *ksnd_rx_buffer_size; /* socket rx buffer size */
- int *ksnd_nagle; /* enable NAGLE? */
- int *ksnd_round_robin; /* round robin for multiple
- * interfaces
- */
- int *ksnd_keepalive; /* # secs for sending keepalive
- * NOOP
- */
- int *ksnd_keepalive_idle; /* # idle secs before 1st probe
- */
- int *ksnd_keepalive_count; /* # probes */
- int *ksnd_keepalive_intvl; /* time between probes */
- int *ksnd_credits; /* # concurrent sends */
- int *ksnd_peertxcredits; /* # concurrent sends to 1 peer
- */
- int *ksnd_peerrtrcredits; /* # per-peer router buffer
- * credits
- */
- int *ksnd_peertimeout; /* seconds to consider peer dead
- */
- int *ksnd_enable_csum; /* enable check sum */
- int *ksnd_inject_csum_error; /* set non-zero to inject
- * checksum error
- */
- int *ksnd_nonblk_zcack; /* always send zc-ack on
- * non-blocking connection
- */
- unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload
- * size
- */
- int *ksnd_zc_recv; /* enable ZC receive (for
- * Chelsio TOE)
- */
- int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to
- * enable ZC receive
- */
-};
-
-struct ksock_net {
- __u64 ksnn_incarnation; /* my epoch */
- spinlock_t ksnn_lock; /* serialise */
- struct list_head ksnn_list; /* chain on global list */
- int ksnn_npeers; /* # peers */
- int ksnn_shutdown; /* shutting down? */
- int ksnn_ninterfaces; /* IP interfaces */
- struct ksock_interface ksnn_interfaces[LNET_MAX_INTERFACES];
-};
-
-/** connd timeout */
-#define SOCKNAL_CONND_TIMEOUT 120
-/** reserved thread for accepting & creating new connd */
-#define SOCKNAL_CONND_RESV 1
-
-struct ksock_nal_data {
- int ksnd_init; /* initialisation state
- */
- int ksnd_nnets; /* # networks set up */
- struct list_head ksnd_nets; /* list of nets */
- rwlock_t ksnd_global_lock; /* stabilize peer/conn
- * ops
- */
- struct list_head *ksnd_peers; /* hash table of all my
- * known peers
- */
- int ksnd_peer_hash_size; /* size of ksnd_peers */
-
- int ksnd_nthreads; /* # live threads */
- int ksnd_shuttingdown; /* tell threads to exit
- */
- struct ksock_sched_info **ksnd_sched_info; /* schedulers info */
-
- atomic_t ksnd_nactive_txs; /* #active txs */
-
- struct list_head ksnd_deathrow_conns; /* conns to close:
- * reaper_lock
- */
- struct list_head ksnd_zombie_conns; /* conns to free:
- * reaper_lock
- */
- struct list_head ksnd_enomem_conns; /* conns to retry:
- * reaper_lock
- */
- wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */
- unsigned long ksnd_reaper_waketime; /* when reaper will wake
- */
- spinlock_t ksnd_reaper_lock; /* serialise */
-
- int ksnd_enomem_tx; /* test ENOMEM sender */
- int ksnd_stall_tx; /* test sluggish sender
- */
- int ksnd_stall_rx; /* test sluggish
- * receiver
- */
- struct list_head ksnd_connd_connreqs; /* incoming connection
- * requests
- */
- struct list_head ksnd_connd_routes; /* routes waiting to be
- * connected
- */
- wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */
- int ksnd_connd_connecting; /* # connds connecting
- */
- time64_t ksnd_connd_failed_stamp;/* time stamp of the
- * last failed
- * connecting attempt
- */
- time64_t ksnd_connd_starting_stamp;/* time stamp of the
- * last starting connd
- */
- unsigned int ksnd_connd_starting; /* # starting connd */
- unsigned int ksnd_connd_running; /* # running connd */
- spinlock_t ksnd_connd_lock; /* serialise */
-
- struct list_head ksnd_idle_noop_txs; /* list head for freed
- * noop tx
- */
- spinlock_t ksnd_tx_lock; /* serialise, g_lock
- * unsafe
- */
-};
-
-#define SOCKNAL_INIT_NOTHING 0
-#define SOCKNAL_INIT_DATA 1
-#define SOCKNAL_INIT_ALL 2
-
-/*
- * A packet just assembled for transmission is represented by 1 or more
- * struct iovec fragments (the first frag contains the portals header),
- * followed by 0 or more struct bio_vec fragments.
- *
- * On the receive side, initially 1 struct iovec fragment is posted for
- * receive (the header). Once the header has been received, the payload is
- * received into either struct iovec or struct bio_vec fragments, depending on
- * what the header matched or whether the message needs forwarding.
- */
-struct ksock_conn; /* forward ref */
-struct ksock_peer; /* forward ref */
-struct ksock_route; /* forward ref */
-struct ksock_proto; /* forward ref */
-
-struct ksock_tx { /* transmit packet */
- struct list_head tx_list; /* queue on conn for transmission etc
- */
- struct list_head tx_zc_list; /* queue on peer for ZC request */
- atomic_t tx_refcount; /* tx reference count */
- int tx_nob; /* # packet bytes */
- int tx_resid; /* residual bytes */
- int tx_niov; /* # packet iovec frags */
- struct kvec *tx_iov; /* packet iovec frags */
- int tx_nkiov; /* # packet page frags */
- unsigned short tx_zc_aborted; /* aborted ZC request */
- unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
- unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
- unsigned short tx_nonblk:1; /* it's a non-blocking ACK */
- struct bio_vec *tx_kiov; /* packet page frags */
- struct ksock_conn *tx_conn; /* owning conn */
- struct lnet_msg *tx_lnetmsg; /* lnet message for lnet_finalize()
- */
- unsigned long tx_deadline; /* when (in jiffies) tx times out */
- struct ksock_msg tx_msg; /* socklnd message buffer */
- int tx_desc_size; /* size of this descriptor */
- union {
- struct {
- struct kvec iov; /* virt hdr */
- struct bio_vec kiov[0]; /* paged payload */
- } paged;
- struct {
- struct kvec iov[1]; /* virt hdr + payload */
- } virt;
- } tx_frags;
-};
-
-#define KSOCK_NOOP_TX_SIZE (offsetof(struct ksock_tx, tx_frags.paged.kiov[0]))
-
-/* network zero copy callback descriptor embedded in struct ksock_tx */
-
-#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */
-#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */
-#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */
-#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */
-#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */
-#define SOCKNAL_RX_SLOP 6 /* skipping body */
-
-struct ksock_conn {
- struct ksock_peer *ksnc_peer; /* owning peer */
- struct ksock_route *ksnc_route; /* owning route */
- struct list_head ksnc_list; /* stash on peer's conn list */
- struct socket *ksnc_sock; /* actual socket */
- void *ksnc_saved_data_ready; /* socket's original
- * data_ready() callback
- */
- void *ksnc_saved_write_space; /* socket's original
- * write_space() callback
- */
- atomic_t ksnc_conn_refcount;/* conn refcount */
- atomic_t ksnc_sock_refcount;/* sock refcount */
- struct ksock_sched *ksnc_scheduler; /* who schedules this connection
- */
- __u32 ksnc_myipaddr; /* my IP */
- __u32 ksnc_ipaddr; /* peer's IP */
- int ksnc_port; /* peer's port */
- signed int ksnc_type:3; /* type of connection, should be
- * signed value
- */
- unsigned int ksnc_closing:1; /* being shut down */
- unsigned int ksnc_flip:1; /* flip or not, only for V2.x */
- unsigned int ksnc_zc_capable:1; /* enable to ZC */
- struct ksock_proto *ksnc_proto; /* protocol for the connection */
-
- /* reader */
- struct list_head ksnc_rx_list; /* where I enq waiting input or a
- * forwarding descriptor
- */
- unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times
- * out
- */
- __u8 ksnc_rx_started; /* started receiving a message */
- __u8 ksnc_rx_ready; /* data ready to read */
- __u8 ksnc_rx_scheduled; /* being progressed */
- __u8 ksnc_rx_state; /* what is being read */
- int ksnc_rx_nob_left; /* # bytes to next hdr/body */
- struct iov_iter ksnc_rx_to; /* copy destination */
- struct kvec ksnc_rx_iov_space[LNET_MAX_IOV]; /* space for frag descriptors */
- __u32 ksnc_rx_csum; /* partial checksum for incoming
- * data
- */
- void *ksnc_cookie; /* rx lnet_finalize passthru arg
- */
- struct ksock_msg ksnc_msg; /* incoming message buffer:
- * V2.x message takes the
- * whole struct
- * V1.x message is a bare
- * struct lnet_hdr, it's stored in
- * ksnc_msg.ksm_u.lnetmsg
- */
- /* WRITER */
- struct list_head ksnc_tx_list; /* where I enq waiting for output
- * space
- */
- struct list_head ksnc_tx_queue; /* packets waiting to be sent */
- struct ksock_tx *ksnc_tx_carrier; /* next TX that can carry a LNet
- * message or ZC-ACK
- */
- unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out
- */
- int ksnc_tx_bufnob; /* send buffer marker */
- atomic_t ksnc_tx_nob; /* # bytes queued */
- int ksnc_tx_ready; /* write space */
- int ksnc_tx_scheduled; /* being progressed */
- unsigned long ksnc_tx_last_post; /* time stamp of the last posted
- * TX
- */
-};
-
-struct ksock_route {
- struct list_head ksnr_list; /* chain on peer route list */
- struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */
- struct ksock_peer *ksnr_peer; /* owning peer */
- atomic_t ksnr_refcount; /* # users */
- unsigned long ksnr_timeout; /* when (in jiffies) reconnection
- * can happen next
- */
- long ksnr_retry_interval; /* how long between retries */
- __u32 ksnr_myipaddr; /* my IP */
- __u32 ksnr_ipaddr; /* IP address to connect to */
- int ksnr_port; /* port to connect to */
- unsigned int ksnr_scheduled:1; /* scheduled for attention */
- unsigned int ksnr_connecting:1; /* connection establishment in
- * progress
- */
- unsigned int ksnr_connected:4; /* connections established by
- * type
- */
- unsigned int ksnr_deleted:1; /* been removed from peer? */
- unsigned int ksnr_share_count; /* created explicitly? */
- int ksnr_conn_count; /* # conns established by this
- * route
- */
-};
-
-#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */
-
-struct ksock_peer {
- struct list_head ksnp_list; /* stash on global peer list */
- unsigned long ksnp_last_alive; /* when (in jiffies) I was last
- * alive
- */
- struct lnet_process_id ksnp_id; /* who's on the other end(s) */
- atomic_t ksnp_refcount; /* # users */
- int ksnp_sharecount; /* lconf usage counter */
- int ksnp_closing; /* being closed */
- int ksnp_accepting; /* # passive connections pending
- */
- int ksnp_error; /* errno on closing last conn */
- __u64 ksnp_zc_next_cookie; /* ZC completion cookie */
- __u64 ksnp_incarnation; /* latest known peer incarnation
- */
- struct ksock_proto *ksnp_proto; /* latest known peer protocol */
- struct list_head ksnp_conns; /* all active connections */
- struct list_head ksnp_routes; /* routes */
- struct list_head ksnp_tx_queue; /* waiting packets */
- spinlock_t ksnp_lock; /* serialize, g_lock unsafe */
- struct list_head ksnp_zc_req_list; /* zero copy requests wait for
- * ACK
- */
- unsigned long ksnp_send_keepalive; /* time to send keepalive */
- struct lnet_ni *ksnp_ni; /* which network */
- int ksnp_n_passive_ips; /* # of... */
-
- /* preferred local interfaces */
- __u32 ksnp_passive_ips[LNET_MAX_INTERFACES];
-};
-
-struct ksock_connreq {
- struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */
- struct lnet_ni *ksncr_ni; /* chosen NI */
- struct socket *ksncr_sock; /* accepted socket */
-};
-
-extern struct ksock_nal_data ksocknal_data;
-extern struct ksock_tunables ksocknal_tunables;
-
-#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */
-#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */
-#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not
- * preferred
- */
-
-struct ksock_proto {
- /* version number of protocol */
- int pro_version;
-
- /* handshake function */
- int (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *);
-
- /* handshake function */
- int (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);
-
- /* message pack */
- void (*pro_pack)(struct ksock_tx *);
-
- /* message unpack */
- void (*pro_unpack)(struct ksock_msg *);
-
- /* queue tx on the connection */
- struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *);
-
- /* queue ZC ack on the connection */
- int (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64);
-
- /* handle ZC request */
- int (*pro_handle_zcreq)(struct ksock_conn *, __u64, int);
-
- /* handle ZC ACK */
- int (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64);
-
- /*
- * msg type matches the connection type:
- * return value:
- * return MATCH_NO : no
- * return MATCH_YES : matching type
- * return MATCH_MAY : can be backup
- */
- int (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int);
-};
-
-extern struct ksock_proto ksocknal_protocol_v1x;
-extern struct ksock_proto ksocknal_protocol_v2x;
-extern struct ksock_proto ksocknal_protocol_v3x;
-
-#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR
-#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR
-#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR
-
-#ifndef CPU_MASK_NONE
-#define CPU_MASK_NONE 0UL
-#endif
-
-static inline int
-ksocknal_route_mask(void)
-{
- if (!*ksocknal_tunables.ksnd_typed_conns)
- return (1 << SOCKLND_CONN_ANY);
-
- return ((1 << SOCKLND_CONN_CONTROL) |
- (1 << SOCKLND_CONN_BULK_IN) |
- (1 << SOCKLND_CONN_BULK_OUT));
-}
-
-static inline struct list_head *
-ksocknal_nid2peerlist(lnet_nid_t nid)
-{
- unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
-
- return &ksocknal_data.ksnd_peers[hash];
-}
-
-static inline void
-ksocknal_conn_addref(struct ksock_conn *conn)
-{
- LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
- atomic_inc(&conn->ksnc_conn_refcount);
-}
-
-void ksocknal_queue_zombie_conn(struct ksock_conn *conn);
-void ksocknal_finalize_zcreq(struct ksock_conn *conn);
-
-static inline void
-ksocknal_conn_decref(struct ksock_conn *conn)
-{
- LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
- if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
- ksocknal_queue_zombie_conn(conn);
-}
-
-static inline int
-ksocknal_connsock_addref(struct ksock_conn *conn)
-{
- int rc = -ESHUTDOWN;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
- if (!conn->ksnc_closing) {
- LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
- atomic_inc(&conn->ksnc_sock_refcount);
- rc = 0;
- }
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- return rc;
-}
-
-static inline void
-ksocknal_connsock_decref(struct ksock_conn *conn)
-{
- LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
- if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
- LASSERT(conn->ksnc_closing);
- sock_release(conn->ksnc_sock);
- conn->ksnc_sock = NULL;
- ksocknal_finalize_zcreq(conn);
- }
-}
-
-static inline void
-ksocknal_tx_addref(struct ksock_tx *tx)
-{
- LASSERT(atomic_read(&tx->tx_refcount) > 0);
- atomic_inc(&tx->tx_refcount);
-}
-
-void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
-void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx);
-
-static inline void
-ksocknal_tx_decref(struct ksock_tx *tx)
-{
- LASSERT(atomic_read(&tx->tx_refcount) > 0);
- if (atomic_dec_and_test(&tx->tx_refcount))
- ksocknal_tx_done(NULL, tx);
-}
-
-static inline void
-ksocknal_route_addref(struct ksock_route *route)
-{
- LASSERT(atomic_read(&route->ksnr_refcount) > 0);
- atomic_inc(&route->ksnr_refcount);
-}
-
-void ksocknal_destroy_route(struct ksock_route *route);
-
-static inline void
-ksocknal_route_decref(struct ksock_route *route)
-{
- LASSERT(atomic_read(&route->ksnr_refcount) > 0);
- if (atomic_dec_and_test(&route->ksnr_refcount))
- ksocknal_destroy_route(route);
-}
-
-static inline void
-ksocknal_peer_addref(struct ksock_peer *peer)
-{
- LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
- atomic_inc(&peer->ksnp_refcount);
-}
-
-void ksocknal_destroy_peer(struct ksock_peer *peer);
-
-static inline void
-ksocknal_peer_decref(struct ksock_peer *peer)
-{
- LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
- if (atomic_dec_and_test(&peer->ksnp_refcount))
- ksocknal_destroy_peer(peer);
-}
-
-int ksocknal_startup(struct lnet_ni *ni);
-void ksocknal_shutdown(struct lnet_ni *ni);
-int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg);
-int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
-int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, struct iov_iter *to, unsigned int rlen);
-int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
-
-int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip,
- int port);
-struct ksock_peer *ksocknal_find_peer_locked(struct lnet_ni *ni,
- struct lnet_process_id id);
-struct ksock_peer *ksocknal_find_peer(struct lnet_ni *ni,
- struct lnet_process_id id);
-void ksocknal_peer_failed(struct ksock_peer *peer);
-int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
- struct socket *sock, int type);
-void ksocknal_close_conn_locked(struct ksock_conn *conn, int why);
-void ksocknal_terminate_conn(struct ksock_conn *conn);
-void ksocknal_destroy_conn(struct ksock_conn *conn);
-int ksocknal_close_peer_conns_locked(struct ksock_peer *peer,
- __u32 ipaddr, int why);
-int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why);
-int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr);
-struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer *peer,
- struct ksock_tx *tx, int nonblk);
-
-int ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
- struct lnet_process_id id);
-struct ksock_tx *ksocknal_alloc_tx(int type, int size);
-void ksocknal_free_tx(struct ksock_tx *tx);
-struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
-void ksocknal_next_tx_carrier(struct ksock_conn *conn);
-void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
-void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error);
-void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
-void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when);
-int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
-void ksocknal_thread_fini(void);
-void ksocknal_launch_all_connections_locked(struct ksock_peer *peer);
-struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer *peer);
-struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer *peer);
-int ksocknal_new_packet(struct ksock_conn *conn, int skip);
-int ksocknal_scheduler(void *arg);
-int ksocknal_connd(void *arg);
-int ksocknal_reaper(void *arg);
-int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- lnet_nid_t peer_nid, struct ksock_hello_msg *hello);
-int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- struct ksock_hello_msg *hello,
- struct lnet_process_id *id,
- __u64 *incarnation);
-void ksocknal_read_callback(struct ksock_conn *conn);
-void ksocknal_write_callback(struct ksock_conn *conn);
-
-int ksocknal_lib_zc_capable(struct ksock_conn *conn);
-void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn);
-void ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn);
-void ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn);
-void ksocknal_lib_push_conn(struct ksock_conn *conn);
-int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn);
-int ksocknal_lib_setup_sock(struct socket *so);
-int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx);
-int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx);
-void ksocknal_lib_eager_ack(struct ksock_conn *conn);
-int ksocknal_lib_recv(struct ksock_conn *conn);
-int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
- int *rxmem, int *nagle);
-
-void ksocknal_read_callback(struct ksock_conn *conn);
-void ksocknal_write_callback(struct ksock_conn *conn);
-
-int ksocknal_tunables_init(void);
-
-void ksocknal_lib_csum_tx(struct ksock_tx *tx);
-
-int ksocknal_lib_memory_pressure(struct ksock_conn *conn);
-int ksocknal_lib_bind_thread_to_cpu(int id);
-
-#endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
deleted file mode 100644
index 01b31a6..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ /dev/null
@@ -1,2586 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/sched/mm.h>
-#include "socklnd.h"
-
-struct ksock_tx *
-ksocknal_alloc_tx(int type, int size)
-{
- struct ksock_tx *tx = NULL;
-
- if (type == KSOCK_MSG_NOOP) {
- LASSERT(size == KSOCK_NOOP_TX_SIZE);
-
- /* searching for a noop tx in free list */
- spin_lock(&ksocknal_data.ksnd_tx_lock);
-
- if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
- tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next,
- struct ksock_tx, tx_list);
- LASSERT(tx->tx_desc_size == size);
- list_del(&tx->tx_list);
- }
-
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
- }
-
- if (!tx)
- tx = kzalloc(size, GFP_NOFS);
-
- if (!tx)
- return NULL;
-
- atomic_set(&tx->tx_refcount, 1);
- tx->tx_zc_aborted = 0;
- tx->tx_zc_capable = 0;
- tx->tx_zc_checked = 0;
- tx->tx_desc_size = size;
-
- atomic_inc(&ksocknal_data.ksnd_nactive_txs);
-
- return tx;
-}
-
-struct ksock_tx *
-ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
-{
- struct ksock_tx *tx;
-
- tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
- if (!tx) {
- CERROR("Can't allocate noop tx desc\n");
- return NULL;
- }
-
- tx->tx_conn = NULL;
- tx->tx_lnetmsg = NULL;
- tx->tx_kiov = NULL;
- tx->tx_nkiov = 0;
- tx->tx_iov = tx->tx_frags.virt.iov;
- tx->tx_niov = 1;
- tx->tx_nonblk = nonblk;
-
- tx->tx_msg.ksm_csum = 0;
- tx->tx_msg.ksm_type = KSOCK_MSG_NOOP;
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
-
- return tx;
-}
-
-void
-ksocknal_free_tx(struct ksock_tx *tx)
-{
- atomic_dec(&ksocknal_data.ksnd_nactive_txs);
-
- if (!tx->tx_lnetmsg && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
- /* it's a noop tx */
- spin_lock(&ksocknal_data.ksnd_tx_lock);
-
- list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
-
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
- } else {
- kfree(tx);
- }
-}
-
-static int
-ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct kvec *iov = tx->tx_iov;
- int nob;
- int rc;
-
- LASSERT(tx->tx_niov > 0);
-
- /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
- rc = ksocknal_lib_send_iov(conn, tx);
-
- if (rc <= 0) /* sent nothing? */
- return rc;
-
- nob = rc;
- LASSERT(nob <= tx->tx_resid);
- tx->tx_resid -= nob;
-
- /* "consume" iov */
- do {
- LASSERT(tx->tx_niov > 0);
-
- if (nob < (int)iov->iov_len) {
- iov->iov_base = (void *)((char *)iov->iov_base + nob);
- iov->iov_len -= nob;
- return rc;
- }
-
- nob -= iov->iov_len;
- tx->tx_iov = ++iov;
- tx->tx_niov--;
- } while (nob);
-
- return rc;
-}
-
-static int
-ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct bio_vec *kiov = tx->tx_kiov;
- int nob;
- int rc;
-
- LASSERT(!tx->tx_niov);
- LASSERT(tx->tx_nkiov > 0);
-
- /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
- rc = ksocknal_lib_send_kiov(conn, tx);
-
- if (rc <= 0) /* sent nothing? */
- return rc;
-
- nob = rc;
- LASSERT(nob <= tx->tx_resid);
- tx->tx_resid -= nob;
-
- /* "consume" kiov */
- do {
- LASSERT(tx->tx_nkiov > 0);
-
- if (nob < (int)kiov->bv_len) {
- kiov->bv_offset += nob;
- kiov->bv_len -= nob;
- return rc;
- }
-
- nob -= (int)kiov->bv_len;
- tx->tx_kiov = ++kiov;
- tx->tx_nkiov--;
- } while (nob);
-
- return rc;
-}
-
-static int
-ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- int rc;
- int bufnob;
-
- if (ksocknal_data.ksnd_stall_tx) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(ksocknal_data.ksnd_stall_tx * HZ);
- }
-
- LASSERT(tx->tx_resid);
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) {
- LASSERT(conn->ksnc_closing);
- return -ESHUTDOWN;
- }
-
- do {
- if (ksocknal_data.ksnd_enomem_tx > 0) {
- /* testing... */
- ksocknal_data.ksnd_enomem_tx--;
- rc = -EAGAIN;
- } else if (tx->tx_niov) {
- rc = ksocknal_send_iov(conn, tx);
- } else {
- rc = ksocknal_send_kiov(conn, tx);
- }
-
- bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
- if (rc > 0) /* sent something? */
- conn->ksnc_tx_bufnob += rc; /* account it */
-
- if (bufnob < conn->ksnc_tx_bufnob) {
- /*
- * allocated send buffer bytes < computed; infer
- * something got ACKed
- */
- conn->ksnc_tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- conn->ksnc_peer->ksnp_last_alive = jiffies;
- conn->ksnc_tx_bufnob = bufnob;
- mb();
- }
-
- if (rc <= 0) { /* Didn't write anything? */
-
- if (!rc) /* some stacks return 0 instead of -EAGAIN */
- rc = -EAGAIN;
-
- /* Check if EAGAIN is due to memory pressure */
- if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
- rc = -ENOMEM;
-
- break;
- }
-
- /* socket's wmem_queued now includes 'rc' bytes */
- atomic_sub(rc, &conn->ksnc_tx_nob);
- rc = 0;
-
- } while (tx->tx_resid);
-
- ksocknal_connsock_decref(conn);
- return rc;
-}
-
-static int
-ksocknal_recv_iter(struct ksock_conn *conn)
-{
- int nob;
- int rc;
-
- /*
- * Never touch conn->ksnc_rx_to or change connection
- * status inside ksocknal_lib_recv
- */
- rc = ksocknal_lib_recv(conn);
-
- if (rc <= 0)
- return rc;
-
- /* received something... */
- nob = rc;
-
- conn->ksnc_peer->ksnp_last_alive = jiffies;
- conn->ksnc_rx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- mb(); /* order with setting rx_started */
- conn->ksnc_rx_started = 1;
-
- conn->ksnc_rx_nob_left -= nob;
-
- iov_iter_advance(&conn->ksnc_rx_to, nob);
- if (iov_iter_count(&conn->ksnc_rx_to))
- return -EAGAIN;
-
- return 1;
-}
-
-static int
-ksocknal_receive(struct ksock_conn *conn)
-{
- /*
- * Return 1 on success, 0 on EOF, < 0 on error.
- * Caller checks ksnc_rx_to to determine
- * progress/completion.
- */
- int rc;
-
- if (ksocknal_data.ksnd_stall_rx) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(ksocknal_data.ksnd_stall_rx * HZ);
- }
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) {
- LASSERT(conn->ksnc_closing);
- return -ESHUTDOWN;
- }
-
- for (;;) {
- rc = ksocknal_recv_iter(conn);
- if (rc <= 0) {
- /* error/EOF or partial receive */
- if (rc == -EAGAIN) {
- rc = 1;
- } else if (!rc && conn->ksnc_rx_started) {
- /* EOF in the middle of a message */
- rc = -EPROTO;
- }
- break;
- }
-
- /* Completed a fragment */
-
- if (!iov_iter_count(&conn->ksnc_rx_to)) {
- rc = 1;
- break;
- }
- }
-
- ksocknal_connsock_decref(conn);
- return rc;
-}
-
-void
-ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx)
-{
- struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
- int rc = (!tx->tx_resid && !tx->tx_zc_aborted) ? 0 : -EIO;
-
- LASSERT(ni || tx->tx_conn);
-
- if (tx->tx_conn)
- ksocknal_conn_decref(tx->tx_conn);
-
- if (!ni && tx->tx_conn)
- ni = tx->tx_conn->ksnc_peer->ksnp_ni;
-
- ksocknal_free_tx(tx);
- if (lnetmsg) /* KSOCK_MSG_NOOP go without lnetmsg */
- lnet_finalize(ni, lnetmsg, rc);
-}
-
-void
-ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
-{
- struct ksock_tx *tx;
-
- while (!list_empty(txlist)) {
- tx = list_entry(txlist->next, struct ksock_tx, tx_list);
-
- if (error && tx->tx_lnetmsg) {
- CNETERR("Deleting packet type %d len %d %s->%s\n",
- le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type),
- le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length),
- libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
- libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
- } else if (error) {
- CNETERR("Deleting noop packet\n");
- }
-
- list_del(&tx->tx_list);
-
- LASSERT(atomic_read(&tx->tx_refcount) == 1);
- ksocknal_tx_done(ni, tx);
- }
-}
-
-static void
-ksocknal_check_zc_req(struct ksock_tx *tx)
-{
- struct ksock_conn *conn = tx->tx_conn;
- struct ksock_peer *peer = conn->ksnc_peer;
-
- /*
- * Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
- * to ksnp_zc_req_list if some fragment of this message should be sent
- * zero-copy. Our peer will send an ACK containing this cookie when
- * she has received this message to tell us we can signal completion.
- * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
- * ksnp_zc_req_list.
- */
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
- LASSERT(tx->tx_zc_capable);
-
- tx->tx_zc_checked = 1;
-
- if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
- !conn->ksnc_zc_capable)
- return;
-
- /*
- * assign cookie and queue tx to pending list, it will be released when
- * a matching ack is received. See ksocknal_handle_zcack()
- */
- ksocknal_tx_addref(tx);
-
- spin_lock(&peer->ksnp_lock);
-
- /* ZC_REQ is going to be pinned to the peer */
- tx->tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
-
- LASSERT(!tx->tx_msg.ksm_zc_cookies[0]);
-
- tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
-
- if (!peer->ksnp_zc_next_cookie)
- peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
-
- list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
-
- spin_unlock(&peer->ksnp_lock);
-}
-
-static void
-ksocknal_uncheck_zc_req(struct ksock_tx *tx)
-{
- struct ksock_peer *peer = tx->tx_conn->ksnc_peer;
-
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
- LASSERT(tx->tx_zc_capable);
-
- tx->tx_zc_checked = 0;
-
- spin_lock(&peer->ksnp_lock);
-
- if (!tx->tx_msg.ksm_zc_cookies[0]) {
- /* Not waiting for an ACK */
- spin_unlock(&peer->ksnp_lock);
- return;
- }
-
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- list_del(&tx->tx_zc_list);
-
- spin_unlock(&peer->ksnp_lock);
-
- ksocknal_tx_decref(tx);
-}
-
-static int
-ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- int rc;
-
- if (tx->tx_zc_capable && !tx->tx_zc_checked)
- ksocknal_check_zc_req(tx);
-
- rc = ksocknal_transmit(conn, tx);
-
- CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc);
-
- if (!tx->tx_resid) {
- /* Sent everything OK */
- LASSERT(!rc);
-
- return 0;
- }
-
- if (rc == -EAGAIN)
- return rc;
-
- if (rc == -ENOMEM) {
- static int counter;
-
- counter++; /* exponential backoff warnings */
- if ((counter & (-counter)) == counter)
- CWARN("%u ENOMEM tx %p\n", counter, conn);
-
- /* Queue on ksnd_enomem_conns for retry after a timeout */
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- /* enomem list takes over scheduler's ref... */
- LASSERT(conn->ksnc_tx_scheduled);
- list_add_tail(&conn->ksnc_tx_list,
- &ksocknal_data.ksnd_enomem_conns);
- if (!time_after_eq(jiffies + SOCKNAL_ENOMEM_RETRY,
- ksocknal_data.ksnd_reaper_waketime))
- wake_up(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
- return rc;
- }
-
- /* Actual error */
- LASSERT(rc < 0);
-
- if (!conn->ksnc_closing) {
- switch (rc) {
- case -ECONNRESET:
- LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n",
- &conn->ksnc_ipaddr);
- break;
- default:
- LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n",
- &conn->ksnc_ipaddr, rc);
- break;
- }
- CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
- conn, rc,
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- }
-
- if (tx->tx_zc_checked)
- ksocknal_uncheck_zc_req(tx);
-
- /* it's not an error if conn is being closed */
- ksocknal_close_conn_and_siblings(conn, (conn->ksnc_closing) ? 0 : rc);
-
- return rc;
-}
-
-static void
-ksocknal_launch_connection_locked(struct ksock_route *route)
-{
- /* called holding write lock on ksnd_global_lock */
-
- LASSERT(!route->ksnr_scheduled);
- LASSERT(!route->ksnr_connecting);
- LASSERT(ksocknal_route_mask() & ~route->ksnr_connected);
-
- route->ksnr_scheduled = 1; /* scheduling conn for connd */
- ksocknal_route_addref(route); /* extra ref for connd */
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
-
- list_add_tail(&route->ksnr_connd_list,
- &ksocknal_data.ksnd_connd_routes);
- wake_up(&ksocknal_data.ksnd_connd_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
-}
-
-void
-ksocknal_launch_all_connections_locked(struct ksock_peer *peer)
-{
- struct ksock_route *route;
-
- /* called holding write lock on ksnd_global_lock */
- for (;;) {
- /* launch any/all connections that need it */
- route = ksocknal_find_connectable_route_locked(peer);
- if (!route)
- return;
-
- ksocknal_launch_connection_locked(route);
- }
-}
-
-struct ksock_conn *
-ksocknal_find_conn_locked(struct ksock_peer *peer, struct ksock_tx *tx,
- int nonblk)
-{
- struct list_head *tmp;
- struct ksock_conn *conn;
- struct ksock_conn *typed = NULL;
- struct ksock_conn *fallback = NULL;
- int tnob = 0;
- int fnob = 0;
-
- list_for_each(tmp, &peer->ksnp_conns) {
- struct ksock_conn *c;
- int nob, rc;
-
- c = list_entry(tmp, struct ksock_conn, ksnc_list);
- nob = atomic_read(&c->ksnc_tx_nob) +
- c->ksnc_sock->sk->sk_wmem_queued;
-
- LASSERT(!c->ksnc_closing);
- LASSERT(c->ksnc_proto &&
- c->ksnc_proto->pro_match_tx);
-
- rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
-
- switch (rc) {
- default:
- LBUG();
- case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
- continue;
-
- case SOCKNAL_MATCH_YES: /* typed connection */
- if (!typed || tnob > nob ||
- (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
- time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
- typed = c;
- tnob = nob;
- }
- break;
-
- case SOCKNAL_MATCH_MAY: /* fallback connection */
- if (!fallback || fnob > nob ||
- (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
- time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
- fallback = c;
- fnob = nob;
- }
- break;
- }
- }
-
- /* prefer the typed selection */
- conn = (typed) ? typed : fallback;
-
- if (conn)
- conn->ksnc_tx_last_post = jiffies;
-
- return conn;
-}
-
-void
-ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- conn->ksnc_proto->pro_pack(tx);
-
- atomic_add(tx->tx_nob, &conn->ksnc_tx_nob);
- ksocknal_conn_addref(conn); /* +1 ref for tx */
- tx->tx_conn = conn;
-}
-
-void
-ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
-{
- struct ksock_sched *sched = conn->ksnc_scheduler;
- struct ksock_msg *msg = &tx->tx_msg;
- struct ksock_tx *ztx = NULL;
- int bufnob = 0;
-
- /*
- * called holding global lock (read or irq-write) and caller may
- * not have dropped this lock between finding conn and calling me,
- * so we don't need the {get,put}connsock dance to deref
- * ksnc_sock...
- */
- LASSERT(!conn->ksnc_closing);
-
- CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port);
-
- ksocknal_tx_prep(conn, tx);
-
- /*
- * Ensure the frags we've been given EXACTLY match the number of
- * bytes we want to send. Many TCP/IP stacks disregard any total
- * size parameters passed to them and just look at the frags.
- *
- * We always expect at least 1 mapped fragment containing the
- * complete ksocknal message header.
- */
- LASSERT(lnet_iov_nob(tx->tx_niov, tx->tx_iov) +
- lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
- (unsigned int)tx->tx_nob);
- LASSERT(tx->tx_niov >= 1);
- LASSERT(tx->tx_resid == tx->tx_nob);
-
- CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
- tx, (tx->tx_lnetmsg) ? tx->tx_lnetmsg->msg_hdr.type :
- KSOCK_MSG_NOOP,
- tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
-
- /*
- * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
- * but they're used inside spinlocks a lot.
- */
- bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
- spin_lock_bh(&sched->kss_lock);
-
- if (list_empty(&conn->ksnc_tx_queue) && !bufnob) {
- /* First packet starts the timeout */
- conn->ksnc_tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
- conn->ksnc_peer->ksnp_last_alive = jiffies;
- conn->ksnc_tx_bufnob = 0;
- mb(); /* order with adding to tx_queue */
- }
-
- if (msg->ksm_type == KSOCK_MSG_NOOP) {
- /*
- * The packet is noop ZC ACK, try to piggyback the ack_cookie
- * on a normal packet so I don't need to send it
- */
- LASSERT(msg->ksm_zc_cookies[1]);
- LASSERT(conn->ksnc_proto->pro_queue_tx_zcack);
-
- /* ZC ACK piggybacked on ztx release tx later */
- if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
- ztx = tx;
- } else {
- /*
- * It's a normal packet - can it piggback a noop zc-ack that
- * has been queued already?
- */
- LASSERT(!msg->ksm_zc_cookies[1]);
- LASSERT(conn->ksnc_proto->pro_queue_tx_msg);
-
- ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
- /* ztx will be released later */
- }
-
- if (ztx) {
- atomic_sub(ztx->tx_nob, &conn->ksnc_tx_nob);
- list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
- }
-
- if (conn->ksnc_tx_ready && /* able to send */
- !conn->ksnc_tx_scheduled) { /* not scheduled to send */
- /* +1 ref for scheduler */
- ksocknal_conn_addref(conn);
- list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
- conn->ksnc_tx_scheduled = 1;
- wake_up(&sched->kss_waitq);
- }
-
- spin_unlock_bh(&sched->kss_lock);
-}
-
-struct ksock_route *
-ksocknal_find_connectable_route_locked(struct ksock_peer *peer)
-{
- unsigned long now = jiffies;
- struct list_head *tmp;
- struct ksock_route *route;
-
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- LASSERT(!route->ksnr_connecting || route->ksnr_scheduled);
-
- /* connections being established */
- if (route->ksnr_scheduled)
- continue;
-
- /* all route types connected ? */
- if (!(ksocknal_route_mask() & ~route->ksnr_connected))
- continue;
-
- if (!(!route->ksnr_retry_interval || /* first attempt */
- time_after_eq(now, route->ksnr_timeout))) {
- CDEBUG(D_NET,
- "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n",
- &route->ksnr_ipaddr,
- route->ksnr_connected,
- route->ksnr_retry_interval,
- (route->ksnr_timeout - now) / HZ);
- continue;
- }
-
- return route;
- }
-
- return NULL;
-}
-
-struct ksock_route *
-ksocknal_find_connecting_route_locked(struct ksock_peer *peer)
-{
- struct list_head *tmp;
- struct ksock_route *route;
-
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- LASSERT(!route->ksnr_connecting || route->ksnr_scheduled);
-
- if (route->ksnr_scheduled)
- return route;
- }
-
- return NULL;
-}
-
-int
-ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
- struct lnet_process_id id)
-{
- struct ksock_peer *peer;
- struct ksock_conn *conn;
- rwlock_t *g_lock;
- int retry;
- int rc;
-
- LASSERT(!tx->tx_conn);
-
- g_lock = &ksocknal_data.ksnd_global_lock;
-
- for (retry = 0;; retry = 1) {
- read_lock(g_lock);
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer) {
- if (!ksocknal_find_connectable_route_locked(peer)) {
- conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
- if (conn) {
- /*
- * I've got no routes that need to be
- * connecting and I do have an actual
- * connection...
- */
- ksocknal_queue_tx_locked(tx, conn);
- read_unlock(g_lock);
- return 0;
- }
- }
- }
-
- /* I'll need a write lock... */
- read_unlock(g_lock);
-
- write_lock_bh(g_lock);
-
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer)
- break;
-
- write_unlock_bh(g_lock);
-
- if (id.pid & LNET_PID_USERFLAG) {
- CERROR("Refusing to create a connection to userspace process %s\n",
- libcfs_id2str(id));
- return -EHOSTUNREACH;
- }
-
- if (retry) {
- CERROR("Can't find peer %s\n", libcfs_id2str(id));
- return -EHOSTUNREACH;
- }
-
- rc = ksocknal_add_peer(ni, id,
- LNET_NIDADDR(id.nid),
- lnet_acceptor_port());
- if (rc) {
- CERROR("Can't add peer %s: %d\n",
- libcfs_id2str(id), rc);
- return rc;
- }
- }
-
- ksocknal_launch_all_connections_locked(peer);
-
- conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
- if (conn) {
- /* Connection exists; queue message on it */
- ksocknal_queue_tx_locked(tx, conn);
- write_unlock_bh(g_lock);
- return 0;
- }
-
- if (peer->ksnp_accepting > 0 ||
- ksocknal_find_connecting_route_locked(peer)) {
- /* the message is going to be pinned to the peer */
- tx->tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
-
- /* Queue the message until a connection is established */
- list_add_tail(&tx->tx_list, &peer->ksnp_tx_queue);
- write_unlock_bh(g_lock);
- return 0;
- }
-
- write_unlock_bh(g_lock);
-
- /* NB Routes may be ignored if connections to them failed recently */
- CNETERR("No usable routes to %s\n", libcfs_id2str(id));
- return -EHOSTUNREACH;
-}
-
-int
-ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
-{
- unsigned int mpflag = 0;
- int type = lntmsg->msg_type;
- struct lnet_process_id target = lntmsg->msg_target;
- unsigned int payload_niov = lntmsg->msg_niov;
- struct kvec *payload_iov = lntmsg->msg_iov;
- struct bio_vec *payload_kiov = lntmsg->msg_kiov;
- unsigned int payload_offset = lntmsg->msg_offset;
- unsigned int payload_nob = lntmsg->msg_len;
- struct ksock_tx *tx;
- int desc_size;
- int rc;
-
- /*
- * NB 'private' is different depending on what we're sending.
- * Just ignore it...
- */
- CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
- payload_nob, payload_niov, libcfs_id2str(target));
-
- LASSERT(!payload_nob || payload_niov > 0);
- LASSERT(payload_niov <= LNET_MAX_IOV);
- /* payload is either all vaddrs or all pages */
- LASSERT(!(payload_kiov && payload_iov));
- LASSERT(!in_interrupt());
-
- if (payload_iov)
- desc_size = offsetof(struct ksock_tx,
- tx_frags.virt.iov[1 + payload_niov]);
- else
- desc_size = offsetof(struct ksock_tx,
- tx_frags.paged.kiov[payload_niov]);
-
- if (lntmsg->msg_vmflush)
- mpflag = memalloc_noreclaim_save();
- tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
- if (!tx) {
- CERROR("Can't allocate tx desc type %d size %d\n",
- type, desc_size);
- if (lntmsg->msg_vmflush)
- memalloc_noreclaim_restore(mpflag);
- return -ENOMEM;
- }
-
- tx->tx_conn = NULL; /* set when assigned a conn */
- tx->tx_lnetmsg = lntmsg;
-
- if (payload_iov) {
- tx->tx_kiov = NULL;
- tx->tx_nkiov = 0;
- tx->tx_iov = tx->tx_frags.virt.iov;
- tx->tx_niov = 1 +
- lnet_extract_iov(payload_niov, &tx->tx_iov[1],
- payload_niov, payload_iov,
- payload_offset, payload_nob);
- } else {
- tx->tx_niov = 1;
- tx->tx_iov = &tx->tx_frags.paged.iov;
- tx->tx_kiov = tx->tx_frags.paged.kiov;
- tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
- payload_niov, payload_kiov,
- payload_offset, payload_nob);
-
- if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
- tx->tx_zc_capable = 1;
- }
-
- tx->tx_msg.ksm_csum = 0;
- tx->tx_msg.ksm_type = KSOCK_MSG_LNET;
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- tx->tx_msg.ksm_zc_cookies[1] = 0;
-
- /* The first fragment will be set later in pro_pack */
- rc = ksocknal_launch_packet(ni, tx, target);
- if (mpflag)
- memalloc_noreclaim_restore(mpflag);
-
- if (!rc)
- return 0;
-
- ksocknal_free_tx(tx);
- return -EIO;
-}
-
-int
-ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
-{
- struct task_struct *task = kthread_run(fn, arg, "%s", name);
-
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
- ksocknal_data.ksnd_nthreads++;
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
- return 0;
-}
-
-void
-ksocknal_thread_fini(void)
-{
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
- ksocknal_data.ksnd_nthreads--;
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-}
-
-int
-ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
-{
- static char ksocknal_slop_buffer[4096];
- struct kvec *kvec = conn->ksnc_rx_iov_space;
-
- int nob;
- unsigned int niov;
- int skipped;
-
- LASSERT(conn->ksnc_proto);
-
- if (*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) {
- /* Remind the socket to ack eagerly... */
- ksocknal_lib_eager_ack(conn);
- }
-
- if (!nob_to_skip) { /* right at next packet boundary now */
- conn->ksnc_rx_started = 0;
- mb(); /* racing with timeout thread */
-
- switch (conn->ksnc_proto->pro_version) {
- case KSOCK_PROTO_V2:
- case KSOCK_PROTO_V3:
- conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
- kvec->iov_base = &conn->ksnc_msg;
- kvec->iov_len = offsetof(struct ksock_msg, ksm_u);
- conn->ksnc_rx_nob_left = offsetof(struct ksock_msg, ksm_u);
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec,
- 1, offsetof(struct ksock_msg, ksm_u));
- break;
-
- case KSOCK_PROTO_V1:
- /* Receiving bare struct lnet_hdr */
- conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
- kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
- kvec->iov_len = sizeof(struct lnet_hdr);
- conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr);
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec,
- 1, sizeof(struct lnet_hdr));
- break;
-
- default:
- LBUG();
- }
- conn->ksnc_rx_csum = ~0;
- return 1;
- }
-
- /*
- * Set up to skip as much as possible now. If there's more left
- * (ran out of iov entries) we'll get called again
- */
- conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
- conn->ksnc_rx_nob_left = nob_to_skip;
- skipped = 0;
- niov = 0;
-
- do {
- nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer));
-
- kvec[niov].iov_base = ksocknal_slop_buffer;
- kvec[niov].iov_len = nob;
- niov++;
- skipped += nob;
- nob_to_skip -= nob;
-
- } while (nob_to_skip && /* mustn't overflow conn's rx iov */
- niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct iovec));
-
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, niov, skipped);
- return 0;
-}
-
-static int
-ksocknal_process_receive(struct ksock_conn *conn)
-{
- struct kvec *kvec = conn->ksnc_rx_iov_space;
- struct lnet_hdr *lhdr;
- struct lnet_process_id *id;
- int rc;
-
- LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
-
- /* NB: sched lock NOT held */
- /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
- LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
- conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
- conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
- conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
- again:
- if (iov_iter_count(&conn->ksnc_rx_to)) {
- rc = ksocknal_receive(conn);
-
- if (rc <= 0) {
- LASSERT(rc != -EAGAIN);
-
- if (!rc)
- CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n",
- conn,
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- else if (!conn->ksnc_closing)
- CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n",
- conn, rc,
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
-
- /* it's not an error if conn is being closed */
- ksocknal_close_conn_and_siblings(conn,
- (conn->ksnc_closing) ? 0 : rc);
- return (!rc ? -ESHUTDOWN : rc);
- }
-
- if (iov_iter_count(&conn->ksnc_rx_to)) {
- /* short read */
- return -EAGAIN;
- }
- }
- switch (conn->ksnc_rx_state) {
- case SOCKNAL_RX_KSM_HEADER:
- if (conn->ksnc_flip) {
- __swab32s(&conn->ksnc_msg.ksm_type);
- __swab32s(&conn->ksnc_msg.ksm_csum);
- __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
- __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
- }
-
- if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
- conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
- CERROR("%s: Unknown message type: %x\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- conn->ksnc_msg.ksm_type);
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, -EPROTO);
- return -EPROTO;
- }
-
- if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
- conn->ksnc_msg.ksm_csum && /* has checksum */
- conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
- /* NOOP Checksum error */
- CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, -EPROTO);
- return -EIO;
- }
-
- if (conn->ksnc_msg.ksm_zc_cookies[1]) {
- __u64 cookie = 0;
-
- LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
-
- if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
- cookie = conn->ksnc_msg.ksm_zc_cookies[0];
-
- rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
- conn->ksnc_msg.ksm_zc_cookies[1]);
-
- if (rc) {
- CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, -EPROTO);
- return rc;
- }
- }
-
- if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
- ksocknal_new_packet(conn, 0);
- return 0; /* NOOP is done and just return */
- }
-
- conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
- conn->ksnc_rx_nob_left = sizeof(struct ksock_lnet_msg);
-
- kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
- kvec->iov_len = sizeof(struct ksock_lnet_msg);
-
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec,
- 1, sizeof(struct ksock_lnet_msg));
-
- goto again; /* read lnet header now */
-
- case SOCKNAL_RX_LNET_HEADER:
- /* unpack message header */
- conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
-
- if (conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) {
- /* Userspace peer */
- lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
- id = &conn->ksnc_peer->ksnp_id;
-
- /* Substitute process ID assigned at connection time */
- lhdr->src_pid = cpu_to_le32(id->pid);
- lhdr->src_nid = cpu_to_le64(id->nid);
- }
-
- conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
- ksocknal_conn_addref(conn); /* ++ref while parsing */
-
- rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
- &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
- conn->ksnc_peer->ksnp_id.nid, conn, 0);
- if (rc < 0) {
- /* I just received garbage: give up on this conn */
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, rc);
- ksocknal_conn_decref(conn);
- return -EPROTO;
- }
-
- /* I'm racing with ksocknal_recv() */
- LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
- conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
-
- if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
- return 0;
-
- /* ksocknal_recv() got called */
- goto again;
-
- case SOCKNAL_RX_LNET_PAYLOAD:
- /* payload all received */
- rc = 0;
-
- if (!conn->ksnc_rx_nob_left && /* not truncating */
- conn->ksnc_msg.ksm_csum && /* has checksum */
- conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
- CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
- rc = -EIO;
- }
-
- if (!rc && conn->ksnc_msg.ksm_zc_cookies[0]) {
- LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
-
- lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
- id = &conn->ksnc_peer->ksnp_id;
-
- rc = conn->ksnc_proto->pro_handle_zcreq(conn,
- conn->ksnc_msg.ksm_zc_cookies[0],
- *ksocknal_tunables.ksnd_nonblk_zcack ||
- le64_to_cpu(lhdr->src_nid) != id->nid);
- }
-
- lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
-
- if (rc) {
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, rc);
- return -EPROTO;
- }
- /* Fall through */
-
- case SOCKNAL_RX_SLOP:
- /* starting new packet? */
- if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left))
- return 0; /* come back later */
- goto again; /* try to finish reading slop now */
-
- default:
- break;
- }
-
- /* Not Reached */
- LBUG();
- return -EINVAL; /* keep gcc happy */
-}
-
-int
-ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
- int delayed, struct iov_iter *to, unsigned int rlen)
-{
- struct ksock_conn *conn = private;
- struct ksock_sched *sched = conn->ksnc_scheduler;
-
- LASSERT(iov_iter_count(to) <= rlen);
- LASSERT(to->nr_segs <= LNET_MAX_IOV);
-
- conn->ksnc_cookie = msg;
- conn->ksnc_rx_nob_left = rlen;
-
- conn->ksnc_rx_to = *to;
-
- LASSERT(conn->ksnc_rx_scheduled);
-
- spin_lock_bh(&sched->kss_lock);
-
- switch (conn->ksnc_rx_state) {
- case SOCKNAL_RX_PARSE_WAIT:
- list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
- wake_up(&sched->kss_waitq);
- LASSERT(conn->ksnc_rx_ready);
- break;
-
- case SOCKNAL_RX_PARSE:
- /* scheduler hasn't noticed I'm parsing yet */
- break;
- }
-
- conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
-
- spin_unlock_bh(&sched->kss_lock);
- ksocknal_conn_decref(conn);
- return 0;
-}
-
-static inline int
-ksocknal_sched_cansleep(struct ksock_sched *sched)
-{
- int rc;
-
- spin_lock_bh(&sched->kss_lock);
-
- rc = !ksocknal_data.ksnd_shuttingdown &&
- list_empty(&sched->kss_rx_conns) &&
- list_empty(&sched->kss_tx_conns);
-
- spin_unlock_bh(&sched->kss_lock);
- return rc;
-}
-
-int ksocknal_scheduler(void *arg)
-{
- struct ksock_sched_info *info;
- struct ksock_sched *sched;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
- int rc;
- int nloops = 0;
- long id = (long)arg;
-
- info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
- sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
-
- rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
- if (rc) {
- CWARN("Can't set CPU partition affinity to %d: %d\n",
- info->ksi_cpt, rc);
- }
-
- spin_lock_bh(&sched->kss_lock);
-
- while (!ksocknal_data.ksnd_shuttingdown) {
- int did_something = 0;
-
- /* Ensure I progress everything semi-fairly */
-
- if (!list_empty(&sched->kss_rx_conns)) {
- conn = list_entry(sched->kss_rx_conns.next,
- struct ksock_conn, ksnc_rx_list);
- list_del(&conn->ksnc_rx_list);
-
- LASSERT(conn->ksnc_rx_scheduled);
- LASSERT(conn->ksnc_rx_ready);
-
- /*
- * clear rx_ready in case receive isn't complete.
- * Do it BEFORE we call process_recv, since
- * data_ready can set it any time after we release
- * kss_lock.
- */
- conn->ksnc_rx_ready = 0;
- spin_unlock_bh(&sched->kss_lock);
-
- rc = ksocknal_process_receive(conn);
-
- spin_lock_bh(&sched->kss_lock);
-
- /* I'm the only one that can clear this flag */
- LASSERT(conn->ksnc_rx_scheduled);
-
- /* Did process_receive get everything it wanted? */
- if (!rc)
- conn->ksnc_rx_ready = 1;
-
- if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
- /*
- * Conn blocked waiting for ksocknal_recv()
- * I change its state (under lock) to signal
- * it can be rescheduled
- */
- conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
- } else if (conn->ksnc_rx_ready) {
- /* reschedule for rx */
- list_add_tail(&conn->ksnc_rx_list,
- &sched->kss_rx_conns);
- } else {
- conn->ksnc_rx_scheduled = 0;
- /* drop my ref */
- ksocknal_conn_decref(conn);
- }
-
- did_something = 1;
- }
-
- if (!list_empty(&sched->kss_tx_conns)) {
- LIST_HEAD(zlist);
-
- if (!list_empty(&sched->kss_zombie_noop_txs)) {
- list_add(&zlist, &sched->kss_zombie_noop_txs);
- list_del_init(&sched->kss_zombie_noop_txs);
- }
-
- conn = list_entry(sched->kss_tx_conns.next,
- struct ksock_conn, ksnc_tx_list);
- list_del(&conn->ksnc_tx_list);
-
- LASSERT(conn->ksnc_tx_scheduled);
- LASSERT(conn->ksnc_tx_ready);
- LASSERT(!list_empty(&conn->ksnc_tx_queue));
-
- tx = list_entry(conn->ksnc_tx_queue.next,
- struct ksock_tx, tx_list);
-
- if (conn->ksnc_tx_carrier == tx)
- ksocknal_next_tx_carrier(conn);
-
- /* dequeue now so empty list => more to send */
- list_del(&tx->tx_list);
-
- /*
- * Clear tx_ready in case send isn't complete. Do
- * it BEFORE we call process_transmit, since
- * write_space can set it any time after we release
- * kss_lock.
- */
- conn->ksnc_tx_ready = 0;
- spin_unlock_bh(&sched->kss_lock);
-
- if (!list_empty(&zlist)) {
- /*
- * free zombie noop txs, it's fast because
- * noop txs are just put in freelist
- */
- ksocknal_txlist_done(NULL, &zlist, 0);
- }
-
- rc = ksocknal_process_transmit(conn, tx);
-
- if (rc == -ENOMEM || rc == -EAGAIN) {
- /*
- * Incomplete send: replace tx on HEAD of
- * tx_queue
- */
- spin_lock_bh(&sched->kss_lock);
- list_add(&tx->tx_list, &conn->ksnc_tx_queue);
- } else {
- /* Complete send; tx -ref */
- ksocknal_tx_decref(tx);
-
- spin_lock_bh(&sched->kss_lock);
- /* assume space for more */
- conn->ksnc_tx_ready = 1;
- }
-
- if (rc == -ENOMEM) {
- /*
- * Do nothing; after a short timeout, this
- * conn will be reposted on kss_tx_conns.
- */
- } else if (conn->ksnc_tx_ready &&
- !list_empty(&conn->ksnc_tx_queue)) {
- /* reschedule for tx */
- list_add_tail(&conn->ksnc_tx_list,
- &sched->kss_tx_conns);
- } else {
- conn->ksnc_tx_scheduled = 0;
- /* drop my ref */
- ksocknal_conn_decref(conn);
- }
-
- did_something = 1;
- }
- if (!did_something || /* nothing to do */
- ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
- spin_unlock_bh(&sched->kss_lock);
-
- nloops = 0;
-
- if (!did_something) { /* wait for something to do */
- rc = wait_event_interruptible_exclusive(
- sched->kss_waitq,
- !ksocknal_sched_cansleep(sched));
- LASSERT(!rc);
- } else {
- cond_resched();
- }
-
- spin_lock_bh(&sched->kss_lock);
- }
- }
-
- spin_unlock_bh(&sched->kss_lock);
- ksocknal_thread_fini();
- return 0;
-}
-
-/*
- * Add connection to kss_rx_conns of scheduler
- * and wakeup the scheduler.
- */
-void ksocknal_read_callback(struct ksock_conn *conn)
-{
- struct ksock_sched *sched;
-
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
-
- conn->ksnc_rx_ready = 1;
-
- if (!conn->ksnc_rx_scheduled) { /* not being progressed */
- list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
- conn->ksnc_rx_scheduled = 1;
- /* extra ref for scheduler */
- ksocknal_conn_addref(conn);
-
- wake_up(&sched->kss_waitq);
- }
- spin_unlock_bh(&sched->kss_lock);
-}
-
-/*
- * Add connection to kss_tx_conns of scheduler
- * and wakeup the scheduler.
- */
-void ksocknal_write_callback(struct ksock_conn *conn)
-{
- struct ksock_sched *sched;
-
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
-
- conn->ksnc_tx_ready = 1;
-
- if (!conn->ksnc_tx_scheduled && /* not being progressed */
- !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */
- list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
- conn->ksnc_tx_scheduled = 1;
- /* extra ref for scheduler */
- ksocknal_conn_addref(conn);
-
- wake_up(&sched->kss_waitq);
- }
-
- spin_unlock_bh(&sched->kss_lock);
-}
-
-static struct ksock_proto *
-ksocknal_parse_proto_version(struct ksock_hello_msg *hello)
-{
- __u32 version = 0;
-
- if (hello->kshm_magic == LNET_PROTO_MAGIC)
- version = hello->kshm_version;
- else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
- version = __swab32(hello->kshm_version);
-
- if (version) {
-#if SOCKNAL_VERSION_DEBUG
- if (*ksocknal_tunables.ksnd_protocol == 1)
- return NULL;
-
- if (*ksocknal_tunables.ksnd_protocol == 2 &&
- version == KSOCK_PROTO_V3)
- return NULL;
-#endif
- if (version == KSOCK_PROTO_V2)
- return &ksocknal_protocol_v2x;
-
- if (version == KSOCK_PROTO_V3)
- return &ksocknal_protocol_v3x;
-
- return NULL;
- }
-
- if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
- struct lnet_magicversion *hmv = (struct lnet_magicversion *)hello;
-
- BUILD_BUG_ON(sizeof(struct lnet_magicversion) !=
- offsetof(struct ksock_hello_msg, kshm_src_nid));
-
- if (hmv->version_major == cpu_to_le16(KSOCK_PROTO_V1_MAJOR) &&
- hmv->version_minor == cpu_to_le16(KSOCK_PROTO_V1_MINOR))
- return &ksocknal_protocol_v1x;
- }
-
- return NULL;
-}
-
-int
-ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- lnet_nid_t peer_nid, struct ksock_hello_msg *hello)
-{
- /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
- struct ksock_net *net = (struct ksock_net *)ni->ni_data;
-
- LASSERT(hello->kshm_nips <= LNET_MAX_INTERFACES);
-
- /* rely on caller to hold a ref on socket so it wouldn't disappear */
- LASSERT(conn->ksnc_proto);
-
- hello->kshm_src_nid = ni->ni_nid;
- hello->kshm_dst_nid = peer_nid;
- hello->kshm_src_pid = the_lnet.ln_pid;
-
- hello->kshm_src_incarnation = net->ksnn_incarnation;
- hello->kshm_ctype = conn->ksnc_type;
-
- return conn->ksnc_proto->pro_send_hello(conn, hello);
-}
-
-static int
-ksocknal_invert_type(int type)
-{
- switch (type) {
- case SOCKLND_CONN_ANY:
- case SOCKLND_CONN_CONTROL:
- return type;
- case SOCKLND_CONN_BULK_IN:
- return SOCKLND_CONN_BULK_OUT;
- case SOCKLND_CONN_BULK_OUT:
- return SOCKLND_CONN_BULK_IN;
- default:
- return SOCKLND_CONN_NONE;
- }
-}
-
-int
-ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- struct ksock_hello_msg *hello,
- struct lnet_process_id *peerid,
- __u64 *incarnation)
-{
- /* Return < 0 fatal error
- * 0 success
- * EALREADY lost connection race
- * EPROTO protocol version mismatch
- */
- struct socket *sock = conn->ksnc_sock;
- int active = !!conn->ksnc_proto;
- int timeout;
- int proto_match;
- int rc;
- struct ksock_proto *proto;
- struct lnet_process_id recv_id;
-
- /* socket type set on active connections - not set on passive */
- LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
-
- timeout = active ? *ksocknal_tunables.ksnd_timeout :
- lnet_acceptor_timeout();
-
- rc = lnet_sock_read(sock, &hello->kshm_magic,
- sizeof(hello->kshm_magic), timeout);
- if (rc) {
- CERROR("Error %d reading HELLO from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0);
- return rc;
- }
-
- if (hello->kshm_magic != LNET_PROTO_MAGIC &&
- hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
- hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
- /* Unexpected magic! */
- CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n",
- __cpu_to_le32(hello->kshm_magic),
- LNET_PROTO_TCP_MAGIC,
- &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- rc = lnet_sock_read(sock, &hello->kshm_version,
- sizeof(hello->kshm_version), timeout);
- if (rc) {
- CERROR("Error %d reading HELLO from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0);
- return rc;
- }
-
- proto = ksocknal_parse_proto_version(hello);
- if (!proto) {
- if (!active) {
- /* unknown protocol from peer, tell peer my protocol */
- conn->ksnc_proto = &ksocknal_protocol_v3x;
-#if SOCKNAL_VERSION_DEBUG
- if (*ksocknal_tunables.ksnd_protocol == 2)
- conn->ksnc_proto = &ksocknal_protocol_v2x;
- else if (*ksocknal_tunables.ksnd_protocol == 1)
- conn->ksnc_proto = &ksocknal_protocol_v1x;
-#endif
- hello->kshm_nips = 0;
- ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
- }
-
- CERROR("Unknown protocol version (%d.x expected) from %pI4h\n",
- conn->ksnc_proto->pro_version,
- &conn->ksnc_ipaddr);
-
- return -EPROTO;
- }
-
- proto_match = (conn->ksnc_proto == proto);
- conn->ksnc_proto = proto;
-
- /* receive the rest of hello message anyway */
- rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
- if (rc) {
- CERROR("Error %d reading or checking hello from from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0);
- return rc;
- }
-
- *incarnation = hello->kshm_src_incarnation;
-
- if (hello->kshm_src_nid == LNET_NID_ANY) {
- CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n",
- &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- if (!active &&
- conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
- /* Userspace NAL assigns peer process ID from socket */
- recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
- recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
- conn->ksnc_ipaddr);
- } else {
- recv_id.nid = hello->kshm_src_nid;
- recv_id.pid = hello->kshm_src_pid;
- }
-
- if (!active) {
- *peerid = recv_id;
-
- /* peer determines type */
- conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
- if (conn->ksnc_type == SOCKLND_CONN_NONE) {
- CERROR("Unexpected type %d from %s ip %pI4h\n",
- hello->kshm_ctype, libcfs_id2str(*peerid),
- &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- return 0;
- }
-
- if (peerid->pid != recv_id.pid ||
- peerid->nid != recv_id.nid) {
- LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n",
- libcfs_id2str(*peerid),
- &conn->ksnc_ipaddr,
- libcfs_id2str(recv_id));
- return -EPROTO;
- }
-
- if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
- /* Possible protocol mismatch or I lost the connection race */
- return proto_match ? EALREADY : EPROTO;
- }
-
- if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
- CERROR("Mismatched types: me %d, %s ip %pI4h %d\n",
- conn->ksnc_type, libcfs_id2str(*peerid),
- &conn->ksnc_ipaddr, hello->kshm_ctype);
- return -EPROTO;
- }
-
- return 0;
-}
-
-static int
-ksocknal_connect(struct ksock_route *route)
-{
- LIST_HEAD(zombies);
- struct ksock_peer *peer = route->ksnr_peer;
- int type;
- int wanted;
- struct socket *sock;
- unsigned long deadline;
- int retry_later = 0;
- int rc = 0;
-
- deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- LASSERT(route->ksnr_scheduled);
- LASSERT(!route->ksnr_connecting);
-
- route->ksnr_connecting = 1;
-
- for (;;) {
- wanted = ksocknal_route_mask() & ~route->ksnr_connected;
-
- /*
- * stop connecting if peer/route got closed under me, or
- * route got connected while queued
- */
- if (peer->ksnp_closing || route->ksnr_deleted ||
- !wanted) {
- retry_later = 0;
- break;
- }
-
- /* reschedule if peer is connecting to me */
- if (peer->ksnp_accepting > 0) {
- CDEBUG(D_NET,
- "peer %s(%d) already connecting to me, retry later.\n",
- libcfs_nid2str(peer->ksnp_id.nid),
- peer->ksnp_accepting);
- retry_later = 1;
- }
-
- if (retry_later) /* needs reschedule */
- break;
-
- if (wanted & BIT(SOCKLND_CONN_ANY)) {
- type = SOCKLND_CONN_ANY;
- } else if (wanted & BIT(SOCKLND_CONN_CONTROL)) {
- type = SOCKLND_CONN_CONTROL;
- } else if (wanted & BIT(SOCKLND_CONN_BULK_IN)) {
- type = SOCKLND_CONN_BULK_IN;
- } else {
- LASSERT(wanted & BIT(SOCKLND_CONN_BULK_OUT));
- type = SOCKLND_CONN_BULK_OUT;
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (time_after_eq(jiffies, deadline)) {
- rc = -ETIMEDOUT;
- lnet_connect_console_error(rc, peer->ksnp_id.nid,
- route->ksnr_ipaddr,
- route->ksnr_port);
- goto failed;
- }
-
- rc = lnet_connect(&sock, peer->ksnp_id.nid,
- route->ksnr_myipaddr,
- route->ksnr_ipaddr, route->ksnr_port);
- if (rc)
- goto failed;
-
- rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
- if (rc < 0) {
- lnet_connect_console_error(rc, peer->ksnp_id.nid,
- route->ksnr_ipaddr,
- route->ksnr_port);
- goto failed;
- }
-
- /*
- * A +ve RC means I have to retry because I lost the connection
- * race or I have to renegotiate protocol version
- */
- retry_later = (rc);
- if (retry_later)
- CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
- libcfs_nid2str(peer->ksnp_id.nid));
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
- }
-
- route->ksnr_scheduled = 0;
- route->ksnr_connecting = 0;
-
- if (retry_later) {
- /*
- * re-queue for attention; this frees me up to handle
- * the peer's incoming connection request
- */
- if (rc == EALREADY ||
- (!rc && peer->ksnp_accepting > 0)) {
- /*
- * We want to introduce a delay before next
- * attempt to connect if we lost conn race,
- * but the race is resolved quickly usually,
- * so min_reconnectms should be good heuristic
- */
- route->ksnr_retry_interval =
- *ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000;
- route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
- }
-
- ksocknal_launch_connection_locked(route);
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
- return retry_later;
-
- failed:
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- route->ksnr_scheduled = 0;
- route->ksnr_connecting = 0;
-
- /* This is a retry rather than a new connection */
- route->ksnr_retry_interval *= 2;
- route->ksnr_retry_interval =
- max(route->ksnr_retry_interval,
- (long)*ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000);
- route->ksnr_retry_interval =
- min(route->ksnr_retry_interval,
- (long)*ksocknal_tunables.ksnd_max_reconnectms * HZ / 1000);
-
- LASSERT(route->ksnr_retry_interval);
- route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
-
- if (!list_empty(&peer->ksnp_tx_queue) &&
- !peer->ksnp_accepting &&
- !ksocknal_find_connecting_route_locked(peer)) {
- struct ksock_conn *conn;
-
- /*
- * ksnp_tx_queue is queued on a conn on successful
- * connection for V1.x and V2.x
- */
- if (!list_empty(&peer->ksnp_conns)) {
- conn = list_entry(peer->ksnp_conns.next,
- struct ksock_conn, ksnc_list);
- LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
- }
-
- /*
- * take all the blocked packets while I've got the lock and
- * complete below...
- */
- list_splice_init(&peer->ksnp_tx_queue, &zombies);
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_peer_failed(peer);
- ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
- return 0;
-}
-
-/*
- * check whether we need to create more connds.
- * It will try to create new thread if it's necessary, @timeout can
- * be updated if failed to create, so caller wouldn't keep try while
- * running out of resource.
- */
-static int
-ksocknal_connd_check_start(time64_t sec, long *timeout)
-{
- char name[16];
- int rc;
- int total = ksocknal_data.ksnd_connd_starting +
- ksocknal_data.ksnd_connd_running;
-
- if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
- /* still in initializing */
- return 0;
- }
-
- if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
- total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
- /*
- * can't create more connd, or still have enough
- * threads to handle more connecting
- */
- return 0;
- }
-
- if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
- /* no pending connecting request */
- return 0;
- }
-
- if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
- /* may run out of resource, retry later */
- *timeout = HZ;
- return 0;
- }
-
- if (ksocknal_data.ksnd_connd_starting > 0) {
- /* serialize starting to avoid flood */
- return 0;
- }
-
- ksocknal_data.ksnd_connd_starting_stamp = sec;
- ksocknal_data.ksnd_connd_starting++;
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
-
- /* NB: total is the next id */
- snprintf(name, sizeof(name), "socknal_cd%02d", total);
- rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
- if (!rc)
- return 1;
-
- /* we tried ... */
- LASSERT(ksocknal_data.ksnd_connd_starting > 0);
- ksocknal_data.ksnd_connd_starting--;
- ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds();
-
- return 1;
-}
-
-/*
- * check whether current thread can exit, it will return 1 if there are too
- * many threads and no creating in past 120 seconds.
- * Also, this function may update @timeout to make caller come back
- * again to recheck these conditions.
- */
-static int
-ksocknal_connd_check_stop(time64_t sec, long *timeout)
-{
- int val;
-
- if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
- /* still in initializing */
- return 0;
- }
-
- if (ksocknal_data.ksnd_connd_starting > 0) {
- /* in progress of starting new thread */
- return 0;
- }
-
- if (ksocknal_data.ksnd_connd_running <=
- *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
- return 0;
- }
-
- /* created thread in past 120 seconds? */
- val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
- SOCKNAL_CONND_TIMEOUT - sec);
-
- *timeout = (val > 0) ? val * HZ :
- SOCKNAL_CONND_TIMEOUT * HZ;
- if (val > 0)
- return 0;
-
- /* no creating in past 120 seconds */
-
- return ksocknal_data.ksnd_connd_running >
- ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
-}
-
-/*
- * Go through connd_routes queue looking for a route that we can process
- * right now, @timeout_p can be updated if we need to come back later
- */
-static struct ksock_route *
-ksocknal_connd_get_route_locked(signed long *timeout_p)
-{
- struct ksock_route *route;
- unsigned long now;
-
- now = jiffies;
-
- /* connd_routes can contain both pending and ordinary routes */
- list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes,
- ksnr_connd_list) {
- if (!route->ksnr_retry_interval ||
- time_after_eq(now, route->ksnr_timeout))
- return route;
-
- if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
- (int)*timeout_p > (int)(route->ksnr_timeout - now))
- *timeout_p = (int)(route->ksnr_timeout - now);
- }
-
- return NULL;
-}
-
-int
-ksocknal_connd(void *arg)
-{
- spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
- struct ksock_connreq *cr;
- wait_queue_entry_t wait;
- int nloops = 0;
- int cons_retry = 0;
-
- init_waitqueue_entry(&wait, current);
-
- spin_lock_bh(connd_lock);
-
- LASSERT(ksocknal_data.ksnd_connd_starting > 0);
- ksocknal_data.ksnd_connd_starting--;
- ksocknal_data.ksnd_connd_running++;
-
- while (!ksocknal_data.ksnd_shuttingdown) {
- struct ksock_route *route = NULL;
- time64_t sec = ktime_get_real_seconds();
- long timeout = MAX_SCHEDULE_TIMEOUT;
- int dropped_lock = 0;
-
- if (ksocknal_connd_check_stop(sec, &timeout)) {
- /* wakeup another one to check stop */
- wake_up(&ksocknal_data.ksnd_connd_waitq);
- break;
- }
-
- if (ksocknal_connd_check_start(sec, &timeout)) {
- /* created new thread */
- dropped_lock = 1;
- }
-
- if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
- /* Connection accepted by the listener */
- cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
- struct ksock_connreq, ksncr_list);
-
- list_del(&cr->ksncr_list);
- spin_unlock_bh(connd_lock);
- dropped_lock = 1;
-
- ksocknal_create_conn(cr->ksncr_ni, NULL,
- cr->ksncr_sock, SOCKLND_CONN_NONE);
- lnet_ni_decref(cr->ksncr_ni);
- kfree(cr);
-
- spin_lock_bh(connd_lock);
- }
-
- /*
- * Only handle an outgoing connection request if there
- * is a thread left to handle incoming connections and
- * create new connd
- */
- if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
- ksocknal_data.ksnd_connd_running) {
- route = ksocknal_connd_get_route_locked(&timeout);
- }
- if (route) {
- list_del(&route->ksnr_connd_list);
- ksocknal_data.ksnd_connd_connecting++;
- spin_unlock_bh(connd_lock);
- dropped_lock = 1;
-
- if (ksocknal_connect(route)) {
- /* consecutive retry */
- if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
- CWARN("massive consecutive re-connecting to %pI4h\n",
- &route->ksnr_ipaddr);
- cons_retry = 0;
- }
- } else {
- cons_retry = 0;
- }
-
- ksocknal_route_decref(route);
-
- spin_lock_bh(connd_lock);
- ksocknal_data.ksnd_connd_connecting--;
- }
-
- if (dropped_lock) {
- if (++nloops < SOCKNAL_RESCHED)
- continue;
- spin_unlock_bh(connd_lock);
- nloops = 0;
- cond_resched();
- spin_lock_bh(connd_lock);
- continue;
- }
-
- /* Nothing to do for 'timeout' */
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq,
- &wait);
- spin_unlock_bh(connd_lock);
-
- nloops = 0;
- schedule_timeout(timeout);
-
- remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
- spin_lock_bh(connd_lock);
- }
- ksocknal_data.ksnd_connd_running--;
- spin_unlock_bh(connd_lock);
-
- ksocknal_thread_fini();
- return 0;
-}
-
-static struct ksock_conn *
-ksocknal_find_timed_out_conn(struct ksock_peer *peer)
-{
- /* We're called with a shared lock on ksnd_global_lock */
- struct ksock_conn *conn;
- struct list_head *ctmp;
-
- list_for_each(ctmp, &peer->ksnp_conns) {
- int error;
-
- conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
-
- /* Don't need the {get,put}connsock dance to deref ksnc_sock */
- LASSERT(!conn->ksnc_closing);
-
- /*
- * SOCK_ERROR will reset error code of socket in
- * some platform (like Darwin8.x)
- */
- error = conn->ksnc_sock->sk->sk_err;
- if (error) {
- ksocknal_conn_addref(conn);
-
- switch (error) {
- case ECONNRESET:
- CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- break;
- case ETIMEDOUT:
- CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- break;
- default:
- CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n",
- error,
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- break;
- }
-
- return conn;
- }
-
- if (conn->ksnc_rx_started &&
- time_after_eq(jiffies,
- conn->ksnc_rx_deadline)) {
- /* Timed out incomplete incoming message */
- ksocknal_conn_addref(conn);
- CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %zd left %d\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port,
- conn->ksnc_rx_state,
- iov_iter_count(&conn->ksnc_rx_to),
- conn->ksnc_rx_nob_left);
- return conn;
- }
-
- if ((!list_empty(&conn->ksnc_tx_queue) ||
- conn->ksnc_sock->sk->sk_wmem_queued) &&
- time_after_eq(jiffies,
- conn->ksnc_tx_deadline)) {
- /*
- * Timed out messages queued for sending or
- * buffered in the socket's send buffer
- */
- ksocknal_conn_addref(conn);
- CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- return conn;
- }
- }
-
- return NULL;
-}
-
-static inline void
-ksocknal_flush_stale_txs(struct ksock_peer *peer)
-{
- struct ksock_tx *tx;
- struct ksock_tx *tmp;
- LIST_HEAD(stale_txs);
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- list_for_each_entry_safe(tx, tmp, &peer->ksnp_tx_queue, tx_list) {
- if (!time_after_eq(jiffies,
- tx->tx_deadline))
- break;
-
- list_del(&tx->tx_list);
- list_add_tail(&tx->tx_list, &stale_txs);
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
-}
-
-static int
-ksocknal_send_keepalive_locked(struct ksock_peer *peer)
- __must_hold(&ksocknal_data.ksnd_global_lock)
-{
- struct ksock_sched *sched;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
-
- /* last_alive will be updated by create_conn */
- if (list_empty(&peer->ksnp_conns))
- return 0;
-
- if (peer->ksnp_proto != &ksocknal_protocol_v3x)
- return 0;
-
- if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
- time_before(jiffies,
- peer->ksnp_last_alive + *ksocknal_tunables.ksnd_keepalive * HZ))
- return 0;
-
- if (time_before(jiffies, peer->ksnp_send_keepalive))
- return 0;
-
- /*
- * retry 10 secs later, so we wouldn't put pressure
- * on this peer if we failed to send keepalive this time
- */
- peer->ksnp_send_keepalive = jiffies + 10 * HZ;
-
- conn = ksocknal_find_conn_locked(peer, NULL, 1);
- if (conn) {
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
- if (!list_empty(&conn->ksnc_tx_queue)) {
- spin_unlock_bh(&sched->kss_lock);
- /* there is an queued ACK, don't need keepalive */
- return 0;
- }
-
- spin_unlock_bh(&sched->kss_lock);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- /* cookie = 1 is reserved for keepalive PING */
- tx = ksocknal_alloc_tx_noop(1, 1);
- if (!tx) {
- read_lock(&ksocknal_data.ksnd_global_lock);
- return -ENOMEM;
- }
-
- if (!ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) {
- read_lock(&ksocknal_data.ksnd_global_lock);
- return 1;
- }
-
- ksocknal_free_tx(tx);
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- return -EIO;
-}
-
-static void
-ksocknal_check_peer_timeouts(int idx)
-{
- struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
- struct ksock_peer *peer;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
-
- again:
- /*
- * NB. We expect to have a look at all the peers and not find any
- * connections to time out, so we just use a shared lock while we
- * take a look...
- */
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- list_for_each_entry(peer, peers, ksnp_list) {
- unsigned long deadline = 0;
- struct ksock_tx *tx_stale;
- int resid = 0;
- int n = 0;
-
- if (ksocknal_send_keepalive_locked(peer)) {
- read_unlock(&ksocknal_data.ksnd_global_lock);
- goto again;
- }
-
- conn = ksocknal_find_timed_out_conn(peer);
-
- if (conn) {
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT);
-
- /*
- * NB we won't find this one again, but we can't
- * just proceed with the next peer, since we dropped
- * ksnd_global_lock and it might be dead already!
- */
- ksocknal_conn_decref(conn);
- goto again;
- }
-
- /*
- * we can't process stale txs right here because we're
- * holding only shared lock
- */
- if (!list_empty(&peer->ksnp_tx_queue)) {
- tx = list_entry(peer->ksnp_tx_queue.next,
- struct ksock_tx, tx_list);
-
- if (time_after_eq(jiffies,
- tx->tx_deadline)) {
- ksocknal_peer_addref(peer);
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_flush_stale_txs(peer);
-
- ksocknal_peer_decref(peer);
- goto again;
- }
- }
-
- if (list_empty(&peer->ksnp_zc_req_list))
- continue;
-
- tx_stale = NULL;
- spin_lock(&peer->ksnp_lock);
- list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
- if (!time_after_eq(jiffies,
- tx->tx_deadline))
- break;
- /* ignore the TX if connection is being closed */
- if (tx->tx_conn->ksnc_closing)
- continue;
- if (!tx_stale)
- tx_stale = tx;
- n++;
- }
-
- if (!tx_stale) {
- spin_unlock(&peer->ksnp_lock);
- continue;
- }
-
- deadline = tx_stale->tx_deadline;
- resid = tx_stale->tx_resid;
- conn = tx_stale->tx_conn;
- ksocknal_conn_addref(conn);
-
- spin_unlock(&peer->ksnp_lock);
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n",
- n, libcfs_nid2str(peer->ksnp_id.nid), tx_stale,
- (jiffies - deadline) / HZ,
- resid, conn->ksnc_sock->sk->sk_wmem_queued);
-
- ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT);
- ksocknal_conn_decref(conn);
- goto again;
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-int
-ksocknal_reaper(void *arg)
-{
- wait_queue_entry_t wait;
- struct ksock_conn *conn;
- struct ksock_sched *sched;
- struct list_head enomem_conns;
- int nenomem_conns;
- long timeout;
- int i;
- int peer_index = 0;
- unsigned long deadline = jiffies;
-
- INIT_LIST_HEAD(&enomem_conns);
- init_waitqueue_entry(&wait, current);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- while (!ksocknal_data.ksnd_shuttingdown) {
- if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) {
- conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next,
- struct ksock_conn, ksnc_list);
- list_del(&conn->ksnc_list);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- ksocknal_terminate_conn(conn);
- ksocknal_conn_decref(conn);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
- continue;
- }
-
- if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) {
- conn = list_entry(ksocknal_data.ksnd_zombie_conns.next,
- struct ksock_conn, ksnc_list);
- list_del(&conn->ksnc_list);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- ksocknal_destroy_conn(conn);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
- continue;
- }
-
- if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) {
- list_add(&enomem_conns,
- &ksocknal_data.ksnd_enomem_conns);
- list_del_init(&ksocknal_data.ksnd_enomem_conns);
- }
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- /* reschedule all the connections that stalled with ENOMEM... */
- nenomem_conns = 0;
- while (!list_empty(&enomem_conns)) {
- conn = list_entry(enomem_conns.next, struct ksock_conn,
- ksnc_tx_list);
- list_del(&conn->ksnc_tx_list);
-
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
-
- LASSERT(conn->ksnc_tx_scheduled);
- conn->ksnc_tx_ready = 1;
- list_add_tail(&conn->ksnc_tx_list,
- &sched->kss_tx_conns);
- wake_up(&sched->kss_waitq);
-
- spin_unlock_bh(&sched->kss_lock);
- nenomem_conns++;
- }
-
- /* careful with the jiffy wrap... */
- while ((timeout = deadline - jiffies) <= 0) {
- const int n = 4;
- const int p = 1;
- int chunk = ksocknal_data.ksnd_peer_hash_size;
-
- /*
- * Time to check for timeouts on a few more peers: I do
- * checks every 'p' seconds on a proportion of the peer
- * table and I need to check every connection 'n' times
- * within a timeout interval, to ensure I detect a
- * timeout on any connection within (n+1)/n times the
- * timeout interval.
- */
- if (*ksocknal_tunables.ksnd_timeout > n * p)
- chunk = (chunk * n * p) /
- *ksocknal_tunables.ksnd_timeout;
- if (!chunk)
- chunk = 1;
-
- for (i = 0; i < chunk; i++) {
- ksocknal_check_peer_timeouts(peer_index);
- peer_index = (peer_index + 1) %
- ksocknal_data.ksnd_peer_hash_size;
- }
-
- deadline = deadline + p * HZ;
- }
-
- if (nenomem_conns) {
- /*
- * Reduce my timeout if I rescheduled ENOMEM conns.
- * This also prevents me getting woken immediately
- * if any go back on my enomem list.
- */
- timeout = SOCKNAL_ENOMEM_RETRY;
- }
- ksocknal_data.ksnd_reaper_waketime = jiffies + timeout;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
-
- if (!ksocknal_data.ksnd_shuttingdown &&
- list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
- list_empty(&ksocknal_data.ksnd_zombie_conns))
- schedule_timeout(timeout);
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
- }
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- ksocknal_thread_fini();
- return 0;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
deleted file mode 100644
index 93a02cd..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
+++ /dev/null
@@ -1,534 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#include <linux/highmem.h>
-#include "socklnd.h"
-
-int
-ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
-{
- int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr,
- &conn->ksnc_port);
-
- /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
- LASSERT(!conn->ksnc_closing);
-
- if (rc) {
- CERROR("Error %d getting sock peer IP\n", rc);
- return rc;
- }
-
- rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL);
- if (rc) {
- CERROR("Error %d getting sock local IP\n", rc);
- return rc;
- }
-
- return 0;
-}
-
-int
-ksocknal_lib_zc_capable(struct ksock_conn *conn)
-{
- int caps = conn->ksnc_sock->sk->sk_route_caps;
-
- if (conn->ksnc_proto == &ksocknal_protocol_v1x)
- return 0;
-
- /*
- * ZC if the socket supports scatter/gather and doesn't need software
- * checksums
- */
- return ((caps & NETIF_F_SG) && (caps & NETIF_F_CSUM_MASK));
-}
-
-int
-ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
- struct socket *sock = conn->ksnc_sock;
- int nob, i;
-
- if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
- conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */
- tx->tx_nob == tx->tx_resid && /* frist sending */
- !tx->tx_msg.ksm_csum) /* not checksummed */
- ksocknal_lib_csum_tx(tx);
-
- for (nob = i = 0; i < tx->tx_niov; i++)
- nob += tx->tx_iov[i].iov_len;
-
- if (!list_empty(&conn->ksnc_tx_queue) ||
- nob < tx->tx_resid)
- msg.msg_flags |= MSG_MORE;
-
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC,
- tx->tx_iov, tx->tx_niov, nob);
- return sock_sendmsg(sock, &msg);
-}
-
-int
-ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct socket *sock = conn->ksnc_sock;
- struct bio_vec *kiov = tx->tx_kiov;
- int rc;
- int nob;
-
- /* Not NOOP message */
- LASSERT(tx->tx_lnetmsg);
-
- if (tx->tx_msg.ksm_zc_cookies[0]) {
- /* Zero copy is enabled */
- struct sock *sk = sock->sk;
- struct page *page = kiov->bv_page;
- int offset = kiov->bv_offset;
- int fragsize = kiov->bv_len;
- int msgflg = MSG_DONTWAIT;
-
- CDEBUG(D_NET, "page %p + offset %x for %d\n",
- page, offset, kiov->bv_len);
-
- if (!list_empty(&conn->ksnc_tx_queue) ||
- fragsize < tx->tx_resid)
- msgflg |= MSG_MORE;
-
- if (sk->sk_prot->sendpage) {
- rc = sk->sk_prot->sendpage(sk, page,
- offset, fragsize, msgflg);
- } else {
- rc = tcp_sendpage(sk, page, offset, fragsize, msgflg);
- }
- } else {
- struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
- int i;
-
- for (nob = i = 0; i < tx->tx_nkiov; i++)
- nob += kiov[i].bv_len;
-
- if (!list_empty(&conn->ksnc_tx_queue) ||
- nob < tx->tx_resid)
- msg.msg_flags |= MSG_MORE;
-
- iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC,
- kiov, tx->tx_nkiov, nob);
- rc = sock_sendmsg(sock, &msg);
- }
- return rc;
-}
-
-void
-ksocknal_lib_eager_ack(struct ksock_conn *conn)
-{
- int opt = 1;
- struct socket *sock = conn->ksnc_sock;
-
- /*
- * Remind the socket to ACK eagerly. If I don't, the socket might
- * think I'm about to send something it could piggy-back the ACK
- * on, introducing delay in completing zero-copy sends in my
- * peer.
- */
- kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt,
- sizeof(opt));
-}
-
-static int lustre_csum(struct kvec *v, void *context)
-{
- struct ksock_conn *conn = context;
- conn->ksnc_rx_csum = crc32_le(conn->ksnc_rx_csum,
- v->iov_base, v->iov_len);
- return 0;
-}
-
-int
-ksocknal_lib_recv(struct ksock_conn *conn)
-{
- struct msghdr msg = { .msg_iter = conn->ksnc_rx_to };
- __u32 saved_csum;
- int rc;
-
- rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT);
- if (rc <= 0)
- return rc;
-
- saved_csum = conn->ksnc_msg.ksm_csum;
- if (!saved_csum)
- return rc;
-
- /* header is included only in V2 - V3 checksums only the bulk data */
- if (!(conn->ksnc_rx_to.type & ITER_BVEC) &&
- conn->ksnc_proto != &ksocknal_protocol_v2x)
- return rc;
-
- /* accumulate checksum */
- conn->ksnc_msg.ksm_csum = 0;
- iov_iter_for_each_range(&conn->ksnc_rx_to, rc, lustre_csum, conn);
- conn->ksnc_msg.ksm_csum = saved_csum;
-
- return rc;
-}
-
-void
-ksocknal_lib_csum_tx(struct ksock_tx *tx)
-{
- int i;
- __u32 csum;
- void *base;
-
- LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg);
- LASSERT(tx->tx_conn);
- LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
-
- tx->tx_msg.ksm_csum = 0;
-
- csum = crc32_le(~0, tx->tx_iov[0].iov_base,
- tx->tx_iov[0].iov_len);
-
- if (tx->tx_kiov) {
- for (i = 0; i < tx->tx_nkiov; i++) {
- base = kmap(tx->tx_kiov[i].bv_page) +
- tx->tx_kiov[i].bv_offset;
-
- csum = crc32_le(csum, base, tx->tx_kiov[i].bv_len);
-
- kunmap(tx->tx_kiov[i].bv_page);
- }
- } else {
- for (i = 1; i < tx->tx_niov; i++)
- csum = crc32_le(csum, tx->tx_iov[i].iov_base,
- tx->tx_iov[i].iov_len);
- }
-
- if (*ksocknal_tunables.ksnd_inject_csum_error) {
- csum++;
- *ksocknal_tunables.ksnd_inject_csum_error = 0;
- }
-
- tx->tx_msg.ksm_csum = csum;
-}
-
-int
-ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
- int *rxmem, int *nagle)
-{
- struct socket *sock = conn->ksnc_sock;
- int len;
- int rc;
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) {
- LASSERT(conn->ksnc_closing);
- *txmem = *rxmem = *nagle = 0;
- return -ESHUTDOWN;
- }
-
- rc = lnet_sock_getbuf(sock, txmem, rxmem);
- if (!rc) {
- len = sizeof(*nagle);
- rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char *)nagle, &len);
- }
-
- ksocknal_connsock_decref(conn);
-
- if (!rc)
- *nagle = !*nagle;
- else
- *txmem = *rxmem = *nagle = 0;
-
- return rc;
-}
-
-int
-ksocknal_lib_setup_sock(struct socket *sock)
-{
- int rc;
- int option;
- int keep_idle;
- int keep_intvl;
- int keep_count;
- int do_keepalive;
- struct linger linger;
-
- sock->sk->sk_allocation = GFP_NOFS;
-
- /*
- * Ensure this socket aborts active sends immediately when we close
- * it.
- */
- linger.l_onoff = 0;
- linger.l_linger = 0;
-
- rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger,
- sizeof(linger));
- if (rc) {
- CERROR("Can't set SO_LINGER: %d\n", rc);
- return rc;
- }
-
- option = -1;
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option,
- sizeof(option));
- if (rc) {
- CERROR("Can't set SO_LINGER2: %d\n", rc);
- return rc;
- }
-
- if (!*ksocknal_tunables.ksnd_nagle) {
- option = 1;
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char *)&option, sizeof(option));
- if (rc) {
- CERROR("Can't disable nagle: %d\n", rc);
- return rc;
- }
- }
-
- rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size,
- *ksocknal_tunables.ksnd_rx_buffer_size);
- if (rc) {
- CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
- *ksocknal_tunables.ksnd_tx_buffer_size,
- *ksocknal_tunables.ksnd_rx_buffer_size, rc);
- return rc;
- }
-
-/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
-
- /* snapshot tunables */
- keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
- keep_count = *ksocknal_tunables.ksnd_keepalive_count;
- keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
-
- do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
-
- option = (do_keepalive ? 1 : 0);
- rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option,
- sizeof(option));
- if (rc) {
- CERROR("Can't set SO_KEEPALIVE: %d\n", rc);
- return rc;
- }
-
- if (!do_keepalive)
- return 0;
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle,
- sizeof(keep_idle));
- if (rc) {
- CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
- return rc;
- }
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
- (char *)&keep_intvl, sizeof(keep_intvl));
- if (rc) {
- CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
- return rc;
- }
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count,
- sizeof(keep_count));
- if (rc) {
- CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
- return rc;
- }
-
- return 0;
-}
-
-void
-ksocknal_lib_push_conn(struct ksock_conn *conn)
-{
- struct sock *sk;
- struct tcp_sock *tp;
- int nonagle;
- int val = 1;
- int rc;
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) /* being shut down */
- return;
-
- sk = conn->ksnc_sock->sk;
- tp = tcp_sk(sk);
-
- lock_sock(sk);
- nonagle = tp->nonagle;
- tp->nonagle = 1;
- release_sock(sk);
-
- rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
- (char *)&val, sizeof(val));
- LASSERT(!rc);
-
- lock_sock(sk);
- tp->nonagle = nonagle;
- release_sock(sk);
-
- ksocknal_connsock_decref(conn);
-}
-
-/*
- * socket call back in Linux
- */
-static void
-ksocknal_data_ready(struct sock *sk)
-{
- struct ksock_conn *conn;
-
- /* interleave correctly with closing sockets... */
- LASSERT(!in_irq());
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- conn = sk->sk_user_data;
- if (!conn) { /* raced with ksocknal_terminate_conn */
- LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
- sk->sk_data_ready(sk);
- } else {
- ksocknal_read_callback(conn);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-static void
-ksocknal_write_space(struct sock *sk)
-{
- struct ksock_conn *conn;
- int wspace;
- int min_wpace;
-
- /* interleave correctly with closing sockets... */
- LASSERT(!in_irq());
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- conn = sk->sk_user_data;
- wspace = sk_stream_wspace(sk);
- min_wpace = sk_stream_min_wspace(sk);
-
- CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
- sk, wspace, min_wpace, conn,
- !conn ? "" : (conn->ksnc_tx_ready ?
- " ready" : " blocked"),
- !conn ? "" : (conn->ksnc_tx_scheduled ?
- " scheduled" : " idle"),
- !conn ? "" : (list_empty(&conn->ksnc_tx_queue) ?
- " empty" : " queued"));
-
- if (!conn) { /* raced with ksocknal_terminate_conn */
- LASSERT(sk->sk_write_space != &ksocknal_write_space);
- sk->sk_write_space(sk);
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return;
- }
-
- if (wspace >= min_wpace) { /* got enough space */
- ksocknal_write_callback(conn);
-
- /*
- * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
- * ENOMEM check in ksocknal_transmit is race-free (think about
- * it).
- */
- clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-void
-ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
-{
- conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
- conn->ksnc_saved_write_space = sock->sk->sk_write_space;
-}
-
-void
-ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn)
-{
- sock->sk->sk_user_data = conn;
- sock->sk->sk_data_ready = ksocknal_data_ready;
- sock->sk->sk_write_space = ksocknal_write_space;
-}
-
-void
-ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
-{
- /*
- * Remove conn's network callbacks.
- * NB I _have_ to restore the callback, rather than storing a noop,
- * since the socket could survive past this module being unloaded!!
- */
- sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
- sock->sk->sk_write_space = conn->ksnc_saved_write_space;
-
- /*
- * A callback could be in progress already; they hold a read lock
- * on ksnd_global_lock (to serialise with me) and NOOP if
- * sk_user_data is NULL.
- */
- sock->sk->sk_user_data = NULL;
-}
-
-int
-ksocknal_lib_memory_pressure(struct ksock_conn *conn)
-{
- int rc = 0;
- struct ksock_sched *sched;
-
- sched = conn->ksnc_scheduler;
- spin_lock_bh(&sched->kss_lock);
-
- if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
- !conn->ksnc_tx_ready) {
- /*
- * SOCK_NOSPACE is set when the socket fills
- * and cleared in the write_space callback
- * (which also sets ksnc_tx_ready). If
- * SOCK_NOSPACE and ksnc_tx_ready are BOTH
- * zero, I didn't fill the socket and
- * write_space won't reschedule me, so I
- * return -ENOMEM to get my caller to retry
- * after a timeout
- */
- rc = -ENOMEM;
- }
-
- spin_unlock_bh(&sched->kss_lock);
-
- return rc;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
deleted file mode 100644
index 5663a4c..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
+++ /dev/null
@@ -1,184 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include "socklnd.h"
-
-static int sock_timeout = 50;
-module_param(sock_timeout, int, 0644);
-MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
-
-static int credits = 256;
-module_param(credits, int, 0444);
-MODULE_PARM_DESC(credits, "# concurrent sends");
-
-static int peer_credits = 8;
-module_param(peer_credits, int, 0444);
-MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
-
-static int peer_buffer_credits;
-module_param(peer_buffer_credits, int, 0444);
-MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
-
-static int peer_timeout = 180;
-module_param(peer_timeout, int, 0444);
-MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
-
-/*
- * Number of daemons in each thread pool which is percpt,
- * we will estimate reasonable value based on CPUs if it's not set.
- */
-static unsigned int nscheds;
-module_param(nscheds, int, 0444);
-MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
-
-static int nconnds = 4;
-module_param(nconnds, int, 0444);
-MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
-
-static int nconnds_max = 64;
-module_param(nconnds_max, int, 0444);
-MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
-
-static int min_reconnectms = 1000;
-module_param(min_reconnectms, int, 0644);
-MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
-
-static int max_reconnectms = 60000;
-module_param(max_reconnectms, int, 0644);
-MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
-
-# define DEFAULT_EAGER_ACK 0
-static int eager_ack = DEFAULT_EAGER_ACK;
-module_param(eager_ack, int, 0644);
-MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
-
-static int typed_conns = 1;
-module_param(typed_conns, int, 0444);
-MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
-
-static int min_bulk = 1 << 10;
-module_param(min_bulk, int, 0644);
-MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
-
-# define DEFAULT_BUFFER_SIZE 0
-static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
-module_param(tx_buffer_size, int, 0644);
-MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
-
-static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
-module_param(rx_buffer_size, int, 0644);
-MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
-
-static int nagle;
-module_param(nagle, int, 0644);
-MODULE_PARM_DESC(nagle, "enable NAGLE?");
-
-static int round_robin = 1;
-module_param(round_robin, int, 0644);
-MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
-
-static int keepalive = 30;
-module_param(keepalive, int, 0644);
-MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
-
-static int keepalive_idle = 30;
-module_param(keepalive_idle, int, 0644);
-MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
-
-#define DEFAULT_KEEPALIVE_COUNT 5
-static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
-module_param(keepalive_count, int, 0644);
-MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
-
-static int keepalive_intvl = 5;
-module_param(keepalive_intvl, int, 0644);
-MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
-
-static int enable_csum;
-module_param(enable_csum, int, 0644);
-MODULE_PARM_DESC(enable_csum, "enable check sum");
-
-static int inject_csum_error;
-module_param(inject_csum_error, int, 0644);
-MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
-
-static int nonblk_zcack = 1;
-module_param(nonblk_zcack, int, 0644);
-MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
-
-static unsigned int zc_min_payload = 16 << 10;
-module_param(zc_min_payload, int, 0644);
-MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
-
-static unsigned int zc_recv;
-module_param(zc_recv, int, 0644);
-MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
-
-static unsigned int zc_recv_min_nfrags = 16;
-module_param(zc_recv_min_nfrags, int, 0644);
-MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
-
-#if SOCKNAL_VERSION_DEBUG
-static int protocol = 3;
-module_param(protocol, int, 0644);
-MODULE_PARM_DESC(protocol, "protocol version");
-#endif
-
-struct ksock_tunables ksocknal_tunables;
-
-int ksocknal_tunables_init(void)
-{
- /* initialize ksocknal_tunables structure */
- ksocknal_tunables.ksnd_timeout = &sock_timeout;
- ksocknal_tunables.ksnd_nscheds = &nscheds;
- ksocknal_tunables.ksnd_nconnds = &nconnds;
- ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
- ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
- ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
- ksocknal_tunables.ksnd_eager_ack = &eager_ack;
- ksocknal_tunables.ksnd_typed_conns = &typed_conns;
- ksocknal_tunables.ksnd_min_bulk = &min_bulk;
- ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
- ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
- ksocknal_tunables.ksnd_nagle = &nagle;
- ksocknal_tunables.ksnd_round_robin = &round_robin;
- ksocknal_tunables.ksnd_keepalive = &keepalive;
- ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
- ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
- ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
- ksocknal_tunables.ksnd_credits = &credits;
- ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
- ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
- ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
- ksocknal_tunables.ksnd_enable_csum = &enable_csum;
- ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
- ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
- ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
- ksocknal_tunables.ksnd_zc_recv = &zc_recv;
- ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
-
-#if SOCKNAL_VERSION_DEBUG
- ksocknal_tunables.ksnd_protocol = &protocol;
-#endif
-
- if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
- *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10;
-
- return 0;
-};
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
deleted file mode 100644
index 05982da..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
+++ /dev/null
@@ -1,810 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2012, Intel Corporation.
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include "socklnd.h"
-
-/*
- * Protocol entries :
- * pro_send_hello : send hello message
- * pro_recv_hello : receive hello message
- * pro_pack : pack message header
- * pro_unpack : unpack message header
- * pro_queue_tx_zcack() : Called holding BH lock: kss_lock
- * return 1 if ACK is piggybacked, otherwise return 0
- * pro_queue_tx_msg() : Called holding BH lock: kss_lock
- * return the ACK that piggybacked by my message, or NULL
- * pro_handle_zcreq() : handler of incoming ZC-REQ
- * pro_handle_zcack() : handler of incoming ZC-ACK
- * pro_match_tx() : Called holding glock
- */
-
-static struct ksock_tx *
-ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
-{
- /* V1.x, just enqueue it */
- list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
- return NULL;
-}
-
-void
-ksocknal_next_tx_carrier(struct ksock_conn *conn)
-{
- struct ksock_tx *tx = conn->ksnc_tx_carrier;
-
- /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
- LASSERT(!list_empty(&conn->ksnc_tx_queue));
- LASSERT(tx);
-
- /* Next TX that can carry ZC-ACK or LNet message */
- if (tx->tx_list.next == &conn->ksnc_tx_queue) {
- /* no more packets queued */
- conn->ksnc_tx_carrier = NULL;
- } else {
- conn->ksnc_tx_carrier = list_next_entry(tx, tx_list);
- LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
- }
-}
-
-static int
-ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
- struct ksock_tx *tx_ack, __u64 cookie)
-{
- struct ksock_tx *tx = conn->ksnc_tx_carrier;
-
- LASSERT(!tx_ack ||
- tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- /*
- * Enqueue or piggyback tx_ack / cookie
- * . no tx can piggyback cookie of tx_ack (or cookie), just
- * enqueue the tx_ack (if tx_ack != NUL) and return NULL.
- * . There is tx can piggyback cookie of tx_ack (or cookie),
- * piggyback the cookie and return the tx.
- */
- if (!tx) {
- if (tx_ack) {
- list_add_tail(&tx_ack->tx_list,
- &conn->ksnc_tx_queue);
- conn->ksnc_tx_carrier = tx_ack;
- }
- return 0;
- }
-
- if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
- /* tx is noop zc-ack, can't piggyback zc-ack cookie */
- if (tx_ack)
- list_add_tail(&tx_ack->tx_list,
- &conn->ksnc_tx_queue);
- return 0;
- }
-
- LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
- LASSERT(!tx->tx_msg.ksm_zc_cookies[1]);
-
- if (tx_ack)
- cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
-
- /* piggyback the zc-ack cookie */
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- /* move on to the next TX which can carry cookie */
- ksocknal_next_tx_carrier(conn);
-
- return 1;
-}
-
-static struct ksock_tx *
-ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
-{
- struct ksock_tx *tx = conn->ksnc_tx_carrier;
-
- /*
- * Enqueue tx_msg:
- * . If there is no NOOP on the connection, just enqueue
- * tx_msg and return NULL
- * . If there is NOOP on the connection, piggyback the cookie
- * and replace the NOOP tx, and return the NOOP tx.
- */
- if (!tx) { /* nothing on queue */
- list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
- conn->ksnc_tx_carrier = tx_msg;
- return NULL;
- }
-
- if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
- list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
- return NULL;
- }
-
- LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- /* There is a noop zc-ack can be piggybacked */
- tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
- ksocknal_next_tx_carrier(conn);
-
- /* use new_tx to replace the noop zc-ack packet */
- list_add(&tx_msg->tx_list, &tx->tx_list);
- list_del(&tx->tx_list);
-
- return tx;
-}
-
-static int
-ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
- struct ksock_tx *tx_ack, __u64 cookie)
-{
- struct ksock_tx *tx;
-
- if (conn->ksnc_type != SOCKLND_CONN_ACK)
- return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
-
- /* non-blocking ZC-ACK (to router) */
- LASSERT(!tx_ack ||
- tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- tx = conn->ksnc_tx_carrier;
- if (!tx) {
- if (tx_ack) {
- list_add_tail(&tx_ack->tx_list,
- &conn->ksnc_tx_queue);
- conn->ksnc_tx_carrier = tx_ack;
- }
- return 0;
- }
-
- /* conn->ksnc_tx_carrier */
-
- if (tx_ack)
- cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
-
- if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
- return 1;
-
- if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
- /* replace the keepalive PING with a real ACK */
- LASSERT(!tx->tx_msg.ksm_zc_cookies[0]);
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- return 1;
- }
-
- if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
- cookie == tx->tx_msg.ksm_zc_cookies[1]) {
- CWARN("%s: duplicated ZC cookie: %llu\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
- return 1; /* XXX return error in the future */
- }
-
- if (!tx->tx_msg.ksm_zc_cookies[0]) {
- /*
- * NOOP tx has only one ZC-ACK cookie,
- * can carry at least one more
- */
- if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
- tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- } else {
- tx->tx_msg.ksm_zc_cookies[0] = cookie;
- }
-
- if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
- /*
- * not likely to carry more ACKs, skip it
- * to simplify logic
- */
- ksocknal_next_tx_carrier(conn);
- }
-
- return 1;
- }
-
- /* takes two or more cookies already */
-
- if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
- __u64 tmp = 0;
-
- /* two separated cookies: (a+2, a) or (a+1, a) */
- LASSERT(tx->tx_msg.ksm_zc_cookies[0] -
- tx->tx_msg.ksm_zc_cookies[1] <= 2);
-
- if (tx->tx_msg.ksm_zc_cookies[0] -
- tx->tx_msg.ksm_zc_cookies[1] == 2) {
- if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
- tmp = cookie;
- } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
- tmp = tx->tx_msg.ksm_zc_cookies[1];
- } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
- tmp = tx->tx_msg.ksm_zc_cookies[0];
- }
-
- if (tmp) {
- /* range of cookies */
- tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
- tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
- return 1;
- }
-
- } else {
- /*
- * ksm_zc_cookies[0] < ksm_zc_cookies[1],
- * it is range of cookies
- */
- if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
- cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
- CWARN("%s: duplicated ZC cookie: %llu\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
- return 1; /* XXX: return error in the future */
- }
-
- if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- return 1;
- }
-
- if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
- tx->tx_msg.ksm_zc_cookies[0] = cookie;
- return 1;
- }
- }
-
- /* failed to piggyback ZC-ACK */
- if (tx_ack) {
- list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
- /* the next tx can piggyback at least 1 ACK */
- ksocknal_next_tx_carrier(conn);
- }
-
- return 0;
-}
-
-static int
-ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
-{
- int nob;
-
-#if SOCKNAL_VERSION_DEBUG
- if (!*ksocknal_tunables.ksnd_typed_conns)
- return SOCKNAL_MATCH_YES;
-#endif
-
- if (!tx || !tx->tx_lnetmsg) {
- /* noop packet */
- nob = offsetof(struct ksock_msg, ksm_u);
- } else {
- nob = tx->tx_lnetmsg->msg_len +
- ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
- sizeof(struct lnet_hdr) : sizeof(struct ksock_msg));
- }
-
- /* default checking for typed connection */
- switch (conn->ksnc_type) {
- default:
- CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
- LBUG();
- case SOCKLND_CONN_ANY:
- return SOCKNAL_MATCH_YES;
-
- case SOCKLND_CONN_BULK_IN:
- return SOCKNAL_MATCH_MAY;
-
- case SOCKLND_CONN_BULK_OUT:
- if (nob < *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
-
- case SOCKLND_CONN_CONTROL:
- if (nob >= *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
- }
-}
-
-static int
-ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
-{
- int nob;
-
- if (!tx || !tx->tx_lnetmsg)
- nob = offsetof(struct ksock_msg, ksm_u);
- else
- nob = tx->tx_lnetmsg->msg_len + sizeof(struct ksock_msg);
-
- switch (conn->ksnc_type) {
- default:
- CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
- LBUG();
- case SOCKLND_CONN_ANY:
- return SOCKNAL_MATCH_NO;
-
- case SOCKLND_CONN_ACK:
- if (nonblk)
- return SOCKNAL_MATCH_YES;
- else if (!tx || !tx->tx_lnetmsg)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_NO;
-
- case SOCKLND_CONN_BULK_OUT:
- if (nonblk)
- return SOCKNAL_MATCH_NO;
- else if (nob < *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
-
- case SOCKLND_CONN_CONTROL:
- if (nonblk)
- return SOCKNAL_MATCH_NO;
- else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
- }
-}
-
-/* (Sink) handle incoming ZC request from sender */
-static int
-ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
-{
- struct ksock_peer *peer = c->ksnc_peer;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
- int rc;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
- if (conn) {
- struct ksock_sched *sched = conn->ksnc_scheduler;
-
- LASSERT(conn->ksnc_proto->pro_queue_tx_zcack);
-
- spin_lock_bh(&sched->kss_lock);
-
- rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
-
- spin_unlock_bh(&sched->kss_lock);
-
- if (rc) { /* piggybacked */
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return 0;
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- /* ACK connection is not ready, or can't piggyback the ACK */
- tx = ksocknal_alloc_tx_noop(cookie, !!remote);
- if (!tx)
- return -ENOMEM;
-
- rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id);
- if (!rc)
- return 0;
-
- ksocknal_free_tx(tx);
- return rc;
-}
-
-/* (Sender) handle ZC_ACK from sink */
-static int
-ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
-{
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_tx *tx;
- struct ksock_tx *temp;
- struct ksock_tx *tmp;
- LIST_HEAD(zlist);
- int count;
-
- if (!cookie1)
- cookie1 = cookie2;
-
- count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
-
- if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
- conn->ksnc_proto == &ksocknal_protocol_v3x) {
- /* keepalive PING for V3.x, just ignore it */
- return count == 1 ? 0 : -EPROTO;
- }
-
- spin_lock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list,
- tx_zc_list) {
- __u64 c = tx->tx_msg.ksm_zc_cookies[0];
-
- if (c == cookie1 || c == cookie2 ||
- (cookie1 < c && c < cookie2)) {
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- list_del(&tx->tx_zc_list);
- list_add(&tx->tx_zc_list, &zlist);
-
- if (!--count)
- break;
- }
- }
-
- spin_unlock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) {
- list_del(&tx->tx_zc_list);
- ksocknal_tx_decref(tx);
- }
-
- return !count ? 0 : -EPROTO;
-}
-
-static int
-ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
-{
- struct socket *sock = conn->ksnc_sock;
- struct lnet_hdr *hdr;
- struct lnet_magicversion *hmv;
- int rc;
- int i;
-
- BUILD_BUG_ON(sizeof(struct lnet_magicversion) != offsetof(struct lnet_hdr, src_nid));
-
- hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
- if (!hdr) {
- CERROR("Can't allocate struct lnet_hdr\n");
- return -ENOMEM;
- }
-
- hmv = (struct lnet_magicversion *)&hdr->dest_nid;
-
- /*
- * Re-organize V2.x message header to V1.x (struct lnet_hdr)
- * header and send out
- */
- hmv->magic = cpu_to_le32(LNET_PROTO_TCP_MAGIC);
- hmv->version_major = cpu_to_le16(KSOCK_PROTO_V1_MAJOR);
- hmv->version_minor = cpu_to_le16(KSOCK_PROTO_V1_MINOR);
-
- if (the_lnet.ln_testprotocompat) {
- /* single-shot proto check */
- LNET_LOCK();
- if (the_lnet.ln_testprotocompat & 1) {
- hmv->version_major++; /* just different! */
- the_lnet.ln_testprotocompat &= ~1;
- }
- if (the_lnet.ln_testprotocompat & 2) {
- hmv->magic = LNET_PROTO_MAGIC;
- the_lnet.ln_testprotocompat &= ~2;
- }
- LNET_UNLOCK();
- }
-
- hdr->src_nid = cpu_to_le64(hello->kshm_src_nid);
- hdr->src_pid = cpu_to_le32(hello->kshm_src_pid);
- hdr->type = cpu_to_le32(LNET_MSG_HELLO);
- hdr->payload_length = cpu_to_le32(hello->kshm_nips * sizeof(__u32));
- hdr->msg.hello.type = cpu_to_le32(hello->kshm_ctype);
- hdr->msg.hello.incarnation = cpu_to_le64(hello->kshm_src_incarnation);
-
- rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
- rc, &conn->ksnc_ipaddr, conn->ksnc_port);
- goto out;
- }
-
- if (!hello->kshm_nips)
- goto out;
-
- for (i = 0; i < (int)hello->kshm_nips; i++)
- hello->kshm_ips[i] = __cpu_to_le32(hello->kshm_ips[i]);
-
- rc = lnet_sock_write(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32),
- lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
- rc, hello->kshm_nips,
- &conn->ksnc_ipaddr, conn->ksnc_port);
- }
-out:
- kfree(hdr);
-
- return rc;
-}
-
-static int
-ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
-{
- struct socket *sock = conn->ksnc_sock;
- int rc;
-
- hello->kshm_magic = LNET_PROTO_MAGIC;
- hello->kshm_version = conn->ksnc_proto->pro_version;
-
- if (the_lnet.ln_testprotocompat) {
- /* single-shot proto check */
- LNET_LOCK();
- if (the_lnet.ln_testprotocompat & 1) {
- hello->kshm_version++; /* just different! */
- the_lnet.ln_testprotocompat &= ~1;
- }
- LNET_UNLOCK();
- }
-
- rc = lnet_sock_write(sock, hello, offsetof(struct ksock_hello_msg, kshm_ips),
- lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
- rc, &conn->ksnc_ipaddr, conn->ksnc_port);
- return rc;
- }
-
- if (!hello->kshm_nips)
- return 0;
-
- rc = lnet_sock_write(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32),
- lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
- rc, hello->kshm_nips,
- &conn->ksnc_ipaddr, conn->ksnc_port);
- }
-
- return rc;
-}
-
-static int
-ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
- int timeout)
-{
- struct socket *sock = conn->ksnc_sock;
- struct lnet_hdr *hdr;
- int rc;
- int i;
-
- hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
- if (!hdr) {
- CERROR("Can't allocate struct lnet_hdr\n");
- return -ENOMEM;
- }
-
- rc = lnet_sock_read(sock, &hdr->src_nid,
- sizeof(*hdr) - offsetof(struct lnet_hdr, src_nid),
- timeout);
- if (rc) {
- CERROR("Error %d reading rest of HELLO hdr from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- goto out;
- }
-
- /* ...and check we got what we expected */
- if (hdr->type != cpu_to_le32(LNET_MSG_HELLO)) {
- CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n",
- le32_to_cpu(hdr->type),
- &conn->ksnc_ipaddr);
- rc = -EPROTO;
- goto out;
- }
-
- hello->kshm_src_nid = le64_to_cpu(hdr->src_nid);
- hello->kshm_src_pid = le32_to_cpu(hdr->src_pid);
- hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation);
- hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type);
- hello->kshm_nips = le32_to_cpu(hdr->payload_length) /
- sizeof(__u32);
-
- if (hello->kshm_nips > LNET_MAX_INTERFACES) {
- CERROR("Bad nips %d from ip %pI4h\n",
- hello->kshm_nips, &conn->ksnc_ipaddr);
- rc = -EPROTO;
- goto out;
- }
-
- if (!hello->kshm_nips)
- goto out;
-
- rc = lnet_sock_read(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32), timeout);
- if (rc) {
- CERROR("Error %d reading IPs from ip %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- goto out;
- }
-
- for (i = 0; i < (int)hello->kshm_nips; i++) {
- hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
-
- if (!hello->kshm_ips[i]) {
- CERROR("Zero IP[%d] from ip %pI4h\n",
- i, &conn->ksnc_ipaddr);
- rc = -EPROTO;
- break;
- }
- }
-out:
- kfree(hdr);
-
- return rc;
-}
-
-static int
-ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
- int timeout)
-{
- struct socket *sock = conn->ksnc_sock;
- int rc;
- int i;
-
- if (hello->kshm_magic == LNET_PROTO_MAGIC)
- conn->ksnc_flip = 0;
- else
- conn->ksnc_flip = 1;
-
- rc = lnet_sock_read(sock, &hello->kshm_src_nid,
- offsetof(struct ksock_hello_msg, kshm_ips) -
- offsetof(struct ksock_hello_msg, kshm_src_nid),
- timeout);
- if (rc) {
- CERROR("Error %d reading HELLO from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- return rc;
- }
-
- if (conn->ksnc_flip) {
- __swab32s(&hello->kshm_src_pid);
- __swab64s(&hello->kshm_src_nid);
- __swab32s(&hello->kshm_dst_pid);
- __swab64s(&hello->kshm_dst_nid);
- __swab64s(&hello->kshm_src_incarnation);
- __swab64s(&hello->kshm_dst_incarnation);
- __swab32s(&hello->kshm_ctype);
- __swab32s(&hello->kshm_nips);
- }
-
- if (hello->kshm_nips > LNET_MAX_INTERFACES) {
- CERROR("Bad nips %d from ip %pI4h\n",
- hello->kshm_nips, &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- if (!hello->kshm_nips)
- return 0;
-
- rc = lnet_sock_read(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32), timeout);
- if (rc) {
- CERROR("Error %d reading IPs from ip %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- return rc;
- }
-
- for (i = 0; i < (int)hello->kshm_nips; i++) {
- if (conn->ksnc_flip)
- __swab32s(&hello->kshm_ips[i]);
-
- if (!hello->kshm_ips[i]) {
- CERROR("Zero IP[%d] from ip %pI4h\n",
- i, &conn->ksnc_ipaddr);
- return -EPROTO;
- }
- }
-
- return 0;
-}
-
-static void
-ksocknal_pack_msg_v1(struct ksock_tx *tx)
-{
- /* V1.x has no KSOCK_MSG_NOOP */
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
- LASSERT(tx->tx_lnetmsg);
-
- tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr;
- tx->tx_iov[0].iov_len = sizeof(struct lnet_hdr);
-
- tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr);
- tx->tx_resid = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr);
-}
-
-static void
-ksocknal_pack_msg_v2(struct ksock_tx *tx)
-{
- tx->tx_iov[0].iov_base = &tx->tx_msg;
-
- if (tx->tx_lnetmsg) {
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
-
- tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
- tx->tx_iov[0].iov_len = sizeof(struct ksock_msg);
- tx->tx_nob = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len;
- tx->tx_resid = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len;
- } else {
- LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- tx->tx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
- tx->tx_nob = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
- tx->tx_resid = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
- }
- /*
- * Don't checksum before start sending, because packet can be
- * piggybacked with ACK
- */
-}
-
-static void
-ksocknal_unpack_msg_v1(struct ksock_msg *msg)
-{
- msg->ksm_csum = 0;
- msg->ksm_type = KSOCK_MSG_LNET;
- msg->ksm_zc_cookies[0] = 0;
- msg->ksm_zc_cookies[1] = 0;
-}
-
-static void
-ksocknal_unpack_msg_v2(struct ksock_msg *msg)
-{
- return; /* Do nothing */
-}
-
-struct ksock_proto ksocknal_protocol_v1x = {
- .pro_version = KSOCK_PROTO_V1,
- .pro_send_hello = ksocknal_send_hello_v1,
- .pro_recv_hello = ksocknal_recv_hello_v1,
- .pro_pack = ksocknal_pack_msg_v1,
- .pro_unpack = ksocknal_unpack_msg_v1,
- .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1,
- .pro_handle_zcreq = NULL,
- .pro_handle_zcack = NULL,
- .pro_queue_tx_zcack = NULL,
- .pro_match_tx = ksocknal_match_tx
-};
-
-struct ksock_proto ksocknal_protocol_v2x = {
- .pro_version = KSOCK_PROTO_V2,
- .pro_send_hello = ksocknal_send_hello_v2,
- .pro_recv_hello = ksocknal_recv_hello_v2,
- .pro_pack = ksocknal_pack_msg_v2,
- .pro_unpack = ksocknal_unpack_msg_v2,
- .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
- .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2,
- .pro_handle_zcreq = ksocknal_handle_zcreq,
- .pro_handle_zcack = ksocknal_handle_zcack,
- .pro_match_tx = ksocknal_match_tx
-};
-
-struct ksock_proto ksocknal_protocol_v3x = {
- .pro_version = KSOCK_PROTO_V3,
- .pro_send_hello = ksocknal_send_hello_v2,
- .pro_recv_hello = ksocknal_recv_hello_v2,
- .pro_pack = ksocknal_pack_msg_v2,
- .pro_unpack = ksocknal_unpack_msg_v2,
- .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
- .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3,
- .pro_handle_zcreq = ksocknal_handle_zcreq,
- .pro_handle_zcack = ksocknal_handle_zcack,
- .pro_match_tx = ksocknal_match_tx_v3
-};
OpenPOWER on IntegriCloud