summaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lnet/klnds
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lnet/klnds')
-rw-r--r--drivers/staging/lustre/lnet/klnds/Makefile1
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile5
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c2958
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h1048
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c3763
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c296
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/Makefile6
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c2921
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h704
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c2586
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c534
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c184
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c810
13 files changed, 0 insertions, 15816 deletions
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
deleted file mode 100644
index c23e4f6..0000000
--- a/drivers/staging/lustre/lnet/klnds/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_LNET) += o2iblnd/ socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
deleted file mode 100644
index 4affe1d..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include
-
-obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
-ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
deleted file mode 100644
index f0b4eb42..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
+++ /dev/null
@@ -1,2958 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <asm/div64.h>
-#include <asm/page.h>
-#include "o2iblnd.h"
-
-static struct lnet_lnd the_o2iblnd;
-
-struct kib_data kiblnd_data;
-
-static __u32 kiblnd_cksum(void *ptr, int nob)
-{
- char *c = ptr;
- __u32 sum = 0;
-
- while (nob-- > 0)
- sum = ((sum << 1) | (sum >> 31)) + *c++;
-
- /* ensure I don't return 0 (== no checksum) */
- return !sum ? 1 : sum;
-}
-
-static char *kiblnd_msgtype2str(int type)
-{
- switch (type) {
- case IBLND_MSG_CONNREQ:
- return "CONNREQ";
-
- case IBLND_MSG_CONNACK:
- return "CONNACK";
-
- case IBLND_MSG_NOOP:
- return "NOOP";
-
- case IBLND_MSG_IMMEDIATE:
- return "IMMEDIATE";
-
- case IBLND_MSG_PUT_REQ:
- return "PUT_REQ";
-
- case IBLND_MSG_PUT_NAK:
- return "PUT_NAK";
-
- case IBLND_MSG_PUT_ACK:
- return "PUT_ACK";
-
- case IBLND_MSG_PUT_DONE:
- return "PUT_DONE";
-
- case IBLND_MSG_GET_REQ:
- return "GET_REQ";
-
- case IBLND_MSG_GET_DONE:
- return "GET_DONE";
-
- default:
- return "???";
- }
-}
-
-static int kiblnd_msgtype2size(int type)
-{
- const int hdr_size = offsetof(struct kib_msg, ibm_u);
-
- switch (type) {
- case IBLND_MSG_CONNREQ:
- case IBLND_MSG_CONNACK:
- return hdr_size + sizeof(struct kib_connparams);
-
- case IBLND_MSG_NOOP:
- return hdr_size;
-
- case IBLND_MSG_IMMEDIATE:
- return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
-
- case IBLND_MSG_PUT_REQ:
- return hdr_size + sizeof(struct kib_putreq_msg);
-
- case IBLND_MSG_PUT_ACK:
- return hdr_size + sizeof(struct kib_putack_msg);
-
- case IBLND_MSG_GET_REQ:
- return hdr_size + sizeof(struct kib_get_msg);
-
- case IBLND_MSG_PUT_NAK:
- case IBLND_MSG_PUT_DONE:
- case IBLND_MSG_GET_DONE:
- return hdr_size + sizeof(struct kib_completion_msg);
- default:
- return -1;
- }
-}
-
-static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
-{
- struct kib_rdma_desc *rd;
- int msg_size;
- int nob;
- int n;
- int i;
-
- LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
- msg->ibm_type == IBLND_MSG_PUT_ACK);
-
- rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
- &msg->ibm_u.get.ibgm_rd :
- &msg->ibm_u.putack.ibpam_rd;
-
- if (flip) {
- __swab32s(&rd->rd_key);
- __swab32s(&rd->rd_nfrags);
- }
-
- n = rd->rd_nfrags;
-
- nob = offsetof(struct kib_msg, ibm_u) +
- kiblnd_rd_msg_size(rd, msg->ibm_type, n);
-
- if (msg->ibm_nob < nob) {
- CERROR("Short %s: %d(%d)\n",
- kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
- return 1;
- }
-
- msg_size = kiblnd_rd_size(rd);
- if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) {
- CERROR("Bad msg_size: %d, should be 0 < n <= %d\n",
- msg_size, LNET_MAX_PAYLOAD);
- return 1;
- }
-
- if (!flip)
- return 0;
-
- for (i = 0; i < n; i++) {
- __swab32s(&rd->rd_frags[i].rf_nob);
- __swab64s(&rd->rd_frags[i].rf_addr);
- }
-
- return 0;
-}
-
-void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
- int credits, lnet_nid_t dstnid, __u64 dststamp)
-{
- struct kib_net *net = ni->ni_data;
-
- /*
- * CAVEAT EMPTOR! all message fields not set here should have been
- * initialised previously.
- */
- msg->ibm_magic = IBLND_MSG_MAGIC;
- msg->ibm_version = version;
- /* ibm_type */
- msg->ibm_credits = credits;
- /* ibm_nob */
- msg->ibm_cksum = 0;
- msg->ibm_srcnid = ni->ni_nid;
- msg->ibm_srcstamp = net->ibn_incarnation;
- msg->ibm_dstnid = dstnid;
- msg->ibm_dststamp = dststamp;
-
- if (*kiblnd_tunables.kib_cksum) {
- /* NB ibm_cksum zero while computing cksum */
- msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
- }
-}
-
-int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
-{
- const int hdr_size = offsetof(struct kib_msg, ibm_u);
- __u32 msg_cksum;
- __u16 version;
- int msg_nob;
- int flip;
-
- /* 6 bytes are enough to have received magic + version */
- if (nob < 6) {
- CERROR("Short message: %d\n", nob);
- return -EPROTO;
- }
-
- if (msg->ibm_magic == IBLND_MSG_MAGIC) {
- flip = 0;
- } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
- flip = 1;
- } else {
- CERROR("Bad magic: %08x\n", msg->ibm_magic);
- return -EPROTO;
- }
-
- version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
- if (version != IBLND_MSG_VERSION &&
- version != IBLND_MSG_VERSION_1) {
- CERROR("Bad version: %x\n", version);
- return -EPROTO;
- }
-
- if (nob < hdr_size) {
- CERROR("Short message: %d\n", nob);
- return -EPROTO;
- }
-
- msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
- if (msg_nob > nob) {
- CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
- return -EPROTO;
- }
-
- /*
- * checksum must be computed with ibm_cksum zero and BEFORE anything
- * gets flipped
- */
- msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
- msg->ibm_cksum = 0;
- if (msg_cksum &&
- msg_cksum != kiblnd_cksum(msg, msg_nob)) {
- CERROR("Bad checksum\n");
- return -EPROTO;
- }
-
- msg->ibm_cksum = msg_cksum;
-
- if (flip) {
- /* leave magic unflipped as a clue to peer endianness */
- msg->ibm_version = version;
- BUILD_BUG_ON(sizeof(msg->ibm_type) != 1);
- BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1);
- msg->ibm_nob = msg_nob;
- __swab64s(&msg->ibm_srcnid);
- __swab64s(&msg->ibm_srcstamp);
- __swab64s(&msg->ibm_dstnid);
- __swab64s(&msg->ibm_dststamp);
- }
-
- if (msg->ibm_srcnid == LNET_NID_ANY) {
- CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
- return -EPROTO;
- }
-
- if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
- CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
- msg_nob, kiblnd_msgtype2size(msg->ibm_type));
- return -EPROTO;
- }
-
- switch (msg->ibm_type) {
- default:
- CERROR("Unknown message type %x\n", msg->ibm_type);
- return -EPROTO;
-
- case IBLND_MSG_NOOP:
- case IBLND_MSG_IMMEDIATE:
- case IBLND_MSG_PUT_REQ:
- break;
-
- case IBLND_MSG_PUT_ACK:
- case IBLND_MSG_GET_REQ:
- if (kiblnd_unpack_rd(msg, flip))
- return -EPROTO;
- break;
-
- case IBLND_MSG_PUT_NAK:
- case IBLND_MSG_PUT_DONE:
- case IBLND_MSG_GET_DONE:
- if (flip)
- __swab32s(&msg->ibm_u.completion.ibcm_status);
- break;
-
- case IBLND_MSG_CONNREQ:
- case IBLND_MSG_CONNACK:
- if (flip) {
- __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
- __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
- __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
- }
- break;
- }
- return 0;
-}
-
-int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp,
- lnet_nid_t nid)
-{
- struct kib_peer *peer;
- struct kib_net *net = ni->ni_data;
- int cpt = lnet_cpt_of_nid(nid);
- unsigned long flags;
-
- LASSERT(net);
- LASSERT(nid != LNET_NID_ANY);
-
- peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt);
- if (!peer) {
- CERROR("Cannot allocate peer\n");
- return -ENOMEM;
- }
-
- peer->ibp_ni = ni;
- peer->ibp_nid = nid;
- peer->ibp_error = 0;
- peer->ibp_last_alive = 0;
- peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
- peer->ibp_queue_depth = ni->ni_peertxcredits;
- atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
-
- INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
- INIT_LIST_HEAD(&peer->ibp_conns);
- INIT_LIST_HEAD(&peer->ibp_tx_queue);
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- /* always called with a ref on ni, which prevents ni being shutdown */
- LASSERT(!net->ibn_shutdown);
-
- /* npeers only grows with the global lock held */
- atomic_inc(&net->ibn_npeers);
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- *peerp = peer;
- return 0;
-}
-
-void kiblnd_destroy_peer(struct kib_peer *peer)
-{
- struct kib_net *net = peer->ibp_ni->ni_data;
-
- LASSERT(net);
- LASSERT(!atomic_read(&peer->ibp_refcount));
- LASSERT(!kiblnd_peer_active(peer));
- LASSERT(kiblnd_peer_idle(peer));
- LASSERT(list_empty(&peer->ibp_tx_queue));
-
- kfree(peer);
-
- /*
- * NB a peer's connections keep a reference on their peer until
- * they are destroyed, so we can be assured that _all_ state to do
- * with this peer has been cleaned up when its refcount drops to
- * zero.
- */
- atomic_dec(&net->ibn_npeers);
-}
-
-struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid)
-{
- /*
- * the caller is responsible for accounting the additional reference
- * that this creates
- */
- struct list_head *peer_list = kiblnd_nid2peerlist(nid);
- struct list_head *tmp;
- struct kib_peer *peer;
-
- list_for_each(tmp, peer_list) {
- peer = list_entry(tmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_nid != nid)
- continue;
-
- CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
- peer, libcfs_nid2str(nid),
- atomic_read(&peer->ibp_refcount),
- peer->ibp_version);
- return peer;
- }
- return NULL;
-}
-
-void kiblnd_unlink_peer_locked(struct kib_peer *peer)
-{
- LASSERT(list_empty(&peer->ibp_conns));
-
- LASSERT(kiblnd_peer_active(peer));
- list_del_init(&peer->ibp_list);
- /* lose peerlist's ref */
- kiblnd_peer_decref(peer);
-}
-
-static int kiblnd_get_peer_info(struct lnet_ni *ni, int index,
- lnet_nid_t *nidp, int *count)
-{
- struct kib_peer *peer;
- struct list_head *ptmp;
- int i;
- unsigned long flags;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
- list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- if (index-- > 0)
- continue;
-
- *nidp = peer->ibp_nid;
- *count = atomic_read(&peer->ibp_refcount);
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
- flags);
- return 0;
- }
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- return -ENOENT;
-}
-
-static void kiblnd_del_peer_locked(struct kib_peer *peer)
-{
- struct list_head *ctmp;
- struct list_head *cnxt;
- struct kib_conn *conn;
-
- if (list_empty(&peer->ibp_conns)) {
- kiblnd_unlink_peer_locked(peer);
- } else {
- list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- kiblnd_close_conn_locked(conn, 0);
- }
- /* NB closing peer's last conn unlinked it. */
- }
- /*
- * NB peer now unlinked; might even be freed if the peer table had the
- * last ref on it.
- */
-}
-
-static int kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid)
-{
- LIST_HEAD(zombies);
- struct list_head *ptmp;
- struct list_head *pnxt;
- struct kib_peer *peer;
- int lo;
- int hi;
- int i;
- unsigned long flags;
- int rc = -ENOENT;
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (nid != LNET_NID_ANY) {
- lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- } else {
- lo = 0;
- hi = kiblnd_data.kib_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
- continue;
-
- if (!list_empty(&peer->ibp_tx_queue)) {
- LASSERT(list_empty(&peer->ibp_conns));
-
- list_splice_init(&peer->ibp_tx_queue,
- &zombies);
- }
-
- kiblnd_del_peer_locked(peer);
- rc = 0; /* matched something */
- }
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_txlist_done(ni, &zombies, -EIO);
-
- return rc;
-}
-
-static struct kib_conn *kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index)
-{
- struct kib_peer *peer;
- struct list_head *ptmp;
- struct kib_conn *conn;
- struct list_head *ctmp;
- int i;
- unsigned long flags;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
- list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- list_for_each(ctmp, &peer->ibp_conns) {
- if (index-- > 0)
- continue;
-
- conn = list_entry(ctmp, struct kib_conn,
- ibc_list);
- kiblnd_conn_addref(conn);
- read_unlock_irqrestore(
- &kiblnd_data.kib_global_lock,
- flags);
- return conn;
- }
- }
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- return NULL;
-}
-
-int kiblnd_translate_mtu(int value)
-{
- switch (value) {
- default:
- return -1;
- case 0:
- return 0;
- case 256:
- return IB_MTU_256;
- case 512:
- return IB_MTU_512;
- case 1024:
- return IB_MTU_1024;
- case 2048:
- return IB_MTU_2048;
- case 4096:
- return IB_MTU_4096;
- }
-}
-
-static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
-{
- int mtu;
-
- /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
- if (!cmid->route.path_rec)
- return;
-
- mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
- LASSERT(mtu >= 0);
- if (mtu)
- cmid->route.path_rec->mtu = mtu;
-}
-
-static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
-{
- cpumask_var_t *mask;
- int vectors;
- int off;
- int i;
- lnet_nid_t nid = conn->ibc_peer->ibp_nid;
-
- vectors = conn->ibc_cmid->device->num_comp_vectors;
- if (vectors <= 1)
- return 0;
-
- mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
- if (!mask)
- return 0;
-
- /* hash NID to CPU id in this partition... */
- off = do_div(nid, cpumask_weight(*mask));
- for_each_cpu(i, *mask) {
- if (!off--)
- return i % vectors;
- }
-
- LBUG();
- return 1;
-}
-
-struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid,
- int state, int version)
-{
- /*
- * CAVEAT EMPTOR:
- * If the new conn is created successfully it takes over the caller's
- * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
- * is destroyed. On failure, the caller's ref on 'peer' remains and
- * she must dispose of 'cmid'. (Actually I'd block forever if I tried
- * to destroy 'cmid' here since I'm called from the CM which still has
- * its ref on 'cmid').
- */
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_net *net = peer->ibp_ni->ni_data;
- struct kib_dev *dev;
- struct ib_qp_init_attr *init_qp_attr;
- struct kib_sched_info *sched;
- struct ib_cq_init_attr cq_attr = {};
- struct kib_conn *conn;
- struct ib_cq *cq;
- unsigned long flags;
- int cpt;
- int rc;
- int i;
-
- LASSERT(net);
- LASSERT(!in_interrupt());
-
- dev = net->ibn_dev;
-
- cpt = lnet_cpt_of_nid(peer->ibp_nid);
- sched = kiblnd_data.kib_scheds[cpt];
-
- LASSERT(sched->ibs_nthreads > 0);
-
- init_qp_attr = kzalloc_cpt(sizeof(*init_qp_attr), GFP_NOFS, cpt);
- if (!init_qp_attr) {
- CERROR("Can't allocate qp_attr for %s\n",
- libcfs_nid2str(peer->ibp_nid));
- goto failed_0;
- }
-
- conn = kzalloc_cpt(sizeof(*conn), GFP_NOFS, cpt);
- if (!conn) {
- CERROR("Can't allocate connection for %s\n",
- libcfs_nid2str(peer->ibp_nid));
- goto failed_1;
- }
-
- conn->ibc_state = IBLND_CONN_INIT;
- conn->ibc_version = version;
- conn->ibc_peer = peer; /* I take the caller's ref */
- cmid->context = conn; /* for future CM callbacks */
- conn->ibc_cmid = cmid;
- conn->ibc_max_frags = peer->ibp_max_frags;
- conn->ibc_queue_depth = peer->ibp_queue_depth;
-
- INIT_LIST_HEAD(&conn->ibc_early_rxs);
- INIT_LIST_HEAD(&conn->ibc_tx_noops);
- INIT_LIST_HEAD(&conn->ibc_tx_queue);
- INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
- INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
- INIT_LIST_HEAD(&conn->ibc_active_txs);
- spin_lock_init(&conn->ibc_lock);
-
- conn->ibc_connvars = kzalloc_cpt(sizeof(*conn->ibc_connvars), GFP_NOFS, cpt);
- if (!conn->ibc_connvars) {
- CERROR("Can't allocate in-progress connection state\n");
- goto failed_2;
- }
-
- write_lock_irqsave(glock, flags);
- if (dev->ibd_failover) {
- write_unlock_irqrestore(glock, flags);
- CERROR("%s: failover in progress\n", dev->ibd_ifname);
- goto failed_2;
- }
-
- if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
- /* wakeup failover thread and teardown connection */
- if (kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- wake_up(&kiblnd_data.kib_failover_waitq);
- }
-
- write_unlock_irqrestore(glock, flags);
- CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
- cmid->device->name, dev->ibd_ifname);
- goto failed_2;
- }
-
- kiblnd_hdev_addref_locked(dev->ibd_hdev);
- conn->ibc_hdev = dev->ibd_hdev;
-
- kiblnd_setup_mtu_locked(cmid);
-
- write_unlock_irqrestore(glock, flags);
-
- conn->ibc_rxs = kzalloc_cpt(IBLND_RX_MSGS(conn) * sizeof(struct kib_rx),
- GFP_NOFS, cpt);
- if (!conn->ibc_rxs) {
- CERROR("Cannot allocate RX buffers\n");
- goto failed_2;
- }
-
- rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
- IBLND_RX_MSG_PAGES(conn));
- if (rc)
- goto failed_2;
-
- kiblnd_map_rx_descs(conn);
-
- cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
- cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
- cq = ib_create_cq(cmid->device,
- kiblnd_cq_completion, kiblnd_cq_event, conn,
- &cq_attr);
- if (IS_ERR(cq)) {
- CERROR("Failed to create CQ with %d CQEs: %ld\n",
- IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
- goto failed_2;
- }
-
- conn->ibc_cq = cq;
-
- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (rc) {
- CERROR("Can't request completion notification: %d\n", rc);
- goto failed_2;
- }
-
- init_qp_attr->event_handler = kiblnd_qp_event;
- init_qp_attr->qp_context = conn;
- init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
- init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
- init_qp_attr->cap.max_send_sge = 1;
- init_qp_attr->cap.max_recv_sge = 1;
- init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
- init_qp_attr->qp_type = IB_QPT_RC;
- init_qp_attr->send_cq = cq;
- init_qp_attr->recv_cq = cq;
-
- conn->ibc_sched = sched;
-
- rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
- if (rc) {
- CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
- rc, init_qp_attr->cap.max_send_wr,
- init_qp_attr->cap.max_recv_wr);
- goto failed_2;
- }
-
- kfree(init_qp_attr);
-
- /* 1 ref for caller and each rxmsg */
- atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
- conn->ibc_nrx = IBLND_RX_MSGS(conn);
-
- /* post receives */
- for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
- rc = kiblnd_post_rx(&conn->ibc_rxs[i],
- IBLND_POSTRX_NO_CREDIT);
- if (rc) {
- CERROR("Can't post rxmsg: %d\n", rc);
-
- /* Make posted receives complete */
- kiblnd_abort_receives(conn);
-
- /*
- * correct # of posted buffers
- * NB locking needed now I'm racing with completion
- */
- spin_lock_irqsave(&sched->ibs_lock, flags);
- conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- /*
- * cmid will be destroyed by CM(ofed) after cm_callback
- * returned, so we can't refer it anymore
- * (by kiblnd_connd()->kiblnd_destroy_conn)
- */
- rdma_destroy_qp(conn->ibc_cmid);
- conn->ibc_cmid = NULL;
-
- /* Drop my own and unused rxbuffer refcounts */
- while (i++ <= IBLND_RX_MSGS(conn))
- kiblnd_conn_decref(conn);
-
- return NULL;
- }
- }
-
- /* Init successful! */
- LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
- state == IBLND_CONN_PASSIVE_WAIT);
- conn->ibc_state = state;
-
- /* 1 more conn */
- atomic_inc(&net->ibn_nconns);
- return conn;
-
- failed_2:
- kiblnd_destroy_conn(conn);
- kfree(conn);
- failed_1:
- kfree(init_qp_attr);
- failed_0:
- return NULL;
-}
-
-void kiblnd_destroy_conn(struct kib_conn *conn)
-{
- struct rdma_cm_id *cmid = conn->ibc_cmid;
- struct kib_peer *peer = conn->ibc_peer;
- int rc;
-
- LASSERT(!in_interrupt());
- LASSERT(!atomic_read(&conn->ibc_refcount));
- LASSERT(list_empty(&conn->ibc_early_rxs));
- LASSERT(list_empty(&conn->ibc_tx_noops));
- LASSERT(list_empty(&conn->ibc_tx_queue));
- LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
- LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
- LASSERT(list_empty(&conn->ibc_active_txs));
- LASSERT(!conn->ibc_noops_posted);
- LASSERT(!conn->ibc_nsends_posted);
-
- switch (conn->ibc_state) {
- default:
- /* conn must be completely disengaged from the network */
- LBUG();
-
- case IBLND_CONN_DISCONNECTED:
- /* connvars should have been freed already */
- LASSERT(!conn->ibc_connvars);
- break;
-
- case IBLND_CONN_INIT:
- break;
- }
-
- /* conn->ibc_cmid might be destroyed by CM already */
- if (cmid && cmid->qp)
- rdma_destroy_qp(cmid);
-
- if (conn->ibc_cq) {
- rc = ib_destroy_cq(conn->ibc_cq);
- if (rc)
- CWARN("Error destroying CQ: %d\n", rc);
- }
-
- if (conn->ibc_rx_pages)
- kiblnd_unmap_rx_descs(conn);
-
- kfree(conn->ibc_rxs);
- kfree(conn->ibc_connvars);
-
- if (conn->ibc_hdev)
- kiblnd_hdev_decref(conn->ibc_hdev);
-
- /* See CAVEAT EMPTOR above in kiblnd_create_conn */
- if (conn->ibc_state != IBLND_CONN_INIT) {
- struct kib_net *net = peer->ibp_ni->ni_data;
-
- kiblnd_peer_decref(peer);
- rdma_destroy_id(cmid);
- atomic_dec(&net->ibn_nconns);
- }
-}
-
-int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why)
-{
- struct kib_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
- int count = 0;
-
- list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
- libcfs_nid2str(peer->ibp_nid),
- conn->ibc_version, why);
-
- kiblnd_close_conn_locked(conn, why);
- count++;
- }
-
- return count;
-}
-
-int kiblnd_close_stale_conns_locked(struct kib_peer *peer,
- int version, __u64 incarnation)
-{
- struct kib_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
- int count = 0;
-
- list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- if (conn->ibc_version == version &&
- conn->ibc_incarnation == incarnation)
- continue;
-
- CDEBUG(D_NET,
- "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
- libcfs_nid2str(peer->ibp_nid),
- conn->ibc_version, conn->ibc_incarnation,
- version, incarnation);
-
- kiblnd_close_conn_locked(conn, -ESTALE);
- count++;
- }
-
- return count;
-}
-
-static int kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid)
-{
- struct kib_peer *peer;
- struct list_head *ptmp;
- struct list_head *pnxt;
- int lo;
- int hi;
- int i;
- unsigned long flags;
- int count = 0;
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (nid != LNET_NID_ANY) {
- lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
- } else {
- lo = 0;
- hi = kiblnd_data.kib_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
- LASSERT(!kiblnd_peer_idle(peer));
-
- if (peer->ibp_ni != ni)
- continue;
-
- if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
- continue;
-
- count += kiblnd_close_peer_conns_locked(peer, 0);
- }
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- /* wildcards always succeed */
- if (nid == LNET_NID_ANY)
- return 0;
-
- return !count ? -ENOENT : 0;
-}
-
-static int kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
-{
- struct libcfs_ioctl_data *data = arg;
- int rc = -EINVAL;
-
- switch (cmd) {
- case IOC_LIBCFS_GET_PEER: {
- lnet_nid_t nid = 0;
- int count = 0;
-
- rc = kiblnd_get_peer_info(ni, data->ioc_count,
- &nid, &count);
- data->ioc_nid = nid;
- data->ioc_count = count;
- break;
- }
-
- case IOC_LIBCFS_DEL_PEER: {
- rc = kiblnd_del_peer(ni, data->ioc_nid);
- break;
- }
- case IOC_LIBCFS_GET_CONN: {
- struct kib_conn *conn;
-
- rc = 0;
- conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
- if (!conn) {
- rc = -ENOENT;
- break;
- }
-
- LASSERT(conn->ibc_cmid);
- data->ioc_nid = conn->ibc_peer->ibp_nid;
- if (!conn->ibc_cmid->route.path_rec)
- data->ioc_u32[0] = 0; /* iWarp has no path MTU */
- else
- data->ioc_u32[0] =
- ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
- kiblnd_conn_decref(conn);
- break;
- }
- case IOC_LIBCFS_CLOSE_CONNECTION: {
- rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
- break;
- }
-
- default:
- break;
- }
-
- return rc;
-}
-
-static void kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid,
- unsigned long *when)
-{
- unsigned long last_alive = 0;
- unsigned long now = jiffies;
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_peer *peer;
- unsigned long flags;
-
- read_lock_irqsave(glock, flags);
-
- peer = kiblnd_find_peer_locked(nid);
- if (peer)
- last_alive = peer->ibp_last_alive;
-
- read_unlock_irqrestore(glock, flags);
-
- if (last_alive)
- *when = last_alive;
-
- /*
- * peer is not persistent in hash, trigger peer creation
- * and connection establishment with a NULL tx
- */
- if (!peer)
- kiblnd_launch_tx(ni, NULL, nid);
-
- CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
- libcfs_nid2str(nid), peer,
- last_alive ? (now - last_alive) / HZ : -1);
-}
-
-static void kiblnd_free_pages(struct kib_pages *p)
-{
- int npages = p->ibp_npages;
- int i;
-
- for (i = 0; i < npages; i++) {
- if (p->ibp_pages[i])
- __free_page(p->ibp_pages[i]);
- }
-
- kfree(p);
-}
-
-int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
-{
- struct kib_pages *p;
- int i;
-
- p = kzalloc_cpt(offsetof(struct kib_pages, ibp_pages[npages]),
- GFP_NOFS, cpt);
- if (!p) {
- CERROR("Can't allocate descriptor for %d pages\n", npages);
- return -ENOMEM;
- }
-
- p->ibp_npages = npages;
-
- for (i = 0; i < npages; i++) {
- p->ibp_pages[i] = alloc_pages_node(
- cfs_cpt_spread_node(lnet_cpt_table(), cpt),
- GFP_NOFS, 0);
- if (!p->ibp_pages[i]) {
- CERROR("Can't allocate page %d of %d\n", i, npages);
- kiblnd_free_pages(p);
- return -ENOMEM;
- }
- }
-
- *pp = p;
- return 0;
-}
-
-void kiblnd_unmap_rx_descs(struct kib_conn *conn)
-{
- struct kib_rx *rx;
- int i;
-
- LASSERT(conn->ibc_rxs);
- LASSERT(conn->ibc_hdev);
-
- for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
- rx = &conn->ibc_rxs[i];
-
- LASSERT(rx->rx_nob >= 0); /* not posted */
-
- kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
- KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
- rx->rx_msgaddr),
- IBLND_MSG_SIZE, DMA_FROM_DEVICE);
- }
-
- kiblnd_free_pages(conn->ibc_rx_pages);
-
- conn->ibc_rx_pages = NULL;
-}
-
-void kiblnd_map_rx_descs(struct kib_conn *conn)
-{
- struct kib_rx *rx;
- struct page *pg;
- int pg_off;
- int ipg;
- int i;
-
- for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
- pg = conn->ibc_rx_pages->ibp_pages[ipg];
- rx = &conn->ibc_rxs[i];
-
- rx->rx_conn = conn;
- rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
-
- rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
- rx->rx_msg,
- IBLND_MSG_SIZE,
- DMA_FROM_DEVICE);
- LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
- rx->rx_msgaddr));
- KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
-
- CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
- i, rx->rx_msg, rx->rx_msgaddr,
- (__u64)(page_to_phys(pg) + pg_off));
-
- pg_off += IBLND_MSG_SIZE;
- LASSERT(pg_off <= PAGE_SIZE);
-
- if (pg_off == PAGE_SIZE) {
- pg_off = 0;
- ipg++;
- LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
- }
- }
-}
-
-static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
-{
- struct kib_hca_dev *hdev = tpo->tpo_hdev;
- struct kib_tx *tx;
- int i;
-
- LASSERT(!tpo->tpo_pool.po_allocated);
-
- if (!hdev)
- return;
-
- for (i = 0; i < tpo->tpo_pool.po_size; i++) {
- tx = &tpo->tpo_tx_descs[i];
- kiblnd_dma_unmap_single(hdev->ibh_ibdev,
- KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
- tx->tx_msgaddr),
- IBLND_MSG_SIZE, DMA_TO_DEVICE);
- }
-
- kiblnd_hdev_decref(hdev);
- tpo->tpo_hdev = NULL;
-}
-
-static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev)
-{
- struct kib_hca_dev *hdev;
- unsigned long flags;
- int i = 0;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- while (dev->ibd_failover) {
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- if (!(i++ % 50))
- CDEBUG(D_NET, "%s: Wait for failover\n",
- dev->ibd_ifname);
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ / 100);
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- }
-
- kiblnd_hdev_addref_locked(dev->ibd_hdev);
- hdev = dev->ibd_hdev;
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- return hdev;
-}
-
-static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
-{
- struct kib_pages *txpgs = tpo->tpo_tx_pages;
- struct kib_pool *pool = &tpo->tpo_pool;
- struct kib_net *net = pool->po_owner->ps_net;
- struct kib_dev *dev;
- struct page *page;
- struct kib_tx *tx;
- int page_offset;
- int ipage;
- int i;
-
- LASSERT(net);
-
- dev = net->ibn_dev;
-
- /* pre-mapped messages are not bigger than 1 page */
- BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE);
-
- /* No fancy arithmetic when we do the buffer calculations */
- BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE);
-
- tpo->tpo_hdev = kiblnd_current_hdev(dev);
-
- for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
- page = txpgs->ibp_pages[ipage];
- tx = &tpo->tpo_tx_descs[i];
-
- tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
- page_offset);
-
- tx->tx_msgaddr = kiblnd_dma_map_single(
- tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
- IBLND_MSG_SIZE, DMA_TO_DEVICE);
- LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
- tx->tx_msgaddr));
- KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
-
- list_add(&tx->tx_list, &pool->po_free_list);
-
- page_offset += IBLND_MSG_SIZE;
- LASSERT(page_offset <= PAGE_SIZE);
-
- if (page_offset == PAGE_SIZE) {
- page_offset = 0;
- ipage++;
- LASSERT(ipage <= txpgs->ibp_npages);
- }
- }
-}
-
-static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
-{
- LASSERT(!fpo->fpo_map_count);
-
- if (fpo->fpo_is_fmr) {
- if (fpo->fmr.fpo_fmr_pool)
- ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
- } else {
- struct kib_fast_reg_descriptor *frd, *tmp;
- int i = 0;
-
- list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
- frd_list) {
- list_del(&frd->frd_list);
- ib_dereg_mr(frd->frd_mr);
- kfree(frd);
- i++;
- }
- if (i < fpo->fast_reg.fpo_pool_size)
- CERROR("FastReg pool still has %d regions registered\n",
- fpo->fast_reg.fpo_pool_size - i);
- }
-
- if (fpo->fpo_hdev)
- kiblnd_hdev_decref(fpo->fpo_hdev);
-
- kfree(fpo);
-}
-
-static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
-{
- struct kib_fmr_pool *fpo, *tmp;
-
- list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
- list_del(&fpo->fpo_list);
- kiblnd_destroy_fmr_pool(fpo);
- }
-}
-
-static int
-kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
- int ncpts)
-{
- int size = tunables->lnd_fmr_pool_size / ncpts;
-
- return max(IBLND_FMR_POOL, size);
-}
-
-static int
-kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
- int ncpts)
-{
- int size = tunables->lnd_fmr_flush_trigger / ncpts;
-
- return max(IBLND_FMR_POOL_FLUSH, size);
-}
-
-static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
-{
- struct ib_fmr_pool_param param = {
- .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
- .page_shift = PAGE_SHIFT,
- .access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE),
- .pool_size = fps->fps_pool_size,
- .dirty_watermark = fps->fps_flush_trigger,
- .flush_function = NULL,
- .flush_arg = NULL,
- .cache = !!fps->fps_cache };
- int rc = 0;
-
- fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
- &param);
- if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
- rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
- if (rc != -ENOSYS)
- CERROR("Failed to create FMR pool: %d\n", rc);
- else
- CERROR("FMRs are not supported\n");
- }
-
- return rc;
-}
-
-static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
-{
- struct kib_fast_reg_descriptor *frd, *tmp;
- int i, rc;
-
- INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
- fpo->fast_reg.fpo_pool_size = 0;
- for (i = 0; i < fps->fps_pool_size; i++) {
- frd = kzalloc_cpt(sizeof(*frd), GFP_NOFS, fps->fps_cpt);
- if (!frd) {
- CERROR("Failed to allocate a new fast_reg descriptor\n");
- rc = -ENOMEM;
- goto out;
- }
-
- frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
- IB_MR_TYPE_MEM_REG,
- LNET_MAX_PAYLOAD / PAGE_SIZE);
- if (IS_ERR(frd->frd_mr)) {
- rc = PTR_ERR(frd->frd_mr);
- CERROR("Failed to allocate ib_alloc_mr: %d\n", rc);
- frd->frd_mr = NULL;
- goto out_middle;
- }
-
- frd->frd_valid = true;
-
- list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
- fpo->fast_reg.fpo_pool_size++;
- }
-
- return 0;
-
-out_middle:
- if (frd->frd_mr)
- ib_dereg_mr(frd->frd_mr);
- kfree(frd);
-
-out:
- list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
- frd_list) {
- list_del(&frd->frd_list);
- ib_dereg_mr(frd->frd_mr);
- kfree(frd);
- }
-
- return rc;
-}
-
-static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
- struct kib_fmr_pool **pp_fpo)
-{
- struct kib_dev *dev = fps->fps_net->ibn_dev;
- struct ib_device_attr *dev_attr;
- struct kib_fmr_pool *fpo;
- int rc;
-
- fpo = kzalloc_cpt(sizeof(*fpo), GFP_NOFS, fps->fps_cpt);
- if (!fpo)
- return -ENOMEM;
-
- fpo->fpo_hdev = kiblnd_current_hdev(dev);
- dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
-
- /* Check for FMR or FastReg support */
- fpo->fpo_is_fmr = 0;
- if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
- fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
- fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
- fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
- LCONSOLE_INFO("Using FMR for registration\n");
- fpo->fpo_is_fmr = 1;
- } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
- LCONSOLE_INFO("Using FastReg for registration\n");
- } else {
- rc = -ENOSYS;
- LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
- goto out_fpo;
- }
-
- if (fpo->fpo_is_fmr)
- rc = kiblnd_alloc_fmr_pool(fps, fpo);
- else
- rc = kiblnd_alloc_freg_pool(fps, fpo);
- if (rc)
- goto out_fpo;
-
- fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- fpo->fpo_owner = fps;
- *pp_fpo = fpo;
-
- return 0;
-
-out_fpo:
- kiblnd_hdev_decref(fpo->fpo_hdev);
- kfree(fpo);
- return rc;
-}
-
-static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps,
- struct list_head *zombies)
-{
- if (!fps->fps_net) /* initialized? */
- return;
-
- spin_lock(&fps->fps_lock);
-
- while (!list_empty(&fps->fps_pool_list)) {
- struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
- struct kib_fmr_pool, fpo_list);
- fpo->fpo_failed = 1;
- list_del(&fpo->fpo_list);
- if (!fpo->fpo_map_count)
- list_add(&fpo->fpo_list, zombies);
- else
- list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
- }
-
- spin_unlock(&fps->fps_lock);
-}
-
-static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
-{
- if (fps->fps_net) { /* initialized? */
- kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
- kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
- }
-}
-
-static int
-kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
- struct kib_net *net,
- struct lnet_ioctl_config_o2iblnd_tunables *tunables)
-{
- struct kib_fmr_pool *fpo;
- int rc;
-
- memset(fps, 0, sizeof(*fps));
-
- fps->fps_net = net;
- fps->fps_cpt = cpt;
-
- fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
- fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
- fps->fps_cache = tunables->lnd_fmr_cache;
-
- spin_lock_init(&fps->fps_lock);
- INIT_LIST_HEAD(&fps->fps_pool_list);
- INIT_LIST_HEAD(&fps->fps_failed_pool_list);
-
- rc = kiblnd_create_fmr_pool(fps, &fpo);
- if (!rc)
- list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
-
- return rc;
-}
-
-static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now)
-{
- if (fpo->fpo_map_count) /* still in use */
- return 0;
- if (fpo->fpo_failed)
- return 1;
- return time_after_eq(now, fpo->fpo_deadline);
-}
-
-static int
-kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
-{
- __u64 *pages = tx->tx_pages;
- struct kib_hca_dev *hdev;
- int npages;
- int size;
- int i;
-
- hdev = tx->tx_pool->tpo_hdev;
-
- for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
- for (size = 0; size < rd->rd_frags[i].rf_nob;
- size += hdev->ibh_page_size) {
- pages[npages++] = (rd->rd_frags[i].rf_addr &
- hdev->ibh_page_mask) + size;
- }
- }
-
- return npages;
-}
-
-void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
-{
- LIST_HEAD(zombies);
- struct kib_fmr_pool *fpo = fmr->fmr_pool;
- struct kib_fmr_poolset *fps;
- unsigned long now = jiffies;
- struct kib_fmr_pool *tmp;
- int rc;
-
- if (!fpo)
- return;
-
- fps = fpo->fpo_owner;
- if (fpo->fpo_is_fmr) {
- if (fmr->fmr_pfmr) {
- rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
- LASSERT(!rc);
- fmr->fmr_pfmr = NULL;
- }
-
- if (status) {
- rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
- LASSERT(!rc);
- }
- } else {
- struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
-
- if (frd) {
- frd->frd_valid = false;
- spin_lock(&fps->fps_lock);
- list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
- spin_unlock(&fps->fps_lock);
- fmr->fmr_frd = NULL;
- }
- }
- fmr->fmr_pool = NULL;
-
- spin_lock(&fps->fps_lock);
- fpo->fpo_map_count--; /* decref the pool */
-
- list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
- /* the first pool is persistent */
- if (fps->fps_pool_list.next == &fpo->fpo_list)
- continue;
-
- if (kiblnd_fmr_pool_is_idle(fpo, now)) {
- list_move(&fpo->fpo_list, &zombies);
- fps->fps_version++;
- }
- }
- spin_unlock(&fps->fps_lock);
-
- if (!list_empty(&zombies))
- kiblnd_destroy_fmr_pool_list(&zombies);
-}
-
-int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
- struct kib_rdma_desc *rd, __u32 nob, __u64 iov,
- struct kib_fmr *fmr)
-{
- __u64 *pages = tx->tx_pages;
- bool is_rx = (rd != tx->tx_rd);
- bool tx_pages_mapped = false;
- struct kib_fmr_pool *fpo;
- int npages = 0;
- __u64 version;
- int rc;
-
- again:
- spin_lock(&fps->fps_lock);
- version = fps->fps_version;
- list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
- fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- fpo->fpo_map_count++;
-
- if (fpo->fpo_is_fmr) {
- struct ib_pool_fmr *pfmr;
-
- spin_unlock(&fps->fps_lock);
-
- if (!tx_pages_mapped) {
- npages = kiblnd_map_tx_pages(tx, rd);
- tx_pages_mapped = 1;
- }
-
- pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
- pages, npages, iov);
- if (likely(!IS_ERR(pfmr))) {
- fmr->fmr_key = is_rx ? pfmr->fmr->rkey :
- pfmr->fmr->lkey;
- fmr->fmr_frd = NULL;
- fmr->fmr_pfmr = pfmr;
- fmr->fmr_pool = fpo;
- return 0;
- }
- rc = PTR_ERR(pfmr);
- } else {
- if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
- struct kib_fast_reg_descriptor *frd;
- struct ib_reg_wr *wr;
- struct ib_mr *mr;
- int n;
-
- frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
- struct kib_fast_reg_descriptor,
- frd_list);
- list_del(&frd->frd_list);
- spin_unlock(&fps->fps_lock);
-
- mr = frd->frd_mr;
-
- if (!frd->frd_valid) {
- __u32 key = is_rx ? mr->rkey : mr->lkey;
- struct ib_send_wr *inv_wr;
-
- inv_wr = &frd->frd_inv_wr;
- memset(inv_wr, 0, sizeof(*inv_wr));
- inv_wr->opcode = IB_WR_LOCAL_INV;
- inv_wr->wr_id = IBLND_WID_MR;
- inv_wr->ex.invalidate_rkey = key;
-
- /* Bump the key */
- key = ib_inc_rkey(key);
- ib_update_fast_reg_key(mr, key);
- }
-
- n = ib_map_mr_sg(mr, tx->tx_frags,
- tx->tx_nfrags, NULL, PAGE_SIZE);
- if (unlikely(n != tx->tx_nfrags)) {
- CERROR("Failed to map mr %d/%d elements\n",
- n, tx->tx_nfrags);
- return n < 0 ? n : -EINVAL;
- }
-
- mr->iova = iov;
-
- /* Prepare FastReg WR */
- wr = &frd->frd_fastreg_wr;
- memset(wr, 0, sizeof(*wr));
- wr->wr.opcode = IB_WR_REG_MR;
- wr->wr.wr_id = IBLND_WID_MR;
- wr->wr.num_sge = 0;
- wr->wr.send_flags = 0;
- wr->mr = mr;
- wr->key = is_rx ? mr->rkey : mr->lkey;
- wr->access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE);
-
- fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
- fmr->fmr_frd = frd;
- fmr->fmr_pfmr = NULL;
- fmr->fmr_pool = fpo;
- return 0;
- }
- spin_unlock(&fps->fps_lock);
- rc = -EAGAIN;
- }
-
- spin_lock(&fps->fps_lock);
- fpo->fpo_map_count--;
- if (rc != -EAGAIN) {
- spin_unlock(&fps->fps_lock);
- return rc;
- }
-
- /* EAGAIN and ... */
- if (version != fps->fps_version) {
- spin_unlock(&fps->fps_lock);
- goto again;
- }
- }
-
- if (fps->fps_increasing) {
- spin_unlock(&fps->fps_lock);
- CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n");
- schedule();
- goto again;
- }
-
- if (time_before(jiffies, fps->fps_next_retry)) {
- /* someone failed recently */
- spin_unlock(&fps->fps_lock);
- return -EAGAIN;
- }
-
- fps->fps_increasing = 1;
- spin_unlock(&fps->fps_lock);
-
- CDEBUG(D_NET, "Allocate new FMR pool\n");
- rc = kiblnd_create_fmr_pool(fps, &fpo);
- spin_lock(&fps->fps_lock);
- fps->fps_increasing = 0;
- if (!rc) {
- fps->fps_version++;
- list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
- } else {
- fps->fps_next_retry = jiffies + IBLND_POOL_RETRY * HZ;
- }
- spin_unlock(&fps->fps_lock);
-
- goto again;
-}
-
-static void kiblnd_fini_pool(struct kib_pool *pool)
-{
- LASSERT(list_empty(&pool->po_free_list));
- LASSERT(!pool->po_allocated);
-
- CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
-}
-
-static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
-{
- CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
-
- memset(pool, 0, sizeof(*pool));
- INIT_LIST_HEAD(&pool->po_free_list);
- pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- pool->po_owner = ps;
- pool->po_size = size;
-}
-
-static void kiblnd_destroy_pool_list(struct list_head *head)
-{
- struct kib_pool *pool;
-
- while (!list_empty(head)) {
- pool = list_entry(head->next, struct kib_pool, po_list);
- list_del(&pool->po_list);
-
- LASSERT(pool->po_owner);
- pool->po_owner->ps_pool_destroy(pool);
- }
-}
-
-static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
-{
- if (!ps->ps_net) /* initialized? */
- return;
-
- spin_lock(&ps->ps_lock);
- while (!list_empty(&ps->ps_pool_list)) {
- struct kib_pool *po = list_entry(ps->ps_pool_list.next,
- struct kib_pool, po_list);
- po->po_failed = 1;
- list_del(&po->po_list);
- if (!po->po_allocated)
- list_add(&po->po_list, zombies);
- else
- list_add(&po->po_list, &ps->ps_failed_pool_list);
- }
- spin_unlock(&ps->ps_lock);
-}
-
-static void kiblnd_fini_poolset(struct kib_poolset *ps)
-{
- if (ps->ps_net) { /* initialized? */
- kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
- kiblnd_destroy_pool_list(&ps->ps_pool_list);
- }
-}
-
-static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
- struct kib_net *net, char *name, int size,
- kib_ps_pool_create_t po_create,
- kib_ps_pool_destroy_t po_destroy,
- kib_ps_node_init_t nd_init,
- kib_ps_node_fini_t nd_fini)
-{
- struct kib_pool *pool;
- int rc;
-
- memset(ps, 0, sizeof(*ps));
-
- ps->ps_cpt = cpt;
- ps->ps_net = net;
- ps->ps_pool_create = po_create;
- ps->ps_pool_destroy = po_destroy;
- ps->ps_node_init = nd_init;
- ps->ps_node_fini = nd_fini;
- ps->ps_pool_size = size;
- if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
- >= sizeof(ps->ps_name))
- return -E2BIG;
- spin_lock_init(&ps->ps_lock);
- INIT_LIST_HEAD(&ps->ps_pool_list);
- INIT_LIST_HEAD(&ps->ps_failed_pool_list);
-
- rc = ps->ps_pool_create(ps, size, &pool);
- if (!rc)
- list_add(&pool->po_list, &ps->ps_pool_list);
- else
- CERROR("Failed to create the first pool for %s\n", ps->ps_name);
-
- return rc;
-}
-
-static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now)
-{
- if (pool->po_allocated) /* still in use */
- return 0;
- if (pool->po_failed)
- return 1;
- return time_after_eq(now, pool->po_deadline);
-}
-
-void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
-{
- LIST_HEAD(zombies);
- struct kib_poolset *ps = pool->po_owner;
- struct kib_pool *tmp;
- unsigned long now = jiffies;
-
- spin_lock(&ps->ps_lock);
-
- if (ps->ps_node_fini)
- ps->ps_node_fini(pool, node);
-
- LASSERT(pool->po_allocated > 0);
- list_add(node, &pool->po_free_list);
- pool->po_allocated--;
-
- list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
- /* the first pool is persistent */
- if (ps->ps_pool_list.next == &pool->po_list)
- continue;
-
- if (kiblnd_pool_is_idle(pool, now))
- list_move(&pool->po_list, &zombies);
- }
- spin_unlock(&ps->ps_lock);
-
- if (!list_empty(&zombies))
- kiblnd_destroy_pool_list(&zombies);
-}
-
-struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps)
-{
- struct list_head *node;
- struct kib_pool *pool;
- unsigned int interval = 1;
- unsigned long time_before;
- unsigned int trips = 0;
- int rc;
-
- again:
- spin_lock(&ps->ps_lock);
- list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
- if (list_empty(&pool->po_free_list))
- continue;
-
- pool->po_allocated++;
- pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ;
- node = pool->po_free_list.next;
- list_del(node);
-
- if (ps->ps_node_init) {
- /* still hold the lock */
- ps->ps_node_init(pool, node);
- }
- spin_unlock(&ps->ps_lock);
- return node;
- }
-
- /* no available tx pool and ... */
- if (ps->ps_increasing) {
- /* another thread is allocating a new pool */
- spin_unlock(&ps->ps_lock);
- trips++;
- CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n",
- ps->ps_name, interval, trips);
-
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(interval);
- if (interval < HZ)
- interval *= 2;
-
- goto again;
- }
-
- if (time_before(jiffies, ps->ps_next_retry)) {
- /* someone failed recently */
- spin_unlock(&ps->ps_lock);
- return NULL;
- }
-
- ps->ps_increasing = 1;
- spin_unlock(&ps->ps_lock);
-
- CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
- time_before = jiffies;
- rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
- CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
- jiffies - time_before);
-
- spin_lock(&ps->ps_lock);
- ps->ps_increasing = 0;
- if (!rc) {
- list_add_tail(&pool->po_list, &ps->ps_pool_list);
- } else {
- ps->ps_next_retry = jiffies + IBLND_POOL_RETRY * HZ;
- CERROR("Can't allocate new %s pool because out of memory\n",
- ps->ps_name);
- }
- spin_unlock(&ps->ps_lock);
-
- goto again;
-}
-
-static void kiblnd_destroy_tx_pool(struct kib_pool *pool)
-{
- struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool);
- int i;
-
- LASSERT(!pool->po_allocated);
-
- if (tpo->tpo_tx_pages) {
- kiblnd_unmap_tx_pool(tpo);
- kiblnd_free_pages(tpo->tpo_tx_pages);
- }
-
- if (!tpo->tpo_tx_descs)
- goto out;
-
- for (i = 0; i < pool->po_size; i++) {
- struct kib_tx *tx = &tpo->tpo_tx_descs[i];
-
- list_del(&tx->tx_list);
- kfree(tx->tx_pages);
- kfree(tx->tx_frags);
- kfree(tx->tx_wrq);
- kfree(tx->tx_sge);
- kfree(tx->tx_rd);
- }
-
- kfree(tpo->tpo_tx_descs);
-out:
- kiblnd_fini_pool(pool);
- kfree(tpo);
-}
-
-static int kiblnd_tx_pool_size(int ncpts)
-{
- int ntx = *kiblnd_tunables.kib_ntx / ncpts;
-
- return max(IBLND_TX_POOL, ntx);
-}
-
-static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size,
- struct kib_pool **pp_po)
-{
- int i;
- int npg;
- struct kib_pool *pool;
- struct kib_tx_pool *tpo;
-
- tpo = kzalloc_cpt(sizeof(*tpo), GFP_NOFS, ps->ps_cpt);
- if (!tpo) {
- CERROR("Failed to allocate TX pool\n");
- return -ENOMEM;
- }
-
- pool = &tpo->tpo_pool;
- kiblnd_init_pool(ps, pool, size);
- tpo->tpo_tx_descs = NULL;
- tpo->tpo_tx_pages = NULL;
-
- npg = DIV_ROUND_UP(size * IBLND_MSG_SIZE, PAGE_SIZE);
- if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) {
- CERROR("Can't allocate tx pages: %d\n", npg);
- kfree(tpo);
- return -ENOMEM;
- }
-
- tpo->tpo_tx_descs = kzalloc_cpt(size * sizeof(struct kib_tx),
- GFP_NOFS, ps->ps_cpt);
- if (!tpo->tpo_tx_descs) {
- CERROR("Can't allocate %d tx descriptors\n", size);
- ps->ps_pool_destroy(pool);
- return -ENOMEM;
- }
-
- memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
-
- for (i = 0; i < size; i++) {
- struct kib_tx *tx = &tpo->tpo_tx_descs[i];
-
- tx->tx_pool = tpo;
- if (ps->ps_net->ibn_fmr_ps) {
- tx->tx_pages = kzalloc_cpt(LNET_MAX_IOV * sizeof(*tx->tx_pages),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_pages)
- break;
- }
-
- tx->tx_frags = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) *
- sizeof(*tx->tx_frags),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_frags)
- break;
-
- sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
-
- tx->tx_wrq = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) *
- sizeof(*tx->tx_wrq),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_wrq)
- break;
-
- tx->tx_sge = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) *
- sizeof(*tx->tx_sge),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_sge)
- break;
-
- tx->tx_rd = kzalloc_cpt(offsetof(struct kib_rdma_desc,
- rd_frags[IBLND_MAX_RDMA_FRAGS]),
- GFP_NOFS, ps->ps_cpt);
- if (!tx->tx_rd)
- break;
- }
-
- if (i == size) {
- kiblnd_map_tx_pool(tpo);
- *pp_po = pool;
- return 0;
- }
-
- ps->ps_pool_destroy(pool);
- return -ENOMEM;
-}
-
-static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
-{
- struct kib_tx_poolset *tps = container_of(pool->po_owner,
- struct kib_tx_poolset,
- tps_poolset);
- struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list);
-
- tx->tx_cookie = tps->tps_next_tx_cookie++;
-}
-
-static void kiblnd_net_fini_pools(struct kib_net *net)
-{
- int i;
-
- cfs_cpt_for_each(i, lnet_cpt_table()) {
- struct kib_tx_poolset *tps;
- struct kib_fmr_poolset *fps;
-
- if (net->ibn_tx_ps) {
- tps = net->ibn_tx_ps[i];
- kiblnd_fini_poolset(&tps->tps_poolset);
- }
-
- if (net->ibn_fmr_ps) {
- fps = net->ibn_fmr_ps[i];
- kiblnd_fini_fmr_poolset(fps);
- }
- }
-
- if (net->ibn_tx_ps) {
- cfs_percpt_free(net->ibn_tx_ps);
- net->ibn_tx_ps = NULL;
- }
-
- if (net->ibn_fmr_ps) {
- cfs_percpt_free(net->ibn_fmr_ps);
- net->ibn_fmr_ps = NULL;
- }
-}
-
-static int kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni,
- __u32 *cpts, int ncpts)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int cpt;
- int rc;
- int i;
-
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
-
- if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
- CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
- tunables->lnd_fmr_pool_size,
- *kiblnd_tunables.kib_ntx / 4);
- rc = -EINVAL;
- goto failed;
- }
-
- /*
- * TX pool must be created later than FMR, see LU-2268
- * for details
- */
- LASSERT(!net->ibn_tx_ps);
-
- /*
- * premapping can fail if ibd_nmr > 1, so we always create
- * FMR pool and map-on-demand if premapping failed
- *
- * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset
- * The number of struct kib_fmr_poolsets create is equal to the
- * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt].
- */
- net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(struct kib_fmr_poolset));
- if (!net->ibn_fmr_ps) {
- CERROR("Failed to allocate FMR pool array\n");
- rc = -ENOMEM;
- goto failed;
- }
-
- for (i = 0; i < ncpts; i++) {
- cpt = !cpts ? i : cpts[i];
- rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
- net, tunables);
- if (rc) {
- CERROR("Can't initialize FMR pool for CPT %d: %d\n",
- cpt, rc);
- goto failed;
- }
- }
-
- if (i > 0)
- LASSERT(i == ncpts);
-
- /*
- * cfs_precpt_alloc is creating an array of struct kib_tx_poolset
- * The number of struct kib_tx_poolsets create is equal to the
- * number of CPTs that exist, i.e net->ibn_tx_ps[cpt].
- */
- net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(struct kib_tx_poolset));
- if (!net->ibn_tx_ps) {
- CERROR("Failed to allocate tx pool array\n");
- rc = -ENOMEM;
- goto failed;
- }
-
- for (i = 0; i < ncpts; i++) {
- cpt = !cpts ? i : cpts[i];
- rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
- cpt, net, "TX",
- kiblnd_tx_pool_size(ncpts),
- kiblnd_create_tx_pool,
- kiblnd_destroy_tx_pool,
- kiblnd_tx_init, NULL);
- if (rc) {
- CERROR("Can't initialize TX pool for CPT %d: %d\n",
- cpt, rc);
- goto failed;
- }
- }
-
- return 0;
- failed:
- kiblnd_net_fini_pools(net);
- LASSERT(rc);
- return rc;
-}
-
-static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
-{
- /*
- * It's safe to assume a HCA can handle a page size
- * matching that of the native system
- */
- hdev->ibh_page_shift = PAGE_SHIFT;
- hdev->ibh_page_size = 1 << PAGE_SHIFT;
- hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
-
- hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
- if (hdev->ibh_mr_size == ~0ULL) {
- hdev->ibh_mr_shift = 64;
- return 0;
- }
-
- CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
- return -EINVAL;
-}
-
-void kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
-{
- if (hdev->ibh_pd)
- ib_dealloc_pd(hdev->ibh_pd);
-
- if (hdev->ibh_cmid)
- rdma_destroy_id(hdev->ibh_cmid);
-
- kfree(hdev);
-}
-
-/* DUMMY */
-static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
- struct rdma_cm_event *event)
-{
- return 0;
-}
-
-static int kiblnd_dev_need_failover(struct kib_dev *dev)
-{
- struct rdma_cm_id *cmid;
- struct sockaddr_in srcaddr;
- struct sockaddr_in dstaddr;
- int rc;
-
- if (!dev->ibd_hdev || /* initializing */
- !dev->ibd_hdev->ibh_cmid || /* listener is dead */
- *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
- return 1;
-
- /*
- * XXX: it's UGLY, but I don't have better way to find
- * ib-bonding HCA failover because:
- *
- * a. no reliable CM event for HCA failover...
- * b. no OFED API to get ib_device for current net_device...
- *
- * We have only two choices at this point:
- *
- * a. rdma_bind_addr(), it will conflict with listener cmid
- * b. rdma_resolve_addr() to zero addr
- */
- cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
- IB_QPT_RC);
- if (IS_ERR(cmid)) {
- rc = PTR_ERR(cmid);
- CERROR("Failed to create cmid for failover: %d\n", rc);
- return rc;
- }
-
- memset(&srcaddr, 0, sizeof(srcaddr));
- srcaddr.sin_family = AF_INET;
- srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
-
- memset(&dstaddr, 0, sizeof(dstaddr));
- dstaddr.sin_family = AF_INET;
- rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
- (struct sockaddr *)&dstaddr, 1);
- if (rc || !cmid->device) {
- CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
- dev->ibd_ifname, &dev->ibd_ifip,
- cmid->device, rc);
- rdma_destroy_id(cmid);
- return rc;
- }
-
- rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
- rdma_destroy_id(cmid);
-
- return rc;
-}
-
-int kiblnd_dev_failover(struct kib_dev *dev)
-{
- LIST_HEAD(zombie_tpo);
- LIST_HEAD(zombie_ppo);
- LIST_HEAD(zombie_fpo);
- struct rdma_cm_id *cmid = NULL;
- struct kib_hca_dev *hdev = NULL;
- struct ib_pd *pd;
- struct kib_net *net;
- struct sockaddr_in addr;
- unsigned long flags;
- int rc = 0;
- int i;
-
- LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
- dev->ibd_can_failover || !dev->ibd_hdev);
-
- rc = kiblnd_dev_need_failover(dev);
- if (rc <= 0)
- goto out;
-
- if (dev->ibd_hdev &&
- dev->ibd_hdev->ibh_cmid) {
- /*
- * XXX it's not good to close old listener at here,
- * because we can fail to create new listener.
- * But we have to close it now, otherwise rdma_bind_addr
- * will return EADDRINUSE... How crap!
- */
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- cmid = dev->ibd_hdev->ibh_cmid;
- /*
- * make next schedule of kiblnd_dev_need_failover()
- * return 1 for me
- */
- dev->ibd_hdev->ibh_cmid = NULL;
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- rdma_destroy_id(cmid);
- }
-
- cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
- IB_QPT_RC);
- if (IS_ERR(cmid)) {
- rc = PTR_ERR(cmid);
- CERROR("Failed to create cmid for failover: %d\n", rc);
- goto out;
- }
-
- memset(&addr, 0, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = htonl(dev->ibd_ifip);
- addr.sin_port = htons(*kiblnd_tunables.kib_service);
-
- /* Bind to failover device or port */
- rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
- if (rc || !cmid->device) {
- CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
- dev->ibd_ifname, &dev->ibd_ifip,
- cmid->device, rc);
- rdma_destroy_id(cmid);
- goto out;
- }
-
- hdev = kzalloc(sizeof(*hdev), GFP_NOFS);
- if (!hdev) {
- CERROR("Failed to allocate kib_hca_dev\n");
- rdma_destroy_id(cmid);
- rc = -ENOMEM;
- goto out;
- }
-
- atomic_set(&hdev->ibh_ref, 1);
- hdev->ibh_dev = dev;
- hdev->ibh_cmid = cmid;
- hdev->ibh_ibdev = cmid->device;
-
- pd = ib_alloc_pd(cmid->device, 0);
- if (IS_ERR(pd)) {
- rc = PTR_ERR(pd);
- CERROR("Can't allocate PD: %d\n", rc);
- goto out;
- }
-
- hdev->ibh_pd = pd;
-
- rc = rdma_listen(cmid, 0);
- if (rc) {
- CERROR("Can't start new listener: %d\n", rc);
- goto out;
- }
-
- rc = kiblnd_hdev_get_attr(hdev);
- if (rc) {
- CERROR("Can't get device attributes: %d\n", rc);
- goto out;
- }
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- swap(dev->ibd_hdev, hdev); /* take over the refcount */
-
- list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
- cfs_cpt_for_each(i, lnet_cpt_table()) {
- kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
- &zombie_tpo);
-
- if (net->ibn_fmr_ps)
- kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
- &zombie_fpo);
- }
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- out:
- if (!list_empty(&zombie_tpo))
- kiblnd_destroy_pool_list(&zombie_tpo);
- if (!list_empty(&zombie_ppo))
- kiblnd_destroy_pool_list(&zombie_ppo);
- if (!list_empty(&zombie_fpo))
- kiblnd_destroy_fmr_pool_list(&zombie_fpo);
- if (hdev)
- kiblnd_hdev_decref(hdev);
-
- if (rc)
- dev->ibd_failed_failover++;
- else
- dev->ibd_failed_failover = 0;
-
- return rc;
-}
-
-void kiblnd_destroy_dev(struct kib_dev *dev)
-{
- LASSERT(!dev->ibd_nnets);
- LASSERT(list_empty(&dev->ibd_nets));
-
- list_del(&dev->ibd_fail_list);
- list_del(&dev->ibd_list);
-
- if (dev->ibd_hdev)
- kiblnd_hdev_decref(dev->ibd_hdev);
-
- kfree(dev);
-}
-
-static struct kib_dev *kiblnd_create_dev(char *ifname)
-{
- struct net_device *netdev;
- struct kib_dev *dev;
- __u32 netmask;
- __u32 ip;
- int up;
- int rc;
-
- rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
- if (rc) {
- CERROR("Can't query IPoIB interface %s: %d\n",
- ifname, rc);
- return NULL;
- }
-
- if (!up) {
- CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
- return NULL;
- }
-
- dev = kzalloc(sizeof(*dev), GFP_NOFS);
- if (!dev)
- return NULL;
-
- netdev = dev_get_by_name(&init_net, ifname);
- if (!netdev) {
- dev->ibd_can_failover = 0;
- } else {
- dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
- dev_put(netdev);
- }
-
- INIT_LIST_HEAD(&dev->ibd_nets);
- INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
- INIT_LIST_HEAD(&dev->ibd_fail_list);
- dev->ibd_ifip = ip;
- strcpy(&dev->ibd_ifname[0], ifname);
-
- /* initialize the device */
- rc = kiblnd_dev_failover(dev);
- if (rc) {
- CERROR("Can't initialize device: %d\n", rc);
- kfree(dev);
- return NULL;
- }
-
- list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
- return dev;
-}
-
-static void kiblnd_base_shutdown(void)
-{
- struct kib_sched_info *sched;
- int i;
-
- LASSERT(list_empty(&kiblnd_data.kib_devs));
-
- switch (kiblnd_data.kib_init) {
- default:
- LBUG();
-
- case IBLND_INIT_ALL:
- case IBLND_INIT_DATA:
- LASSERT(kiblnd_data.kib_peers);
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
- LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
- LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
- LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
- LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
- LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
-
- /* flag threads to terminate; wake and wait for them to die */
- kiblnd_data.kib_shutdown = 1;
-
- /*
- * NB: we really want to stop scheduler threads net by net
- * instead of the whole module, this should be improved
- * with dynamic configuration LNet
- */
- cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
- wake_up_all(&sched->ibs_waitq);
-
- wake_up_all(&kiblnd_data.kib_connd_waitq);
- wake_up_all(&kiblnd_data.kib_failover_waitq);
-
- i = 2;
- while (atomic_read(&kiblnd_data.kib_nthreads)) {
- i++;
- /* power of 2 ? */
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
- "Waiting for %d threads to terminate\n",
- atomic_read(&kiblnd_data.kib_nthreads));
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- }
-
- /* fall through */
-
- case IBLND_INIT_NOTHING:
- break;
- }
-
- kvfree(kiblnd_data.kib_peers);
-
- if (kiblnd_data.kib_scheds)
- cfs_percpt_free(kiblnd_data.kib_scheds);
-
- kiblnd_data.kib_init = IBLND_INIT_NOTHING;
- module_put(THIS_MODULE);
-}
-
-static void kiblnd_shutdown(struct lnet_ni *ni)
-{
- struct kib_net *net = ni->ni_data;
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- int i;
- unsigned long flags;
-
- LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
-
- if (!net)
- goto out;
-
- write_lock_irqsave(g_lock, flags);
- net->ibn_shutdown = 1;
- write_unlock_irqrestore(g_lock, flags);
-
- switch (net->ibn_init) {
- default:
- LBUG();
-
- case IBLND_INIT_ALL:
- /* nuke all existing peers within this net */
- kiblnd_del_peer(ni, LNET_NID_ANY);
-
- /* Wait for all peer state to clean up */
- i = 2;
- while (atomic_read(&net->ibn_npeers)) {
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
- "%s: waiting for %d peers to disconnect\n",
- libcfs_nid2str(ni->ni_nid),
- atomic_read(&net->ibn_npeers));
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- }
-
- kiblnd_net_fini_pools(net);
-
- write_lock_irqsave(g_lock, flags);
- LASSERT(net->ibn_dev->ibd_nnets > 0);
- net->ibn_dev->ibd_nnets--;
- list_del(&net->ibn_list);
- write_unlock_irqrestore(g_lock, flags);
-
- /* fall through */
-
- case IBLND_INIT_NOTHING:
- LASSERT(!atomic_read(&net->ibn_nconns));
-
- if (net->ibn_dev && !net->ibn_dev->ibd_nnets)
- kiblnd_destroy_dev(net->ibn_dev);
-
- break;
- }
-
- net->ibn_init = IBLND_INIT_NOTHING;
- ni->ni_data = NULL;
-
- kfree(net);
-
-out:
- if (list_empty(&kiblnd_data.kib_devs))
- kiblnd_base_shutdown();
-}
-
-static int kiblnd_base_startup(void)
-{
- struct kib_sched_info *sched;
- int rc;
- int i;
-
- LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
-
- try_module_get(THIS_MODULE);
- /* zero pointers, flags etc */
- memset(&kiblnd_data, 0, sizeof(kiblnd_data));
-
- rwlock_init(&kiblnd_data.kib_global_lock);
-
- INIT_LIST_HEAD(&kiblnd_data.kib_devs);
- INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
-
- kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
- kiblnd_data.kib_peers = kvmalloc_array(kiblnd_data.kib_peer_hash_size,
- sizeof(struct list_head),
- GFP_KERNEL);
- if (!kiblnd_data.kib_peers)
- goto failed;
- for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
- INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
-
- spin_lock_init(&kiblnd_data.kib_connd_lock);
- INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
- INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
- INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
- INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
-
- init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
- init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
-
- kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(*sched));
- if (!kiblnd_data.kib_scheds)
- goto failed;
-
- cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
- int nthrs;
-
- spin_lock_init(&sched->ibs_lock);
- INIT_LIST_HEAD(&sched->ibs_conns);
- init_waitqueue_head(&sched->ibs_waitq);
-
- nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
- if (*kiblnd_tunables.kib_nscheds > 0) {
- nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
- } else {
- /*
- * max to half of CPUs, another half is reserved for
- * upper layer modules
- */
- nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
- }
-
- sched->ibs_nthreads_max = nthrs;
- sched->ibs_cpt = i;
- }
-
- kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
-
- /* lists/ptrs/locks initialised */
- kiblnd_data.kib_init = IBLND_INIT_DATA;
- /*****************************************************/
-
- rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
- if (rc) {
- CERROR("Can't spawn o2iblnd connd: %d\n", rc);
- goto failed;
- }
-
- if (*kiblnd_tunables.kib_dev_failover)
- rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
- "kiblnd_failover");
-
- if (rc) {
- CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
- goto failed;
- }
-
- /* flag everything initialised */
- kiblnd_data.kib_init = IBLND_INIT_ALL;
- /*****************************************************/
-
- return 0;
-
- failed:
- kiblnd_base_shutdown();
- return -ENETDOWN;
-}
-
-static int kiblnd_start_schedulers(struct kib_sched_info *sched)
-{
- int rc = 0;
- int nthrs;
- int i;
-
- if (!sched->ibs_nthreads) {
- if (*kiblnd_tunables.kib_nscheds > 0) {
- nthrs = sched->ibs_nthreads_max;
- } else {
- nthrs = cfs_cpt_weight(lnet_cpt_table(),
- sched->ibs_cpt);
- nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
- nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
- }
- } else {
- LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
- /* increase one thread if there is new interface */
- nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
- }
-
- for (i = 0; i < nthrs; i++) {
- long id;
- char name[20];
-
- id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
- snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
- KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
- rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
- if (!rc)
- continue;
-
- CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
- sched->ibs_cpt, sched->ibs_nthreads + i, rc);
- break;
- }
-
- sched->ibs_nthreads += i;
- return rc;
-}
-
-static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts,
- int ncpts)
-{
- int cpt;
- int rc;
- int i;
-
- for (i = 0; i < ncpts; i++) {
- struct kib_sched_info *sched;
-
- cpt = !cpts ? i : cpts[i];
- sched = kiblnd_data.kib_scheds[cpt];
-
- if (!newdev && sched->ibs_nthreads > 0)
- continue;
-
- rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
- if (rc) {
- CERROR("Failed to start scheduler threads for %s\n",
- dev->ibd_ifname);
- return rc;
- }
- }
- return 0;
-}
-
-static struct kib_dev *kiblnd_dev_search(char *ifname)
-{
- struct kib_dev *alias = NULL;
- struct kib_dev *dev;
- char *colon;
- char *colon2;
-
- colon = strchr(ifname, ':');
- list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
- if (!strcmp(&dev->ibd_ifname[0], ifname))
- return dev;
-
- if (alias)
- continue;
-
- colon2 = strchr(dev->ibd_ifname, ':');
- if (colon)
- *colon = 0;
- if (colon2)
- *colon2 = 0;
-
- if (!strcmp(&dev->ibd_ifname[0], ifname))
- alias = dev;
-
- if (colon)
- *colon = ':';
- if (colon2)
- *colon2 = ':';
- }
- return alias;
-}
-
-static int kiblnd_startup(struct lnet_ni *ni)
-{
- char *ifname;
- struct kib_dev *ibdev = NULL;
- struct kib_net *net;
- struct timespec64 tv;
- unsigned long flags;
- int rc;
- int newdev;
-
- LASSERT(ni->ni_lnd == &the_o2iblnd);
-
- if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
- rc = kiblnd_base_startup();
- if (rc)
- return rc;
- }
-
- net = kzalloc(sizeof(*net), GFP_NOFS);
- ni->ni_data = net;
- if (!net)
- goto net_failed;
-
- ktime_get_real_ts64(&tv);
- net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
- tv.tv_nsec / NSEC_PER_USEC;
-
- rc = kiblnd_tunables_setup(ni);
- if (rc)
- goto net_failed;
-
- if (ni->ni_interfaces[0]) {
- /* Use the IPoIB interface specified in 'networks=' */
-
- BUILD_BUG_ON(LNET_MAX_INTERFACES <= 1);
- if (ni->ni_interfaces[1]) {
- CERROR("Multiple interfaces not supported\n");
- goto failed;
- }
-
- ifname = ni->ni_interfaces[0];
- } else {
- ifname = *kiblnd_tunables.kib_default_ipif;
- }
-
- if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
- CERROR("IPoIB interface name too long: %s\n", ifname);
- goto failed;
- }
-
- ibdev = kiblnd_dev_search(ifname);
-
- newdev = !ibdev;
- /* hmm...create kib_dev even for alias */
- if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname))
- ibdev = kiblnd_create_dev(ifname);
-
- if (!ibdev)
- goto failed;
-
- net->ibn_dev = ibdev;
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
-
- rc = kiblnd_dev_start_threads(ibdev, newdev,
- ni->ni_cpts, ni->ni_ncpts);
- if (rc)
- goto failed;
-
- rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
- if (rc) {
- CERROR("Failed to initialize NI pools: %d\n", rc);
- goto failed;
- }
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- ibdev->ibd_nnets++;
- list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- net->ibn_init = IBLND_INIT_ALL;
-
- return 0;
-
-failed:
- if (!net->ibn_dev && ibdev)
- kiblnd_destroy_dev(ibdev);
-
-net_failed:
- kiblnd_shutdown(ni);
-
- CDEBUG(D_NET, "%s failed\n", __func__);
- return -ENETDOWN;
-}
-
-static struct lnet_lnd the_o2iblnd = {
- .lnd_type = O2IBLND,
- .lnd_startup = kiblnd_startup,
- .lnd_shutdown = kiblnd_shutdown,
- .lnd_ctl = kiblnd_ctl,
- .lnd_query = kiblnd_query,
- .lnd_send = kiblnd_send,
- .lnd_recv = kiblnd_recv,
-};
-
-static void __exit ko2iblnd_exit(void)
-{
- lnet_unregister_lnd(&the_o2iblnd);
-}
-
-static int __init ko2iblnd_init(void)
-{
- int rc;
-
- BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE);
- BUILD_BUG_ON(offsetof(struct kib_msg,
- ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
- > IBLND_MSG_SIZE);
- BUILD_BUG_ON(offsetof(struct kib_msg,
- ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
- > IBLND_MSG_SIZE);
-
- kiblnd_tunables_init();
-
- rc = libcfs_setup();
- if (rc)
- return rc;
-
- lnet_register_lnd(&the_o2iblnd);
-
- return 0;
-}
-
-MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
-MODULE_VERSION("2.7.0");
-MODULE_LICENSE("GPL");
-
-module_init(ko2iblnd_init);
-module_exit(ko2iblnd_exit);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
deleted file mode 100644
index 217503f..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
+++ /dev/null
@@ -1,1048 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd.h
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/unistd.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-
-#include <linux/io.h>
-
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/list.h>
-#include <linux/kmod.h>
-#include <linux/sysctl.h>
-#include <linux/pci.h>
-
-#include <net/sock.h>
-#include <linux/in.h>
-
-#include <rdma/rdma_cm.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_fmr_pool.h>
-
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <linux/lnet/lib-lnet.h>
-
-#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */
-/* # scheduler loops before reschedule */
-#define IBLND_RESCHED 100
-
-#define IBLND_N_SCHED 2
-#define IBLND_N_SCHED_HIGH 4
-
-struct kib_tunables {
- int *kib_dev_failover; /* HCA failover */
- unsigned int *kib_service; /* IB service number */
- int *kib_min_reconnect_interval; /* first failed connection retry... */
- int *kib_max_reconnect_interval; /* exponentially increasing to this */
- int *kib_cksum; /* checksum struct kib_msg? */
- int *kib_timeout; /* comms timeout (seconds) */
- int *kib_keepalive; /* keepalive timeout (seconds) */
- int *kib_ntx; /* # tx descs */
- char **kib_default_ipif; /* default IPoIB interface */
- int *kib_retry_count;
- int *kib_rnr_retry_count;
- int *kib_ib_mtu; /* IB MTU */
- int *kib_require_priv_port; /* accept only privileged ports */
- int *kib_use_priv_port; /* use privileged port for active connect */
- int *kib_nscheds; /* # threads on each CPT */
-};
-
-extern struct kib_tunables kiblnd_tunables;
-
-#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */
-#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */
-
-#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */
-#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *)0)->ibm_credits)) - 1) /* Max # of peer credits */
-
-/* when eagerly to return credits */
-#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
- IBLND_CREDIT_HIGHWATER_V1 : \
- t->lnd_peercredits_hiw)
-
-#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \
- cb, dev, \
- ps, qpt)
-
-/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
-#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
-#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
-
-#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */
-#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */
-#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */
-
-/************************/
-/* derived constants... */
-/* Pools (shared by connections on each CPT) */
-/* These pools can grow at runtime, so don't need give a very large value */
-#define IBLND_TX_POOL 256
-#define IBLND_FMR_POOL 256
-#define IBLND_FMR_POOL_FLUSH 192
-
-#define IBLND_RX_MSGS(c) \
- ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version))
-#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE)
-#define IBLND_RX_MSG_PAGES(c) \
- ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE)
-
-/* WRs and CQEs (per connection) */
-#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c)
-#define IBLND_SEND_WRS(c) \
- (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \
- kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni))
-#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
-
-struct kib_hca_dev;
-
-/* o2iblnd can run over aliased interface */
-#ifdef IFALIASZ
-#define KIB_IFNAME_SIZE IFALIASZ
-#else
-#define KIB_IFNAME_SIZE 256
-#endif
-
-struct kib_dev {
- struct list_head ibd_list; /* chain on kib_devs */
- struct list_head ibd_fail_list; /* chain on kib_failed_devs */
- __u32 ibd_ifip; /* IPoIB interface IP */
-
- /* IPoIB interface name */
- char ibd_ifname[KIB_IFNAME_SIZE];
- int ibd_nnets; /* # nets extant */
-
- unsigned long ibd_next_failover;
- int ibd_failed_failover; /* # failover failures */
- unsigned int ibd_failover; /* failover in progress */
- unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */
- struct list_head ibd_nets;
- struct kib_hca_dev *ibd_hdev;
-};
-
-struct kib_hca_dev {
- struct rdma_cm_id *ibh_cmid; /* listener cmid */
- struct ib_device *ibh_ibdev; /* IB device */
- int ibh_page_shift; /* page shift of current HCA */
- int ibh_page_size; /* page size of current HCA */
- __u64 ibh_page_mask; /* page mask of current HCA */
- int ibh_mr_shift; /* bits shift of max MR size */
- __u64 ibh_mr_size; /* size of MR */
- struct ib_pd *ibh_pd; /* PD */
- struct kib_dev *ibh_dev; /* owner */
- atomic_t ibh_ref; /* refcount */
-};
-
-/** # of seconds to keep pool alive */
-#define IBLND_POOL_DEADLINE 300
-/** # of seconds to retry if allocation failed */
-#define IBLND_POOL_RETRY 1
-
-struct kib_pages {
- int ibp_npages; /* # pages */
- struct page *ibp_pages[0]; /* page array */
-};
-
-struct kib_pool;
-struct kib_poolset;
-
-typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps,
- int inc, struct kib_pool **pp_po);
-typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
-typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
-typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
-
-struct kib_net;
-
-#define IBLND_POOL_NAME_LEN 32
-
-struct kib_poolset {
- spinlock_t ps_lock; /* serialize */
- struct kib_net *ps_net; /* network it belongs to */
- char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
- struct list_head ps_pool_list; /* list of pools */
- struct list_head ps_failed_pool_list;/* failed pool list */
- unsigned long ps_next_retry; /* time stamp for retry if */
- /* failed to allocate */
- int ps_increasing; /* is allocating new pool */
- int ps_pool_size; /* new pool size */
- int ps_cpt; /* CPT id */
-
- kib_ps_pool_create_t ps_pool_create; /* create a new pool */
- kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */
- kib_ps_node_init_t ps_node_init; /* initialize new allocated node */
- kib_ps_node_fini_t ps_node_fini; /* finalize node */
-};
-
-struct kib_pool {
- struct list_head po_list; /* chain on pool list */
- struct list_head po_free_list; /* pre-allocated node */
- struct kib_poolset *po_owner; /* pool_set of this pool */
- unsigned long po_deadline; /* deadline of this pool */
- int po_allocated; /* # of elements in use */
- int po_failed; /* pool is created on failed HCA */
- int po_size; /* # of pre-allocated elements */
-};
-
-struct kib_tx_poolset {
- struct kib_poolset tps_poolset; /* pool-set */
- __u64 tps_next_tx_cookie; /* cookie of TX */
-};
-
-struct kib_tx_pool {
- struct kib_pool tpo_pool; /* pool */
- struct kib_hca_dev *tpo_hdev; /* device for this pool */
- struct kib_tx *tpo_tx_descs; /* all the tx descriptors */
- struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */
-};
-
-struct kib_fmr_poolset {
- spinlock_t fps_lock; /* serialize */
- struct kib_net *fps_net; /* IB network */
- struct list_head fps_pool_list; /* FMR pool list */
- struct list_head fps_failed_pool_list;/* FMR pool list */
- __u64 fps_version; /* validity stamp */
- int fps_cpt; /* CPT id */
- int fps_pool_size;
- int fps_flush_trigger;
- int fps_cache;
- int fps_increasing; /* is allocating new pool */
- unsigned long fps_next_retry; /* time stamp for retry if*/
- /* failed to allocate */
-};
-
-struct kib_fast_reg_descriptor { /* For fast registration */
- struct list_head frd_list;
- struct ib_send_wr frd_inv_wr;
- struct ib_reg_wr frd_fastreg_wr;
- struct ib_mr *frd_mr;
- bool frd_valid;
-};
-
-struct kib_fmr_pool {
- struct list_head fpo_list; /* chain on pool list */
- struct kib_hca_dev *fpo_hdev; /* device for this pool */
- struct kib_fmr_poolset *fpo_owner; /* owner of this pool */
- union {
- struct {
- struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
- } fmr;
- struct { /* For fast registration */
- struct list_head fpo_pool_list;
- int fpo_pool_size;
- } fast_reg;
- };
- unsigned long fpo_deadline; /* deadline of this pool */
- int fpo_failed; /* fmr pool is failed */
- int fpo_map_count; /* # of mapped FMR */
- int fpo_is_fmr;
-};
-
-struct kib_fmr {
- struct kib_fmr_pool *fmr_pool; /* pool of FMR */
- struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
- struct kib_fast_reg_descriptor *fmr_frd;
- u32 fmr_key;
-};
-
-struct kib_net {
- struct list_head ibn_list; /* chain on struct kib_dev::ibd_nets */
- __u64 ibn_incarnation;/* my epoch */
- int ibn_init; /* initialisation state */
- int ibn_shutdown; /* shutting down? */
-
- atomic_t ibn_npeers; /* # peers extant */
- atomic_t ibn_nconns; /* # connections extant */
-
- struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */
- struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */
-
- struct kib_dev *ibn_dev; /* underlying IB device */
-};
-
-#define KIB_THREAD_SHIFT 16
-#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid))
-#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT)
-#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
-
-struct kib_sched_info {
- spinlock_t ibs_lock; /* serialise */
- wait_queue_head_t ibs_waitq; /* schedulers sleep here */
- struct list_head ibs_conns; /* conns to check for rx completions */
- int ibs_nthreads; /* number of scheduler threads */
- int ibs_nthreads_max; /* max allowed scheduler threads */
- int ibs_cpt; /* CPT id */
-};
-
-struct kib_data {
- int kib_init; /* initialisation state */
- int kib_shutdown; /* shut down? */
- struct list_head kib_devs; /* IB devices extant */
- struct list_head kib_failed_devs; /* list head of failed devices */
- wait_queue_head_t kib_failover_waitq; /* schedulers sleep here */
- atomic_t kib_nthreads; /* # live threads */
- rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */
- struct list_head *kib_peers; /* hash table of all my known peers */
- int kib_peer_hash_size; /* size of kib_peers */
- void *kib_connd; /* the connd task (serialisation assertions) */
- struct list_head kib_connd_conns; /* connections to setup/teardown */
- struct list_head kib_connd_zombies; /* connections with zero refcount */
- /* connections to reconnect */
- struct list_head kib_reconn_list;
- /* peers wait for reconnection */
- struct list_head kib_reconn_wait;
- /**
- * The second that peers are pulled out from \a kib_reconn_wait
- * for reconnection.
- */
- time64_t kib_reconn_sec;
-
- wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */
- spinlock_t kib_connd_lock; /* serialise */
- struct ib_qp_attr kib_error_qpa; /* QP->ERROR */
- struct kib_sched_info **kib_scheds; /* percpt data for schedulers */
-};
-
-#define IBLND_INIT_NOTHING 0
-#define IBLND_INIT_DATA 1
-#define IBLND_INIT_ALL 2
-
-/************************************************************************
- * IB Wire message format.
- * These are sent in sender's byte order (i.e. receiver flips).
- */
-
-struct kib_connparams {
- __u16 ibcp_queue_depth;
- __u16 ibcp_max_frags;
- __u32 ibcp_max_msg_size;
-} WIRE_ATTR;
-
-struct kib_immediate_msg {
- struct lnet_hdr ibim_hdr; /* portals header */
- char ibim_payload[0]; /* piggy-backed payload */
-} WIRE_ATTR;
-
-struct kib_rdma_frag {
- __u32 rf_nob; /* # bytes this frag */
- __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */
-} WIRE_ATTR;
-
-struct kib_rdma_desc {
- __u32 rd_key; /* local/remote key */
- __u32 rd_nfrags; /* # fragments */
- struct kib_rdma_frag rd_frags[0]; /* buffer frags */
-} WIRE_ATTR;
-
-struct kib_putreq_msg {
- struct lnet_hdr ibprm_hdr; /* portals header */
- __u64 ibprm_cookie; /* opaque completion cookie */
-} WIRE_ATTR;
-
-struct kib_putack_msg {
- __u64 ibpam_src_cookie; /* reflected completion cookie */
- __u64 ibpam_dst_cookie; /* opaque completion cookie */
- struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */
-} WIRE_ATTR;
-
-struct kib_get_msg {
- struct lnet_hdr ibgm_hdr; /* portals header */
- __u64 ibgm_cookie; /* opaque completion cookie */
- struct kib_rdma_desc ibgm_rd; /* rdma descriptor */
-} WIRE_ATTR;
-
-struct kib_completion_msg {
- __u64 ibcm_cookie; /* opaque completion cookie */
- __s32 ibcm_status; /* < 0 failure: >= 0 length */
-} WIRE_ATTR;
-
-struct kib_msg {
- /* First 2 fields fixed FOR ALL TIME */
- __u32 ibm_magic; /* I'm an ibnal message */
- __u16 ibm_version; /* this is my version number */
-
- __u8 ibm_type; /* msg type */
- __u8 ibm_credits; /* returned credits */
- __u32 ibm_nob; /* # bytes in whole message */
- __u32 ibm_cksum; /* checksum (0 == no checksum) */
- __u64 ibm_srcnid; /* sender's NID */
- __u64 ibm_srcstamp; /* sender's incarnation */
- __u64 ibm_dstnid; /* destination's NID */
- __u64 ibm_dststamp; /* destination's incarnation */
-
- union {
- struct kib_connparams connparams;
- struct kib_immediate_msg immediate;
- struct kib_putreq_msg putreq;
- struct kib_putack_msg putack;
- struct kib_get_msg get;
- struct kib_completion_msg completion;
- } WIRE_ATTR ibm_u;
-} WIRE_ATTR;
-
-#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */
-
-#define IBLND_MSG_VERSION_1 0x11
-#define IBLND_MSG_VERSION_2 0x12
-#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2
-
-#define IBLND_MSG_CONNREQ 0xc0 /* connection request */
-#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */
-#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */
-#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */
-#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */
-#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */
-#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */
-#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */
-#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */
-#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */
-
-struct kib_rej {
- __u32 ibr_magic; /* sender's magic */
- __u16 ibr_version; /* sender's version */
- __u8 ibr_why; /* reject reason */
- __u8 ibr_padding; /* padding */
- __u64 ibr_incarnation; /* incarnation of peer */
- struct kib_connparams ibr_cp; /* connection parameters */
-} WIRE_ATTR;
-
-/* connection rejection reasons */
-#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */
-#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */
-#define IBLND_REJECT_FATAL 3 /* Anything else */
-#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */
-#define IBLND_REJECT_CONN_STALE 5 /* stale peer */
-/* peer's rdma frags doesn't match mine */
-#define IBLND_REJECT_RDMA_FRAGS 6
-/* peer's msg queue size doesn't match mine */
-#define IBLND_REJECT_MSG_QUEUE_SIZE 7
-
-/***********************************************************************/
-
-struct kib_rx { /* receive message */
- struct list_head rx_list; /* queue for attention */
- struct kib_conn *rx_conn; /* owning conn */
- int rx_nob; /* # bytes received (-1 while posted) */
- enum ib_wc_status rx_status; /* completion status */
- struct kib_msg *rx_msg; /* message buffer (host vaddr) */
- __u64 rx_msgaddr; /* message buffer (I/O addr) */
- DECLARE_PCI_UNMAP_ADDR(rx_msgunmap); /* for dma_unmap_single() */
- struct ib_recv_wr rx_wrq; /* receive work item... */
- struct ib_sge rx_sge; /* ...and its memory */
-};
-
-#define IBLND_POSTRX_DONT_POST 0 /* don't post */
-#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */
-#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */
-#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give self back 1 reserved credit */
-
-struct kib_tx { /* transmit message */
- struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
- struct kib_tx_pool *tx_pool; /* pool I'm from */
- struct kib_conn *tx_conn; /* owning conn */
- short tx_sending; /* # tx callbacks outstanding */
- short tx_queued; /* queued for sending */
- short tx_waiting; /* waiting for peer */
- int tx_status; /* LNET completion status */
- unsigned long tx_deadline; /* completion deadline */
- __u64 tx_cookie; /* completion cookie */
- struct lnet_msg *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
- struct kib_msg *tx_msg; /* message buffer (host vaddr) */
- __u64 tx_msgaddr; /* message buffer (I/O addr) */
- DECLARE_PCI_UNMAP_ADDR(tx_msgunmap); /* for dma_unmap_single() */
- int tx_nwrq; /* # send work items */
- struct ib_rdma_wr *tx_wrq; /* send work items... */
- struct ib_sge *tx_sge; /* ...and their memory */
- struct kib_rdma_desc *tx_rd; /* rdma descriptor */
- int tx_nfrags; /* # entries in... */
- struct scatterlist *tx_frags; /* dma_map_sg descriptor */
- __u64 *tx_pages; /* rdma phys page addrs */
- struct kib_fmr fmr; /* FMR */
- int tx_dmadir; /* dma direction */
-};
-
-struct kib_connvars {
- struct kib_msg cv_msg; /* connection-in-progress variables */
-};
-
-struct kib_conn {
- struct kib_sched_info *ibc_sched; /* scheduler information */
- struct kib_peer *ibc_peer; /* owning peer */
- struct kib_hca_dev *ibc_hdev; /* HCA bound on */
- struct list_head ibc_list; /* stash on peer's conn list */
- struct list_head ibc_sched_list; /* schedule for attention */
- __u16 ibc_version; /* version of connection */
- /* reconnect later */
- __u16 ibc_reconnect:1;
- __u64 ibc_incarnation; /* which instance of the peer */
- atomic_t ibc_refcount; /* # users */
- int ibc_state; /* what's happening */
- int ibc_nsends_posted; /* # uncompleted sends */
- int ibc_noops_posted; /* # uncompleted NOOPs */
- int ibc_credits; /* # credits I have */
- int ibc_outstanding_credits; /* # credits to return */
- int ibc_reserved_credits; /* # ACK/DONE msg credits */
- int ibc_comms_error; /* set on comms error */
- /* connections queue depth */
- __u16 ibc_queue_depth;
- /* connections max frags */
- __u16 ibc_max_frags;
- unsigned int ibc_nrx:16; /* receive buffers owned */
- unsigned int ibc_scheduled:1; /* scheduled for attention */
- unsigned int ibc_ready:1; /* CQ callback fired */
- unsigned long ibc_last_send; /* time of last send */
- struct list_head ibc_connd_list; /* link chain for */
- /* kiblnd_check_conns only */
- struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */
- struct list_head ibc_tx_noops; /* IBLND_MSG_NOOPs for */
- /* IBLND_MSG_VERSION_1 */
- struct list_head ibc_tx_queue; /* sends that need a credit */
- struct list_head ibc_tx_queue_nocred; /* sends that don't need a */
- /* credit */
- struct list_head ibc_tx_queue_rsrvd; /* sends that need to */
- /* reserve an ACK/DONE msg */
- struct list_head ibc_active_txs; /* active tx awaiting completion */
- spinlock_t ibc_lock; /* serialise */
- struct kib_rx *ibc_rxs; /* the rx descs */
- struct kib_pages *ibc_rx_pages; /* premapped rx msg pages */
-
- struct rdma_cm_id *ibc_cmid; /* CM id */
- struct ib_cq *ibc_cq; /* completion queue */
-
- struct kib_connvars *ibc_connvars; /* in-progress connection state */
-};
-
-#define IBLND_CONN_INIT 0 /* being initialised */
-#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */
-#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */
-#define IBLND_CONN_ESTABLISHED 3 /* connection established */
-#define IBLND_CONN_CLOSING 4 /* being closed */
-#define IBLND_CONN_DISCONNECTED 5 /* disconnected */
-
-struct kib_peer {
- struct list_head ibp_list; /* stash on global peer list */
- lnet_nid_t ibp_nid; /* who's on the other end(s) */
- struct lnet_ni *ibp_ni; /* LNet interface */
- struct list_head ibp_conns; /* all active connections */
- struct kib_conn *ibp_next_conn; /* next connection to send on for
- * round robin */
- struct list_head ibp_tx_queue; /* msgs waiting for a conn */
- __u64 ibp_incarnation; /* incarnation of peer */
- /* when (in jiffies) I was last alive */
- unsigned long ibp_last_alive;
- /* # users */
- atomic_t ibp_refcount;
- /* version of peer */
- __u16 ibp_version;
- /* current passive connection attempts */
- unsigned short ibp_accepting;
- /* current active connection attempts */
- unsigned short ibp_connecting;
- /* reconnect this peer later */
- unsigned char ibp_reconnecting;
- /* counter of how many times we triggered a conn race */
- unsigned char ibp_races;
- /* # consecutive reconnection attempts to this peer */
- unsigned int ibp_reconnected;
- /* errno on closing this peer */
- int ibp_error;
- /* max map_on_demand */
- __u16 ibp_max_frags;
- /* max_peer_credits */
- __u16 ibp_queue_depth;
-};
-
-extern struct kib_data kiblnd_data;
-
-void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
-
-int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
-
-/* max # of fragments configured by user */
-static inline int
-kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int mod;
-
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
- mod = tunables->lnd_map_on_demand;
- return mod ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT;
-}
-
-static inline int
-kiblnd_rdma_frags(int version, struct lnet_ni *ni)
-{
- return version == IBLND_MSG_VERSION_1 ?
- (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) :
- kiblnd_cfg_rdma_frags(ni);
-}
-
-static inline int
-kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- int concurrent_sends;
-
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
- concurrent_sends = tunables->lnd_concurrent_sends;
-
- if (version == IBLND_MSG_VERSION_1) {
- if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
- return IBLND_MSG_QUEUE_SIZE_V1 * 2;
-
- if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
- return IBLND_MSG_QUEUE_SIZE_V1 / 2;
- }
-
- return concurrent_sends;
-}
-
-static inline void
-kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev)
-{
- LASSERT(atomic_read(&hdev->ibh_ref) > 0);
- atomic_inc(&hdev->ibh_ref);
-}
-
-static inline void
-kiblnd_hdev_decref(struct kib_hca_dev *hdev)
-{
- LASSERT(atomic_read(&hdev->ibh_ref) > 0);
- if (atomic_dec_and_test(&hdev->ibh_ref))
- kiblnd_hdev_destroy(hdev);
-}
-
-static inline int
-kiblnd_dev_can_failover(struct kib_dev *dev)
-{
- if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
- return 0;
-
- if (!*kiblnd_tunables.kib_dev_failover) /* disabled */
- return 0;
-
- if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
- return 1;
-
- return dev->ibd_can_failover;
-}
-
-#define kiblnd_conn_addref(conn) \
-do { \
- CDEBUG(D_NET, "conn[%p] (%d)++\n", \
- (conn), atomic_read(&(conn)->ibc_refcount)); \
- atomic_inc(&(conn)->ibc_refcount); \
-} while (0)
-
-#define kiblnd_conn_decref(conn) \
-do { \
- unsigned long flags; \
- \
- CDEBUG(D_NET, "conn[%p] (%d)--\n", \
- (conn), atomic_read(&(conn)->ibc_refcount)); \
- LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \
- if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \
- spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \
- list_add_tail(&(conn)->ibc_list, \
- &kiblnd_data.kib_connd_zombies); \
- wake_up(&kiblnd_data.kib_connd_waitq); \
- spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
- } \
-} while (0)
-
-#define kiblnd_peer_addref(peer) \
-do { \
- CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \
- (peer), libcfs_nid2str((peer)->ibp_nid), \
- atomic_read(&(peer)->ibp_refcount)); \
- atomic_inc(&(peer)->ibp_refcount); \
-} while (0)
-
-#define kiblnd_peer_decref(peer) \
-do { \
- CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \
- (peer), libcfs_nid2str((peer)->ibp_nid), \
- atomic_read(&(peer)->ibp_refcount)); \
- LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \
- if (atomic_dec_and_test(&(peer)->ibp_refcount)) \
- kiblnd_destroy_peer(peer); \
-} while (0)
-
-static inline bool
-kiblnd_peer_connecting(struct kib_peer *peer)
-{
- return peer->ibp_connecting ||
- peer->ibp_reconnecting ||
- peer->ibp_accepting;
-}
-
-static inline bool
-kiblnd_peer_idle(struct kib_peer *peer)
-{
- return !kiblnd_peer_connecting(peer) && list_empty(&peer->ibp_conns);
-}
-
-static inline struct list_head *
-kiblnd_nid2peerlist(lnet_nid_t nid)
-{
- unsigned int hash =
- ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
-
- return &kiblnd_data.kib_peers[hash];
-}
-
-static inline int
-kiblnd_peer_active(struct kib_peer *peer)
-{
- /* Am I in the peer hash table? */
- return !list_empty(&peer->ibp_list);
-}
-
-static inline struct kib_conn *
-kiblnd_get_conn_locked(struct kib_peer *peer)
-{
- struct list_head *next;
-
- LASSERT(!list_empty(&peer->ibp_conns));
-
- /* Advance to next connection, be sure to skip the head node */
- if (!peer->ibp_next_conn ||
- peer->ibp_next_conn->ibc_list.next == &peer->ibp_conns)
- next = peer->ibp_conns.next;
- else
- next = peer->ibp_next_conn->ibc_list.next;
- peer->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list);
-
- return peer->ibp_next_conn;
-}
-
-static inline int
-kiblnd_send_keepalive(struct kib_conn *conn)
-{
- return (*kiblnd_tunables.kib_keepalive > 0) &&
- time_after(jiffies, conn->ibc_last_send +
- msecs_to_jiffies(*kiblnd_tunables.kib_keepalive *
- MSEC_PER_SEC));
-}
-
-static inline int
-kiblnd_need_noop(struct kib_conn *conn)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
-
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
-
- if (conn->ibc_outstanding_credits <
- IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
- !kiblnd_send_keepalive(conn))
- return 0; /* No need to send NOOP */
-
- if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
- if (!list_empty(&conn->ibc_tx_queue_nocred))
- return 0; /* NOOP can be piggybacked */
-
- /* No tx to piggyback NOOP onto or no credit to send a tx */
- return (list_empty(&conn->ibc_tx_queue) ||
- !conn->ibc_credits);
- }
-
- if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
- !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
- !conn->ibc_credits) /* no credit */
- return 0;
-
- if (conn->ibc_credits == 1 && /* last credit reserved for */
- !conn->ibc_outstanding_credits) /* giving back credits */
- return 0;
-
- /* No tx to piggyback NOOP onto or no credit to send a tx */
- return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
-}
-
-static inline void
-kiblnd_abort_receives(struct kib_conn *conn)
-{
- ib_modify_qp(conn->ibc_cmid->qp,
- &kiblnd_data.kib_error_qpa, IB_QP_STATE);
-}
-
-static inline const char *
-kiblnd_queue2str(struct kib_conn *conn, struct list_head *q)
-{
- if (q == &conn->ibc_tx_queue)
- return "tx_queue";
-
- if (q == &conn->ibc_tx_queue_rsrvd)
- return "tx_queue_rsrvd";
-
- if (q == &conn->ibc_tx_queue_nocred)
- return "tx_queue_nocred";
-
- if (q == &conn->ibc_active_txs)
- return "active_txs";
-
- LBUG();
- return NULL;
-}
-
-/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the */
-/* lowest bits of the work request id to stash the work item type. */
-
-#define IBLND_WID_INVAL 0
-#define IBLND_WID_TX 1
-#define IBLND_WID_RX 2
-#define IBLND_WID_RDMA 3
-#define IBLND_WID_MR 4
-#define IBLND_WID_MASK 7UL
-
-static inline __u64
-kiblnd_ptr2wreqid(void *ptr, int type)
-{
- unsigned long lptr = (unsigned long)ptr;
-
- LASSERT(!(lptr & IBLND_WID_MASK));
- LASSERT(!(type & ~IBLND_WID_MASK));
- return (__u64)(lptr | type);
-}
-
-static inline void *
-kiblnd_wreqid2ptr(__u64 wreqid)
-{
- return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
-}
-
-static inline int
-kiblnd_wreqid2type(__u64 wreqid)
-{
- return wreqid & IBLND_WID_MASK;
-}
-
-static inline void
-kiblnd_set_conn_state(struct kib_conn *conn, int state)
-{
- conn->ibc_state = state;
- mb();
-}
-
-static inline void
-kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob)
-{
- msg->ibm_type = type;
- msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob;
-}
-
-static inline int
-kiblnd_rd_size(struct kib_rdma_desc *rd)
-{
- int i;
- int size;
-
- for (i = size = 0; i < rd->rd_nfrags; i++)
- size += rd->rd_frags[i].rf_nob;
-
- return size;
-}
-
-static inline __u64
-kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index)
-{
- return rd->rd_frags[index].rf_addr;
-}
-
-static inline __u32
-kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index)
-{
- return rd->rd_frags[index].rf_nob;
-}
-
-static inline __u32
-kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index)
-{
- return rd->rd_key;
-}
-
-static inline int
-kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
-{
- if (nob < rd->rd_frags[index].rf_nob) {
- rd->rd_frags[index].rf_addr += nob;
- rd->rd_frags[index].rf_nob -= nob;
- } else {
- index++;
- }
-
- return index;
-}
-
-static inline int
-kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
-{
- LASSERT(msgtype == IBLND_MSG_GET_REQ ||
- msgtype == IBLND_MSG_PUT_ACK);
-
- return msgtype == IBLND_MSG_GET_REQ ?
- offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
- offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
-}
-
-static inline __u64
-kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
- return ib_dma_mapping_error(dev, dma_addr);
-}
-
-static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
- void *msg, size_t size,
- enum dma_data_direction direction)
-{
- return ib_dma_map_single(dev, msg, size, direction);
-}
-
-static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
- __u64 addr, size_t size,
- enum dma_data_direction direction)
-{
- ib_dma_unmap_single(dev, addr, size, direction);
-}
-
-#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
-#define KIBLND_UNMAP_ADDR(p, m, a) (a)
-
-static inline int kiblnd_dma_map_sg(struct ib_device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
-{
- return ib_dma_map_sg(dev, sg, nents, direction);
-}
-
-static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
-{
- ib_dma_unmap_sg(dev, sg, nents, direction);
-}
-
-static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
- struct scatterlist *sg)
-{
- return ib_sg_dma_address(dev, sg);
-}
-
-static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
- struct scatterlist *sg)
-{
- return ib_sg_dma_len(dev, sg);
-}
-
-/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly */
-/* right because OFED1.2 defines it as const, to use it we have to add */
-/* (void *) cast to overcome "const" */
-
-#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
-#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
-
-void kiblnd_map_rx_descs(struct kib_conn *conn);
-void kiblnd_unmap_rx_descs(struct kib_conn *conn);
-void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node);
-struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps);
-
-int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
- struct kib_rdma_desc *rd, __u32 nob, __u64 iov,
- struct kib_fmr *fmr);
-void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status);
-
-int kiblnd_tunables_setup(struct lnet_ni *ni);
-void kiblnd_tunables_init(void);
-
-int kiblnd_connd(void *arg);
-int kiblnd_scheduler(void *arg);
-int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
-int kiblnd_failover_thread(void *arg);
-
-int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages);
-
-int kiblnd_cm_callback(struct rdma_cm_id *cmid,
- struct rdma_cm_event *event);
-int kiblnd_translate_mtu(int value);
-
-int kiblnd_dev_failover(struct kib_dev *dev);
-int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp,
- lnet_nid_t nid);
-void kiblnd_destroy_peer(struct kib_peer *peer);
-bool kiblnd_reconnect_peer(struct kib_peer *peer);
-void kiblnd_destroy_dev(struct kib_dev *dev);
-void kiblnd_unlink_peer_locked(struct kib_peer *peer);
-struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid);
-int kiblnd_close_stale_conns_locked(struct kib_peer *peer,
- int version, __u64 incarnation);
-int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why);
-
-struct kib_conn *kiblnd_create_conn(struct kib_peer *peer,
- struct rdma_cm_id *cmid,
- int state, int version);
-void kiblnd_destroy_conn(struct kib_conn *conn);
-void kiblnd_close_conn(struct kib_conn *conn, int error);
-void kiblnd_close_conn_locked(struct kib_conn *conn, int error);
-
-void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid);
-void kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist,
- int status);
-
-void kiblnd_qp_event(struct ib_event *event, void *arg);
-void kiblnd_cq_event(struct ib_event *event, void *arg);
-void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
-
-void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version,
- int credits, lnet_nid_t dstnid, __u64 dststamp);
-int kiblnd_unpack_msg(struct kib_msg *msg, int nob);
-int kiblnd_post_rx(struct kib_rx *rx, int credit);
-
-int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
-int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, struct iov_iter *to, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
deleted file mode 100644
index 65b7a62..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ /dev/null
@@ -1,3763 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd_cb.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <linux/highmem.h>
-#include "o2iblnd.h"
-
-#define MAX_CONN_RACES_BEFORE_ABORT 20
-
-static void kiblnd_peer_alive(struct kib_peer *peer);
-static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error);
-static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx,
- int type, int body_nob);
-static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
- int resid, struct kib_rdma_desc *dstrd,
- __u64 dstcookie);
-static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn);
-static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn);
-static void kiblnd_unmap_tx(struct kib_tx *tx);
-static void kiblnd_check_sends_locked(struct kib_conn *conn);
-
-static void
-kiblnd_tx_done(struct lnet_ni *ni, struct kib_tx *tx)
-{
- struct lnet_msg *lntmsg[2];
- struct kib_net *net = ni->ni_data;
- int rc;
- int i;
-
- LASSERT(net);
- LASSERT(!in_interrupt());
- LASSERT(!tx->tx_queued); /* mustn't be queued for sending */
- LASSERT(!tx->tx_sending); /* mustn't be awaiting sent callback */
- LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */
- LASSERT(tx->tx_pool);
-
- kiblnd_unmap_tx(tx);
-
- /* tx may have up to 2 lnet msgs to finalise */
- lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
- lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
- rc = tx->tx_status;
-
- if (tx->tx_conn) {
- LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni);
-
- kiblnd_conn_decref(tx->tx_conn);
- tx->tx_conn = NULL;
- }
-
- tx->tx_nwrq = 0;
- tx->tx_status = 0;
-
- kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
-
- /* delay finalize until my descs have been freed */
- for (i = 0; i < 2; i++) {
- if (!lntmsg[i])
- continue;
-
- lnet_finalize(ni, lntmsg[i], rc);
- }
-}
-
-void
-kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int status)
-{
- struct kib_tx *tx;
-
- while (!list_empty(txlist)) {
- tx = list_entry(txlist->next, struct kib_tx, tx_list);
-
- list_del(&tx->tx_list);
- /* complete now */
- tx->tx_waiting = 0;
- tx->tx_status = status;
- kiblnd_tx_done(ni, tx);
- }
-}
-
-static struct kib_tx *
-kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target)
-{
- struct kib_net *net = (struct kib_net *)ni->ni_data;
- struct list_head *node;
- struct kib_tx *tx;
- struct kib_tx_poolset *tps;
-
- tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
- node = kiblnd_pool_alloc_node(&tps->tps_poolset);
- if (!node)
- return NULL;
- tx = list_entry(node, struct kib_tx, tx_list);
-
- LASSERT(!tx->tx_nwrq);
- LASSERT(!tx->tx_queued);
- LASSERT(!tx->tx_sending);
- LASSERT(!tx->tx_waiting);
- LASSERT(!tx->tx_status);
- LASSERT(!tx->tx_conn);
- LASSERT(!tx->tx_lntmsg[0]);
- LASSERT(!tx->tx_lntmsg[1]);
- LASSERT(!tx->tx_nfrags);
-
- return tx;
-}
-
-static void
-kiblnd_drop_rx(struct kib_rx *rx)
-{
- struct kib_conn *conn = rx->rx_conn;
- struct kib_sched_info *sched = conn->ibc_sched;
- unsigned long flags;
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
- LASSERT(conn->ibc_nrx > 0);
- conn->ibc_nrx--;
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- kiblnd_conn_decref(conn);
-}
-
-int
-kiblnd_post_rx(struct kib_rx *rx, int credit)
-{
- struct kib_conn *conn = rx->rx_conn;
- struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data;
- struct ib_recv_wr *bad_wrq = NULL;
- int rc;
-
- LASSERT(net);
- LASSERT(!in_interrupt());
- LASSERT(credit == IBLND_POSTRX_NO_CREDIT ||
- credit == IBLND_POSTRX_PEER_CREDIT ||
- credit == IBLND_POSTRX_RSRVD_CREDIT);
-
- rx->rx_sge.lkey = conn->ibc_hdev->ibh_pd->local_dma_lkey;
- rx->rx_sge.addr = rx->rx_msgaddr;
- rx->rx_sge.length = IBLND_MSG_SIZE;
-
- rx->rx_wrq.next = NULL;
- rx->rx_wrq.sg_list = &rx->rx_sge;
- rx->rx_wrq.num_sge = 1;
- rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
-
- LASSERT(conn->ibc_state >= IBLND_CONN_INIT);
- LASSERT(rx->rx_nob >= 0); /* not posted */
-
- if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
- kiblnd_drop_rx(rx); /* No more posts for this rx */
- return 0;
- }
-
- rx->rx_nob = -1; /* flag posted */
-
- /* NB: need an extra reference after ib_post_recv because we don't
- * own this rx (and rx::rx_conn) anymore, LU-5678.
- */
- kiblnd_conn_addref(conn);
- rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
- if (unlikely(rc)) {
- CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
- rx->rx_nob = 0;
- }
-
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
- goto out;
-
- if (unlikely(rc)) {
- kiblnd_close_conn(conn, rc);
- kiblnd_drop_rx(rx); /* No more posts for this rx */
- goto out;
- }
-
- if (credit == IBLND_POSTRX_NO_CREDIT)
- goto out;
-
- spin_lock(&conn->ibc_lock);
- if (credit == IBLND_POSTRX_PEER_CREDIT)
- conn->ibc_outstanding_credits++;
- else
- conn->ibc_reserved_credits++;
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
-out:
- kiblnd_conn_decref(conn);
- return rc;
-}
-
-static struct kib_tx *
-kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, __u64 cookie)
-{
- struct list_head *tmp;
-
- list_for_each(tmp, &conn->ibc_active_txs) {
- struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list);
-
- LASSERT(!tx->tx_queued);
- LASSERT(tx->tx_sending || tx->tx_waiting);
-
- if (tx->tx_cookie != cookie)
- continue;
-
- if (tx->tx_waiting &&
- tx->tx_msg->ibm_type == txtype)
- return tx;
-
- CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
- tx->tx_waiting ? "" : "NOT ",
- tx->tx_msg->ibm_type, txtype);
- }
- return NULL;
-}
-
-static void
-kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, __u64 cookie)
-{
- struct kib_tx *tx;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- int idle;
-
- spin_lock(&conn->ibc_lock);
-
- tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
- if (!tx) {
- spin_unlock(&conn->ibc_lock);
-
- CWARN("Unmatched completion type %x cookie %#llx from %s\n",
- txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_close_conn(conn, -EPROTO);
- return;
- }
-
- if (!tx->tx_status) { /* success so far */
- if (status < 0) /* failed? */
- tx->tx_status = status;
- else if (txtype == IBLND_MSG_GET_REQ)
- lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
- }
-
- tx->tx_waiting = 0;
-
- idle = !tx->tx_queued && !tx->tx_sending;
- if (idle)
- list_del(&tx->tx_list);
-
- spin_unlock(&conn->ibc_lock);
-
- if (idle)
- kiblnd_tx_done(ni, tx);
-}
-
-static void
-kiblnd_send_completion(struct kib_conn *conn, int type, int status, __u64 cookie)
-{
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
-
- if (!tx) {
- CERROR("Can't get tx for completion %x for %s\n",
- type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- return;
- }
-
- tx->tx_msg->ibm_u.completion.ibcm_status = status;
- tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
- kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg));
-
- kiblnd_queue_tx(tx, conn);
-}
-
-static void
-kiblnd_handle_rx(struct kib_rx *rx)
-{
- struct kib_msg *msg = rx->rx_msg;
- struct kib_conn *conn = rx->rx_conn;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- int credits = msg->ibm_credits;
- struct kib_tx *tx;
- int rc = 0;
- int rc2;
- int post_credit;
-
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- CDEBUG(D_NET, "Received %x[%d] from %s\n",
- msg->ibm_type, credits,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
-
- if (credits) {
- /* Have I received credits that will let me send? */
- spin_lock(&conn->ibc_lock);
-
- if (conn->ibc_credits + credits >
- conn->ibc_queue_depth) {
- rc2 = conn->ibc_credits;
- spin_unlock(&conn->ibc_lock);
-
- CERROR("Bad credits from %s: %d + %d > %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- rc2, credits, conn->ibc_queue_depth);
-
- kiblnd_close_conn(conn, -EPROTO);
- kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
- return;
- }
-
- conn->ibc_credits += credits;
-
- /* This ensures the credit taken by NOOP can be returned */
- if (msg->ibm_type == IBLND_MSG_NOOP &&
- !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
- conn->ibc_outstanding_credits++;
-
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
- }
-
- switch (msg->ibm_type) {
- default:
- CERROR("Bad IBLND message type %x from %s\n",
- msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- post_credit = IBLND_POSTRX_NO_CREDIT;
- rc = -EPROTO;
- break;
-
- case IBLND_MSG_NOOP:
- if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
- post_credit = IBLND_POSTRX_NO_CREDIT;
- break;
- }
-
- if (credits) /* credit already posted */
- post_credit = IBLND_POSTRX_NO_CREDIT;
- else /* a keepalive NOOP */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_IMMEDIATE:
- post_credit = IBLND_POSTRX_DONT_POST;
- rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
- msg->ibm_srcnid, rx, 0);
- if (rc < 0) /* repost on error */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_PUT_REQ:
- post_credit = IBLND_POSTRX_DONT_POST;
- rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
- msg->ibm_srcnid, rx, 1);
- if (rc < 0) /* repost on error */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_PUT_NAK:
- CWARN("PUT_NACK from %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- post_credit = IBLND_POSTRX_RSRVD_CREDIT;
- kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
- msg->ibm_u.completion.ibcm_status,
- msg->ibm_u.completion.ibcm_cookie);
- break;
-
- case IBLND_MSG_PUT_ACK:
- post_credit = IBLND_POSTRX_RSRVD_CREDIT;
-
- spin_lock(&conn->ibc_lock);
- tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
- msg->ibm_u.putack.ibpam_src_cookie);
- if (tx)
- list_del(&tx->tx_list);
- spin_unlock(&conn->ibc_lock);
-
- if (!tx) {
- CERROR("Unmatched PUT_ACK from %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- rc = -EPROTO;
- break;
- }
-
- LASSERT(tx->tx_waiting);
- /*
- * CAVEAT EMPTOR: I could be racing with tx_complete, but...
- * (a) I can overwrite tx_msg since my peer has received it!
- * (b) tx_waiting set tells tx_complete() it's not done.
- */
- tx->tx_nwrq = 0; /* overwrite PUT_REQ */
-
- rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
- kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
- &msg->ibm_u.putack.ibpam_rd,
- msg->ibm_u.putack.ibpam_dst_cookie);
- if (rc2 < 0)
- CERROR("Can't setup rdma for PUT to %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
-
- spin_lock(&conn->ibc_lock);
- tx->tx_waiting = 0; /* clear waiting and queue atomically */
- kiblnd_queue_tx_locked(tx, conn);
- spin_unlock(&conn->ibc_lock);
- break;
-
- case IBLND_MSG_PUT_DONE:
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
- msg->ibm_u.completion.ibcm_status,
- msg->ibm_u.completion.ibcm_cookie);
- break;
-
- case IBLND_MSG_GET_REQ:
- post_credit = IBLND_POSTRX_DONT_POST;
- rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
- msg->ibm_srcnid, rx, 1);
- if (rc < 0) /* repost on error */
- post_credit = IBLND_POSTRX_PEER_CREDIT;
- break;
-
- case IBLND_MSG_GET_DONE:
- post_credit = IBLND_POSTRX_RSRVD_CREDIT;
- kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
- msg->ibm_u.completion.ibcm_status,
- msg->ibm_u.completion.ibcm_cookie);
- break;
- }
-
- if (rc < 0) /* protocol error */
- kiblnd_close_conn(conn, rc);
-
- if (post_credit != IBLND_POSTRX_DONT_POST)
- kiblnd_post_rx(rx, post_credit);
-}
-
-static void
-kiblnd_rx_complete(struct kib_rx *rx, int status, int nob)
-{
- struct kib_msg *msg = rx->rx_msg;
- struct kib_conn *conn = rx->rx_conn;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- struct kib_net *net = ni->ni_data;
- int rc;
- int err = -EIO;
-
- LASSERT(net);
- LASSERT(rx->rx_nob < 0); /* was posted */
- rx->rx_nob = 0; /* isn't now */
-
- if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
- goto ignore;
-
- if (status != IB_WC_SUCCESS) {
- CNETERR("Rx from %s failed: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
- goto failed;
- }
-
- LASSERT(nob >= 0);
- rx->rx_nob = nob;
-
- rc = kiblnd_unpack_msg(msg, rx->rx_nob);
- if (rc) {
- CERROR("Error %d unpacking rx from %s\n",
- rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- goto failed;
- }
-
- if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
- msg->ibm_dstnid != ni->ni_nid ||
- msg->ibm_srcstamp != conn->ibc_incarnation ||
- msg->ibm_dststamp != net->ibn_incarnation) {
- CERROR("Stale rx from %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- err = -ESTALE;
- goto failed;
- }
-
- /* set time last known alive */
- kiblnd_peer_alive(conn->ibc_peer);
-
- /* racing with connection establishment/teardown! */
-
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- unsigned long flags;
-
- write_lock_irqsave(g_lock, flags);
- /* must check holding global lock to eliminate race */
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
- write_unlock_irqrestore(g_lock, flags);
- return;
- }
- write_unlock_irqrestore(g_lock, flags);
- }
- kiblnd_handle_rx(rx);
- return;
-
- failed:
- CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
- kiblnd_close_conn(conn, err);
- ignore:
- kiblnd_drop_rx(rx); /* Don't re-post rx. */
-}
-
-static struct page *
-kiblnd_kvaddr_to_page(unsigned long vaddr)
-{
- struct page *page;
-
- if (is_vmalloc_addr((void *)vaddr)) {
- page = vmalloc_to_page((void *)vaddr);
- LASSERT(page);
- return page;
- }
-#ifdef CONFIG_HIGHMEM
- if (vaddr >= PKMAP_BASE &&
- vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
- /* No highmem pages only used for bulk (kiov) I/O */
- CERROR("find page for address in highmem\n");
- LBUG();
- }
-#endif
- page = virt_to_page(vaddr);
- LASSERT(page);
- return page;
-}
-
-static int
-kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, struct kib_rdma_desc *rd, __u32 nob)
-{
- struct kib_hca_dev *hdev;
- struct kib_fmr_poolset *fps;
- int cpt;
- int rc;
-
- LASSERT(tx->tx_pool);
- LASSERT(tx->tx_pool->tpo_pool.po_owner);
-
- hdev = tx->tx_pool->tpo_hdev;
- cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
-
- fps = net->ibn_fmr_ps[cpt];
- rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr);
- if (rc) {
- CERROR("Can't map %u bytes: %d\n", nob, rc);
- return rc;
- }
-
- /*
- * If rd is not tx_rd, it's going to get sent to a peer, who will need
- * the rkey
- */
- rd->rd_key = tx->fmr.fmr_key;
- rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
- rd->rd_frags[0].rf_nob = nob;
- rd->rd_nfrags = 1;
-
- return 0;
-}
-
-static void kiblnd_unmap_tx(struct kib_tx *tx)
-{
- if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd)
- kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
-
- if (tx->tx_nfrags) {
- kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
- tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
- tx->tx_nfrags = 0;
- }
-}
-
-static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
- struct kib_rdma_desc *rd, int nfrags)
-{
- struct kib_net *net = ni->ni_data;
- struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
- __u32 nob;
- int i;
-
- /*
- * If rd is not tx_rd, it's going to get sent to a peer and I'm the
- * RDMA sink
- */
- tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
- tx->tx_nfrags = nfrags;
-
- rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
- tx->tx_nfrags, tx->tx_dmadir);
-
- for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
- rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len(
- hdev->ibh_ibdev, &tx->tx_frags[i]);
- rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
- hdev->ibh_ibdev, &tx->tx_frags[i]);
- nob += rd->rd_frags[i].rf_nob;
- }
-
- if (net->ibn_fmr_ps)
- return kiblnd_fmr_map_tx(net, tx, rd, nob);
-
- return -EINVAL;
-}
-
-static int
-kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx,
- struct kib_rdma_desc *rd, unsigned int niov,
- const struct kvec *iov, int offset, int nob)
-{
- struct kib_net *net = ni->ni_data;
- struct page *page;
- struct scatterlist *sg;
- unsigned long vaddr;
- int fragnob;
- int page_offset;
-
- LASSERT(nob > 0);
- LASSERT(niov > 0);
- LASSERT(net);
-
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- niov--;
- iov++;
- LASSERT(niov > 0);
- }
-
- sg = tx->tx_frags;
- do {
- LASSERT(niov > 0);
-
- vaddr = ((unsigned long)iov->iov_base) + offset;
- page_offset = vaddr & (PAGE_SIZE - 1);
- page = kiblnd_kvaddr_to_page(vaddr);
- if (!page) {
- CERROR("Can't find page\n");
- return -EFAULT;
- }
-
- fragnob = min((int)(iov->iov_len - offset), nob);
- fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
-
- sg_set_page(sg, page, fragnob, page_offset);
- sg = sg_next(sg);
- if (!sg) {
- CERROR("lacking enough sg entries to map tx\n");
- return -EFAULT;
- }
-
- if (offset + fragnob < iov->iov_len) {
- offset += fragnob;
- } else {
- offset = 0;
- iov++;
- niov--;
- }
- nob -= fragnob;
- } while (nob > 0);
-
- return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
-}
-
-static int
-kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx,
- struct kib_rdma_desc *rd, int nkiov,
- const struct bio_vec *kiov, int offset, int nob)
-{
- struct kib_net *net = ni->ni_data;
- struct scatterlist *sg;
- int fragnob;
-
- CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
-
- LASSERT(nob > 0);
- LASSERT(nkiov > 0);
- LASSERT(net);
-
- while (offset >= kiov->bv_len) {
- offset -= kiov->bv_len;
- nkiov--;
- kiov++;
- LASSERT(nkiov > 0);
- }
-
- sg = tx->tx_frags;
- do {
- LASSERT(nkiov > 0);
-
- fragnob = min((int)(kiov->bv_len - offset), nob);
-
- sg_set_page(sg, kiov->bv_page, fragnob,
- kiov->bv_offset + offset);
- sg = sg_next(sg);
- if (!sg) {
- CERROR("lacking enough sg entries to map tx\n");
- return -EFAULT;
- }
-
- offset = 0;
- kiov++;
- nkiov--;
- nob -= fragnob;
- } while (nob > 0);
-
- return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
-}
-
-static int
-kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit)
- __must_hold(&conn->ibc_lock)
-{
- struct kib_msg *msg = tx->tx_msg;
- struct kib_peer *peer = conn->ibc_peer;
- struct lnet_ni *ni = peer->ibp_ni;
- int ver = conn->ibc_version;
- int rc;
- int done;
-
- LASSERT(tx->tx_queued);
- /* We rely on this for QP sizing */
- LASSERT(tx->tx_nwrq > 0);
-
- LASSERT(!credit || credit == 1);
- LASSERT(conn->ibc_outstanding_credits >= 0);
- LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth);
- LASSERT(conn->ibc_credits >= 0);
- LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
-
- if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) {
- /* tx completions outstanding... */
- CDEBUG(D_NET, "%s: posted enough\n",
- libcfs_nid2str(peer->ibp_nid));
- return -EAGAIN;
- }
-
- if (credit && !conn->ibc_credits) { /* no credits */
- CDEBUG(D_NET, "%s: no credits\n",
- libcfs_nid2str(peer->ibp_nid));
- return -EAGAIN;
- }
-
- if (credit && !IBLND_OOB_CAPABLE(ver) &&
- conn->ibc_credits == 1 && /* last credit reserved */
- msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */
- CDEBUG(D_NET, "%s: not using last credit\n",
- libcfs_nid2str(peer->ibp_nid));
- return -EAGAIN;
- }
-
- /* NB don't drop ibc_lock before bumping tx_sending */
- list_del(&tx->tx_list);
- tx->tx_queued = 0;
-
- if (msg->ibm_type == IBLND_MSG_NOOP &&
- (!kiblnd_need_noop(conn) || /* redundant NOOP */
- (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
- conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
- /*
- * OK to drop when posted enough NOOPs, since
- * kiblnd_check_sends_locked will queue NOOP again when
- * posted NOOPs complete
- */
- spin_unlock(&conn->ibc_lock);
- kiblnd_tx_done(peer->ibp_ni, tx);
- spin_lock(&conn->ibc_lock);
- CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
- libcfs_nid2str(peer->ibp_nid),
- conn->ibc_noops_posted);
- return 0;
- }
-
- kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
- peer->ibp_nid, conn->ibc_incarnation);
-
- conn->ibc_credits -= credit;
- conn->ibc_outstanding_credits = 0;
- conn->ibc_nsends_posted++;
- if (msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted++;
-
- /*
- * CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
- * PUT. If so, it was first queued here as a PUT_REQ, sent and
- * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
- * and then re-queued here. It's (just) possible that
- * tx_sending is non-zero if we've not done the tx_complete()
- * from the first send; hence the ++ rather than = below.
- */
- tx->tx_sending++;
- list_add(&tx->tx_list, &conn->ibc_active_txs);
-
- /* I'm still holding ibc_lock! */
- if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
- rc = -ECONNABORTED;
- } else if (tx->tx_pool->tpo_pool.po_failed ||
- conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
- /* close_conn will launch failover */
- rc = -ENETDOWN;
- } else {
- struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
- struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
- struct ib_send_wr *wrq = &tx->tx_wrq[0].wr;
-
- if (frd) {
- if (!frd->frd_valid) {
- wrq = &frd->frd_inv_wr;
- wrq->next = &frd->frd_fastreg_wr.wr;
- } else {
- wrq = &frd->frd_fastreg_wr.wr;
- }
- frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
- }
-
- LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
- "bad wr_id %llx, opc %d, flags %d, peer: %s\n",
- bad->wr_id, bad->opcode, bad->send_flags,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- bad = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
- }
-
- conn->ibc_last_send = jiffies;
-
- if (!rc)
- return 0;
-
- /*
- * NB credits are transferred in the actual
- * message, which can only be the last work item
- */
- conn->ibc_credits += credit;
- conn->ibc_outstanding_credits += msg->ibm_credits;
- conn->ibc_nsends_posted--;
- if (msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted--;
-
- tx->tx_status = rc;
- tx->tx_waiting = 0;
- tx->tx_sending--;
-
- done = !tx->tx_sending;
- if (done)
- list_del(&tx->tx_list);
-
- spin_unlock(&conn->ibc_lock);
-
- if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
- CERROR("Error %d posting transmit to %s\n",
- rc, libcfs_nid2str(peer->ibp_nid));
- else
- CDEBUG(D_NET, "Error %d posting transmit to %s\n",
- rc, libcfs_nid2str(peer->ibp_nid));
-
- kiblnd_close_conn(conn, rc);
-
- if (done)
- kiblnd_tx_done(peer->ibp_ni, tx);
-
- spin_lock(&conn->ibc_lock);
-
- return -EIO;
-}
-
-static void
-kiblnd_check_sends_locked(struct kib_conn *conn)
-{
- int ver = conn->ibc_version;
- struct lnet_ni *ni = conn->ibc_peer->ibp_ni;
- struct kib_tx *tx;
-
- /* Don't send anything until after the connection is established */
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- CDEBUG(D_NET, "%s too soon\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- return;
- }
-
- LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni));
- LASSERT(!IBLND_OOB_CAPABLE(ver) ||
- conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
- LASSERT(conn->ibc_reserved_credits >= 0);
-
- while (conn->ibc_reserved_credits > 0 &&
- !list_empty(&conn->ibc_tx_queue_rsrvd)) {
- tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
- struct kib_tx, tx_list);
- list_del(&tx->tx_list);
- list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
- conn->ibc_reserved_credits--;
- }
-
- if (kiblnd_need_noop(conn)) {
- spin_unlock(&conn->ibc_lock);
-
- tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
- if (tx)
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
-
- spin_lock(&conn->ibc_lock);
- if (tx)
- kiblnd_queue_tx_locked(tx, conn);
- }
-
- for (;;) {
- int credit;
-
- if (!list_empty(&conn->ibc_tx_queue_nocred)) {
- credit = 0;
- tx = list_entry(conn->ibc_tx_queue_nocred.next,
- struct kib_tx, tx_list);
- } else if (!list_empty(&conn->ibc_tx_noops)) {
- LASSERT(!IBLND_OOB_CAPABLE(ver));
- credit = 1;
- tx = list_entry(conn->ibc_tx_noops.next,
- struct kib_tx, tx_list);
- } else if (!list_empty(&conn->ibc_tx_queue)) {
- credit = 1;
- tx = list_entry(conn->ibc_tx_queue.next,
- struct kib_tx, tx_list);
- } else {
- break;
- }
-
- if (kiblnd_post_tx_locked(conn, tx, credit))
- break;
- }
-}
-
-static void
-kiblnd_tx_complete(struct kib_tx *tx, int status)
-{
- int failed = (status != IB_WC_SUCCESS);
- struct kib_conn *conn = tx->tx_conn;
- int idle;
-
- LASSERT(tx->tx_sending > 0);
-
- if (failed) {
- if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
- CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
- status);
-
- kiblnd_close_conn(conn, -EIO);
- } else {
- kiblnd_peer_alive(conn->ibc_peer);
- }
-
- spin_lock(&conn->ibc_lock);
-
- /*
- * I could be racing with rdma completion. Whoever makes 'tx' idle
- * gets to free it, which also drops its ref on 'conn'.
- */
- tx->tx_sending--;
- conn->ibc_nsends_posted--;
- if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
- conn->ibc_noops_posted--;
-
- if (failed) {
- tx->tx_waiting = 0; /* don't wait for peer */
- tx->tx_status = -EIO;
- }
-
- idle = !tx->tx_sending && /* This is the final callback */
- !tx->tx_waiting && /* Not waiting for peer */
- !tx->tx_queued; /* Not re-queued (PUT_DONE) */
- if (idle)
- list_del(&tx->tx_list);
-
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
- if (idle)
- kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
-}
-
-static void
-kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type,
- int body_nob)
-{
- struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev;
- struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
- struct ib_rdma_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
- int nob = offsetof(struct kib_msg, ibm_u) + body_nob;
-
- LASSERT(tx->tx_nwrq >= 0);
- LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
- LASSERT(nob <= IBLND_MSG_SIZE);
-
- kiblnd_init_msg(tx->tx_msg, type, body_nob);
-
- sge->lkey = hdev->ibh_pd->local_dma_lkey;
- sge->addr = tx->tx_msgaddr;
- sge->length = nob;
-
- memset(wrq, 0, sizeof(*wrq));
-
- wrq->wr.next = NULL;
- wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
- wrq->wr.sg_list = sge;
- wrq->wr.num_sge = 1;
- wrq->wr.opcode = IB_WR_SEND;
- wrq->wr.send_flags = IB_SEND_SIGNALED;
-
- tx->tx_nwrq++;
-}
-
-static int
-kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type,
- int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie)
-{
- struct kib_msg *ibmsg = tx->tx_msg;
- struct kib_rdma_desc *srcrd = tx->tx_rd;
- struct ib_sge *sge = &tx->tx_sge[0];
- struct ib_rdma_wr *wrq, *next;
- int rc = resid;
- int srcidx = 0;
- int dstidx = 0;
- int wrknob;
-
- LASSERT(!in_interrupt());
- LASSERT(!tx->tx_nwrq);
- LASSERT(type == IBLND_MSG_GET_DONE ||
- type == IBLND_MSG_PUT_DONE);
-
- if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) {
- CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- conn->ibc_max_frags << PAGE_SHIFT,
- kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd));
- rc = -EMSGSIZE;
- goto too_big;
- }
-
- while (resid > 0) {
- if (srcidx >= srcrd->rd_nfrags) {
- CERROR("Src buffer exhausted: %d frags\n", srcidx);
- rc = -EPROTO;
- break;
- }
-
- if (dstidx == dstrd->rd_nfrags) {
- CERROR("Dst buffer exhausted: %d frags\n", dstidx);
- rc = -EPROTO;
- break;
- }
-
- if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) {
- CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- IBLND_MAX_RDMA_FRAGS,
- srcidx, srcrd->rd_nfrags,
- dstidx, dstrd->rd_nfrags);
- rc = -EMSGSIZE;
- break;
- }
-
- wrknob = min3(kiblnd_rd_frag_size(srcrd, srcidx),
- kiblnd_rd_frag_size(dstrd, dstidx),
- (__u32)resid);
-
- sge = &tx->tx_sge[tx->tx_nwrq];
- sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx);
- sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx);
- sge->length = wrknob;
-
- wrq = &tx->tx_wrq[tx->tx_nwrq];
- next = wrq + 1;
-
- wrq->wr.next = &next->wr;
- wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
- wrq->wr.sg_list = sge;
- wrq->wr.num_sge = 1;
- wrq->wr.opcode = IB_WR_RDMA_WRITE;
- wrq->wr.send_flags = 0;
-
- wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
- wrq->rkey = kiblnd_rd_frag_key(dstrd, dstidx);
-
- srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
- dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
-
- resid -= wrknob;
-
- tx->tx_nwrq++;
- wrq++;
- sge++;
- }
-too_big:
- if (rc < 0) /* no RDMA if completing with failure */
- tx->tx_nwrq = 0;
-
- ibmsg->ibm_u.completion.ibcm_status = rc;
- ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
- kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
- type, sizeof(struct kib_completion_msg));
-
- return rc;
-}
-
-static void
-kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn)
-{
- struct list_head *q;
-
- LASSERT(tx->tx_nwrq > 0); /* work items set up */
- LASSERT(!tx->tx_queued); /* not queued for sending already */
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- tx->tx_queued = 1;
- tx->tx_deadline = jiffies +
- msecs_to_jiffies(*kiblnd_tunables.kib_timeout *
- MSEC_PER_SEC);
-
- if (!tx->tx_conn) {
- kiblnd_conn_addref(conn);
- tx->tx_conn = conn;
- LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
- } else {
- /* PUT_DONE first attached to conn as a PUT_REQ */
- LASSERT(tx->tx_conn == conn);
- LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
- }
-
- switch (tx->tx_msg->ibm_type) {
- default:
- LBUG();
-
- case IBLND_MSG_PUT_REQ:
- case IBLND_MSG_GET_REQ:
- q = &conn->ibc_tx_queue_rsrvd;
- break;
-
- case IBLND_MSG_PUT_NAK:
- case IBLND_MSG_PUT_ACK:
- case IBLND_MSG_PUT_DONE:
- case IBLND_MSG_GET_DONE:
- q = &conn->ibc_tx_queue_nocred;
- break;
-
- case IBLND_MSG_NOOP:
- if (IBLND_OOB_CAPABLE(conn->ibc_version))
- q = &conn->ibc_tx_queue_nocred;
- else
- q = &conn->ibc_tx_noops;
- break;
-
- case IBLND_MSG_IMMEDIATE:
- q = &conn->ibc_tx_queue;
- break;
- }
-
- list_add_tail(&tx->tx_list, q);
-}
-
-static void
-kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn)
-{
- spin_lock(&conn->ibc_lock);
- kiblnd_queue_tx_locked(tx, conn);
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-}
-
-static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
- struct sockaddr_in *srcaddr,
- struct sockaddr_in *dstaddr,
- int timeout_ms)
-{
- unsigned short port;
- int rc;
-
- /* allow the port to be reused */
- rc = rdma_set_reuseaddr(cmid, 1);
- if (rc) {
- CERROR("Unable to set reuse on cmid: %d\n", rc);
- return rc;
- }
-
- /* look for a free privileged port */
- for (port = PROT_SOCK - 1; port > 0; port--) {
- srcaddr->sin_port = htons(port);
- rc = rdma_resolve_addr(cmid,
- (struct sockaddr *)srcaddr,
- (struct sockaddr *)dstaddr,
- timeout_ms);
- if (!rc) {
- CDEBUG(D_NET, "bound to port %hu\n", port);
- return 0;
- } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
- CDEBUG(D_NET, "bind to port %hu failed: %d\n",
- port, rc);
- } else {
- return rc;
- }
- }
-
- CERROR("Failed to bind to a free privileged port\n");
- return rc;
-}
-
-static void
-kiblnd_connect_peer(struct kib_peer *peer)
-{
- struct rdma_cm_id *cmid;
- struct kib_dev *dev;
- struct kib_net *net = peer->ibp_ni->ni_data;
- struct sockaddr_in srcaddr;
- struct sockaddr_in dstaddr;
- int rc;
-
- LASSERT(net);
- LASSERT(peer->ibp_connecting > 0);
-
- cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
- IB_QPT_RC);
-
- if (IS_ERR(cmid)) {
- CERROR("Can't create CMID for %s: %ld\n",
- libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
- rc = PTR_ERR(cmid);
- goto failed;
- }
-
- dev = net->ibn_dev;
- memset(&srcaddr, 0, sizeof(srcaddr));
- srcaddr.sin_family = AF_INET;
- srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
-
- memset(&dstaddr, 0, sizeof(dstaddr));
- dstaddr.sin_family = AF_INET;
- dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
- dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
-
- kiblnd_peer_addref(peer); /* cmid's ref */
-
- if (*kiblnd_tunables.kib_use_priv_port) {
- rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- } else {
- rc = rdma_resolve_addr(cmid,
- (struct sockaddr *)&srcaddr,
- (struct sockaddr *)&dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- }
- if (rc) {
- /* Can't initiate address resolution: */
- CERROR("Can't resolve addr for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- goto failed2;
- }
-
- return;
-
- failed2:
- kiblnd_peer_connect_failed(peer, 1, rc);
- kiblnd_peer_decref(peer); /* cmid's ref */
- rdma_destroy_id(cmid);
- return;
- failed:
- kiblnd_peer_connect_failed(peer, 1, rc);
-}
-
-bool
-kiblnd_reconnect_peer(struct kib_peer *peer)
-{
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- char *reason = NULL;
- struct list_head txs;
- unsigned long flags;
-
- INIT_LIST_HEAD(&txs);
-
- write_lock_irqsave(glock, flags);
- if (!peer->ibp_reconnecting) {
- if (peer->ibp_accepting)
- reason = "accepting";
- else if (peer->ibp_connecting)
- reason = "connecting";
- else if (!list_empty(&peer->ibp_conns))
- reason = "connected";
- else /* connected then closed */
- reason = "closed";
-
- goto no_reconnect;
- }
-
- LASSERT(!peer->ibp_accepting && !peer->ibp_connecting &&
- list_empty(&peer->ibp_conns));
- peer->ibp_reconnecting--;
-
- if (!kiblnd_peer_active(peer)) {
- list_splice_init(&peer->ibp_tx_queue, &txs);
- reason = "unlinked";
- goto no_reconnect;
- }
-
- peer->ibp_connecting++;
- peer->ibp_reconnected++;
- write_unlock_irqrestore(glock, flags);
-
- kiblnd_connect_peer(peer);
- return true;
-
-no_reconnect:
- write_unlock_irqrestore(glock, flags);
-
- CWARN("Abort reconnection of %s: %s\n",
- libcfs_nid2str(peer->ibp_nid), reason);
- kiblnd_txlist_done(peer->ibp_ni, &txs, -ECONNABORTED);
- return false;
-}
-
-void
-kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
-{
- struct kib_peer *peer;
- struct kib_peer *peer2;
- struct kib_conn *conn;
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- unsigned long flags;
- int rc;
- int i;
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-
- /*
- * If I get here, I've committed to send, so I complete the tx with
- * failure on any problems
- */
- LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */
- LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */
-
- /*
- * First time, just use a read lock since I expect to find my peer
- * connected
- */
- read_lock_irqsave(g_lock, flags);
-
- peer = kiblnd_find_peer_locked(nid);
- if (peer && !list_empty(&peer->ibp_conns)) {
- /* Found a peer with an established connection */
- conn = kiblnd_get_conn_locked(peer);
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- read_unlock_irqrestore(g_lock, flags);
-
- if (tx)
- kiblnd_queue_tx(tx, conn);
- kiblnd_conn_decref(conn); /* ...to here */
- return;
- }
-
- read_unlock(g_lock);
- /* Re-try with a write lock */
- write_lock(g_lock);
-
- peer = kiblnd_find_peer_locked(nid);
- if (peer) {
- if (list_empty(&peer->ibp_conns)) {
- /* found a peer, but it's still connecting... */
- LASSERT(kiblnd_peer_connecting(peer));
- if (tx)
- list_add_tail(&tx->tx_list,
- &peer->ibp_tx_queue);
- write_unlock_irqrestore(g_lock, flags);
- } else {
- conn = kiblnd_get_conn_locked(peer);
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- write_unlock_irqrestore(g_lock, flags);
-
- if (tx)
- kiblnd_queue_tx(tx, conn);
- kiblnd_conn_decref(conn); /* ...to here */
- }
- return;
- }
-
- write_unlock_irqrestore(g_lock, flags);
-
- /* Allocate a peer ready to add to the peer table and retry */
- rc = kiblnd_create_peer(ni, &peer, nid);
- if (rc) {
- CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
- if (tx) {
- tx->tx_status = -EHOSTUNREACH;
- tx->tx_waiting = 0;
- kiblnd_tx_done(ni, tx);
- }
- return;
- }
-
- write_lock_irqsave(g_lock, flags);
-
- peer2 = kiblnd_find_peer_locked(nid);
- if (peer2) {
- if (list_empty(&peer2->ibp_conns)) {
- /* found a peer, but it's still connecting... */
- LASSERT(kiblnd_peer_connecting(peer2));
- if (tx)
- list_add_tail(&tx->tx_list,
- &peer2->ibp_tx_queue);
- write_unlock_irqrestore(g_lock, flags);
- } else {
- conn = kiblnd_get_conn_locked(peer2);
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- write_unlock_irqrestore(g_lock, flags);
-
- if (tx)
- kiblnd_queue_tx(tx, conn);
- kiblnd_conn_decref(conn); /* ...to here */
- }
-
- kiblnd_peer_decref(peer);
- return;
- }
-
- /* Brand new peer */
- LASSERT(!peer->ibp_connecting);
- tunables = &peer->ibp_ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
- peer->ibp_connecting = tunables->lnd_conns_per_peer;
-
- /* always called with a ref on ni, which prevents ni being shutdown */
- LASSERT(!((struct kib_net *)ni->ni_data)->ibn_shutdown);
-
- if (tx)
- list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
-
- kiblnd_peer_addref(peer);
- list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
-
- write_unlock_irqrestore(g_lock, flags);
-
- for (i = 0; i < tunables->lnd_conns_per_peer; i++)
- kiblnd_connect_peer(peer);
- kiblnd_peer_decref(peer);
-}
-
-int
-kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
-{
- struct lnet_hdr *hdr = &lntmsg->msg_hdr;
- int type = lntmsg->msg_type;
- struct lnet_process_id target = lntmsg->msg_target;
- int target_is_router = lntmsg->msg_target_is_router;
- int routing = lntmsg->msg_routing;
- unsigned int payload_niov = lntmsg->msg_niov;
- struct kvec *payload_iov = lntmsg->msg_iov;
- struct bio_vec *payload_kiov = lntmsg->msg_kiov;
- unsigned int payload_offset = lntmsg->msg_offset;
- unsigned int payload_nob = lntmsg->msg_len;
- struct iov_iter from;
- struct kib_msg *ibmsg;
- struct kib_rdma_desc *rd;
- struct kib_tx *tx;
- int nob;
- int rc;
-
- /* NB 'private' is different depending on what we're sending.... */
-
- CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
- payload_nob, payload_niov, libcfs_id2str(target));
-
- LASSERT(!payload_nob || payload_niov > 0);
- LASSERT(payload_niov <= LNET_MAX_IOV);
-
- /* Thread context */
- LASSERT(!in_interrupt());
- /* payload is either all vaddrs or all pages */
- LASSERT(!(payload_kiov && payload_iov));
-
- if (payload_kiov)
- iov_iter_bvec(&from, ITER_BVEC | WRITE,
- payload_kiov, payload_niov,
- payload_nob + payload_offset);
- else
- iov_iter_kvec(&from, ITER_KVEC | WRITE,
- payload_iov, payload_niov,
- payload_nob + payload_offset);
-
- iov_iter_advance(&from, payload_offset);
-
- switch (type) {
- default:
- LBUG();
- return -EIO;
-
- case LNET_MSG_ACK:
- LASSERT(!payload_nob);
- break;
-
- case LNET_MSG_GET:
- if (routing || target_is_router)
- break; /* send IMMEDIATE */
-
- /* is the REPLY message too small for RDMA? */
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
- if (nob <= IBLND_MSG_SIZE)
- break; /* send IMMEDIATE */
-
- tx = kiblnd_get_idle_tx(ni, target.nid);
- if (!tx) {
- CERROR("Can't allocate txd for GET to %s\n",
- libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
-
- ibmsg = tx->tx_msg;
- rd = &ibmsg->ibm_u.get.ibgm_rd;
- if (!(lntmsg->msg_md->md_options & LNET_MD_KIOV))
- rc = kiblnd_setup_rd_iov(ni, tx, rd,
- lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.iov,
- 0, lntmsg->msg_md->md_length);
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, rd,
- lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.kiov,
- 0, lntmsg->msg_md->md_length);
- if (rc) {
- CERROR("Can't setup GET sink for %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- kiblnd_tx_done(ni, tx);
- return -EIO;
- }
-
- nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]);
- ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
- ibmsg->ibm_u.get.ibgm_hdr = *hdr;
-
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
-
- tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
- if (!tx->tx_lntmsg[1]) {
- CERROR("Can't create reply for GET -> %s\n",
- libcfs_nid2str(target.nid));
- kiblnd_tx_done(ni, tx);
- return -EIO;
- }
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
- tx->tx_waiting = 1; /* waiting for GET_DONE */
- kiblnd_launch_tx(ni, tx, target.nid);
- return 0;
-
- case LNET_MSG_REPLY:
- case LNET_MSG_PUT:
- /* Is the payload small enough not to need RDMA? */
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
- if (nob <= IBLND_MSG_SIZE)
- break; /* send IMMEDIATE */
-
- tx = kiblnd_get_idle_tx(ni, target.nid);
- if (!tx) {
- CERROR("Can't allocate %s txd for %s\n",
- type == LNET_MSG_PUT ? "PUT" : "REPLY",
- libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
-
- if (!payload_kiov)
- rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
- payload_niov, payload_iov,
- payload_offset, payload_nob);
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
- payload_niov, payload_kiov,
- payload_offset, payload_nob);
- if (rc) {
- CERROR("Can't setup PUT src for %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- kiblnd_tx_done(ni, tx);
- return -EIO;
- }
-
- ibmsg = tx->tx_msg;
- ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
- ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(struct kib_putreq_msg));
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
- kiblnd_launch_tx(ni, tx, target.nid);
- return 0;
- }
-
- /* send IMMEDIATE */
-
- LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob])
- <= IBLND_MSG_SIZE);
-
- tx = kiblnd_get_idle_tx(ni, target.nid);
- if (!tx) {
- CERROR("Can't send %d to %s: tx descs exhausted\n",
- type, libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
-
- ibmsg = tx->tx_msg;
- ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
-
- rc = copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, payload_nob,
- &from);
- if (rc != payload_nob) {
- kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
- return -EFAULT;
- }
-
- nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]);
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- kiblnd_launch_tx(ni, tx, target.nid);
- return 0;
-}
-
-static void
-kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg)
-{
- struct lnet_process_id target = lntmsg->msg_target;
- unsigned int niov = lntmsg->msg_niov;
- struct kvec *iov = lntmsg->msg_iov;
- struct bio_vec *kiov = lntmsg->msg_kiov;
- unsigned int offset = lntmsg->msg_offset;
- unsigned int nob = lntmsg->msg_len;
- struct kib_tx *tx;
- int rc;
-
- tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
- if (!tx) {
- CERROR("Can't get tx for REPLY to %s\n",
- libcfs_nid2str(target.nid));
- goto failed_0;
- }
-
- if (!nob)
- rc = 0;
- else if (!kiov)
- rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
- niov, iov, offset, nob);
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
- niov, kiov, offset, nob);
-
- if (rc) {
- CERROR("Can't setup GET src for %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- goto failed_1;
- }
-
- rc = kiblnd_init_rdma(rx->rx_conn, tx,
- IBLND_MSG_GET_DONE, nob,
- &rx->rx_msg->ibm_u.get.ibgm_rd,
- rx->rx_msg->ibm_u.get.ibgm_cookie);
- if (rc < 0) {
- CERROR("Can't setup rdma for GET from %s: %d\n",
- libcfs_nid2str(target.nid), rc);
- goto failed_1;
- }
-
- if (!nob) {
- /* No RDMA: local completion may happen now! */
- lnet_finalize(ni, lntmsg, 0);
- } else {
- /* RDMA: lnet_finalize(lntmsg) when it completes */
- tx->tx_lntmsg[0] = lntmsg;
- }
-
- kiblnd_queue_tx(tx, rx->rx_conn);
- return;
-
- failed_1:
- kiblnd_tx_done(ni, tx);
- failed_0:
- lnet_finalize(ni, lntmsg, -EIO);
-}
-
-int
-kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, struct iov_iter *to, unsigned int rlen)
-{
- struct kib_rx *rx = private;
- struct kib_msg *rxmsg = rx->rx_msg;
- struct kib_conn *conn = rx->rx_conn;
- struct kib_tx *tx;
- int nob;
- int post_credit = IBLND_POSTRX_PEER_CREDIT;
- int rc = 0;
-
- LASSERT(iov_iter_count(to) <= rlen);
- LASSERT(!in_interrupt());
- /* Either all pages or all vaddrs */
-
- switch (rxmsg->ibm_type) {
- default:
- LBUG();
-
- case IBLND_MSG_IMMEDIATE:
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]);
- if (nob > rx->rx_nob) {
- CERROR("Immediate message from %s too big: %d(%d)\n",
- libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
- nob, rx->rx_nob);
- rc = -EPROTO;
- break;
- }
-
- rc = copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, rlen,
- to);
- if (rc != rlen) {
- rc = -EFAULT;
- break;
- }
-
- rc = 0;
- lnet_finalize(ni, lntmsg, 0);
- break;
-
- case IBLND_MSG_PUT_REQ: {
- struct kib_msg *txmsg;
- struct kib_rdma_desc *rd;
-
- if (!iov_iter_count(to)) {
- lnet_finalize(ni, lntmsg, 0);
- kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
- rxmsg->ibm_u.putreq.ibprm_cookie);
- break;
- }
-
- tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
- if (!tx) {
- CERROR("Can't allocate tx for %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- /* Not replying will break the connection */
- rc = -ENOMEM;
- break;
- }
-
- txmsg = tx->tx_msg;
- rd = &txmsg->ibm_u.putack.ibpam_rd;
- if (!(to->type & ITER_BVEC))
- rc = kiblnd_setup_rd_iov(ni, tx, rd,
- to->nr_segs, to->kvec,
- to->iov_offset,
- iov_iter_count(to));
- else
- rc = kiblnd_setup_rd_kiov(ni, tx, rd,
- to->nr_segs, to->bvec,
- to->iov_offset,
- iov_iter_count(to));
- if (rc) {
- CERROR("Can't setup PUT sink for %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
- kiblnd_tx_done(ni, tx);
- /* tell peer it's over */
- kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
- rxmsg->ibm_u.putreq.ibprm_cookie);
- break;
- }
-
- nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]);
- txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
- txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
-
- kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
-
- tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- tx->tx_waiting = 1; /* waiting for PUT_DONE */
- kiblnd_queue_tx(tx, conn);
-
- /* reposted buffer reserved for PUT_DONE */
- post_credit = IBLND_POSTRX_NO_CREDIT;
- break;
- }
-
- case IBLND_MSG_GET_REQ:
- if (lntmsg) {
- /* Optimized GET; RDMA lntmsg's payload */
- kiblnd_reply(ni, rx, lntmsg);
- } else {
- /* GET didn't match anything */
- kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
- -ENODATA,
- rxmsg->ibm_u.get.ibgm_cookie);
- }
- break;
- }
-
- kiblnd_post_rx(rx, post_credit);
- return rc;
-}
-
-int
-kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
-{
- struct task_struct *task = kthread_run(fn, arg, "%s", name);
-
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- atomic_inc(&kiblnd_data.kib_nthreads);
- return 0;
-}
-
-static void
-kiblnd_thread_fini(void)
-{
- atomic_dec(&kiblnd_data.kib_nthreads);
-}
-
-static void
-kiblnd_peer_alive(struct kib_peer *peer)
-{
- /* This is racy, but everyone's only writing jiffies */
- peer->ibp_last_alive = jiffies;
- mb();
-}
-
-static void
-kiblnd_peer_notify(struct kib_peer *peer)
-{
- int error = 0;
- unsigned long last_alive = 0;
- unsigned long flags;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (kiblnd_peer_idle(peer) && peer->ibp_error) {
- error = peer->ibp_error;
- peer->ibp_error = 0;
-
- last_alive = peer->ibp_last_alive;
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- if (error)
- lnet_notify(peer->ibp_ni,
- peer->ibp_nid, 0, last_alive);
-}
-
-void
-kiblnd_close_conn_locked(struct kib_conn *conn, int error)
-{
- /*
- * This just does the immediate housekeeping. 'error' is zero for a
- * normal shutdown which can happen only after the connection has been
- * established. If the connection is established, schedule the
- * connection to be finished off by the connd. Otherwise the connd is
- * already dealing with it (either to set it up or tear it down).
- * Caller holds kib_global_lock exclusively in irq context
- */
- struct kib_peer *peer = conn->ibc_peer;
- struct kib_dev *dev;
- unsigned long flags;
-
- LASSERT(error || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- if (error && !conn->ibc_comms_error)
- conn->ibc_comms_error = error;
-
- if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
- return; /* already being handled */
-
- if (!error &&
- list_empty(&conn->ibc_tx_noops) &&
- list_empty(&conn->ibc_tx_queue) &&
- list_empty(&conn->ibc_tx_queue_rsrvd) &&
- list_empty(&conn->ibc_tx_queue_nocred) &&
- list_empty(&conn->ibc_active_txs)) {
- CDEBUG(D_NET, "closing conn to %s\n",
- libcfs_nid2str(peer->ibp_nid));
- } else {
- CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
- libcfs_nid2str(peer->ibp_nid), error,
- list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
- list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
- list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
- list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
- list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
- }
-
- dev = ((struct kib_net *)peer->ibp_ni->ni_data)->ibn_dev;
- if (peer->ibp_next_conn == conn)
- /* clear next_conn so it won't be used */
- peer->ibp_next_conn = NULL;
- list_del(&conn->ibc_list);
- /* connd (see below) takes over ibc_list's ref */
-
- if (list_empty(&peer->ibp_conns) && /* no more conns */
- kiblnd_peer_active(peer)) { /* still in peer table */
- kiblnd_unlink_peer_locked(peer);
-
- /* set/clear error on last conn */
- peer->ibp_error = conn->ibc_comms_error;
- }
-
- kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
-
- if (error &&
- kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- wake_up(&kiblnd_data.kib_failover_waitq);
- }
-
- spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
-
- list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
- wake_up(&kiblnd_data.kib_connd_waitq);
-
- spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
-}
-
-void
-kiblnd_close_conn(struct kib_conn *conn, int error)
-{
- unsigned long flags;
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_close_conn_locked(conn, error);
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-}
-
-static void
-kiblnd_handle_early_rxs(struct kib_conn *conn)
-{
- unsigned long flags;
- struct kib_rx *rx;
-
- LASSERT(!in_interrupt());
- LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- while (!list_empty(&conn->ibc_early_rxs)) {
- rx = list_entry(conn->ibc_early_rxs.next,
- struct kib_rx, rx_list);
- list_del(&rx->rx_list);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_handle_rx(rx);
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- }
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-}
-
-static void
-kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs)
-{
- LIST_HEAD(zombies);
- struct list_head *tmp;
- struct list_head *nxt;
- struct kib_tx *tx;
-
- spin_lock(&conn->ibc_lock);
-
- list_for_each_safe(tmp, nxt, txs) {
- tx = list_entry(tmp, struct kib_tx, tx_list);
-
- if (txs == &conn->ibc_active_txs) {
- LASSERT(!tx->tx_queued);
- LASSERT(tx->tx_waiting || tx->tx_sending);
- } else {
- LASSERT(tx->tx_queued);
- }
-
- tx->tx_status = -ECONNABORTED;
- tx->tx_waiting = 0;
-
- if (!tx->tx_sending) {
- tx->tx_queued = 0;
- list_del(&tx->tx_list);
- list_add(&tx->tx_list, &zombies);
- }
- }
-
- spin_unlock(&conn->ibc_lock);
-
- kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
-}
-
-static void
-kiblnd_finalise_conn(struct kib_conn *conn)
-{
- LASSERT(!in_interrupt());
- LASSERT(conn->ibc_state > IBLND_CONN_INIT);
-
- kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
-
- /*
- * abort_receives moves QP state to IB_QPS_ERR. This is only required
- * for connections that didn't get as far as being connected, because
- * rdma_disconnect() does this for free.
- */
- kiblnd_abort_receives(conn);
-
- /*
- * Complete all tx descs not waiting for sends to complete.
- * NB we should be safe from RDMA now that the QP has changed state
- */
- kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
- kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
- kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
- kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
- kiblnd_abort_txs(conn, &conn->ibc_active_txs);
-
- kiblnd_handle_early_rxs(conn);
-}
-
-static void
-kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error)
-{
- LIST_HEAD(zombies);
- unsigned long flags;
-
- LASSERT(error);
- LASSERT(!in_interrupt());
-
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- if (active) {
- LASSERT(peer->ibp_connecting > 0);
- peer->ibp_connecting--;
- } else {
- LASSERT(peer->ibp_accepting > 0);
- peer->ibp_accepting--;
- }
-
- if (kiblnd_peer_connecting(peer)) {
- /* another connection attempt under way... */
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
- flags);
- return;
- }
-
- peer->ibp_reconnected = 0;
- if (list_empty(&peer->ibp_conns)) {
- /* Take peer's blocked transmits to complete with error */
- list_add(&zombies, &peer->ibp_tx_queue);
- list_del_init(&peer->ibp_tx_queue);
-
- if (kiblnd_peer_active(peer))
- kiblnd_unlink_peer_locked(peer);
-
- peer->ibp_error = error;
- } else {
- /* Can't have blocked transmits if there are connections */
- LASSERT(list_empty(&peer->ibp_tx_queue));
- }
-
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_peer_notify(peer);
-
- if (list_empty(&zombies))
- return;
-
- CNETERR("Deleting messages for %s: connection failed\n",
- libcfs_nid2str(peer->ibp_nid));
-
- kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
-}
-
-static void
-kiblnd_connreq_done(struct kib_conn *conn, int status)
-{
- struct kib_peer *peer = conn->ibc_peer;
- struct kib_tx *tx;
- struct kib_tx *tmp;
- struct list_head txs;
- unsigned long flags;
- int active;
-
- active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
-
- CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n",
- libcfs_nid2str(peer->ibp_nid), active,
- conn->ibc_version, status);
-
- LASSERT(!in_interrupt());
- LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
- peer->ibp_connecting > 0) ||
- (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
- peer->ibp_accepting > 0));
-
- kfree(conn->ibc_connvars);
- conn->ibc_connvars = NULL;
-
- if (status) {
- /* failed to establish connection */
- kiblnd_peer_connect_failed(peer, active, status);
- kiblnd_finalise_conn(conn);
- return;
- }
-
- /* connection established */
- write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- conn->ibc_last_send = jiffies;
- kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
- kiblnd_peer_alive(peer);
-
- /*
- * Add conn to peer's list and nuke any dangling conns from a different
- * peer instance...
- */
- kiblnd_conn_addref(conn); /* +1 ref for ibc_list */
- list_add(&conn->ibc_list, &peer->ibp_conns);
- peer->ibp_reconnected = 0;
- if (active)
- peer->ibp_connecting--;
- else
- peer->ibp_accepting--;
-
- if (!peer->ibp_version) {
- peer->ibp_version = conn->ibc_version;
- peer->ibp_incarnation = conn->ibc_incarnation;
- }
-
- if (peer->ibp_version != conn->ibc_version ||
- peer->ibp_incarnation != conn->ibc_incarnation) {
- kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
- conn->ibc_incarnation);
- peer->ibp_version = conn->ibc_version;
- peer->ibp_incarnation = conn->ibc_incarnation;
- }
-
- /* grab pending txs while I have the lock */
- list_add(&txs, &peer->ibp_tx_queue);
- list_del_init(&peer->ibp_tx_queue);
-
- if (!kiblnd_peer_active(peer) || /* peer has been deleted */
- conn->ibc_comms_error) { /* error has happened already */
- struct lnet_ni *ni = peer->ibp_ni;
-
- /* start to shut down connection */
- kiblnd_close_conn_locked(conn, -ECONNABORTED);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
-
- return;
- }
-
- /*
- * +1 ref for myself, this connection is visible to other threads
- * now, refcount of peer:ibp_conns can be released by connection
- * close from either a different thread, or the calling of
- * kiblnd_check_sends_locked() below. See bz21911 for details.
- */
- kiblnd_conn_addref(conn);
- write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- /* Schedule blocked txs
- * Note: if we are running with conns_per_peer > 1, these blocked
- * txs will all get scheduled to the first connection which gets
- * scheduled. We won't be using round robin on this first batch.
- */
- spin_lock(&conn->ibc_lock);
- list_for_each_entry_safe(tx, tmp, &txs, tx_list) {
- list_del(&tx->tx_list);
-
- kiblnd_queue_tx_locked(tx, conn);
- }
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
- /* schedule blocked rxs */
- kiblnd_handle_early_rxs(conn);
-
- kiblnd_conn_decref(conn);
-}
-
-static void
-kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
-{
- int rc;
-
- rc = rdma_reject(cmid, rej, sizeof(*rej));
-
- if (rc)
- CWARN("Error %d sending reject\n", rc);
-}
-
-static int
-kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
-{
- rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
- struct kib_msg *reqmsg = priv;
- struct kib_msg *ackmsg;
- struct kib_dev *ibdev;
- struct kib_peer *peer;
- struct kib_peer *peer2;
- struct kib_conn *conn;
- struct lnet_ni *ni = NULL;
- struct kib_net *net = NULL;
- lnet_nid_t nid;
- struct rdma_conn_param cp;
- struct kib_rej rej;
- int version = IBLND_MSG_VERSION;
- unsigned long flags;
- int max_frags;
- int rc;
- struct sockaddr_in *peer_addr;
-
- LASSERT(!in_interrupt());
-
- /* cmid inherits 'context' from the corresponding listener id */
- ibdev = (struct kib_dev *)cmid->context;
- LASSERT(ibdev);
-
- memset(&rej, 0, sizeof(rej));
- rej.ibr_magic = IBLND_MSG_MAGIC;
- rej.ibr_why = IBLND_REJECT_FATAL;
- rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
-
- peer_addr = (struct sockaddr_in *)&cmid->route.addr.dst_addr;
- if (*kiblnd_tunables.kib_require_priv_port &&
- ntohs(peer_addr->sin_port) >= PROT_SOCK) {
- __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
-
- CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
- &ip, ntohs(peer_addr->sin_port));
- goto failed;
- }
-
- if (priv_nob < offsetof(struct kib_msg, ibm_type)) {
- CERROR("Short connection request\n");
- goto failed;
- }
-
- /*
- * Future protocol version compatibility support! If the
- * o2iblnd-specific protocol changes, or when LNET unifies
- * protocols over all LNDs, the initial connection will
- * negotiate a protocol version. I trap this here to avoid
- * console errors; the reject tells the peer which protocol I
- * speak.
- */
- if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
- reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
- goto failed;
- if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
- reqmsg->ibm_version != IBLND_MSG_VERSION &&
- reqmsg->ibm_version != IBLND_MSG_VERSION_1)
- goto failed;
- if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
- reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
- reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
- goto failed;
-
- rc = kiblnd_unpack_msg(reqmsg, priv_nob);
- if (rc) {
- CERROR("Can't parse connection request: %d\n", rc);
- goto failed;
- }
-
- nid = reqmsg->ibm_srcnid;
- ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
-
- if (ni) {
- net = (struct kib_net *)ni->ni_data;
- rej.ibr_incarnation = net->ibn_incarnation;
- }
-
- if (!ni || /* no matching net */
- ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */
- net->ibn_dev != ibdev) { /* wrong device */
- CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n",
- libcfs_nid2str(nid),
- !ni ? "NA" : libcfs_nid2str(ni->ni_nid),
- ibdev->ibd_ifname, ibdev->ibd_nnets,
- &ibdev->ibd_ifip,
- libcfs_nid2str(reqmsg->ibm_dstnid));
-
- goto failed;
- }
-
- /* check time stamp as soon as possible */
- if (reqmsg->ibm_dststamp &&
- reqmsg->ibm_dststamp != net->ibn_incarnation) {
- CWARN("Stale connection request\n");
- rej.ibr_why = IBLND_REJECT_CONN_STALE;
- goto failed;
- }
-
- /* I can accept peer's version */
- version = reqmsg->ibm_version;
-
- if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
- CERROR("Unexpected connreq msg type: %x from %s\n",
- reqmsg->ibm_type, libcfs_nid2str(nid));
- goto failed;
- }
-
- if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
- kiblnd_msg_queue_size(version, ni)) {
- CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n",
- libcfs_nid2str(nid),
- reqmsg->ibm_u.connparams.ibcp_queue_depth,
- kiblnd_msg_queue_size(version, ni));
-
- if (version == IBLND_MSG_VERSION)
- rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
-
- goto failed;
- }
-
- max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
- if (max_frags > kiblnd_rdma_frags(version, ni)) {
- CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n",
- libcfs_nid2str(nid), version, max_frags,
- kiblnd_rdma_frags(version, ni));
-
- if (version >= IBLND_MSG_VERSION)
- rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
-
- goto failed;
- } else if (max_frags < kiblnd_rdma_frags(version, ni) &&
- !net->ibn_fmr_ps) {
- CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n",
- libcfs_nid2str(nid), version, max_frags,
- kiblnd_rdma_frags(version, ni));
-
- if (version == IBLND_MSG_VERSION)
- rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
-
- goto failed;
- }
-
- if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
- CERROR("Can't accept %s: message size %d too big (%d max)\n",
- libcfs_nid2str(nid),
- reqmsg->ibm_u.connparams.ibcp_max_msg_size,
- IBLND_MSG_SIZE);
- goto failed;
- }
-
- /* assume 'nid' is a new peer; create */
- rc = kiblnd_create_peer(ni, &peer, nid);
- if (rc) {
- CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
- rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
- goto failed;
- }
-
- /* We have validated the peer's parameters so use those */
- peer->ibp_max_frags = max_frags;
- peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
-
- write_lock_irqsave(g_lock, flags);
-
- peer2 = kiblnd_find_peer_locked(nid);
- if (peer2) {
- if (!peer2->ibp_version) {
- peer2->ibp_version = version;
- peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
- }
-
- /* not the guy I've talked with */
- if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
- peer2->ibp_version != version) {
- kiblnd_close_peer_conns_locked(peer2, -ESTALE);
-
- if (kiblnd_peer_active(peer2)) {
- peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
- peer2->ibp_version = version;
- }
- write_unlock_irqrestore(g_lock, flags);
-
- CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n",
- libcfs_nid2str(nid), peer2->ibp_version, version,
- peer2->ibp_incarnation, reqmsg->ibm_srcstamp);
-
- kiblnd_peer_decref(peer);
- rej.ibr_why = IBLND_REJECT_CONN_STALE;
- goto failed;
- }
-
- /*
- * Tie-break connection race in favour of the higher NID.
- * If we keep running into a race condition multiple times,
- * we have to assume that the connection attempt with the
- * higher NID is stuck in a connecting state and will never
- * recover. As such, we pass through this if-block and let
- * the lower NID connection win so we can move forward.
- */
- if (peer2->ibp_connecting &&
- nid < ni->ni_nid && peer2->ibp_races <
- MAX_CONN_RACES_BEFORE_ABORT) {
- peer2->ibp_races++;
- write_unlock_irqrestore(g_lock, flags);
-
- CDEBUG(D_NET, "Conn race %s\n",
- libcfs_nid2str(peer2->ibp_nid));
-
- kiblnd_peer_decref(peer);
- rej.ibr_why = IBLND_REJECT_CONN_RACE;
- goto failed;
- }
- if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT)
- CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n",
- libcfs_nid2str(peer2->ibp_nid),
- MAX_CONN_RACES_BEFORE_ABORT);
- /**
- * passive connection is allowed even this peer is waiting for
- * reconnection.
- */
- peer2->ibp_reconnecting = 0;
- peer2->ibp_races = 0;
- peer2->ibp_accepting++;
- kiblnd_peer_addref(peer2);
-
- /**
- * Race with kiblnd_launch_tx (active connect) to create peer
- * so copy validated parameters since we now know what the
- * peer's limits are
- */
- peer2->ibp_max_frags = peer->ibp_max_frags;
- peer2->ibp_queue_depth = peer->ibp_queue_depth;
-
- write_unlock_irqrestore(g_lock, flags);
- kiblnd_peer_decref(peer);
- peer = peer2;
- } else {
- /* Brand new peer */
- LASSERT(!peer->ibp_accepting);
- LASSERT(!peer->ibp_version &&
- !peer->ibp_incarnation);
-
- peer->ibp_accepting = 1;
- peer->ibp_version = version;
- peer->ibp_incarnation = reqmsg->ibm_srcstamp;
-
- /* I have a ref on ni that prevents it being shutdown */
- LASSERT(!net->ibn_shutdown);
-
- kiblnd_peer_addref(peer);
- list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
-
- write_unlock_irqrestore(g_lock, flags);
- }
-
- conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT,
- version);
- if (!conn) {
- kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
- kiblnd_peer_decref(peer);
- rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
- goto failed;
- }
-
- /*
- * conn now "owns" cmid, so I return success from here on to ensure the
- * CM callback doesn't destroy cmid.
- */
- conn->ibc_incarnation = reqmsg->ibm_srcstamp;
- conn->ibc_credits = conn->ibc_queue_depth;
- conn->ibc_reserved_credits = conn->ibc_queue_depth;
- LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
- IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn));
-
- ackmsg = &conn->ibc_connvars->cv_msg;
- memset(ackmsg, 0, sizeof(*ackmsg));
-
- kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
- sizeof(ackmsg->ibm_u.connparams));
- ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth;
- ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
- ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
-
- kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
-
- memset(&cp, 0, sizeof(cp));
- cp.private_data = ackmsg;
- cp.private_data_len = ackmsg->ibm_nob;
- cp.responder_resources = 0; /* No atomic ops or RDMA reads */
- cp.initiator_depth = 0;
- cp.flow_control = 1;
- cp.retry_count = *kiblnd_tunables.kib_retry_count;
- cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
-
- CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
-
- rc = rdma_accept(cmid, &cp);
- if (rc) {
- CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
- rej.ibr_version = version;
- rej.ibr_why = IBLND_REJECT_FATAL;
-
- kiblnd_reject(cmid, &rej);
- kiblnd_connreq_done(conn, rc);
- kiblnd_conn_decref(conn);
- }
-
- lnet_ni_decref(ni);
- return 0;
-
- failed:
- if (ni) {
- rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni);
- rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni);
- lnet_ni_decref(ni);
- }
-
- rej.ibr_version = version;
- kiblnd_reject(cmid, &rej);
-
- return -ECONNREFUSED;
-}
-
-static void
-kiblnd_check_reconnect(struct kib_conn *conn, int version,
- __u64 incarnation, int why, struct kib_connparams *cp)
-{
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_peer *peer = conn->ibc_peer;
- char *reason;
- int msg_size = IBLND_MSG_SIZE;
- int frag_num = -1;
- int queue_dep = -1;
- bool reconnect;
- unsigned long flags;
-
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
- LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */
-
- if (cp) {
- msg_size = cp->ibcp_max_msg_size;
- frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT;
- queue_dep = cp->ibcp_queue_depth;
- }
-
- write_lock_irqsave(glock, flags);
- /**
- * retry connection if it's still needed and no other connection
- * attempts (active or passive) are in progress
- * NB: reconnect is still needed even when ibp_tx_queue is
- * empty if ibp_version != version because reconnect may be
- * initiated by kiblnd_query()
- */
- reconnect = (!list_empty(&peer->ibp_tx_queue) ||
- peer->ibp_version != version) &&
- peer->ibp_connecting &&
- !peer->ibp_accepting;
- if (!reconnect) {
- reason = "no need";
- goto out;
- }
-
- switch (why) {
- default:
- reason = "Unknown";
- break;
-
- case IBLND_REJECT_RDMA_FRAGS: {
- struct lnet_ioctl_config_lnd_tunables *tunables;
-
- if (!cp) {
- reason = "can't negotiate max frags";
- goto out;
- }
- tunables = peer->ibp_ni->ni_lnd_tunables;
- if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
- reason = "map_on_demand must be enabled";
- goto out;
- }
- if (conn->ibc_max_frags <= frag_num) {
- reason = "unsupported max frags";
- goto out;
- }
-
- peer->ibp_max_frags = frag_num;
- reason = "rdma fragments";
- break;
- }
- case IBLND_REJECT_MSG_QUEUE_SIZE:
- if (!cp) {
- reason = "can't negotiate queue depth";
- goto out;
- }
- if (conn->ibc_queue_depth <= queue_dep) {
- reason = "unsupported queue depth";
- goto out;
- }
-
- peer->ibp_queue_depth = queue_dep;
- reason = "queue depth";
- break;
-
- case IBLND_REJECT_CONN_STALE:
- reason = "stale";
- break;
-
- case IBLND_REJECT_CONN_RACE:
- reason = "conn race";
- break;
-
- case IBLND_REJECT_CONN_UNCOMPAT:
- reason = "version negotiation";
- break;
- }
-
- conn->ibc_reconnect = 1;
- peer->ibp_reconnecting++;
- peer->ibp_version = version;
- if (incarnation)
- peer->ibp_incarnation = incarnation;
-out:
- write_unlock_irqrestore(glock, flags);
-
- CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n",
- libcfs_nid2str(peer->ibp_nid),
- reconnect ? "reconnect" : "don't reconnect",
- reason, IBLND_MSG_VERSION, version, msg_size,
- conn->ibc_queue_depth, queue_dep,
- conn->ibc_max_frags, frag_num);
- /**
- * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer
- * while destroying the zombie
- */
-}
-
-static void
-kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
-{
- struct kib_peer *peer = conn->ibc_peer;
-
- LASSERT(!in_interrupt());
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
-
- switch (reason) {
- case IB_CM_REJ_STALE_CONN:
- kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
- IBLND_REJECT_CONN_STALE, NULL);
- break;
-
- case IB_CM_REJ_INVALID_SERVICE_ID:
- CNETERR("%s rejected: no listener at %d\n",
- libcfs_nid2str(peer->ibp_nid),
- *kiblnd_tunables.kib_service);
- break;
-
- case IB_CM_REJ_CONSUMER_DEFINED:
- if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
- struct kib_rej *rej = priv;
- struct kib_connparams *cp = NULL;
- int flip = 0;
- __u64 incarnation = -1;
-
- /* NB. default incarnation is -1 because:
- * a) V1 will ignore dst incarnation in connreq.
- * b) V2 will provide incarnation while rejecting me,
- * -1 will be overwrote.
- *
- * if I try to connect to a V1 peer with V2 protocol,
- * it rejected me then upgrade to V2, I have no idea
- * about the upgrading and try to reconnect with V1,
- * in this case upgraded V2 can find out I'm trying to
- * talk to the old guy and reject me(incarnation is -1).
- */
-
- if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
- rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
- __swab32s(&rej->ibr_magic);
- __swab16s(&rej->ibr_version);
- flip = 1;
- }
-
- if (priv_nob >= sizeof(struct kib_rej) &&
- rej->ibr_version > IBLND_MSG_VERSION_1) {
- /*
- * priv_nob is always 148 in current version
- * of OFED, so we still need to check version.
- * (define of IB_CM_REJ_PRIVATE_DATA_SIZE)
- */
- cp = &rej->ibr_cp;
-
- if (flip) {
- __swab64s(&rej->ibr_incarnation);
- __swab16s(&cp->ibcp_queue_depth);
- __swab16s(&cp->ibcp_max_frags);
- __swab32s(&cp->ibcp_max_msg_size);
- }
-
- incarnation = rej->ibr_incarnation;
- }
-
- if (rej->ibr_magic != IBLND_MSG_MAGIC &&
- rej->ibr_magic != LNET_PROTO_MAGIC) {
- CERROR("%s rejected: consumer defined fatal error\n",
- libcfs_nid2str(peer->ibp_nid));
- break;
- }
-
- if (rej->ibr_version != IBLND_MSG_VERSION &&
- rej->ibr_version != IBLND_MSG_VERSION_1) {
- CERROR("%s rejected: o2iblnd version %x error\n",
- libcfs_nid2str(peer->ibp_nid),
- rej->ibr_version);
- break;
- }
-
- if (rej->ibr_why == IBLND_REJECT_FATAL &&
- rej->ibr_version == IBLND_MSG_VERSION_1) {
- CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
- libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
-
- if (conn->ibc_version != IBLND_MSG_VERSION_1)
- rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
- }
-
- switch (rej->ibr_why) {
- case IBLND_REJECT_CONN_RACE:
- case IBLND_REJECT_CONN_STALE:
- case IBLND_REJECT_CONN_UNCOMPAT:
- case IBLND_REJECT_MSG_QUEUE_SIZE:
- case IBLND_REJECT_RDMA_FRAGS:
- kiblnd_check_reconnect(conn, rej->ibr_version,
- incarnation,
- rej->ibr_why, cp);
- break;
-
- case IBLND_REJECT_NO_RESOURCES:
- CERROR("%s rejected: o2iblnd no resources\n",
- libcfs_nid2str(peer->ibp_nid));
- break;
-
- case IBLND_REJECT_FATAL:
- CERROR("%s rejected: o2iblnd fatal error\n",
- libcfs_nid2str(peer->ibp_nid));
- break;
-
- default:
- CERROR("%s rejected: o2iblnd reason %d\n",
- libcfs_nid2str(peer->ibp_nid),
- rej->ibr_why);
- break;
- }
- break;
- }
- /* fall through */
- default:
- CNETERR("%s rejected: reason %d, size %d\n",
- libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
- break;
- }
-
- kiblnd_connreq_done(conn, -ECONNREFUSED);
-}
-
-static void
-kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob)
-{
- struct kib_peer *peer = conn->ibc_peer;
- struct lnet_ni *ni = peer->ibp_ni;
- struct kib_net *net = ni->ni_data;
- struct kib_msg *msg = priv;
- int ver = conn->ibc_version;
- int rc = kiblnd_unpack_msg(msg, priv_nob);
- unsigned long flags;
-
- LASSERT(net);
-
- if (rc) {
- CERROR("Can't unpack connack from %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- goto failed;
- }
-
- if (msg->ibm_type != IBLND_MSG_CONNACK) {
- CERROR("Unexpected message %d from %s\n",
- msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
- rc = -EPROTO;
- goto failed;
- }
-
- if (ver != msg->ibm_version) {
- CERROR("%s replied version %x is different with requested version %x\n",
- libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
- rc = -EPROTO;
- goto failed;
- }
-
- if (msg->ibm_u.connparams.ibcp_queue_depth >
- conn->ibc_queue_depth) {
- CERROR("%s has incompatible queue depth %d (<=%d wanted)\n",
- libcfs_nid2str(peer->ibp_nid),
- msg->ibm_u.connparams.ibcp_queue_depth,
- conn->ibc_queue_depth);
- rc = -EPROTO;
- goto failed;
- }
-
- if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) >
- conn->ibc_max_frags) {
- CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
- libcfs_nid2str(peer->ibp_nid),
- msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT,
- conn->ibc_max_frags);
- rc = -EPROTO;
- goto failed;
- }
-
- if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
- CERROR("%s max message size %d too big (%d max)\n",
- libcfs_nid2str(peer->ibp_nid),
- msg->ibm_u.connparams.ibcp_max_msg_size,
- IBLND_MSG_SIZE);
- rc = -EPROTO;
- goto failed;
- }
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- if (msg->ibm_dstnid == ni->ni_nid &&
- msg->ibm_dststamp == net->ibn_incarnation)
- rc = 0;
- else
- rc = -ESTALE;
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- if (rc) {
- CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc,
- msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
- goto failed;
- }
-
- conn->ibc_incarnation = msg->ibm_srcstamp;
- conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth;
- conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
- conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth;
- conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
- LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
- IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
-
- kiblnd_connreq_done(conn, 0);
- return;
-
- failed:
- /*
- * NB My QP has already established itself, so I handle anything going
- * wrong here by setting ibc_comms_error.
- * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
- * immediately tears it down.
- */
- LASSERT(rc);
- conn->ibc_comms_error = rc;
- kiblnd_connreq_done(conn, 0);
-}
-
-static int
-kiblnd_active_connect(struct rdma_cm_id *cmid)
-{
- struct kib_peer *peer = (struct kib_peer *)cmid->context;
- struct kib_conn *conn;
- struct kib_msg *msg;
- struct rdma_conn_param cp;
- int version;
- __u64 incarnation;
- unsigned long flags;
- int rc;
-
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- incarnation = peer->ibp_incarnation;
- version = !peer->ibp_version ? IBLND_MSG_VERSION :
- peer->ibp_version;
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT,
- version);
- if (!conn) {
- kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
- kiblnd_peer_decref(peer); /* lose cmid's ref */
- return -ENOMEM;
- }
-
- /*
- * conn "owns" cmid now, so I return success from here on to ensure the
- * CM callback doesn't destroy cmid. conn also takes over cmid's ref
- * on peer
- */
- msg = &conn->ibc_connvars->cv_msg;
-
- memset(msg, 0, sizeof(*msg));
- kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
- msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth;
- msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
- msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
-
- kiblnd_pack_msg(peer->ibp_ni, msg, version,
- 0, peer->ibp_nid, incarnation);
-
- memset(&cp, 0, sizeof(cp));
- cp.private_data = msg;
- cp.private_data_len = msg->ibm_nob;
- cp.responder_resources = 0; /* No atomic ops or RDMA reads */
- cp.initiator_depth = 0;
- cp.flow_control = 1;
- cp.retry_count = *kiblnd_tunables.kib_retry_count;
- cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
-
- LASSERT(cmid->context == (void *)conn);
- LASSERT(conn->ibc_cmid == cmid);
-
- rc = rdma_connect(cmid, &cp);
- if (rc) {
- CERROR("Can't connect to %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- kiblnd_connreq_done(conn, rc);
- kiblnd_conn_decref(conn);
- }
-
- return 0;
-}
-
-int
-kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
-{
- struct kib_peer *peer;
- struct kib_conn *conn;
- int rc;
-
- switch (event->event) {
- default:
- CERROR("Unexpected event: %d, status: %d\n",
- event->event, event->status);
- LBUG();
-
- case RDMA_CM_EVENT_CONNECT_REQUEST:
- /* destroy cmid on failure */
- rc = kiblnd_passive_connect(cmid,
- (void *)KIBLND_CONN_PARAM(event),
- KIBLND_CONN_PARAM_LEN(event));
- CDEBUG(D_NET, "connreq: %d\n", rc);
- return rc;
-
- case RDMA_CM_EVENT_ADDR_ERROR:
- peer = (struct kib_peer *)cmid->context;
- CNETERR("%s: ADDR ERROR %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
- kiblnd_peer_decref(peer);
- return -EHOSTUNREACH; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_ADDR_RESOLVED:
- peer = (struct kib_peer *)cmid->context;
-
- CDEBUG(D_NET, "%s Addr resolved: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
-
- if (event->status) {
- CNETERR("Can't resolve address for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- rc = event->status;
- } else {
- rc = rdma_resolve_route(
- cmid, *kiblnd_tunables.kib_timeout * 1000);
- if (!rc) {
- struct kib_net *net = peer->ibp_ni->ni_data;
- struct kib_dev *dev = net->ibn_dev;
-
- CDEBUG(D_NET, "%s: connection bound to "\
- "%s:%pI4h:%s\n",
- libcfs_nid2str(peer->ibp_nid),
- dev->ibd_ifname,
- &dev->ibd_ifip, cmid->device->name);
-
- return 0;
- }
-
- /* Can't initiate route resolution */
- CERROR("Can't resolve route for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
- }
- kiblnd_peer_connect_failed(peer, 1, rc);
- kiblnd_peer_decref(peer);
- return rc; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_ROUTE_ERROR:
- peer = (struct kib_peer *)cmid->context;
- CNETERR("%s: ROUTE ERROR %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
- kiblnd_peer_decref(peer);
- return -EHOSTUNREACH; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_ROUTE_RESOLVED:
- peer = (struct kib_peer *)cmid->context;
- CDEBUG(D_NET, "%s Route resolved: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
-
- if (!event->status)
- return kiblnd_active_connect(cmid);
-
- CNETERR("Can't resolve route for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), event->status);
- kiblnd_peer_connect_failed(peer, 1, event->status);
- kiblnd_peer_decref(peer);
- return event->status; /* rc destroys cmid */
-
- case RDMA_CM_EVENT_UNREACHABLE:
- conn = (struct kib_conn *)cmid->context;
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
- conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
- CNETERR("%s: UNREACHABLE %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
- kiblnd_connreq_done(conn, -ENETDOWN);
- kiblnd_conn_decref(conn);
- return 0;
-
- case RDMA_CM_EVENT_CONNECT_ERROR:
- conn = (struct kib_conn *)cmid->context;
- LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
- conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
- CNETERR("%s: CONNECT ERROR %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
- kiblnd_connreq_done(conn, -ENOTCONN);
- kiblnd_conn_decref(conn);
- return 0;
-
- case RDMA_CM_EVENT_REJECTED:
- conn = (struct kib_conn *)cmid->context;
- switch (conn->ibc_state) {
- default:
- LBUG();
-
- case IBLND_CONN_PASSIVE_WAIT:
- CERROR("%s: REJECTED %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- event->status);
- kiblnd_connreq_done(conn, -ECONNRESET);
- break;
-
- case IBLND_CONN_ACTIVE_CONNECT:
- kiblnd_rejected(conn, event->status,
- (void *)KIBLND_CONN_PARAM(event),
- KIBLND_CONN_PARAM_LEN(event));
- break;
- }
- kiblnd_conn_decref(conn);
- return 0;
-
- case RDMA_CM_EVENT_ESTABLISHED:
- conn = (struct kib_conn *)cmid->context;
- switch (conn->ibc_state) {
- default:
- LBUG();
-
- case IBLND_CONN_PASSIVE_WAIT:
- CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_connreq_done(conn, 0);
- break;
-
- case IBLND_CONN_ACTIVE_CONNECT:
- CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_check_connreply(conn,
- (void *)KIBLND_CONN_PARAM(event),
- KIBLND_CONN_PARAM_LEN(event));
- break;
- }
- /* net keeps its ref on conn! */
- return 0;
-
- case RDMA_CM_EVENT_TIMEWAIT_EXIT:
- CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
- return 0;
- case RDMA_CM_EVENT_DISCONNECTED:
- conn = (struct kib_conn *)cmid->context;
- if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
- CERROR("%s DISCONNECTED\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- kiblnd_connreq_done(conn, -ECONNRESET);
- } else {
- kiblnd_close_conn(conn, 0);
- }
- kiblnd_conn_decref(conn);
- cmid->context = NULL;
- return 0;
-
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- LCONSOLE_ERROR_MSG(0x131,
- "Received notification of device removal\n"
- "Please shutdown LNET to allow this to proceed\n");
- /*
- * Can't remove network from underneath LNET for now, so I have
- * to ignore this
- */
- return 0;
-
- case RDMA_CM_EVENT_ADDR_CHANGE:
- LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
- return 0;
- }
-}
-
-static int
-kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
-{
- struct kib_tx *tx;
- struct list_head *ttmp;
-
- list_for_each(ttmp, txs) {
- tx = list_entry(ttmp, struct kib_tx, tx_list);
-
- if (txs != &conn->ibc_active_txs) {
- LASSERT(tx->tx_queued);
- } else {
- LASSERT(!tx->tx_queued);
- LASSERT(tx->tx_waiting || tx->tx_sending);
- }
-
- if (time_after_eq(jiffies, tx->tx_deadline)) {
- CERROR("Timed out tx: %s, %lu seconds\n",
- kiblnd_queue2str(conn, txs),
- (jiffies - tx->tx_deadline) / HZ);
- return 1;
- }
- }
-
- return 0;
-}
-
-static int
-kiblnd_conn_timed_out_locked(struct kib_conn *conn)
-{
- return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
- kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
-}
-
-static void
-kiblnd_check_conns(int idx)
-{
- LIST_HEAD(closes);
- LIST_HEAD(checksends);
- struct list_head *peers = &kiblnd_data.kib_peers[idx];
- struct list_head *ptmp;
- struct kib_peer *peer;
- struct kib_conn *conn;
- struct kib_conn *temp;
- struct kib_conn *tmp;
- struct list_head *ctmp;
- unsigned long flags;
-
- /*
- * NB. We expect to have a look at all the peers and not find any
- * RDMAs to time out, so we just use a shared lock while we
- * take a look...
- */
- read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
-
- list_for_each(ptmp, peers) {
- peer = list_entry(ptmp, struct kib_peer, ibp_list);
-
- list_for_each(ctmp, &peer->ibp_conns) {
- int timedout;
- int sendnoop;
-
- conn = list_entry(ctmp, struct kib_conn, ibc_list);
-
- LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
-
- spin_lock(&conn->ibc_lock);
-
- sendnoop = kiblnd_need_noop(conn);
- timedout = kiblnd_conn_timed_out_locked(conn);
- if (!sendnoop && !timedout) {
- spin_unlock(&conn->ibc_lock);
- continue;
- }
-
- if (timedout) {
- CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n",
- libcfs_nid2str(peer->ibp_nid),
- (jiffies - peer->ibp_last_alive) / HZ,
- conn->ibc_credits,
- conn->ibc_outstanding_credits,
- conn->ibc_reserved_credits);
- list_add(&conn->ibc_connd_list, &closes);
- } else {
- list_add(&conn->ibc_connd_list, &checksends);
- }
- /* +ref for 'closes' or 'checksends' */
- kiblnd_conn_addref(conn);
-
- spin_unlock(&conn->ibc_lock);
- }
- }
-
- read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
-
- /*
- * Handle timeout by closing the whole
- * connection. We can only be sure RDMA activity
- * has ceased once the QP has been modified.
- */
- list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) {
- list_del(&conn->ibc_connd_list);
- kiblnd_close_conn(conn, -ETIMEDOUT);
- kiblnd_conn_decref(conn);
- }
-
- /*
- * In case we have enough credits to return via a
- * NOOP, but there were no non-blocking tx descs
- * free to do it last time...
- */
- list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) {
- list_del(&conn->ibc_connd_list);
-
- spin_lock(&conn->ibc_lock);
- kiblnd_check_sends_locked(conn);
- spin_unlock(&conn->ibc_lock);
-
- kiblnd_conn_decref(conn);
- }
-}
-
-static void
-kiblnd_disconnect_conn(struct kib_conn *conn)
-{
- LASSERT(!in_interrupt());
- LASSERT(current == kiblnd_data.kib_connd);
- LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
-
- rdma_disconnect(conn->ibc_cmid);
- kiblnd_finalise_conn(conn);
-
- kiblnd_peer_notify(conn->ibc_peer);
-}
-
-/**
- * High-water for reconnection to the same peer, reconnection attempt should
- * be delayed after trying more than KIB_RECONN_HIGH_RACE.
- */
-#define KIB_RECONN_HIGH_RACE 10
-/**
- * Allow connd to take a break and handle other things after consecutive
- * reconnection attempts.
- */
-#define KIB_RECONN_BREAK 100
-
-int
-kiblnd_connd(void *arg)
-{
- spinlock_t *lock = &kiblnd_data.kib_connd_lock;
- wait_queue_entry_t wait;
- unsigned long flags;
- struct kib_conn *conn;
- int timeout;
- int i;
- int dropped_lock;
- int peer_index = 0;
- unsigned long deadline = jiffies;
-
- init_waitqueue_entry(&wait, current);
- kiblnd_data.kib_connd = current;
-
- spin_lock_irqsave(lock, flags);
-
- while (!kiblnd_data.kib_shutdown) {
- int reconn = 0;
-
- dropped_lock = 0;
-
- if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
- struct kib_peer *peer = NULL;
-
- conn = list_entry(kiblnd_data.kib_connd_zombies.next,
- struct kib_conn, ibc_list);
- list_del(&conn->ibc_list);
- if (conn->ibc_reconnect) {
- peer = conn->ibc_peer;
- kiblnd_peer_addref(peer);
- }
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- kiblnd_destroy_conn(conn);
-
- spin_lock_irqsave(lock, flags);
- if (!peer) {
- kfree(conn);
- continue;
- }
-
- conn->ibc_peer = peer;
- if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE)
- list_add_tail(&conn->ibc_list,
- &kiblnd_data.kib_reconn_list);
- else
- list_add_tail(&conn->ibc_list,
- &kiblnd_data.kib_reconn_wait);
- }
-
- if (!list_empty(&kiblnd_data.kib_connd_conns)) {
- conn = list_entry(kiblnd_data.kib_connd_conns.next,
- struct kib_conn, ibc_list);
- list_del(&conn->ibc_list);
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- kiblnd_disconnect_conn(conn);
- kiblnd_conn_decref(conn);
-
- spin_lock_irqsave(lock, flags);
- }
-
- while (reconn < KIB_RECONN_BREAK) {
- if (kiblnd_data.kib_reconn_sec !=
- ktime_get_real_seconds()) {
- kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
- list_splice_init(&kiblnd_data.kib_reconn_wait,
- &kiblnd_data.kib_reconn_list);
- }
-
- if (list_empty(&kiblnd_data.kib_reconn_list))
- break;
-
- conn = list_entry(kiblnd_data.kib_reconn_list.next,
- struct kib_conn, ibc_list);
- list_del(&conn->ibc_list);
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- reconn += kiblnd_reconnect_peer(conn->ibc_peer);
- kiblnd_peer_decref(conn->ibc_peer);
- kfree(conn);
-
- spin_lock_irqsave(lock, flags);
- }
-
- /* careful with the jiffy wrap... */
- timeout = (int)(deadline - jiffies);
- if (timeout <= 0) {
- const int n = 4;
- const int p = 1;
- int chunk = kiblnd_data.kib_peer_hash_size;
-
- spin_unlock_irqrestore(lock, flags);
- dropped_lock = 1;
-
- /*
- * Time to check for RDMA timeouts on a few more
- * peers: I do checks every 'p' seconds on a
- * proportion of the peer table and I need to check
- * every connection 'n' times within a timeout
- * interval, to ensure I detect a timeout on any
- * connection within (n+1)/n times the timeout
- * interval.
- */
- if (*kiblnd_tunables.kib_timeout > n * p)
- chunk = (chunk * n * p) /
- *kiblnd_tunables.kib_timeout;
- if (!chunk)
- chunk = 1;
-
- for (i = 0; i < chunk; i++) {
- kiblnd_check_conns(peer_index);
- peer_index = (peer_index + 1) %
- kiblnd_data.kib_peer_hash_size;
- }
-
- deadline += msecs_to_jiffies(p * MSEC_PER_SEC);
- spin_lock_irqsave(lock, flags);
- }
-
- if (dropped_lock)
- continue;
-
- /* Nothing to do for 'timeout' */
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
- spin_unlock_irqrestore(lock, flags);
-
- schedule_timeout(timeout);
-
- remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
- spin_lock_irqsave(lock, flags);
- }
-
- spin_unlock_irqrestore(lock, flags);
-
- kiblnd_thread_fini();
- return 0;
-}
-
-void
-kiblnd_qp_event(struct ib_event *event, void *arg)
-{
- struct kib_conn *conn = arg;
-
- switch (event->event) {
- case IB_EVENT_COMM_EST:
- CDEBUG(D_NET, "%s established\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- /*
- * We received a packet but connection isn't established
- * probably handshake packet was lost, so free to
- * force make connection established
- */
- rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST);
- return;
-
- default:
- CERROR("%s: Async QP event type %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
- return;
- }
-}
-
-static void
-kiblnd_complete(struct ib_wc *wc)
-{
- switch (kiblnd_wreqid2type(wc->wr_id)) {
- default:
- LBUG();
-
- case IBLND_WID_MR:
- if (wc->status != IB_WC_SUCCESS &&
- wc->status != IB_WC_WR_FLUSH_ERR)
- CNETERR("FastReg failed: %d\n", wc->status);
- break;
-
- case IBLND_WID_RDMA:
- /*
- * We only get RDMA completion notification if it fails. All
- * subsequent work items, including the final SEND will fail
- * too. However we can't print out any more info about the
- * failing RDMA because 'tx' might be back on the idle list or
- * even reused already if we didn't manage to post all our work
- * items
- */
- CNETERR("RDMA (tx: %p) failed: %d\n",
- kiblnd_wreqid2ptr(wc->wr_id), wc->status);
- return;
-
- case IBLND_WID_TX:
- kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
- return;
-
- case IBLND_WID_RX:
- kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
- wc->byte_len);
- return;
- }
-}
-
-void
-kiblnd_cq_completion(struct ib_cq *cq, void *arg)
-{
- /*
- * NB I'm not allowed to schedule this conn once its refcount has
- * reached 0. Since fundamentally I'm racing with scheduler threads
- * consuming my CQ I could be called after all completions have
- * occurred. But in this case, !ibc_nrx && !ibc_nsends_posted
- * and this CQ is about to be destroyed so I NOOP.
- */
- struct kib_conn *conn = arg;
- struct kib_sched_info *sched = conn->ibc_sched;
- unsigned long flags;
-
- LASSERT(cq == conn->ibc_cq);
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
-
- conn->ibc_ready = 1;
-
- if (!conn->ibc_scheduled &&
- (conn->ibc_nrx > 0 ||
- conn->ibc_nsends_posted > 0)) {
- kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
- conn->ibc_scheduled = 1;
- list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
-
- if (waitqueue_active(&sched->ibs_waitq))
- wake_up(&sched->ibs_waitq);
- }
-
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-}
-
-void
-kiblnd_cq_event(struct ib_event *event, void *arg)
-{
- struct kib_conn *conn = arg;
-
- CERROR("%s: async CQ event type %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
-}
-
-int
-kiblnd_scheduler(void *arg)
-{
- long id = (long)arg;
- struct kib_sched_info *sched;
- struct kib_conn *conn;
- wait_queue_entry_t wait;
- unsigned long flags;
- struct ib_wc wc;
- int did_something;
- int busy_loops = 0;
- int rc;
-
- init_waitqueue_entry(&wait, current);
-
- sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
-
- rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
- if (rc) {
- CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n",
- sched->ibs_cpt);
- }
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
-
- while (!kiblnd_data.kib_shutdown) {
- if (busy_loops++ >= IBLND_RESCHED) {
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- cond_resched();
- busy_loops = 0;
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
- }
-
- did_something = 0;
-
- if (!list_empty(&sched->ibs_conns)) {
- conn = list_entry(sched->ibs_conns.next, struct kib_conn,
- ibc_sched_list);
- /* take over kib_sched_conns' ref on conn... */
- LASSERT(conn->ibc_scheduled);
- list_del(&conn->ibc_sched_list);
- conn->ibc_ready = 0;
-
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- wc.wr_id = IBLND_WID_INVAL;
-
- rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
- if (!rc) {
- rc = ib_req_notify_cq(conn->ibc_cq,
- IB_CQ_NEXT_COMP);
- if (rc < 0) {
- CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
- kiblnd_close_conn(conn, -EIO);
- kiblnd_conn_decref(conn);
- spin_lock_irqsave(&sched->ibs_lock,
- flags);
- continue;
- }
-
- rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
- }
-
- if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) {
- LCONSOLE_ERROR("ib_poll_cq (rc: %d) returned invalid wr_id, opcode %d, status: %d, vendor_err: %d, conn: %s status: %d\nplease upgrade firmware and OFED or contact vendor.\n",
- rc, wc.opcode, wc.status,
- wc.vendor_err,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- conn->ibc_state);
- rc = -EINVAL;
- }
-
- if (rc < 0) {
- CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- rc);
- kiblnd_close_conn(conn, -EIO);
- kiblnd_conn_decref(conn);
- spin_lock_irqsave(&sched->ibs_lock, flags);
- continue;
- }
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
-
- if (rc || conn->ibc_ready) {
- /*
- * There may be another completion waiting; get
- * another scheduler to check while I handle
- * this one...
- */
- /* +1 ref for sched_conns */
- kiblnd_conn_addref(conn);
- list_add_tail(&conn->ibc_sched_list,
- &sched->ibs_conns);
- if (waitqueue_active(&sched->ibs_waitq))
- wake_up(&sched->ibs_waitq);
- } else {
- conn->ibc_scheduled = 0;
- }
-
- if (rc) {
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
- kiblnd_complete(&wc);
-
- spin_lock_irqsave(&sched->ibs_lock, flags);
- }
-
- kiblnd_conn_decref(conn); /* ...drop my ref from above */
- did_something = 1;
- }
-
- if (did_something)
- continue;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- schedule();
- busy_loops = 0;
-
- remove_wait_queue(&sched->ibs_waitq, &wait);
- spin_lock_irqsave(&sched->ibs_lock, flags);
- }
-
- spin_unlock_irqrestore(&sched->ibs_lock, flags);
-
- kiblnd_thread_fini();
- return 0;
-}
-
-int
-kiblnd_failover_thread(void *arg)
-{
- rwlock_t *glock = &kiblnd_data.kib_global_lock;
- struct kib_dev *dev;
- wait_queue_entry_t wait;
- unsigned long flags;
- int rc;
-
- LASSERT(*kiblnd_tunables.kib_dev_failover);
-
- init_waitqueue_entry(&wait, current);
- write_lock_irqsave(glock, flags);
-
- while (!kiblnd_data.kib_shutdown) {
- int do_failover = 0;
- int long_sleep;
-
- list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
- ibd_fail_list) {
- if (time_before(jiffies,
- dev->ibd_next_failover))
- continue;
- do_failover = 1;
- break;
- }
-
- if (do_failover) {
- list_del_init(&dev->ibd_fail_list);
- dev->ibd_failover = 1;
- write_unlock_irqrestore(glock, flags);
-
- rc = kiblnd_dev_failover(dev);
-
- write_lock_irqsave(glock, flags);
-
- LASSERT(dev->ibd_failover);
- dev->ibd_failover = 0;
- if (rc >= 0) { /* Device is OK or failover succeed */
- dev->ibd_next_failover = jiffies + 3 * HZ;
- continue;
- }
-
- /* failed to failover, retry later */
- dev->ibd_next_failover =
- jiffies + min(dev->ibd_failed_failover, 10) * HZ;
- if (kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- }
-
- continue;
- }
-
- /* long sleep if no more pending failover */
- long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
- write_unlock_irqrestore(glock, flags);
-
- rc = schedule_timeout(long_sleep ? 10 * HZ :
- HZ);
- remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
- write_lock_irqsave(glock, flags);
-
- if (!long_sleep || rc)
- continue;
-
- /*
- * have a long sleep, routine check all active devices,
- * we need checking like this because if there is not active
- * connection on the dev and no SEND from local, we may listen
- * on wrong HCA for ever while there is a bonding failover
- */
- list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
- if (kiblnd_dev_can_failover(dev)) {
- list_add_tail(&dev->ibd_fail_list,
- &kiblnd_data.kib_failed_devs);
- }
- }
- }
-
- write_unlock_irqrestore(glock, flags);
-
- kiblnd_thread_fini();
- return 0;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
deleted file mode 100644
index 39d0792..0000000
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ /dev/null
@@ -1,296 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/o2iblnd/o2iblnd_modparams.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include "o2iblnd.h"
-
-static int service = 987;
-module_param(service, int, 0444);
-MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
-
-static int cksum;
-module_param(cksum, int, 0644);
-MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
-
-static int timeout = 50;
-module_param(timeout, int, 0644);
-MODULE_PARM_DESC(timeout, "timeout (seconds)");
-
-/*
- * Number of threads in each scheduler pool which is percpt,
- * we will estimate reasonable value based on CPUs if it's set to zero.
- */
-static int nscheds;
-module_param(nscheds, int, 0444);
-MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
-
-static unsigned int conns_per_peer = 1;
-module_param(conns_per_peer, uint, 0444);
-MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
-
-/* NB: this value is shared by all CPTs, it can grow at runtime */
-static int ntx = 512;
-module_param(ntx, int, 0444);
-MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
-
-/* NB: this value is shared by all CPTs */
-static int credits = 256;
-module_param(credits, int, 0444);
-MODULE_PARM_DESC(credits, "# concurrent sends");
-
-static int peer_credits = 8;
-module_param(peer_credits, int, 0444);
-MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
-
-static int peer_credits_hiw;
-module_param(peer_credits_hiw, int, 0444);
-MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
-
-static int peer_buffer_credits;
-module_param(peer_buffer_credits, int, 0444);
-MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
-
-static int peer_timeout = 180;
-module_param(peer_timeout, int, 0444);
-MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
-
-static char *ipif_name = "ib0";
-module_param(ipif_name, charp, 0444);
-MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
-
-static int retry_count = 5;
-module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
-
-static int rnr_retry_count = 6;
-module_param(rnr_retry_count, int, 0644);
-MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
-
-static int keepalive = 100;
-module_param(keepalive, int, 0644);
-MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
-
-static int ib_mtu;
-module_param(ib_mtu, int, 0444);
-MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
-
-static int concurrent_sends;
-module_param(concurrent_sends, int, 0444);
-MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
-
-#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS
-static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
-module_param(map_on_demand, int, 0444);
-MODULE_PARM_DESC(map_on_demand, "map on demand");
-
-/* NB: this value is shared by all CPTs, it can grow at runtime */
-static int fmr_pool_size = 512;
-module_param(fmr_pool_size, int, 0444);
-MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
-
-/* NB: this value is shared by all CPTs, it can grow at runtime */
-static int fmr_flush_trigger = 384;
-module_param(fmr_flush_trigger, int, 0444);
-MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
-
-static int fmr_cache = 1;
-module_param(fmr_cache, int, 0444);
-MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
-
-/*
- * 0: disable failover
- * 1: enable failover if necessary
- * 2: force to failover (for debug)
- */
-static int dev_failover;
-module_param(dev_failover, int, 0444);
-MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
-
-static int require_privileged_port;
-module_param(require_privileged_port, int, 0644);
-MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
-
-static int use_privileged_port = 1;
-module_param(use_privileged_port, int, 0644);
-MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
-
-struct kib_tunables kiblnd_tunables = {
- .kib_dev_failover = &dev_failover,
- .kib_service = &service,
- .kib_cksum = &cksum,
- .kib_timeout = &timeout,
- .kib_keepalive = &keepalive,
- .kib_ntx = &ntx,
- .kib_default_ipif = &ipif_name,
- .kib_retry_count = &retry_count,
- .kib_rnr_retry_count = &rnr_retry_count,
- .kib_ib_mtu = &ib_mtu,
- .kib_require_priv_port = &require_privileged_port,
- .kib_use_priv_port = &use_privileged_port,
- .kib_nscheds = &nscheds
-};
-
-static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
-
-/* # messages/RDMAs in-flight */
-int kiblnd_msg_queue_size(int version, struct lnet_ni *ni)
-{
- if (version == IBLND_MSG_VERSION_1)
- return IBLND_MSG_QUEUE_SIZE_V1;
- else if (ni)
- return ni->ni_peertxcredits;
- else
- return peer_credits;
-}
-
-int kiblnd_tunables_setup(struct lnet_ni *ni)
-{
- struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-
- /*
- * if there was no tunables specified, setup the tunables to be
- * defaulted
- */
- if (!ni->ni_lnd_tunables) {
- ni->ni_lnd_tunables = kzalloc(sizeof(*ni->ni_lnd_tunables),
- GFP_NOFS);
- if (!ni->ni_lnd_tunables)
- return -ENOMEM;
-
- memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib,
- &default_tunables, sizeof(*tunables));
- }
- tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
-
- /* Current API version */
- tunables->lnd_version = 0;
-
- if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
- CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
- *kiblnd_tunables.kib_ib_mtu);
- return -EINVAL;
- }
-
- if (!ni->ni_peertimeout)
- ni->ni_peertimeout = peer_timeout;
-
- if (!ni->ni_maxtxcredits)
- ni->ni_maxtxcredits = credits;
-
- if (!ni->ni_peertxcredits)
- ni->ni_peertxcredits = peer_credits;
-
- if (!ni->ni_peerrtrcredits)
- ni->ni_peerrtrcredits = peer_buffer_credits;
-
- if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT)
- ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT;
-
- if (ni->ni_peertxcredits > IBLND_CREDITS_MAX)
- ni->ni_peertxcredits = IBLND_CREDITS_MAX;
-
- if (ni->ni_peertxcredits > credits)
- ni->ni_peertxcredits = credits;
-
- if (!tunables->lnd_peercredits_hiw)
- tunables->lnd_peercredits_hiw = peer_credits_hiw;
-
- if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2)
- tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2;
-
- if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits)
- tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1;
-
- if (tunables->lnd_map_on_demand <= 0 ||
- tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
- /* Use the default */
- CWARN("Invalid map_on_demand (%d), expects 1 - %d. Using default of %d\n",
- tunables->lnd_map_on_demand,
- IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND);
- tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
- }
-
- if (tunables->lnd_map_on_demand == 1) {
- /* don't make sense to create map if only one fragment */
- tunables->lnd_map_on_demand = 2;
- }
-
- if (!tunables->lnd_concurrent_sends) {
- if (tunables->lnd_map_on_demand > 0 &&
- tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
- tunables->lnd_concurrent_sends =
- ni->ni_peertxcredits * 2;
- } else {
- tunables->lnd_concurrent_sends = ni->ni_peertxcredits;
- }
- }
-
- if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2)
- tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2;
-
- if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2)
- tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2;
-
- if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) {
- CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n",
- tunables->lnd_concurrent_sends, ni->ni_peertxcredits);
- }
-
- if (!tunables->lnd_fmr_pool_size)
- tunables->lnd_fmr_pool_size = fmr_pool_size;
- if (!tunables->lnd_fmr_flush_trigger)
- tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
- if (!tunables->lnd_fmr_cache)
- tunables->lnd_fmr_cache = fmr_cache;
- if (!tunables->lnd_conns_per_peer) {
- tunables->lnd_conns_per_peer = (conns_per_peer) ?
- conns_per_peer : 1;
- }
-
- return 0;
-}
-
-void kiblnd_tunables_init(void)
-{
- default_tunables.lnd_version = 0;
- default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
- default_tunables.lnd_map_on_demand = map_on_demand;
- default_tunables.lnd_concurrent_sends = concurrent_sends;
- default_tunables.lnd_fmr_pool_size = fmr_pool_size;
- default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
- default_tunables.lnd_fmr_cache = fmr_cache;
- default_tunables.lnd_conns_per_peer = conns_per_peer;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
deleted file mode 100644
index a7da1ab..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include
-subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include
-
-obj-$(CONFIG_LNET) += ksocklnd.o
-
-ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib.o
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
deleted file mode 100644
index f01b34a..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ /dev/null
@@ -1,2921 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2015, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/socklnd/socklnd.c
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include "socklnd.h"
-
-static struct lnet_lnd the_ksocklnd;
-struct ksock_nal_data ksocknal_data;
-
-static struct ksock_interface *
-ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip)
-{
- struct ksock_net *net = ni->ni_data;
- int i;
- struct ksock_interface *iface;
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- LASSERT(i < LNET_MAX_INTERFACES);
- iface = &net->ksnn_interfaces[i];
-
- if (iface->ksni_ipaddr == ip)
- return iface;
- }
-
- return NULL;
-}
-
-static struct ksock_route *
-ksocknal_create_route(__u32 ipaddr, int port)
-{
- struct ksock_route *route;
-
- route = kzalloc(sizeof(*route), GFP_NOFS);
- if (!route)
- return NULL;
-
- atomic_set(&route->ksnr_refcount, 1);
- route->ksnr_peer = NULL;
- route->ksnr_retry_interval = 0; /* OK to connect at any time */
- route->ksnr_ipaddr = ipaddr;
- route->ksnr_port = port;
- route->ksnr_scheduled = 0;
- route->ksnr_connecting = 0;
- route->ksnr_connected = 0;
- route->ksnr_deleted = 0;
- route->ksnr_conn_count = 0;
- route->ksnr_share_count = 0;
-
- return route;
-}
-
-void
-ksocknal_destroy_route(struct ksock_route *route)
-{
- LASSERT(!atomic_read(&route->ksnr_refcount));
-
- if (route->ksnr_peer)
- ksocknal_peer_decref(route->ksnr_peer);
-
- kfree(route);
-}
-
-static int
-ksocknal_create_peer(struct ksock_peer **peerp, struct lnet_ni *ni,
- struct lnet_process_id id)
-{
- int cpt = lnet_cpt_of_nid(id.nid);
- struct ksock_net *net = ni->ni_data;
- struct ksock_peer *peer;
-
- LASSERT(id.nid != LNET_NID_ANY);
- LASSERT(id.pid != LNET_PID_ANY);
- LASSERT(!in_interrupt());
-
- peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt);
- if (!peer)
- return -ENOMEM;
-
- peer->ksnp_ni = ni;
- peer->ksnp_id = id;
- atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */
- peer->ksnp_closing = 0;
- peer->ksnp_accepting = 0;
- peer->ksnp_proto = NULL;
- peer->ksnp_last_alive = 0;
- peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
-
- INIT_LIST_HEAD(&peer->ksnp_conns);
- INIT_LIST_HEAD(&peer->ksnp_routes);
- INIT_LIST_HEAD(&peer->ksnp_tx_queue);
- INIT_LIST_HEAD(&peer->ksnp_zc_req_list);
- spin_lock_init(&peer->ksnp_lock);
-
- spin_lock_bh(&net->ksnn_lock);
-
- if (net->ksnn_shutdown) {
- spin_unlock_bh(&net->ksnn_lock);
-
- kfree(peer);
- CERROR("Can't create peer: network shutdown\n");
- return -ESHUTDOWN;
- }
-
- net->ksnn_npeers++;
-
- spin_unlock_bh(&net->ksnn_lock);
-
- *peerp = peer;
- return 0;
-}
-
-void
-ksocknal_destroy_peer(struct ksock_peer *peer)
-{
- struct ksock_net *net = peer->ksnp_ni->ni_data;
-
- CDEBUG(D_NET, "peer %s %p deleted\n",
- libcfs_id2str(peer->ksnp_id), peer);
-
- LASSERT(!atomic_read(&peer->ksnp_refcount));
- LASSERT(!peer->ksnp_accepting);
- LASSERT(list_empty(&peer->ksnp_conns));
- LASSERT(list_empty(&peer->ksnp_routes));
- LASSERT(list_empty(&peer->ksnp_tx_queue));
- LASSERT(list_empty(&peer->ksnp_zc_req_list));
-
- kfree(peer);
-
- /*
- * NB a peer's connections and routes keep a reference on their peer
- * until they are destroyed, so we can be assured that _all_ state to
- * do with this peer has been cleaned up when its refcount drops to
- * zero.
- */
- spin_lock_bh(&net->ksnn_lock);
- net->ksnn_npeers--;
- spin_unlock_bh(&net->ksnn_lock);
-}
-
-struct ksock_peer *
-ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id)
-{
- struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
- struct ksock_peer *peer;
-
- list_for_each_entry(peer, peer_list, ksnp_list) {
- LASSERT(!peer->ksnp_closing);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- if (peer->ksnp_id.nid != id.nid ||
- peer->ksnp_id.pid != id.pid)
- continue;
-
- CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
- peer, libcfs_id2str(id),
- atomic_read(&peer->ksnp_refcount));
- return peer;
- }
- return NULL;
-}
-
-struct ksock_peer *
-ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id)
-{
- struct ksock_peer *peer;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer) /* +1 ref for caller? */
- ksocknal_peer_addref(peer);
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- return peer;
-}
-
-static void
-ksocknal_unlink_peer_locked(struct ksock_peer *peer)
-{
- int i;
- __u32 ip;
- struct ksock_interface *iface;
-
- for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
- LASSERT(i < LNET_MAX_INTERFACES);
- ip = peer->ksnp_passive_ips[i];
-
- iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
- /*
- * All IPs in peer->ksnp_passive_ips[] come from the
- * interface list, therefore the call must succeed.
- */
- LASSERT(iface);
-
- CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
- peer, iface, iface->ksni_nroutes);
- iface->ksni_npeers--;
- }
-
- LASSERT(list_empty(&peer->ksnp_conns));
- LASSERT(list_empty(&peer->ksnp_routes));
- LASSERT(!peer->ksnp_closing);
- peer->ksnp_closing = 1;
- list_del(&peer->ksnp_list);
- /* lose peerlist's ref */
- ksocknal_peer_decref(peer);
-}
-
-static int
-ksocknal_get_peer_info(struct lnet_ni *ni, int index,
- struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip,
- int *port, int *conn_count, int *share_count)
-{
- struct ksock_peer *peer;
- struct list_head *ptmp;
- struct ksock_route *route;
- struct list_head *rtmp;
- int i;
- int j;
- int rc = -ENOENT;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- if (!peer->ksnp_n_passive_ips &&
- list_empty(&peer->ksnp_routes)) {
- if (index-- > 0)
- continue;
-
- *id = peer->ksnp_id;
- *myip = 0;
- *peer_ip = 0;
- *port = 0;
- *conn_count = 0;
- *share_count = 0;
- rc = 0;
- goto out;
- }
-
- for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
- if (index-- > 0)
- continue;
-
- *id = peer->ksnp_id;
- *myip = peer->ksnp_passive_ips[j];
- *peer_ip = 0;
- *port = 0;
- *conn_count = 0;
- *share_count = 0;
- rc = 0;
- goto out;
- }
-
- list_for_each(rtmp, &peer->ksnp_routes) {
- if (index-- > 0)
- continue;
-
- route = list_entry(rtmp, struct ksock_route,
- ksnr_list);
-
- *id = peer->ksnp_id;
- *myip = route->ksnr_myipaddr;
- *peer_ip = route->ksnr_ipaddr;
- *port = route->ksnr_port;
- *conn_count = route->ksnr_conn_count;
- *share_count = route->ksnr_share_count;
- rc = 0;
- goto out;
- }
- }
- }
- out:
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return rc;
-}
-
-static void
-ksocknal_associate_route_conn_locked(struct ksock_route *route,
- struct ksock_conn *conn)
-{
- struct ksock_peer *peer = route->ksnr_peer;
- int type = conn->ksnc_type;
- struct ksock_interface *iface;
-
- conn->ksnc_route = route;
- ksocknal_route_addref(route);
-
- if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
- if (!route->ksnr_myipaddr) {
- /* route wasn't bound locally yet (the initial route) */
- CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr,
- &conn->ksnc_myipaddr);
- } else {
- CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr,
- &route->ksnr_myipaddr,
- &conn->ksnc_myipaddr);
-
- iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
- route->ksnr_myipaddr);
- if (iface)
- iface->ksni_nroutes--;
- }
- route->ksnr_myipaddr = conn->ksnc_myipaddr;
- iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
- route->ksnr_myipaddr);
- if (iface)
- iface->ksni_nroutes++;
- }
-
- route->ksnr_connected |= (1 << type);
- route->ksnr_conn_count++;
-
- /*
- * Successful connection => further attempts can
- * proceed immediately
- */
- route->ksnr_retry_interval = 0;
-}
-
-static void
-ksocknal_add_route_locked(struct ksock_peer *peer, struct ksock_route *route)
-{
- struct list_head *tmp;
- struct ksock_conn *conn;
- struct ksock_route *route2;
-
- LASSERT(!peer->ksnp_closing);
- LASSERT(!route->ksnr_peer);
- LASSERT(!route->ksnr_scheduled);
- LASSERT(!route->ksnr_connecting);
- LASSERT(!route->ksnr_connected);
-
- /* LASSERT(unique) */
- list_for_each(tmp, &peer->ksnp_routes) {
- route2 = list_entry(tmp, struct ksock_route, ksnr_list);
-
- if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
- CERROR("Duplicate route %s %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr);
- LBUG();
- }
- }
-
- route->ksnr_peer = peer;
- ksocknal_peer_addref(peer);
- /* peer's routelist takes over my ref on 'route' */
- list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
-
- list_for_each(tmp, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
- continue;
-
- ksocknal_associate_route_conn_locked(route, conn);
- /* keep going (typed routes) */
- }
-}
-
-static void
-ksocknal_del_route_locked(struct ksock_route *route)
-{
- struct ksock_peer *peer = route->ksnr_peer;
- struct ksock_interface *iface;
- struct ksock_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
-
- LASSERT(!route->ksnr_deleted);
-
- /* Close associated conns */
- list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
- conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
-
- if (conn->ksnc_route != route)
- continue;
-
- ksocknal_close_conn_locked(conn, 0);
- }
-
- if (route->ksnr_myipaddr) {
- iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
- route->ksnr_myipaddr);
- if (iface)
- iface->ksni_nroutes--;
- }
-
- route->ksnr_deleted = 1;
- list_del(&route->ksnr_list);
- ksocknal_route_decref(route); /* drop peer's ref */
-
- if (list_empty(&peer->ksnp_routes) &&
- list_empty(&peer->ksnp_conns)) {
- /*
- * I've just removed the last route to a peer with no active
- * connections
- */
- ksocknal_unlink_peer_locked(peer);
- }
-}
-
-int
-ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
- int port)
-{
- struct ksock_peer *peer;
- struct ksock_peer *peer2;
- struct ksock_route *route;
- struct ksock_route *route2;
- int rc;
-
- if (id.nid == LNET_NID_ANY ||
- id.pid == LNET_PID_ANY)
- return -EINVAL;
-
- /* Have a brand new peer ready... */
- rc = ksocknal_create_peer(&peer, ni, id);
- if (rc)
- return rc;
-
- route = ksocknal_create_route(ipaddr, port);
- if (!route) {
- ksocknal_peer_decref(peer);
- return -ENOMEM;
- }
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- /* always called with a ref on ni, so shutdown can't have started */
- LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown);
-
- peer2 = ksocknal_find_peer_locked(ni, id);
- if (peer2) {
- ksocknal_peer_decref(peer);
- peer = peer2;
- } else {
- /* peer table takes my ref on peer */
- list_add_tail(&peer->ksnp_list,
- ksocknal_nid2peerlist(id.nid));
- }
-
- list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) {
- if (route2->ksnr_ipaddr == ipaddr) {
- /* Route already exists, use the old one */
- ksocknal_route_decref(route);
- route2->ksnr_share_count++;
- goto out;
- }
- }
- /* Route doesn't already exist, add the new one */
- ksocknal_add_route_locked(peer, route);
- route->ksnr_share_count++;
-out:
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return 0;
-}
-
-static void
-ksocknal_del_peer_locked(struct ksock_peer *peer, __u32 ip)
-{
- struct ksock_conn *conn;
- struct ksock_route *route;
- struct list_head *tmp;
- struct list_head *nxt;
- int nshared;
-
- LASSERT(!peer->ksnp_closing);
-
- /* Extra ref prevents peer disappearing until I'm done with it */
- ksocknal_peer_addref(peer);
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- /* no match */
- if (!(!ip || route->ksnr_ipaddr == ip))
- continue;
-
- route->ksnr_share_count = 0;
- /* This deletes associated conns too */
- ksocknal_del_route_locked(route);
- }
-
- nshared = 0;
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
- nshared += route->ksnr_share_count;
- }
-
- if (!nshared) {
- /*
- * remove everything else if there are no explicit entries
- * left
- */
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- /* we should only be removing auto-entries */
- LASSERT(!route->ksnr_share_count);
- ksocknal_del_route_locked(route);
- }
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- ksocknal_close_conn_locked(conn, 0);
- }
- }
-
- ksocknal_peer_decref(peer);
- /* NB peer unlinks itself when last conn/route is removed */
-}
-
-static int
-ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
-{
- LIST_HEAD(zombies);
- struct list_head *ptmp;
- struct list_head *pnxt;
- struct ksock_peer *peer;
- int lo;
- int hi;
- int i;
- int rc = -ENOENT;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (id.nid != LNET_NID_ANY) {
- lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- } else {
- lo = 0;
- hi = ksocknal_data.ksnd_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
- (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
- continue;
-
- ksocknal_peer_addref(peer); /* a ref for me... */
-
- ksocknal_del_peer_locked(peer, ip);
-
- if (peer->ksnp_closing &&
- !list_empty(&peer->ksnp_tx_queue)) {
- LASSERT(list_empty(&peer->ksnp_conns));
- LASSERT(list_empty(&peer->ksnp_routes));
-
- list_splice_init(&peer->ksnp_tx_queue,
- &zombies);
- }
-
- ksocknal_peer_decref(peer); /* ...till here */
-
- rc = 0; /* matched! */
- }
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_txlist_done(ni, &zombies, 1);
-
- return rc;
-}
-
-static struct ksock_conn *
-ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index)
-{
- struct ksock_peer *peer;
- struct list_head *ptmp;
- struct ksock_conn *conn;
- struct list_head *ctmp;
- int i;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- LASSERT(!peer->ksnp_closing);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- list_for_each(ctmp, &peer->ksnp_conns) {
- if (index-- > 0)
- continue;
-
- conn = list_entry(ctmp, struct ksock_conn,
- ksnc_list);
- ksocknal_conn_addref(conn);
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return conn;
- }
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return NULL;
-}
-
-static struct ksock_sched *
-ksocknal_choose_scheduler_locked(unsigned int cpt)
-{
- struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
- struct ksock_sched *sched;
- int i;
-
- LASSERT(info->ksi_nthreads > 0);
-
- sched = &info->ksi_scheds[0];
- /*
- * NB: it's safe so far, but info->ksi_nthreads could be changed
- * at runtime when we have dynamic LNet configuration, then we
- * need to take care of this.
- */
- for (i = 1; i < info->ksi_nthreads; i++) {
- if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
- sched = &info->ksi_scheds[i];
- }
-
- return sched;
-}
-
-static int
-ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs)
-{
- struct ksock_net *net = ni->ni_data;
- int i;
- int nip;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- nip = net->ksnn_ninterfaces;
- LASSERT(nip <= LNET_MAX_INTERFACES);
-
- /*
- * Only offer interfaces for additional connections if I have
- * more than one.
- */
- if (nip < 2) {
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return 0;
- }
-
- for (i = 0; i < nip; i++) {
- ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
- LASSERT(ipaddrs[i]);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return nip;
-}
-
-static int
-ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips)
-{
- int best_netmatch = 0;
- int best_xor = 0;
- int best = -1;
- int this_xor;
- int this_netmatch;
- int i;
-
- for (i = 0; i < nips; i++) {
- if (!ips[i])
- continue;
-
- this_xor = ips[i] ^ iface->ksni_ipaddr;
- this_netmatch = !(this_xor & iface->ksni_netmask) ? 1 : 0;
-
- if (!(best < 0 ||
- best_netmatch < this_netmatch ||
- (best_netmatch == this_netmatch &&
- best_xor > this_xor)))
- continue;
-
- best = i;
- best_netmatch = this_netmatch;
- best_xor = this_xor;
- }
-
- LASSERT(best >= 0);
- return best;
-}
-
-static int
-ksocknal_select_ips(struct ksock_peer *peer, __u32 *peerips, int n_peerips)
-{
- rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
- struct ksock_net *net = peer->ksnp_ni->ni_data;
- struct ksock_interface *iface;
- struct ksock_interface *best_iface;
- int n_ips;
- int i;
- int j;
- int k;
- __u32 ip;
- __u32 xor;
- int this_netmatch;
- int best_netmatch;
- int best_npeers;
-
- /*
- * CAVEAT EMPTOR: We do all our interface matching with an
- * exclusive hold of global lock at IRQ priority. We're only
- * expecting to be dealing with small numbers of interfaces, so the
- * O(n**3)-ness shouldn't matter
- */
- /*
- * Also note that I'm not going to return more than n_peerips
- * interfaces, even if I have more myself
- */
- write_lock_bh(global_lock);
-
- LASSERT(n_peerips <= LNET_MAX_INTERFACES);
- LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
-
- /*
- * Only match interfaces for additional connections
- * if I have > 1 interface
- */
- n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
- min(n_peerips, net->ksnn_ninterfaces);
-
- for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
- /* ^ yes really... */
-
- /*
- * If we have any new interfaces, first tick off all the
- * peer IPs that match old interfaces, then choose new
- * interfaces to match the remaining peer IPS.
- * We don't forget interfaces we've stopped using; we might
- * start using them again...
- */
- if (i < peer->ksnp_n_passive_ips) {
- /* Old interface. */
- ip = peer->ksnp_passive_ips[i];
- best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
-
- /* peer passive ips are kept up to date */
- LASSERT(best_iface);
- } else {
- /* choose a new interface */
- LASSERT(i == peer->ksnp_n_passive_ips);
-
- best_iface = NULL;
- best_netmatch = 0;
- best_npeers = 0;
-
- for (j = 0; j < net->ksnn_ninterfaces; j++) {
- iface = &net->ksnn_interfaces[j];
- ip = iface->ksni_ipaddr;
-
- for (k = 0; k < peer->ksnp_n_passive_ips; k++)
- if (peer->ksnp_passive_ips[k] == ip)
- break;
-
- if (k < peer->ksnp_n_passive_ips) /* using it already */
- continue;
-
- k = ksocknal_match_peerip(iface, peerips,
- n_peerips);
- xor = ip ^ peerips[k];
- this_netmatch = !(xor & iface->ksni_netmask) ? 1 : 0;
-
- if (!(!best_iface ||
- best_netmatch < this_netmatch ||
- (best_netmatch == this_netmatch &&
- best_npeers > iface->ksni_npeers)))
- continue;
-
- best_iface = iface;
- best_netmatch = this_netmatch;
- best_npeers = iface->ksni_npeers;
- }
-
- LASSERT(best_iface);
-
- best_iface->ksni_npeers++;
- ip = best_iface->ksni_ipaddr;
- peer->ksnp_passive_ips[i] = ip;
- peer->ksnp_n_passive_ips = i + 1;
- }
-
- /* mark the best matching peer IP used */
- j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
- peerips[j] = 0;
- }
-
- /* Overwrite input peer IP addresses */
- memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
-
- write_unlock_bh(global_lock);
-
- return n_ips;
-}
-
-static void
-ksocknal_create_routes(struct ksock_peer *peer, int port,
- __u32 *peer_ipaddrs, int npeer_ipaddrs)
-{
- struct ksock_route *newroute = NULL;
- rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
- struct lnet_ni *ni = peer->ksnp_ni;
- struct ksock_net *net = ni->ni_data;
- struct list_head *rtmp;
- struct ksock_route *route;
- struct ksock_interface *iface;
- struct ksock_interface *best_iface;
- int best_netmatch;
- int this_netmatch;
- int best_nroutes;
- int i;
- int j;
-
- /*
- * CAVEAT EMPTOR: We do all our interface matching with an
- * exclusive hold of global lock at IRQ priority. We're only
- * expecting to be dealing with small numbers of interfaces, so the
- * O(n**3)-ness here shouldn't matter
- */
- write_lock_bh(global_lock);
-
- if (net->ksnn_ninterfaces < 2) {
- /*
- * Only create additional connections
- * if I have > 1 interface
- */
- write_unlock_bh(global_lock);
- return;
- }
-
- LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES);
-
- for (i = 0; i < npeer_ipaddrs; i++) {
- if (newroute) {
- newroute->ksnr_ipaddr = peer_ipaddrs[i];
- } else {
- write_unlock_bh(global_lock);
-
- newroute = ksocknal_create_route(peer_ipaddrs[i], port);
- if (!newroute)
- return;
-
- write_lock_bh(global_lock);
- }
-
- if (peer->ksnp_closing) {
- /* peer got closed under me */
- break;
- }
-
- /* Already got a route? */
- route = NULL;
- list_for_each(rtmp, &peer->ksnp_routes) {
- route = list_entry(rtmp, struct ksock_route, ksnr_list);
-
- if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
- break;
-
- route = NULL;
- }
- if (route)
- continue;
-
- best_iface = NULL;
- best_nroutes = 0;
- best_netmatch = 0;
-
- LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
-
- /* Select interface to connect from */
- for (j = 0; j < net->ksnn_ninterfaces; j++) {
- iface = &net->ksnn_interfaces[j];
-
- /* Using this interface already? */
- list_for_each(rtmp, &peer->ksnp_routes) {
- route = list_entry(rtmp, struct ksock_route,
- ksnr_list);
-
- if (route->ksnr_myipaddr == iface->ksni_ipaddr)
- break;
-
- route = NULL;
- }
- if (route)
- continue;
-
- this_netmatch = (!((iface->ksni_ipaddr ^
- newroute->ksnr_ipaddr) &
- iface->ksni_netmask)) ? 1 : 0;
-
- if (!(!best_iface ||
- best_netmatch < this_netmatch ||
- (best_netmatch == this_netmatch &&
- best_nroutes > iface->ksni_nroutes)))
- continue;
-
- best_iface = iface;
- best_netmatch = this_netmatch;
- best_nroutes = iface->ksni_nroutes;
- }
-
- if (!best_iface)
- continue;
-
- newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
- best_iface->ksni_nroutes++;
-
- ksocknal_add_route_locked(peer, newroute);
- newroute = NULL;
- }
-
- write_unlock_bh(global_lock);
- if (newroute)
- ksocknal_route_decref(newroute);
-}
-
-int
-ksocknal_accept(struct lnet_ni *ni, struct socket *sock)
-{
- struct ksock_connreq *cr;
- int rc;
- __u32 peer_ip;
- int peer_port;
-
- rc = lnet_sock_getaddr(sock, 1, &peer_ip, &peer_port);
- LASSERT(!rc); /* we succeeded before */
-
- cr = kzalloc(sizeof(*cr), GFP_NOFS);
- if (!cr) {
- LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n",
- &peer_ip);
- return -ENOMEM;
- }
-
- lnet_ni_addref(ni);
- cr->ksncr_ni = ni;
- cr->ksncr_sock = sock;
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
-
- list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
- wake_up(&ksocknal_data.ksnd_connd_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
- return 0;
-}
-
-static int
-ksocknal_connecting(struct ksock_peer *peer, __u32 ipaddr)
-{
- struct ksock_route *route;
-
- list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) {
- if (route->ksnr_ipaddr == ipaddr)
- return route->ksnr_connecting;
- }
- return 0;
-}
-
-int
-ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
- struct socket *sock, int type)
-{
- rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
- LIST_HEAD(zombies);
- struct lnet_process_id peerid;
- struct list_head *tmp;
- __u64 incarnation;
- struct ksock_conn *conn;
- struct ksock_conn *conn2;
- struct ksock_peer *peer = NULL;
- struct ksock_peer *peer2;
- struct ksock_sched *sched;
- struct ksock_hello_msg *hello;
- int cpt;
- struct ksock_tx *tx;
- struct ksock_tx *txtmp;
- int rc;
- int active;
- char *warn = NULL;
-
- active = !!route;
-
- LASSERT(active == (type != SOCKLND_CONN_NONE));
-
- conn = kzalloc(sizeof(*conn), GFP_NOFS);
- if (!conn) {
- rc = -ENOMEM;
- goto failed_0;
- }
-
- conn->ksnc_peer = NULL;
- conn->ksnc_route = NULL;
- conn->ksnc_sock = sock;
- /*
- * 2 ref, 1 for conn, another extra ref prevents socket
- * being closed before establishment of connection
- */
- atomic_set(&conn->ksnc_sock_refcount, 2);
- conn->ksnc_type = type;
- ksocknal_lib_save_callback(sock, conn);
- atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
-
- conn->ksnc_rx_ready = 0;
- conn->ksnc_rx_scheduled = 0;
-
- INIT_LIST_HEAD(&conn->ksnc_tx_queue);
- conn->ksnc_tx_ready = 0;
- conn->ksnc_tx_scheduled = 0;
- conn->ksnc_tx_carrier = NULL;
- atomic_set(&conn->ksnc_tx_nob, 0);
-
- hello = kvzalloc(offsetof(struct ksock_hello_msg,
- kshm_ips[LNET_MAX_INTERFACES]),
- GFP_KERNEL);
- if (!hello) {
- rc = -ENOMEM;
- goto failed_1;
- }
-
- /* stash conn's local and remote addrs */
- rc = ksocknal_lib_get_conn_addrs(conn);
- if (rc)
- goto failed_1;
-
- /*
- * Find out/confirm peer's NID and connection type and get the
- * vector of interfaces she's willing to let me connect to.
- * Passive connections use the listener timeout since the peer sends
- * eagerly
- */
- if (active) {
- peer = route->ksnr_peer;
- LASSERT(ni == peer->ksnp_ni);
-
- /* Active connection sends HELLO eagerly */
- hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
- peerid = peer->ksnp_id;
-
- write_lock_bh(global_lock);
- conn->ksnc_proto = peer->ksnp_proto;
- write_unlock_bh(global_lock);
-
- if (!conn->ksnc_proto) {
- conn->ksnc_proto = &ksocknal_protocol_v3x;
-#if SOCKNAL_VERSION_DEBUG
- if (*ksocknal_tunables.ksnd_protocol == 2)
- conn->ksnc_proto = &ksocknal_protocol_v2x;
- else if (*ksocknal_tunables.ksnd_protocol == 1)
- conn->ksnc_proto = &ksocknal_protocol_v1x;
-#endif
- }
-
- rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
- if (rc)
- goto failed_1;
- } else {
- peerid.nid = LNET_NID_ANY;
- peerid.pid = LNET_PID_ANY;
-
- /* Passive, get protocol from peer */
- conn->ksnc_proto = NULL;
- }
-
- rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation);
- if (rc < 0)
- goto failed_1;
-
- LASSERT(!rc || active);
- LASSERT(conn->ksnc_proto);
- LASSERT(peerid.nid != LNET_NID_ANY);
-
- cpt = lnet_cpt_of_nid(peerid.nid);
-
- if (active) {
- ksocknal_peer_addref(peer);
- write_lock_bh(global_lock);
- } else {
- rc = ksocknal_create_peer(&peer, ni, peerid);
- if (rc)
- goto failed_1;
-
- write_lock_bh(global_lock);
-
- /* called with a ref on ni, so shutdown can't have started */
- LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown);
-
- peer2 = ksocknal_find_peer_locked(ni, peerid);
- if (!peer2) {
- /*
- * NB this puts an "empty" peer in the peer
- * table (which takes my ref)
- */
- list_add_tail(&peer->ksnp_list,
- ksocknal_nid2peerlist(peerid.nid));
- } else {
- ksocknal_peer_decref(peer);
- peer = peer2;
- }
-
- /* +1 ref for me */
- ksocknal_peer_addref(peer);
- peer->ksnp_accepting++;
-
- /*
- * Am I already connecting to this guy? Resolve in
- * favour of higher NID...
- */
- if (peerid.nid < ni->ni_nid &&
- ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
- rc = EALREADY;
- warn = "connection race resolution";
- goto failed_2;
- }
- }
-
- if (peer->ksnp_closing ||
- (active && route->ksnr_deleted)) {
- /* peer/route got closed under me */
- rc = -ESTALE;
- warn = "peer/route removed";
- goto failed_2;
- }
-
- if (!peer->ksnp_proto) {
- /*
- * Never connected before.
- * NB recv_hello may have returned EPROTO to signal my peer
- * wants a different protocol than the one I asked for.
- */
- LASSERT(list_empty(&peer->ksnp_conns));
-
- peer->ksnp_proto = conn->ksnc_proto;
- peer->ksnp_incarnation = incarnation;
- }
-
- if (peer->ksnp_proto != conn->ksnc_proto ||
- peer->ksnp_incarnation != incarnation) {
- /* Peer rebooted or I've got the wrong protocol version */
- ksocknal_close_peer_conns_locked(peer, 0, 0);
-
- peer->ksnp_proto = NULL;
- rc = ESTALE;
- warn = peer->ksnp_incarnation != incarnation ?
- "peer rebooted" :
- "wrong proto version";
- goto failed_2;
- }
-
- switch (rc) {
- default:
- LBUG();
- case 0:
- break;
- case EALREADY:
- warn = "lost conn race";
- goto failed_2;
- case EPROTO:
- warn = "retry with different protocol version";
- goto failed_2;
- }
-
- /*
- * Refuse to duplicate an existing connection, unless this is a
- * loopback connection
- */
- if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
- list_for_each(tmp, &peer->ksnp_conns) {
- conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
- conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
- conn2->ksnc_type != conn->ksnc_type)
- continue;
-
- /*
- * Reply on a passive connection attempt so the peer
- * realises we're connected.
- */
- LASSERT(!rc);
- if (!active)
- rc = EALREADY;
-
- warn = "duplicate";
- goto failed_2;
- }
- }
-
- /*
- * If the connection created by this route didn't bind to the IP
- * address the route connected to, the connection/route matching
- * code below probably isn't going to work.
- */
- if (active &&
- route->ksnr_ipaddr != conn->ksnc_ipaddr) {
- CERROR("Route %s %pI4h connected to %pI4h\n",
- libcfs_id2str(peer->ksnp_id),
- &route->ksnr_ipaddr,
- &conn->ksnc_ipaddr);
- }
-
- /*
- * Search for a route corresponding to the new connection and
- * create an association. This allows incoming connections created
- * by routes in my peer to match my own route entries so I don't
- * continually create duplicate routes.
- */
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
- continue;
-
- ksocknal_associate_route_conn_locked(route, conn);
- break;
- }
-
- conn->ksnc_peer = peer; /* conn takes my ref on peer */
- peer->ksnp_last_alive = jiffies;
- peer->ksnp_send_keepalive = 0;
- peer->ksnp_error = 0;
-
- sched = ksocknal_choose_scheduler_locked(cpt);
- sched->kss_nconns++;
- conn->ksnc_scheduler = sched;
-
- conn->ksnc_tx_last_post = jiffies;
- /* Set the deadline for the outgoing HELLO to drain */
- conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued;
- conn->ksnc_tx_deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- mb(); /* order with adding to peer's conn list */
-
- list_add(&conn->ksnc_list, &peer->ksnp_conns);
- ksocknal_conn_addref(conn);
-
- ksocknal_new_packet(conn, 0);
-
- conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
-
- /* Take packets blocking for this connection. */
- list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
- int match = conn->ksnc_proto->pro_match_tx(conn, tx,
- tx->tx_nonblk);
-
- if (match == SOCKNAL_MATCH_NO)
- continue;
-
- list_del(&tx->tx_list);
- ksocknal_queue_tx_locked(tx, conn);
- }
-
- write_unlock_bh(global_lock);
-
- /*
- * We've now got a new connection. Any errors from here on are just
- * like "normal" comms errors and we close the connection normally.
- * NB (a) we still have to send the reply HELLO for passive
- * connections,
- * (b) normal I/O on the conn is blocked until I setup and call the
- * socket callbacks.
- */
- CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n",
- libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
- &conn->ksnc_myipaddr, &conn->ksnc_ipaddr,
- conn->ksnc_port, incarnation, cpt,
- (int)(sched - &sched->kss_info->ksi_scheds[0]));
-
- if (active) {
- /* additional routes after interface exchange? */
- ksocknal_create_routes(peer, conn->ksnc_port,
- hello->kshm_ips, hello->kshm_nips);
- } else {
- hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
- hello->kshm_nips);
- rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
- }
-
- kvfree(hello);
-
- /*
- * setup the socket AFTER I've received hello (it disables
- * SO_LINGER). I might call back to the acceptor who may want
- * to send a protocol version response and then close the
- * socket; this ensures the socket only tears down after the
- * response has been sent.
- */
- if (!rc)
- rc = ksocknal_lib_setup_sock(sock);
-
- write_lock_bh(global_lock);
-
- /* NB my callbacks block while I hold ksnd_global_lock */
- ksocknal_lib_set_callback(sock, conn);
-
- if (!active)
- peer->ksnp_accepting--;
-
- write_unlock_bh(global_lock);
-
- if (rc) {
- write_lock_bh(global_lock);
- if (!conn->ksnc_closing) {
- /* could be closed by another thread */
- ksocknal_close_conn_locked(conn, rc);
- }
- write_unlock_bh(global_lock);
- } else if (!ksocknal_connsock_addref(conn)) {
- /* Allow I/O to proceed. */
- ksocknal_read_callback(conn);
- ksocknal_write_callback(conn);
- ksocknal_connsock_decref(conn);
- }
-
- ksocknal_connsock_decref(conn);
- ksocknal_conn_decref(conn);
- return rc;
-
- failed_2:
- if (!peer->ksnp_closing &&
- list_empty(&peer->ksnp_conns) &&
- list_empty(&peer->ksnp_routes)) {
- list_add(&zombies, &peer->ksnp_tx_queue);
- list_del_init(&peer->ksnp_tx_queue);
- ksocknal_unlink_peer_locked(peer);
- }
-
- write_unlock_bh(global_lock);
-
- if (warn) {
- if (rc < 0)
- CERROR("Not creating conn %s type %d: %s\n",
- libcfs_id2str(peerid), conn->ksnc_type, warn);
- else
- CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
- libcfs_id2str(peerid), conn->ksnc_type, warn);
- }
-
- if (!active) {
- if (rc > 0) {
- /*
- * Request retry by replying with CONN_NONE
- * ksnc_proto has been set already
- */
- conn->ksnc_type = SOCKLND_CONN_NONE;
- hello->kshm_nips = 0;
- ksocknal_send_hello(ni, conn, peerid.nid, hello);
- }
-
- write_lock_bh(global_lock);
- peer->ksnp_accepting--;
- write_unlock_bh(global_lock);
- }
-
- ksocknal_txlist_done(ni, &zombies, 1);
- ksocknal_peer_decref(peer);
-
-failed_1:
- kvfree(hello);
-
- kfree(conn);
-
-failed_0:
- sock_release(sock);
- return rc;
-}
-
-void
-ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
-{
- /*
- * This just does the immmediate housekeeping, and queues the
- * connection for the reaper to terminate.
- * Caller holds ksnd_global_lock exclusively in irq context
- */
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_route *route;
- struct ksock_conn *conn2;
- struct list_head *tmp;
-
- LASSERT(!peer->ksnp_error);
- LASSERT(!conn->ksnc_closing);
- conn->ksnc_closing = 1;
-
- /* ksnd_deathrow_conns takes over peer's ref */
- list_del(&conn->ksnc_list);
-
- route = conn->ksnc_route;
- if (route) {
- /* dissociate conn from route... */
- LASSERT(!route->ksnr_deleted);
- LASSERT(route->ksnr_connected & (1 << conn->ksnc_type));
-
- conn2 = NULL;
- list_for_each(tmp, &peer->ksnp_conns) {
- conn2 = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn2->ksnc_route == route &&
- conn2->ksnc_type == conn->ksnc_type)
- break;
-
- conn2 = NULL;
- }
- if (!conn2)
- route->ksnr_connected &= ~(1 << conn->ksnc_type);
-
- conn->ksnc_route = NULL;
-
- ksocknal_route_decref(route); /* drop conn's ref on route */
- }
-
- if (list_empty(&peer->ksnp_conns)) {
- /* No more connections to this peer */
-
- if (!list_empty(&peer->ksnp_tx_queue)) {
- struct ksock_tx *tx;
-
- LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
-
- /*
- * throw them to the last connection...,
- * these TXs will be send to /dev/null by scheduler
- */
- list_for_each_entry(tx, &peer->ksnp_tx_queue,
- tx_list)
- ksocknal_tx_prep(conn, tx);
-
- spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
- list_splice_init(&peer->ksnp_tx_queue,
- &conn->ksnc_tx_queue);
- spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
- }
-
- peer->ksnp_proto = NULL; /* renegotiate protocol version */
- peer->ksnp_error = error; /* stash last conn close reason */
-
- if (list_empty(&peer->ksnp_routes)) {
- /*
- * I've just closed last conn belonging to a
- * peer with no routes to it
- */
- ksocknal_unlink_peer_locked(peer);
- }
- }
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- list_add_tail(&conn->ksnc_list,
- &ksocknal_data.ksnd_deathrow_conns);
- wake_up(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-}
-
-void
-ksocknal_peer_failed(struct ksock_peer *peer)
-{
- int notify = 0;
- unsigned long last_alive = 0;
-
- /*
- * There has been a connection failure or comms error; but I'll only
- * tell LNET I think the peer is dead if it's to another kernel and
- * there are no connections or connection attempts in existence.
- */
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- if (!(peer->ksnp_id.pid & LNET_PID_USERFLAG) &&
- list_empty(&peer->ksnp_conns) &&
- !peer->ksnp_accepting &&
- !ksocknal_find_connecting_route_locked(peer)) {
- notify = 1;
- last_alive = peer->ksnp_last_alive;
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- if (notify)
- lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0,
- last_alive);
-}
-
-void
-ksocknal_finalize_zcreq(struct ksock_conn *conn)
-{
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_tx *tx;
- struct ksock_tx *temp;
- struct ksock_tx *tmp;
- LIST_HEAD(zlist);
-
- /*
- * NB safe to finalize TXs because closing of socket will
- * abort all buffered data
- */
- LASSERT(!conn->ksnc_sock);
-
- spin_lock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
- if (tx->tx_conn != conn)
- continue;
-
- LASSERT(tx->tx_msg.ksm_zc_cookies[0]);
-
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- tx->tx_zc_aborted = 1; /* mark it as not-acked */
- list_del(&tx->tx_zc_list);
- list_add(&tx->tx_zc_list, &zlist);
- }
-
- spin_unlock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) {
- list_del(&tx->tx_zc_list);
- ksocknal_tx_decref(tx);
- }
-}
-
-void
-ksocknal_terminate_conn(struct ksock_conn *conn)
-{
- /*
- * This gets called by the reaper (guaranteed thread context) to
- * disengage the socket from its callbacks and close it.
- * ksnc_refcount will eventually hit zero, and then the reaper will
- * destroy it.
- */
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_sched *sched = conn->ksnc_scheduler;
- int failed = 0;
-
- LASSERT(conn->ksnc_closing);
-
- /* wake up the scheduler to "send" all remaining packets to /dev/null */
- spin_lock_bh(&sched->kss_lock);
-
- /* a closing conn is always ready to tx */
- conn->ksnc_tx_ready = 1;
-
- if (!conn->ksnc_tx_scheduled &&
- !list_empty(&conn->ksnc_tx_queue)) {
- list_add_tail(&conn->ksnc_tx_list,
- &sched->kss_tx_conns);
- conn->ksnc_tx_scheduled = 1;
- /* extra ref for scheduler */
- ksocknal_conn_addref(conn);
-
- wake_up(&sched->kss_waitq);
- }
-
- spin_unlock_bh(&sched->kss_lock);
-
- /* serialise with callbacks */
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
-
- /*
- * OK, so this conn may not be completely disengaged from its
- * scheduler yet, but it _has_ committed to terminate...
- */
- conn->ksnc_scheduler->kss_nconns--;
-
- if (peer->ksnp_error) {
- /* peer's last conn closed in error */
- LASSERT(list_empty(&peer->ksnp_conns));
- failed = 1;
- peer->ksnp_error = 0; /* avoid multiple notifications */
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (failed)
- ksocknal_peer_failed(peer);
-
- /*
- * The socket is closed on the final put; either here, or in
- * ksocknal_{send,recv}msg(). Since we set up the linger2 option
- * when the connection was established, this will close the socket
- * immediately, aborting anything buffered in it. Any hung
- * zero-copy transmits will therefore complete in finite time.
- */
- ksocknal_connsock_decref(conn);
-}
-
-void
-ksocknal_queue_zombie_conn(struct ksock_conn *conn)
-{
- /* Queue the conn for the reaper to destroy */
-
- LASSERT(!atomic_read(&conn->ksnc_conn_refcount));
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
- wake_up(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-}
-
-void
-ksocknal_destroy_conn(struct ksock_conn *conn)
-{
- unsigned long last_rcv;
-
- /* Final coup-de-grace of the reaper */
- CDEBUG(D_NET, "connection %p\n", conn);
-
- LASSERT(!atomic_read(&conn->ksnc_conn_refcount));
- LASSERT(!atomic_read(&conn->ksnc_sock_refcount));
- LASSERT(!conn->ksnc_sock);
- LASSERT(!conn->ksnc_route);
- LASSERT(!conn->ksnc_tx_scheduled);
- LASSERT(!conn->ksnc_rx_scheduled);
- LASSERT(list_empty(&conn->ksnc_tx_queue));
-
- /* complete current receive if any */
- switch (conn->ksnc_rx_state) {
- case SOCKNAL_RX_LNET_PAYLOAD:
- last_rcv = conn->ksnc_rx_deadline -
- *ksocknal_tunables.ksnd_timeout * HZ;
- CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %zd, left: %d, last alive is %ld secs ago\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
- &conn->ksnc_ipaddr, conn->ksnc_port,
- iov_iter_count(&conn->ksnc_rx_to), conn->ksnc_rx_nob_left,
- (jiffies - last_rcv) / HZ);
- lnet_finalize(conn->ksnc_peer->ksnp_ni,
- conn->ksnc_cookie, -EIO);
- break;
- case SOCKNAL_RX_LNET_HEADER:
- if (conn->ksnc_rx_started)
- CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port,
- conn->ksnc_proto->pro_version);
- break;
- case SOCKNAL_RX_KSM_HEADER:
- if (conn->ksnc_rx_started)
- CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port,
- conn->ksnc_proto->pro_version);
- break;
- case SOCKNAL_RX_SLOP:
- if (conn->ksnc_rx_started)
- CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port);
- break;
- default:
- LBUG();
- break;
- }
-
- ksocknal_peer_decref(conn->ksnc_peer);
-
- kfree(conn);
-}
-
-int
-ksocknal_close_peer_conns_locked(struct ksock_peer *peer, __u32 ipaddr, int why)
-{
- struct ksock_conn *conn;
- struct list_head *ctmp;
- struct list_head *cnxt;
- int count = 0;
-
- list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) {
- conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
-
- if (!ipaddr || conn->ksnc_ipaddr == ipaddr) {
- count++;
- ksocknal_close_conn_locked(conn, why);
- }
- }
-
- return count;
-}
-
-int
-ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why)
-{
- struct ksock_peer *peer = conn->ksnc_peer;
- __u32 ipaddr = conn->ksnc_ipaddr;
- int count;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- count = ksocknal_close_peer_conns_locked(peer, ipaddr, why);
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return count;
-}
-
-int
-ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr)
-{
- struct ksock_peer *peer;
- struct list_head *ptmp;
- struct list_head *pnxt;
- int lo;
- int hi;
- int i;
- int count = 0;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (id.nid != LNET_NID_ANY) {
- lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
- } else {
- lo = 0;
- hi = ksocknal_data.ksnd_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- list_for_each_safe(ptmp, pnxt,
- &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer, ksnp_list);
-
- if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
- (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
- continue;
-
- count += ksocknal_close_peer_conns_locked(peer, ipaddr,
- 0);
- }
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- /* wildcards always succeed */
- if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || !ipaddr)
- return 0;
-
- if (!count)
- return -ENOENT;
- else
- return 0;
-}
-
-void
-ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive)
-{
- /*
- * The router is telling me she's been notified of a change in
- * gateway state....
- */
- struct lnet_process_id id = {0};
-
- id.nid = gw_nid;
- id.pid = LNET_PID_ANY;
-
- CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
- alive ? "up" : "down");
-
- if (!alive) {
- /* If the gateway crashed, close all open connections... */
- ksocknal_close_matching_conns(id, 0);
- return;
- }
-
- /*
- * ...otherwise do nothing. We can only establish new connections
- * if we have autroutes, and these connect on demand.
- */
-}
-
-void
-ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when)
-{
- int connect = 1;
- unsigned long last_alive = 0;
- unsigned long now = jiffies;
- struct ksock_peer *peer = NULL;
- rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
- struct lnet_process_id id = {
- .nid = nid,
- .pid = LNET_PID_LUSTRE,
- };
-
- read_lock(glock);
-
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer) {
- struct ksock_conn *conn;
- int bufnob;
-
- list_for_each_entry(conn, &peer->ksnp_conns, ksnc_list) {
- bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
-
- if (bufnob < conn->ksnc_tx_bufnob) {
- /* something got ACKed */
- conn->ksnc_tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- peer->ksnp_last_alive = now;
- conn->ksnc_tx_bufnob = bufnob;
- }
- }
-
- last_alive = peer->ksnp_last_alive;
- if (!ksocknal_find_connectable_route_locked(peer))
- connect = 0;
- }
-
- read_unlock(glock);
-
- if (last_alive)
- *when = last_alive;
-
- CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
- libcfs_nid2str(nid), peer,
- last_alive ? (now - last_alive) / HZ : -1,
- connect);
-
- if (!connect)
- return;
-
- ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
-
- write_lock_bh(glock);
-
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer)
- ksocknal_launch_all_connections_locked(peer);
-
- write_unlock_bh(glock);
-}
-
-static void
-ksocknal_push_peer(struct ksock_peer *peer)
-{
- int index;
- int i;
- struct list_head *tmp;
- struct ksock_conn *conn;
-
- for (index = 0; ; index++) {
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- i = 0;
- conn = NULL;
-
- list_for_each(tmp, &peer->ksnp_conns) {
- if (i++ == index) {
- conn = list_entry(tmp, struct ksock_conn,
- ksnc_list);
- ksocknal_conn_addref(conn);
- break;
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- if (!conn)
- break;
-
- ksocknal_lib_push_conn(conn);
- ksocknal_conn_decref(conn);
- }
-}
-
-static int ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id)
-{
- struct list_head *start;
- struct list_head *end;
- struct list_head *tmp;
- int rc = -ENOENT;
- unsigned int hsize = ksocknal_data.ksnd_peer_hash_size;
-
- if (id.nid == LNET_NID_ANY) {
- start = &ksocknal_data.ksnd_peers[0];
- end = &ksocknal_data.ksnd_peers[hsize - 1];
- } else {
- start = ksocknal_nid2peerlist(id.nid);
- end = ksocknal_nid2peerlist(id.nid);
- }
-
- for (tmp = start; tmp <= end; tmp++) {
- int peer_off; /* searching offset in peer hash table */
-
- for (peer_off = 0; ; peer_off++) {
- struct ksock_peer *peer;
- int i = 0;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
- list_for_each_entry(peer, tmp, ksnp_list) {
- if (!((id.nid == LNET_NID_ANY ||
- id.nid == peer->ksnp_id.nid) &&
- (id.pid == LNET_PID_ANY ||
- id.pid == peer->ksnp_id.pid)))
- continue;
-
- if (i++ == peer_off) {
- ksocknal_peer_addref(peer);
- break;
- }
- }
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- if (!i) /* no match */
- break;
-
- rc = 0;
- ksocknal_push_peer(peer);
- ksocknal_peer_decref(peer);
- }
- }
- return rc;
-}
-
-static int
-ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask)
-{
- struct ksock_net *net = ni->ni_data;
- struct ksock_interface *iface;
- int rc;
- int i;
- int j;
- struct list_head *ptmp;
- struct ksock_peer *peer;
- struct list_head *rtmp;
- struct ksock_route *route;
-
- if (!ipaddress || !netmask)
- return -EINVAL;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- iface = ksocknal_ip2iface(ni, ipaddress);
- if (iface) {
- /* silently ignore dups */
- rc = 0;
- } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
- rc = -ENOSPC;
- } else {
- iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
-
- iface->ksni_ipaddr = ipaddress;
- iface->ksni_netmask = netmask;
- iface->ksni_nroutes = 0;
- iface->ksni_npeers = 0;
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(ptmp, struct ksock_peer,
- ksnp_list);
-
- for (j = 0; j < peer->ksnp_n_passive_ips; j++)
- if (peer->ksnp_passive_ips[j] == ipaddress)
- iface->ksni_npeers++;
-
- list_for_each(rtmp, &peer->ksnp_routes) {
- route = list_entry(rtmp, struct ksock_route,
- ksnr_list);
-
- if (route->ksnr_myipaddr == ipaddress)
- iface->ksni_nroutes++;
- }
- }
- }
-
- rc = 0;
- /*
- * NB only new connections will pay attention to the
- * new interface!
- */
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return rc;
-}
-
-static void
-ksocknal_peer_del_interface_locked(struct ksock_peer *peer, __u32 ipaddr)
-{
- struct list_head *tmp;
- struct list_head *nxt;
- struct ksock_route *route;
- struct ksock_conn *conn;
- int i;
- int j;
-
- for (i = 0; i < peer->ksnp_n_passive_ips; i++)
- if (peer->ksnp_passive_ips[i] == ipaddr) {
- for (j = i + 1; j < peer->ksnp_n_passive_ips; j++)
- peer->ksnp_passive_ips[j - 1] =
- peer->ksnp_passive_ips[j];
- peer->ksnp_n_passive_ips--;
- break;
- }
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- if (route->ksnr_myipaddr != ipaddr)
- continue;
-
- if (route->ksnr_share_count) {
- /* Manually created; keep, but unbind */
- route->ksnr_myipaddr = 0;
- } else {
- ksocknal_del_route_locked(route);
- }
- }
-
- list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
-
- if (conn->ksnc_myipaddr == ipaddr)
- ksocknal_close_conn_locked(conn, 0);
- }
-}
-
-static int
-ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress)
-{
- struct ksock_net *net = ni->ni_data;
- int rc = -ENOENT;
- struct list_head *tmp;
- struct list_head *nxt;
- struct ksock_peer *peer;
- __u32 this_ip;
- int i;
- int j;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
-
- if (!(!ipaddress || ipaddress == this_ip))
- continue;
-
- rc = 0;
-
- for (j = i + 1; j < net->ksnn_ninterfaces; j++)
- net->ksnn_interfaces[j - 1] =
- net->ksnn_interfaces[j];
-
- net->ksnn_ninterfaces--;
-
- for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
- list_for_each_safe(tmp, nxt,
- &ksocknal_data.ksnd_peers[j]) {
- peer = list_entry(tmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni != ni)
- continue;
-
- ksocknal_peer_del_interface_locked(peer, this_ip);
- }
- }
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- return rc;
-}
-
-int
-ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
-{
- struct lnet_process_id id = {0};
- struct libcfs_ioctl_data *data = arg;
- int rc;
-
- switch (cmd) {
- case IOC_LIBCFS_GET_INTERFACE: {
- struct ksock_net *net = ni->ni_data;
- struct ksock_interface *iface;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
- rc = -ENOENT;
- } else {
- rc = 0;
- iface = &net->ksnn_interfaces[data->ioc_count];
-
- data->ioc_u32[0] = iface->ksni_ipaddr;
- data->ioc_u32[1] = iface->ksni_netmask;
- data->ioc_u32[2] = iface->ksni_npeers;
- data->ioc_u32[3] = iface->ksni_nroutes;
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return rc;
- }
-
- case IOC_LIBCFS_ADD_INTERFACE:
- return ksocknal_add_interface(ni,
- data->ioc_u32[0], /* IP address */
- data->ioc_u32[1]); /* net mask */
-
- case IOC_LIBCFS_DEL_INTERFACE:
- return ksocknal_del_interface(ni,
- data->ioc_u32[0]); /* IP address */
-
- case IOC_LIBCFS_GET_PEER: {
- __u32 myip = 0;
- __u32 ip = 0;
- int port = 0;
- int conn_count = 0;
- int share_count = 0;
-
- rc = ksocknal_get_peer_info(ni, data->ioc_count,
- &id, &myip, &ip, &port,
- &conn_count, &share_count);
- if (rc)
- return rc;
-
- data->ioc_nid = id.nid;
- data->ioc_count = share_count;
- data->ioc_u32[0] = ip;
- data->ioc_u32[1] = port;
- data->ioc_u32[2] = myip;
- data->ioc_u32[3] = conn_count;
- data->ioc_u32[4] = id.pid;
- return 0;
- }
-
- case IOC_LIBCFS_ADD_PEER:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_LUSTRE;
- return ksocknal_add_peer(ni, id,
- data->ioc_u32[0], /* IP */
- data->ioc_u32[1]); /* port */
-
- case IOC_LIBCFS_DEL_PEER:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_ANY;
- return ksocknal_del_peer(ni, id,
- data->ioc_u32[0]); /* IP */
-
- case IOC_LIBCFS_GET_CONN: {
- int txmem;
- int rxmem;
- int nagle;
- struct ksock_conn *conn;
-
- conn = ksocknal_get_conn_by_idx(ni, data->ioc_count);
- if (!conn)
- return -ENOENT;
-
- ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
-
- data->ioc_count = txmem;
- data->ioc_nid = conn->ksnc_peer->ksnp_id.nid;
- data->ioc_flags = nagle;
- data->ioc_u32[0] = conn->ksnc_ipaddr;
- data->ioc_u32[1] = conn->ksnc_port;
- data->ioc_u32[2] = conn->ksnc_myipaddr;
- data->ioc_u32[3] = conn->ksnc_type;
- data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
- data->ioc_u32[5] = rxmem;
- data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
- ksocknal_conn_decref(conn);
- return 0;
- }
-
- case IOC_LIBCFS_CLOSE_CONNECTION:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_ANY;
- return ksocknal_close_matching_conns(id,
- data->ioc_u32[0]);
-
- case IOC_LIBCFS_REGISTER_MYNID:
- /* Ignore if this is a noop */
- if (data->ioc_nid == ni->ni_nid)
- return 0;
-
- CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
- libcfs_nid2str(data->ioc_nid),
- libcfs_nid2str(ni->ni_nid));
- return -EINVAL;
-
- case IOC_LIBCFS_PUSH_CONNECTION:
- id.nid = data->ioc_nid;
- id.pid = LNET_PID_ANY;
- return ksocknal_push(ni, id);
-
- default:
- return -EINVAL;
- }
- /* not reached */
-}
-
-static void
-ksocknal_free_buffers(void)
-{
- LASSERT(!atomic_read(&ksocknal_data.ksnd_nactive_txs));
-
- if (ksocknal_data.ksnd_sched_info) {
- struct ksock_sched_info *info;
- int i;
-
- cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info)
- kfree(info->ksi_scheds);
- cfs_percpt_free(ksocknal_data.ksnd_sched_info);
- }
-
- kvfree(ksocknal_data.ksnd_peers);
-
- spin_lock(&ksocknal_data.ksnd_tx_lock);
-
- if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
- struct list_head zlist;
- struct ksock_tx *tx;
- struct ksock_tx *temp;
-
- list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
- list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
-
- list_for_each_entry_safe(tx, temp, &zlist, tx_list) {
- list_del(&tx->tx_list);
- kfree(tx);
- }
- } else {
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
- }
-}
-
-static void
-ksocknal_base_shutdown(void)
-{
- struct ksock_sched_info *info;
- struct ksock_sched *sched;
- int i;
- int j;
-
- LASSERT(!ksocknal_data.ksnd_nnets);
-
- switch (ksocknal_data.ksnd_init) {
- default:
- LASSERT(0);
- /* fall through */
- case SOCKNAL_INIT_ALL:
- case SOCKNAL_INIT_DATA:
- LASSERT(ksocknal_data.ksnd_peers);
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
- LASSERT(list_empty(&ksocknal_data.ksnd_peers[i]));
-
- LASSERT(list_empty(&ksocknal_data.ksnd_nets));
- LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns));
- LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns));
- LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs));
- LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes));
-
- if (ksocknal_data.ksnd_sched_info) {
- cfs_percpt_for_each(info, i,
- ksocknal_data.ksnd_sched_info) {
- if (!info->ksi_scheds)
- continue;
-
- for (j = 0; j < info->ksi_nthreads_max; j++) {
- sched = &info->ksi_scheds[j];
- LASSERT(list_empty(
- &sched->kss_tx_conns));
- LASSERT(list_empty(
- &sched->kss_rx_conns));
- LASSERT(list_empty(
- &sched->kss_zombie_noop_txs));
- LASSERT(!sched->kss_nconns);
- }
- }
- }
-
- /* flag threads to terminate; wake and wait for them to die */
- ksocknal_data.ksnd_shuttingdown = 1;
- wake_up_all(&ksocknal_data.ksnd_connd_waitq);
- wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
-
- if (ksocknal_data.ksnd_sched_info) {
- cfs_percpt_for_each(info, i,
- ksocknal_data.ksnd_sched_info) {
- if (!info->ksi_scheds)
- continue;
-
- for (j = 0; j < info->ksi_nthreads_max; j++) {
- sched = &info->ksi_scheds[j];
- wake_up_all(&sched->kss_waitq);
- }
- }
- }
-
- i = 4;
- read_lock(&ksocknal_data.ksnd_global_lock);
- while (ksocknal_data.ksnd_nthreads) {
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
- "waiting for %d threads to terminate\n",
- ksocknal_data.ksnd_nthreads);
- read_unlock(&ksocknal_data.ksnd_global_lock);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- read_lock(&ksocknal_data.ksnd_global_lock);
- }
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_free_buffers();
-
- ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
- break;
- }
-
- module_put(THIS_MODULE);
-}
-
-static __u64
-ksocknal_new_incarnation(void)
-{
- /* The incarnation number is the time this module loaded and it
- * identifies this particular instance of the socknal.
- */
- return ktime_get_ns();
-}
-
-static int
-ksocknal_base_startup(void)
-{
- struct ksock_sched_info *info;
- int rc;
- int i;
-
- LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
- LASSERT(!ksocknal_data.ksnd_nnets);
-
- memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */
-
- ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
- ksocknal_data.ksnd_peers = kvmalloc_array(ksocknal_data.ksnd_peer_hash_size,
- sizeof(struct list_head),
- GFP_KERNEL);
- if (!ksocknal_data.ksnd_peers)
- return -ENOMEM;
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
- INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
-
- rwlock_init(&ksocknal_data.ksnd_global_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
-
- spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns);
- init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_lock_init(&ksocknal_data.ksnd_connd_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes);
- init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
-
- spin_lock_init(&ksocknal_data.ksnd_tx_lock);
- INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs);
-
- /* NB memset above zeros whole of ksocknal_data */
-
- /* flag lists/ptrs/locks initialised */
- ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
- try_module_get(THIS_MODULE);
-
- ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
- sizeof(*info));
- if (!ksocknal_data.ksnd_sched_info)
- goto failed;
-
- cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
- struct ksock_sched *sched;
- int nthrs;
-
- nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
- if (*ksocknal_tunables.ksnd_nscheds > 0) {
- nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
- } else {
- /*
- * max to half of CPUs, assume another half should be
- * reserved for upper layer modules
- */
- nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
- }
-
- info->ksi_nthreads_max = nthrs;
- info->ksi_cpt = i;
-
- info->ksi_scheds = kzalloc_cpt(info->ksi_nthreads_max * sizeof(*sched),
- GFP_NOFS, i);
- if (!info->ksi_scheds)
- goto failed;
-
- for (; nthrs > 0; nthrs--) {
- sched = &info->ksi_scheds[nthrs - 1];
-
- sched->kss_info = info;
- spin_lock_init(&sched->kss_lock);
- INIT_LIST_HEAD(&sched->kss_rx_conns);
- INIT_LIST_HEAD(&sched->kss_tx_conns);
- INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
- init_waitqueue_head(&sched->kss_waitq);
- }
- }
-
- ksocknal_data.ksnd_connd_starting = 0;
- ksocknal_data.ksnd_connd_failed_stamp = 0;
- ksocknal_data.ksnd_connd_starting_stamp = ktime_get_real_seconds();
- /*
- * must have at least 2 connds to remain responsive to accepts while
- * connecting
- */
- if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
- *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
-
- if (*ksocknal_tunables.ksnd_nconnds_max <
- *ksocknal_tunables.ksnd_nconnds) {
- ksocknal_tunables.ksnd_nconnds_max =
- ksocknal_tunables.ksnd_nconnds;
- }
-
- for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
- char name[16];
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
- ksocknal_data.ksnd_connd_starting++;
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
-
- snprintf(name, sizeof(name), "socknal_cd%02d", i);
- rc = ksocknal_thread_start(ksocknal_connd,
- (void *)((uintptr_t)i), name);
- if (rc) {
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
- ksocknal_data.ksnd_connd_starting--;
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
- CERROR("Can't spawn socknal connd: %d\n", rc);
- goto failed;
- }
- }
-
- rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
- if (rc) {
- CERROR("Can't spawn socknal reaper: %d\n", rc);
- goto failed;
- }
-
- /* flag everything initialised */
- ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
-
- return 0;
-
- failed:
- ksocknal_base_shutdown();
- return -ENETDOWN;
-}
-
-static void
-ksocknal_debug_peerhash(struct lnet_ni *ni)
-{
- struct ksock_peer *peer = NULL;
- struct list_head *tmp;
- int i;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
- list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) {
- peer = list_entry(tmp, struct ksock_peer, ksnp_list);
-
- if (peer->ksnp_ni == ni)
- break;
-
- peer = NULL;
- }
- }
-
- if (peer) {
- struct ksock_route *route;
- struct ksock_conn *conn;
-
- CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n",
- libcfs_id2str(peer->ksnp_id),
- atomic_read(&peer->ksnp_refcount),
- peer->ksnp_sharecount, peer->ksnp_closing,
- peer->ksnp_accepting, peer->ksnp_error,
- peer->ksnp_zc_next_cookie,
- !list_empty(&peer->ksnp_tx_queue),
- !list_empty(&peer->ksnp_zc_req_list));
-
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
- CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n",
- atomic_read(&route->ksnr_refcount),
- route->ksnr_scheduled, route->ksnr_connecting,
- route->ksnr_connected, route->ksnr_deleted);
- }
-
- list_for_each(tmp, &peer->ksnp_conns) {
- conn = list_entry(tmp, struct ksock_conn, ksnc_list);
- CWARN("Conn: ref %d, sref %d, t %d, c %d\n",
- atomic_read(&conn->ksnc_conn_refcount),
- atomic_read(&conn->ksnc_sock_refcount),
- conn->ksnc_type, conn->ksnc_closing);
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-void
-ksocknal_shutdown(struct lnet_ni *ni)
-{
- struct ksock_net *net = ni->ni_data;
- int i;
- struct lnet_process_id anyid = {0};
-
- anyid.nid = LNET_NID_ANY;
- anyid.pid = LNET_PID_ANY;
-
- LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
- LASSERT(ksocknal_data.ksnd_nnets > 0);
-
- spin_lock_bh(&net->ksnn_lock);
- net->ksnn_shutdown = 1; /* prevent new peers */
- spin_unlock_bh(&net->ksnn_lock);
-
- /* Delete all peers */
- ksocknal_del_peer(ni, anyid, 0);
-
- /* Wait for all peer state to clean up */
- i = 2;
- spin_lock_bh(&net->ksnn_lock);
- while (net->ksnn_npeers) {
- spin_unlock_bh(&net->ksnn_lock);
-
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
- "waiting for %d peers to disconnect\n",
- net->ksnn_npeers);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
-
- ksocknal_debug_peerhash(ni);
-
- spin_lock_bh(&net->ksnn_lock);
- }
- spin_unlock_bh(&net->ksnn_lock);
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- LASSERT(!net->ksnn_interfaces[i].ksni_npeers);
- LASSERT(!net->ksnn_interfaces[i].ksni_nroutes);
- }
-
- list_del(&net->ksnn_list);
- kfree(net);
-
- ksocknal_data.ksnd_nnets--;
- if (!ksocknal_data.ksnd_nnets)
- ksocknal_base_shutdown();
-}
-
-static int
-ksocknal_enumerate_interfaces(struct ksock_net *net)
-{
- char **names;
- int i;
- int j;
- int rc;
- int n;
-
- n = lnet_ipif_enumerate(&names);
- if (n <= 0) {
- CERROR("Can't enumerate interfaces: %d\n", n);
- return n;
- }
-
- for (i = j = 0; i < n; i++) {
- int up;
- __u32 ip;
- __u32 mask;
-
- if (!strcmp(names[i], "lo")) /* skip the loopback IF */
- continue;
-
- rc = lnet_ipif_query(names[i], &up, &ip, &mask);
- if (rc) {
- CWARN("Can't get interface %s info: %d\n",
- names[i], rc);
- continue;
- }
-
- if (!up) {
- CWARN("Ignoring interface %s (down)\n",
- names[i]);
- continue;
- }
-
- if (j == LNET_MAX_INTERFACES) {
- CWARN("Ignoring interface %s (too many interfaces)\n",
- names[i]);
- continue;
- }
-
- net->ksnn_interfaces[j].ksni_ipaddr = ip;
- net->ksnn_interfaces[j].ksni_netmask = mask;
- strlcpy(net->ksnn_interfaces[j].ksni_name,
- names[i], sizeof(net->ksnn_interfaces[j].ksni_name));
- j++;
- }
-
- lnet_ipif_free_enumeration(names, n);
-
- if (!j)
- CERROR("Can't find any usable interfaces\n");
-
- return j;
-}
-
-static int
-ksocknal_search_new_ipif(struct ksock_net *net)
-{
- int new_ipif = 0;
- int i;
-
- for (i = 0; i < net->ksnn_ninterfaces; i++) {
- char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
- char *colon = strchr(ifnam, ':');
- int found = 0;
- struct ksock_net *tmp;
- int j;
-
- if (colon) /* ignore alias device */
- *colon = 0;
-
- list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, ksnn_list) {
- for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
- char *ifnam2 =
- &tmp->ksnn_interfaces[j].ksni_name[0];
- char *colon2 = strchr(ifnam2, ':');
-
- if (colon2)
- *colon2 = 0;
-
- found = !strcmp(ifnam, ifnam2);
- if (colon2)
- *colon2 = ':';
- }
- if (found)
- break;
- }
-
- new_ipif += !found;
- if (colon)
- *colon = ':';
- }
-
- return new_ipif;
-}
-
-static int
-ksocknal_start_schedulers(struct ksock_sched_info *info)
-{
- int nthrs;
- int rc = 0;
- int i;
-
- if (!info->ksi_nthreads) {
- if (*ksocknal_tunables.ksnd_nscheds > 0) {
- nthrs = info->ksi_nthreads_max;
- } else {
- nthrs = cfs_cpt_weight(lnet_cpt_table(),
- info->ksi_cpt);
- nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
- nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
- }
- nthrs = min(nthrs, info->ksi_nthreads_max);
- } else {
- LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
- /* increase two threads if there is new interface */
- nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
- }
-
- for (i = 0; i < nthrs; i++) {
- long id;
- char name[20];
- struct ksock_sched *sched;
-
- id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
- sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
- snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
- info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
-
- rc = ksocknal_thread_start(ksocknal_scheduler,
- (void *)id, name);
- if (!rc)
- continue;
-
- CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
- info->ksi_cpt, info->ksi_nthreads + i, rc);
- break;
- }
-
- info->ksi_nthreads += i;
- return rc;
-}
-
-static int
-ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts)
-{
- int newif = ksocknal_search_new_ipif(net);
- int rc;
- int i;
-
- LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
-
- for (i = 0; i < ncpts; i++) {
- struct ksock_sched_info *info;
- int cpt = !cpts ? i : cpts[i];
-
- LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
- info = ksocknal_data.ksnd_sched_info[cpt];
-
- if (!newif && info->ksi_nthreads > 0)
- continue;
-
- rc = ksocknal_start_schedulers(info);
- if (rc)
- return rc;
- }
- return 0;
-}
-
-int
-ksocknal_startup(struct lnet_ni *ni)
-{
- struct ksock_net *net;
- int rc;
- int i;
-
- LASSERT(ni->ni_lnd == &the_ksocklnd);
-
- if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
- rc = ksocknal_base_startup();
- if (rc)
- return rc;
- }
-
- net = kzalloc(sizeof(*net), GFP_NOFS);
- if (!net)
- goto fail_0;
-
- spin_lock_init(&net->ksnn_lock);
- net->ksnn_incarnation = ksocknal_new_incarnation();
- ni->ni_data = net;
- ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout;
- ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits;
- ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits;
- ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
-
- if (!ni->ni_interfaces[0]) {
- rc = ksocknal_enumerate_interfaces(net);
- if (rc <= 0)
- goto fail_1;
-
- net->ksnn_ninterfaces = 1;
- } else {
- for (i = 0; i < LNET_MAX_INTERFACES; i++) {
- int up;
-
- if (!ni->ni_interfaces[i])
- break;
-
- rc = lnet_ipif_query(ni->ni_interfaces[i], &up,
- &net->ksnn_interfaces[i].ksni_ipaddr,
- &net->ksnn_interfaces[i].ksni_netmask);
-
- if (rc) {
- CERROR("Can't get interface %s info: %d\n",
- ni->ni_interfaces[i], rc);
- goto fail_1;
- }
-
- if (!up) {
- CERROR("Interface %s is down\n",
- ni->ni_interfaces[i]);
- goto fail_1;
- }
-
- strlcpy(net->ksnn_interfaces[i].ksni_name,
- ni->ni_interfaces[i],
- sizeof(net->ksnn_interfaces[i].ksni_name));
- }
- net->ksnn_ninterfaces = i;
- }
-
- /* call it before add it to ksocknal_data.ksnd_nets */
- rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
- if (rc)
- goto fail_1;
-
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
- net->ksnn_interfaces[0].ksni_ipaddr);
- list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
-
- ksocknal_data.ksnd_nnets++;
-
- return 0;
-
- fail_1:
- kfree(net);
- fail_0:
- if (!ksocknal_data.ksnd_nnets)
- ksocknal_base_shutdown();
-
- return -ENETDOWN;
-}
-
-static void __exit ksocklnd_exit(void)
-{
- lnet_unregister_lnd(&the_ksocklnd);
-}
-
-static int __init ksocklnd_init(void)
-{
- int rc;
-
- /* check ksnr_connected/connecting field large enough */
- BUILD_BUG_ON(SOCKLND_CONN_NTYPES > 4);
- BUILD_BUG_ON(SOCKLND_CONN_ACK != SOCKLND_CONN_BULK_IN);
-
- /* initialize the_ksocklnd */
- the_ksocklnd.lnd_type = SOCKLND;
- the_ksocklnd.lnd_startup = ksocknal_startup;
- the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
- the_ksocklnd.lnd_ctl = ksocknal_ctl;
- the_ksocklnd.lnd_send = ksocknal_send;
- the_ksocklnd.lnd_recv = ksocknal_recv;
- the_ksocklnd.lnd_notify = ksocknal_notify;
- the_ksocklnd.lnd_query = ksocknal_query;
- the_ksocklnd.lnd_accept = ksocknal_accept;
-
- rc = ksocknal_tunables_init();
- if (rc)
- return rc;
-
- rc = libcfs_setup();
- if (rc)
- return rc;
-
- lnet_register_lnd(&the_ksocklnd);
-
- return 0;
-}
-
-MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("TCP Socket LNet Network Driver");
-MODULE_VERSION("2.7.0");
-MODULE_LICENSE("GPL");
-
-module_init(ksocklnd_init);
-module_exit(ksocklnd_exit);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
deleted file mode 100644
index 4e5c89a..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
+++ /dev/null
@@ -1,704 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Lustre, http://www.lustre.org
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _SOCKLND_SOCKLND_H_
-#define _SOCKLND_SOCKLND_H_
-
-#define DEBUG_PORTAL_ALLOC
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <linux/crc32.h>
-#include <linux/errno.h>
-#include <linux/if.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/sysctl.h>
-#include <linux/uio.h>
-#include <linux/unistd.h>
-#include <asm/irq.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-
-#include <linux/lnet/lib-lnet.h>
-#include <linux/lnet/socklnd.h>
-
-/* assume one thread for each connection type */
-#define SOCKNAL_NSCHEDS 3
-#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1)
-
-#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
-#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */
-#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */
-#define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */
-
-#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */
-#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */
-
-#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */
-
-/*
- * risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
- * no risk if we're not running on a CONFIG_HIGHMEM platform.
- */
-#ifdef CONFIG_HIGHMEM
-# define SOCKNAL_RISK_KMAP_DEADLOCK 0
-#else
-# define SOCKNAL_RISK_KMAP_DEADLOCK 1
-#endif
-
-struct ksock_sched_info;
-
-struct ksock_sched { /* per scheduler state */
- spinlock_t kss_lock; /* serialise */
- struct list_head kss_rx_conns; /* conn waiting to be read */
- struct list_head kss_tx_conns; /* conn waiting to be written */
- struct list_head kss_zombie_noop_txs; /* zombie noop tx list */
- wait_queue_head_t kss_waitq; /* where scheduler sleeps */
- int kss_nconns; /* # connections assigned to
- * this scheduler
- */
- struct ksock_sched_info *kss_info; /* owner of it */
-};
-
-struct ksock_sched_info {
- int ksi_nthreads_max; /* max allowed threads */
- int ksi_nthreads; /* number of threads */
- int ksi_cpt; /* CPT id */
- struct ksock_sched *ksi_scheds; /* array of schedulers */
-};
-
-#define KSOCK_CPT_SHIFT 16
-#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid))
-#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT)
-#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
-
-struct ksock_interface { /* in-use interface */
- __u32 ksni_ipaddr; /* interface's IP address */
- __u32 ksni_netmask; /* interface's network mask */
- int ksni_nroutes; /* # routes using (active) */
- int ksni_npeers; /* # peers using (passive) */
- char ksni_name[IFNAMSIZ]; /* interface name */
-};
-
-struct ksock_tunables {
- int *ksnd_timeout; /* "stuck" socket timeout
- * (seconds)
- */
- int *ksnd_nscheds; /* # scheduler threads in each
- * pool while starting
- */
- int *ksnd_nconnds; /* # connection daemons */
- int *ksnd_nconnds_max; /* max # connection daemons */
- int *ksnd_min_reconnectms; /* first connection retry after
- * (ms)...
- */
- int *ksnd_max_reconnectms; /* ...exponentially increasing to
- * this
- */
- int *ksnd_eager_ack; /* make TCP ack eagerly? */
- int *ksnd_typed_conns; /* drive sockets by type? */
- int *ksnd_min_bulk; /* smallest "large" message */
- int *ksnd_tx_buffer_size; /* socket tx buffer size */
- int *ksnd_rx_buffer_size; /* socket rx buffer size */
- int *ksnd_nagle; /* enable NAGLE? */
- int *ksnd_round_robin; /* round robin for multiple
- * interfaces
- */
- int *ksnd_keepalive; /* # secs for sending keepalive
- * NOOP
- */
- int *ksnd_keepalive_idle; /* # idle secs before 1st probe
- */
- int *ksnd_keepalive_count; /* # probes */
- int *ksnd_keepalive_intvl; /* time between probes */
- int *ksnd_credits; /* # concurrent sends */
- int *ksnd_peertxcredits; /* # concurrent sends to 1 peer
- */
- int *ksnd_peerrtrcredits; /* # per-peer router buffer
- * credits
- */
- int *ksnd_peertimeout; /* seconds to consider peer dead
- */
- int *ksnd_enable_csum; /* enable check sum */
- int *ksnd_inject_csum_error; /* set non-zero to inject
- * checksum error
- */
- int *ksnd_nonblk_zcack; /* always send zc-ack on
- * non-blocking connection
- */
- unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload
- * size
- */
- int *ksnd_zc_recv; /* enable ZC receive (for
- * Chelsio TOE)
- */
- int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to
- * enable ZC receive
- */
-};
-
-struct ksock_net {
- __u64 ksnn_incarnation; /* my epoch */
- spinlock_t ksnn_lock; /* serialise */
- struct list_head ksnn_list; /* chain on global list */
- int ksnn_npeers; /* # peers */
- int ksnn_shutdown; /* shutting down? */
- int ksnn_ninterfaces; /* IP interfaces */
- struct ksock_interface ksnn_interfaces[LNET_MAX_INTERFACES];
-};
-
-/** connd timeout */
-#define SOCKNAL_CONND_TIMEOUT 120
-/** reserved thread for accepting & creating new connd */
-#define SOCKNAL_CONND_RESV 1
-
-struct ksock_nal_data {
- int ksnd_init; /* initialisation state
- */
- int ksnd_nnets; /* # networks set up */
- struct list_head ksnd_nets; /* list of nets */
- rwlock_t ksnd_global_lock; /* stabilize peer/conn
- * ops
- */
- struct list_head *ksnd_peers; /* hash table of all my
- * known peers
- */
- int ksnd_peer_hash_size; /* size of ksnd_peers */
-
- int ksnd_nthreads; /* # live threads */
- int ksnd_shuttingdown; /* tell threads to exit
- */
- struct ksock_sched_info **ksnd_sched_info; /* schedulers info */
-
- atomic_t ksnd_nactive_txs; /* #active txs */
-
- struct list_head ksnd_deathrow_conns; /* conns to close:
- * reaper_lock
- */
- struct list_head ksnd_zombie_conns; /* conns to free:
- * reaper_lock
- */
- struct list_head ksnd_enomem_conns; /* conns to retry:
- * reaper_lock
- */
- wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */
- unsigned long ksnd_reaper_waketime; /* when reaper will wake
- */
- spinlock_t ksnd_reaper_lock; /* serialise */
-
- int ksnd_enomem_tx; /* test ENOMEM sender */
- int ksnd_stall_tx; /* test sluggish sender
- */
- int ksnd_stall_rx; /* test sluggish
- * receiver
- */
- struct list_head ksnd_connd_connreqs; /* incoming connection
- * requests
- */
- struct list_head ksnd_connd_routes; /* routes waiting to be
- * connected
- */
- wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */
- int ksnd_connd_connecting; /* # connds connecting
- */
- time64_t ksnd_connd_failed_stamp;/* time stamp of the
- * last failed
- * connecting attempt
- */
- time64_t ksnd_connd_starting_stamp;/* time stamp of the
- * last starting connd
- */
- unsigned int ksnd_connd_starting; /* # starting connd */
- unsigned int ksnd_connd_running; /* # running connd */
- spinlock_t ksnd_connd_lock; /* serialise */
-
- struct list_head ksnd_idle_noop_txs; /* list head for freed
- * noop tx
- */
- spinlock_t ksnd_tx_lock; /* serialise, g_lock
- * unsafe
- */
-};
-
-#define SOCKNAL_INIT_NOTHING 0
-#define SOCKNAL_INIT_DATA 1
-#define SOCKNAL_INIT_ALL 2
-
-/*
- * A packet just assembled for transmission is represented by 1 or more
- * struct iovec fragments (the first frag contains the portals header),
- * followed by 0 or more struct bio_vec fragments.
- *
- * On the receive side, initially 1 struct iovec fragment is posted for
- * receive (the header). Once the header has been received, the payload is
- * received into either struct iovec or struct bio_vec fragments, depending on
- * what the header matched or whether the message needs forwarding.
- */
-struct ksock_conn; /* forward ref */
-struct ksock_peer; /* forward ref */
-struct ksock_route; /* forward ref */
-struct ksock_proto; /* forward ref */
-
-struct ksock_tx { /* transmit packet */
- struct list_head tx_list; /* queue on conn for transmission etc
- */
- struct list_head tx_zc_list; /* queue on peer for ZC request */
- atomic_t tx_refcount; /* tx reference count */
- int tx_nob; /* # packet bytes */
- int tx_resid; /* residual bytes */
- int tx_niov; /* # packet iovec frags */
- struct kvec *tx_iov; /* packet iovec frags */
- int tx_nkiov; /* # packet page frags */
- unsigned short tx_zc_aborted; /* aborted ZC request */
- unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
- unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
- unsigned short tx_nonblk:1; /* it's a non-blocking ACK */
- struct bio_vec *tx_kiov; /* packet page frags */
- struct ksock_conn *tx_conn; /* owning conn */
- struct lnet_msg *tx_lnetmsg; /* lnet message for lnet_finalize()
- */
- unsigned long tx_deadline; /* when (in jiffies) tx times out */
- struct ksock_msg tx_msg; /* socklnd message buffer */
- int tx_desc_size; /* size of this descriptor */
- union {
- struct {
- struct kvec iov; /* virt hdr */
- struct bio_vec kiov[0]; /* paged payload */
- } paged;
- struct {
- struct kvec iov[1]; /* virt hdr + payload */
- } virt;
- } tx_frags;
-};
-
-#define KSOCK_NOOP_TX_SIZE (offsetof(struct ksock_tx, tx_frags.paged.kiov[0]))
-
-/* network zero copy callback descriptor embedded in struct ksock_tx */
-
-#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */
-#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */
-#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */
-#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */
-#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */
-#define SOCKNAL_RX_SLOP 6 /* skipping body */
-
-struct ksock_conn {
- struct ksock_peer *ksnc_peer; /* owning peer */
- struct ksock_route *ksnc_route; /* owning route */
- struct list_head ksnc_list; /* stash on peer's conn list */
- struct socket *ksnc_sock; /* actual socket */
- void *ksnc_saved_data_ready; /* socket's original
- * data_ready() callback
- */
- void *ksnc_saved_write_space; /* socket's original
- * write_space() callback
- */
- atomic_t ksnc_conn_refcount;/* conn refcount */
- atomic_t ksnc_sock_refcount;/* sock refcount */
- struct ksock_sched *ksnc_scheduler; /* who schedules this connection
- */
- __u32 ksnc_myipaddr; /* my IP */
- __u32 ksnc_ipaddr; /* peer's IP */
- int ksnc_port; /* peer's port */
- signed int ksnc_type:3; /* type of connection, should be
- * signed value
- */
- unsigned int ksnc_closing:1; /* being shut down */
- unsigned int ksnc_flip:1; /* flip or not, only for V2.x */
- unsigned int ksnc_zc_capable:1; /* enable to ZC */
- struct ksock_proto *ksnc_proto; /* protocol for the connection */
-
- /* reader */
- struct list_head ksnc_rx_list; /* where I enq waiting input or a
- * forwarding descriptor
- */
- unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times
- * out
- */
- __u8 ksnc_rx_started; /* started receiving a message */
- __u8 ksnc_rx_ready; /* data ready to read */
- __u8 ksnc_rx_scheduled; /* being progressed */
- __u8 ksnc_rx_state; /* what is being read */
- int ksnc_rx_nob_left; /* # bytes to next hdr/body */
- struct iov_iter ksnc_rx_to; /* copy destination */
- struct kvec ksnc_rx_iov_space[LNET_MAX_IOV]; /* space for frag descriptors */
- __u32 ksnc_rx_csum; /* partial checksum for incoming
- * data
- */
- void *ksnc_cookie; /* rx lnet_finalize passthru arg
- */
- struct ksock_msg ksnc_msg; /* incoming message buffer:
- * V2.x message takes the
- * whole struct
- * V1.x message is a bare
- * struct lnet_hdr, it's stored in
- * ksnc_msg.ksm_u.lnetmsg
- */
- /* WRITER */
- struct list_head ksnc_tx_list; /* where I enq waiting for output
- * space
- */
- struct list_head ksnc_tx_queue; /* packets waiting to be sent */
- struct ksock_tx *ksnc_tx_carrier; /* next TX that can carry a LNet
- * message or ZC-ACK
- */
- unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out
- */
- int ksnc_tx_bufnob; /* send buffer marker */
- atomic_t ksnc_tx_nob; /* # bytes queued */
- int ksnc_tx_ready; /* write space */
- int ksnc_tx_scheduled; /* being progressed */
- unsigned long ksnc_tx_last_post; /* time stamp of the last posted
- * TX
- */
-};
-
-struct ksock_route {
- struct list_head ksnr_list; /* chain on peer route list */
- struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */
- struct ksock_peer *ksnr_peer; /* owning peer */
- atomic_t ksnr_refcount; /* # users */
- unsigned long ksnr_timeout; /* when (in jiffies) reconnection
- * can happen next
- */
- long ksnr_retry_interval; /* how long between retries */
- __u32 ksnr_myipaddr; /* my IP */
- __u32 ksnr_ipaddr; /* IP address to connect to */
- int ksnr_port; /* port to connect to */
- unsigned int ksnr_scheduled:1; /* scheduled for attention */
- unsigned int ksnr_connecting:1; /* connection establishment in
- * progress
- */
- unsigned int ksnr_connected:4; /* connections established by
- * type
- */
- unsigned int ksnr_deleted:1; /* been removed from peer? */
- unsigned int ksnr_share_count; /* created explicitly? */
- int ksnr_conn_count; /* # conns established by this
- * route
- */
-};
-
-#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */
-
-struct ksock_peer {
- struct list_head ksnp_list; /* stash on global peer list */
- unsigned long ksnp_last_alive; /* when (in jiffies) I was last
- * alive
- */
- struct lnet_process_id ksnp_id; /* who's on the other end(s) */
- atomic_t ksnp_refcount; /* # users */
- int ksnp_sharecount; /* lconf usage counter */
- int ksnp_closing; /* being closed */
- int ksnp_accepting; /* # passive connections pending
- */
- int ksnp_error; /* errno on closing last conn */
- __u64 ksnp_zc_next_cookie; /* ZC completion cookie */
- __u64 ksnp_incarnation; /* latest known peer incarnation
- */
- struct ksock_proto *ksnp_proto; /* latest known peer protocol */
- struct list_head ksnp_conns; /* all active connections */
- struct list_head ksnp_routes; /* routes */
- struct list_head ksnp_tx_queue; /* waiting packets */
- spinlock_t ksnp_lock; /* serialize, g_lock unsafe */
- struct list_head ksnp_zc_req_list; /* zero copy requests wait for
- * ACK
- */
- unsigned long ksnp_send_keepalive; /* time to send keepalive */
- struct lnet_ni *ksnp_ni; /* which network */
- int ksnp_n_passive_ips; /* # of... */
-
- /* preferred local interfaces */
- __u32 ksnp_passive_ips[LNET_MAX_INTERFACES];
-};
-
-struct ksock_connreq {
- struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */
- struct lnet_ni *ksncr_ni; /* chosen NI */
- struct socket *ksncr_sock; /* accepted socket */
-};
-
-extern struct ksock_nal_data ksocknal_data;
-extern struct ksock_tunables ksocknal_tunables;
-
-#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */
-#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */
-#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not
- * preferred
- */
-
-struct ksock_proto {
- /* version number of protocol */
- int pro_version;
-
- /* handshake function */
- int (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *);
-
- /* handshake function */
- int (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int);
-
- /* message pack */
- void (*pro_pack)(struct ksock_tx *);
-
- /* message unpack */
- void (*pro_unpack)(struct ksock_msg *);
-
- /* queue tx on the connection */
- struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *);
-
- /* queue ZC ack on the connection */
- int (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64);
-
- /* handle ZC request */
- int (*pro_handle_zcreq)(struct ksock_conn *, __u64, int);
-
- /* handle ZC ACK */
- int (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64);
-
- /*
- * msg type matches the connection type:
- * return value:
- * return MATCH_NO : no
- * return MATCH_YES : matching type
- * return MATCH_MAY : can be backup
- */
- int (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int);
-};
-
-extern struct ksock_proto ksocknal_protocol_v1x;
-extern struct ksock_proto ksocknal_protocol_v2x;
-extern struct ksock_proto ksocknal_protocol_v3x;
-
-#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR
-#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR
-#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR
-
-#ifndef CPU_MASK_NONE
-#define CPU_MASK_NONE 0UL
-#endif
-
-static inline int
-ksocknal_route_mask(void)
-{
- if (!*ksocknal_tunables.ksnd_typed_conns)
- return (1 << SOCKLND_CONN_ANY);
-
- return ((1 << SOCKLND_CONN_CONTROL) |
- (1 << SOCKLND_CONN_BULK_IN) |
- (1 << SOCKLND_CONN_BULK_OUT));
-}
-
-static inline struct list_head *
-ksocknal_nid2peerlist(lnet_nid_t nid)
-{
- unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
-
- return &ksocknal_data.ksnd_peers[hash];
-}
-
-static inline void
-ksocknal_conn_addref(struct ksock_conn *conn)
-{
- LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
- atomic_inc(&conn->ksnc_conn_refcount);
-}
-
-void ksocknal_queue_zombie_conn(struct ksock_conn *conn);
-void ksocknal_finalize_zcreq(struct ksock_conn *conn);
-
-static inline void
-ksocknal_conn_decref(struct ksock_conn *conn)
-{
- LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
- if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
- ksocknal_queue_zombie_conn(conn);
-}
-
-static inline int
-ksocknal_connsock_addref(struct ksock_conn *conn)
-{
- int rc = -ESHUTDOWN;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
- if (!conn->ksnc_closing) {
- LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
- atomic_inc(&conn->ksnc_sock_refcount);
- rc = 0;
- }
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- return rc;
-}
-
-static inline void
-ksocknal_connsock_decref(struct ksock_conn *conn)
-{
- LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
- if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
- LASSERT(conn->ksnc_closing);
- sock_release(conn->ksnc_sock);
- conn->ksnc_sock = NULL;
- ksocknal_finalize_zcreq(conn);
- }
-}
-
-static inline void
-ksocknal_tx_addref(struct ksock_tx *tx)
-{
- LASSERT(atomic_read(&tx->tx_refcount) > 0);
- atomic_inc(&tx->tx_refcount);
-}
-
-void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx);
-void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx);
-
-static inline void
-ksocknal_tx_decref(struct ksock_tx *tx)
-{
- LASSERT(atomic_read(&tx->tx_refcount) > 0);
- if (atomic_dec_and_test(&tx->tx_refcount))
- ksocknal_tx_done(NULL, tx);
-}
-
-static inline void
-ksocknal_route_addref(struct ksock_route *route)
-{
- LASSERT(atomic_read(&route->ksnr_refcount) > 0);
- atomic_inc(&route->ksnr_refcount);
-}
-
-void ksocknal_destroy_route(struct ksock_route *route);
-
-static inline void
-ksocknal_route_decref(struct ksock_route *route)
-{
- LASSERT(atomic_read(&route->ksnr_refcount) > 0);
- if (atomic_dec_and_test(&route->ksnr_refcount))
- ksocknal_destroy_route(route);
-}
-
-static inline void
-ksocknal_peer_addref(struct ksock_peer *peer)
-{
- LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
- atomic_inc(&peer->ksnp_refcount);
-}
-
-void ksocknal_destroy_peer(struct ksock_peer *peer);
-
-static inline void
-ksocknal_peer_decref(struct ksock_peer *peer)
-{
- LASSERT(atomic_read(&peer->ksnp_refcount) > 0);
- if (atomic_dec_and_test(&peer->ksnp_refcount))
- ksocknal_destroy_peer(peer);
-}
-
-int ksocknal_startup(struct lnet_ni *ni);
-void ksocknal_shutdown(struct lnet_ni *ni);
-int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg);
-int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg);
-int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
- int delayed, struct iov_iter *to, unsigned int rlen);
-int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
-
-int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip,
- int port);
-struct ksock_peer *ksocknal_find_peer_locked(struct lnet_ni *ni,
- struct lnet_process_id id);
-struct ksock_peer *ksocknal_find_peer(struct lnet_ni *ni,
- struct lnet_process_id id);
-void ksocknal_peer_failed(struct ksock_peer *peer);
-int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route,
- struct socket *sock, int type);
-void ksocknal_close_conn_locked(struct ksock_conn *conn, int why);
-void ksocknal_terminate_conn(struct ksock_conn *conn);
-void ksocknal_destroy_conn(struct ksock_conn *conn);
-int ksocknal_close_peer_conns_locked(struct ksock_peer *peer,
- __u32 ipaddr, int why);
-int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why);
-int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr);
-struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer *peer,
- struct ksock_tx *tx, int nonblk);
-
-int ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
- struct lnet_process_id id);
-struct ksock_tx *ksocknal_alloc_tx(int type, int size);
-void ksocknal_free_tx(struct ksock_tx *tx);
-struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
-void ksocknal_next_tx_carrier(struct ksock_conn *conn);
-void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn);
-void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error);
-void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive);
-void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when);
-int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
-void ksocknal_thread_fini(void);
-void ksocknal_launch_all_connections_locked(struct ksock_peer *peer);
-struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer *peer);
-struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer *peer);
-int ksocknal_new_packet(struct ksock_conn *conn, int skip);
-int ksocknal_scheduler(void *arg);
-int ksocknal_connd(void *arg);
-int ksocknal_reaper(void *arg);
-int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- lnet_nid_t peer_nid, struct ksock_hello_msg *hello);
-int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- struct ksock_hello_msg *hello,
- struct lnet_process_id *id,
- __u64 *incarnation);
-void ksocknal_read_callback(struct ksock_conn *conn);
-void ksocknal_write_callback(struct ksock_conn *conn);
-
-int ksocknal_lib_zc_capable(struct ksock_conn *conn);
-void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn);
-void ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn);
-void ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn);
-void ksocknal_lib_push_conn(struct ksock_conn *conn);
-int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn);
-int ksocknal_lib_setup_sock(struct socket *so);
-int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx);
-int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx);
-void ksocknal_lib_eager_ack(struct ksock_conn *conn);
-int ksocknal_lib_recv(struct ksock_conn *conn);
-int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
- int *rxmem, int *nagle);
-
-void ksocknal_read_callback(struct ksock_conn *conn);
-void ksocknal_write_callback(struct ksock_conn *conn);
-
-int ksocknal_tunables_init(void);
-
-void ksocknal_lib_csum_tx(struct ksock_tx *tx);
-
-int ksocknal_lib_memory_pressure(struct ksock_conn *conn);
-int ksocknal_lib_bind_thread_to_cpu(int id);
-
-#endif /* _SOCKLND_SOCKLND_H_ */
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
deleted file mode 100644
index 01b31a6..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ /dev/null
@@ -1,2586 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/sched/mm.h>
-#include "socklnd.h"
-
-struct ksock_tx *
-ksocknal_alloc_tx(int type, int size)
-{
- struct ksock_tx *tx = NULL;
-
- if (type == KSOCK_MSG_NOOP) {
- LASSERT(size == KSOCK_NOOP_TX_SIZE);
-
- /* searching for a noop tx in free list */
- spin_lock(&ksocknal_data.ksnd_tx_lock);
-
- if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
- tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next,
- struct ksock_tx, tx_list);
- LASSERT(tx->tx_desc_size == size);
- list_del(&tx->tx_list);
- }
-
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
- }
-
- if (!tx)
- tx = kzalloc(size, GFP_NOFS);
-
- if (!tx)
- return NULL;
-
- atomic_set(&tx->tx_refcount, 1);
- tx->tx_zc_aborted = 0;
- tx->tx_zc_capable = 0;
- tx->tx_zc_checked = 0;
- tx->tx_desc_size = size;
-
- atomic_inc(&ksocknal_data.ksnd_nactive_txs);
-
- return tx;
-}
-
-struct ksock_tx *
-ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
-{
- struct ksock_tx *tx;
-
- tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
- if (!tx) {
- CERROR("Can't allocate noop tx desc\n");
- return NULL;
- }
-
- tx->tx_conn = NULL;
- tx->tx_lnetmsg = NULL;
- tx->tx_kiov = NULL;
- tx->tx_nkiov = 0;
- tx->tx_iov = tx->tx_frags.virt.iov;
- tx->tx_niov = 1;
- tx->tx_nonblk = nonblk;
-
- tx->tx_msg.ksm_csum = 0;
- tx->tx_msg.ksm_type = KSOCK_MSG_NOOP;
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
-
- return tx;
-}
-
-void
-ksocknal_free_tx(struct ksock_tx *tx)
-{
- atomic_dec(&ksocknal_data.ksnd_nactive_txs);
-
- if (!tx->tx_lnetmsg && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
- /* it's a noop tx */
- spin_lock(&ksocknal_data.ksnd_tx_lock);
-
- list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
-
- spin_unlock(&ksocknal_data.ksnd_tx_lock);
- } else {
- kfree(tx);
- }
-}
-
-static int
-ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct kvec *iov = tx->tx_iov;
- int nob;
- int rc;
-
- LASSERT(tx->tx_niov > 0);
-
- /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
- rc = ksocknal_lib_send_iov(conn, tx);
-
- if (rc <= 0) /* sent nothing? */
- return rc;
-
- nob = rc;
- LASSERT(nob <= tx->tx_resid);
- tx->tx_resid -= nob;
-
- /* "consume" iov */
- do {
- LASSERT(tx->tx_niov > 0);
-
- if (nob < (int)iov->iov_len) {
- iov->iov_base = (void *)((char *)iov->iov_base + nob);
- iov->iov_len -= nob;
- return rc;
- }
-
- nob -= iov->iov_len;
- tx->tx_iov = ++iov;
- tx->tx_niov--;
- } while (nob);
-
- return rc;
-}
-
-static int
-ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct bio_vec *kiov = tx->tx_kiov;
- int nob;
- int rc;
-
- LASSERT(!tx->tx_niov);
- LASSERT(tx->tx_nkiov > 0);
-
- /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
- rc = ksocknal_lib_send_kiov(conn, tx);
-
- if (rc <= 0) /* sent nothing? */
- return rc;
-
- nob = rc;
- LASSERT(nob <= tx->tx_resid);
- tx->tx_resid -= nob;
-
- /* "consume" kiov */
- do {
- LASSERT(tx->tx_nkiov > 0);
-
- if (nob < (int)kiov->bv_len) {
- kiov->bv_offset += nob;
- kiov->bv_len -= nob;
- return rc;
- }
-
- nob -= (int)kiov->bv_len;
- tx->tx_kiov = ++kiov;
- tx->tx_nkiov--;
- } while (nob);
-
- return rc;
-}
-
-static int
-ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- int rc;
- int bufnob;
-
- if (ksocknal_data.ksnd_stall_tx) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(ksocknal_data.ksnd_stall_tx * HZ);
- }
-
- LASSERT(tx->tx_resid);
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) {
- LASSERT(conn->ksnc_closing);
- return -ESHUTDOWN;
- }
-
- do {
- if (ksocknal_data.ksnd_enomem_tx > 0) {
- /* testing... */
- ksocknal_data.ksnd_enomem_tx--;
- rc = -EAGAIN;
- } else if (tx->tx_niov) {
- rc = ksocknal_send_iov(conn, tx);
- } else {
- rc = ksocknal_send_kiov(conn, tx);
- }
-
- bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
- if (rc > 0) /* sent something? */
- conn->ksnc_tx_bufnob += rc; /* account it */
-
- if (bufnob < conn->ksnc_tx_bufnob) {
- /*
- * allocated send buffer bytes < computed; infer
- * something got ACKed
- */
- conn->ksnc_tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- conn->ksnc_peer->ksnp_last_alive = jiffies;
- conn->ksnc_tx_bufnob = bufnob;
- mb();
- }
-
- if (rc <= 0) { /* Didn't write anything? */
-
- if (!rc) /* some stacks return 0 instead of -EAGAIN */
- rc = -EAGAIN;
-
- /* Check if EAGAIN is due to memory pressure */
- if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
- rc = -ENOMEM;
-
- break;
- }
-
- /* socket's wmem_queued now includes 'rc' bytes */
- atomic_sub(rc, &conn->ksnc_tx_nob);
- rc = 0;
-
- } while (tx->tx_resid);
-
- ksocknal_connsock_decref(conn);
- return rc;
-}
-
-static int
-ksocknal_recv_iter(struct ksock_conn *conn)
-{
- int nob;
- int rc;
-
- /*
- * Never touch conn->ksnc_rx_to or change connection
- * status inside ksocknal_lib_recv
- */
- rc = ksocknal_lib_recv(conn);
-
- if (rc <= 0)
- return rc;
-
- /* received something... */
- nob = rc;
-
- conn->ksnc_peer->ksnp_last_alive = jiffies;
- conn->ksnc_rx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- mb(); /* order with setting rx_started */
- conn->ksnc_rx_started = 1;
-
- conn->ksnc_rx_nob_left -= nob;
-
- iov_iter_advance(&conn->ksnc_rx_to, nob);
- if (iov_iter_count(&conn->ksnc_rx_to))
- return -EAGAIN;
-
- return 1;
-}
-
-static int
-ksocknal_receive(struct ksock_conn *conn)
-{
- /*
- * Return 1 on success, 0 on EOF, < 0 on error.
- * Caller checks ksnc_rx_to to determine
- * progress/completion.
- */
- int rc;
-
- if (ksocknal_data.ksnd_stall_rx) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(ksocknal_data.ksnd_stall_rx * HZ);
- }
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) {
- LASSERT(conn->ksnc_closing);
- return -ESHUTDOWN;
- }
-
- for (;;) {
- rc = ksocknal_recv_iter(conn);
- if (rc <= 0) {
- /* error/EOF or partial receive */
- if (rc == -EAGAIN) {
- rc = 1;
- } else if (!rc && conn->ksnc_rx_started) {
- /* EOF in the middle of a message */
- rc = -EPROTO;
- }
- break;
- }
-
- /* Completed a fragment */
-
- if (!iov_iter_count(&conn->ksnc_rx_to)) {
- rc = 1;
- break;
- }
- }
-
- ksocknal_connsock_decref(conn);
- return rc;
-}
-
-void
-ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx)
-{
- struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
- int rc = (!tx->tx_resid && !tx->tx_zc_aborted) ? 0 : -EIO;
-
- LASSERT(ni || tx->tx_conn);
-
- if (tx->tx_conn)
- ksocknal_conn_decref(tx->tx_conn);
-
- if (!ni && tx->tx_conn)
- ni = tx->tx_conn->ksnc_peer->ksnp_ni;
-
- ksocknal_free_tx(tx);
- if (lnetmsg) /* KSOCK_MSG_NOOP go without lnetmsg */
- lnet_finalize(ni, lnetmsg, rc);
-}
-
-void
-ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
-{
- struct ksock_tx *tx;
-
- while (!list_empty(txlist)) {
- tx = list_entry(txlist->next, struct ksock_tx, tx_list);
-
- if (error && tx->tx_lnetmsg) {
- CNETERR("Deleting packet type %d len %d %s->%s\n",
- le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type),
- le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length),
- libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
- libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
- } else if (error) {
- CNETERR("Deleting noop packet\n");
- }
-
- list_del(&tx->tx_list);
-
- LASSERT(atomic_read(&tx->tx_refcount) == 1);
- ksocknal_tx_done(ni, tx);
- }
-}
-
-static void
-ksocknal_check_zc_req(struct ksock_tx *tx)
-{
- struct ksock_conn *conn = tx->tx_conn;
- struct ksock_peer *peer = conn->ksnc_peer;
-
- /*
- * Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
- * to ksnp_zc_req_list if some fragment of this message should be sent
- * zero-copy. Our peer will send an ACK containing this cookie when
- * she has received this message to tell us we can signal completion.
- * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
- * ksnp_zc_req_list.
- */
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
- LASSERT(tx->tx_zc_capable);
-
- tx->tx_zc_checked = 1;
-
- if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
- !conn->ksnc_zc_capable)
- return;
-
- /*
- * assign cookie and queue tx to pending list, it will be released when
- * a matching ack is received. See ksocknal_handle_zcack()
- */
- ksocknal_tx_addref(tx);
-
- spin_lock(&peer->ksnp_lock);
-
- /* ZC_REQ is going to be pinned to the peer */
- tx->tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
-
- LASSERT(!tx->tx_msg.ksm_zc_cookies[0]);
-
- tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
-
- if (!peer->ksnp_zc_next_cookie)
- peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
-
- list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
-
- spin_unlock(&peer->ksnp_lock);
-}
-
-static void
-ksocknal_uncheck_zc_req(struct ksock_tx *tx)
-{
- struct ksock_peer *peer = tx->tx_conn->ksnc_peer;
-
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
- LASSERT(tx->tx_zc_capable);
-
- tx->tx_zc_checked = 0;
-
- spin_lock(&peer->ksnp_lock);
-
- if (!tx->tx_msg.ksm_zc_cookies[0]) {
- /* Not waiting for an ACK */
- spin_unlock(&peer->ksnp_lock);
- return;
- }
-
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- list_del(&tx->tx_zc_list);
-
- spin_unlock(&peer->ksnp_lock);
-
- ksocknal_tx_decref(tx);
-}
-
-static int
-ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- int rc;
-
- if (tx->tx_zc_capable && !tx->tx_zc_checked)
- ksocknal_check_zc_req(tx);
-
- rc = ksocknal_transmit(conn, tx);
-
- CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc);
-
- if (!tx->tx_resid) {
- /* Sent everything OK */
- LASSERT(!rc);
-
- return 0;
- }
-
- if (rc == -EAGAIN)
- return rc;
-
- if (rc == -ENOMEM) {
- static int counter;
-
- counter++; /* exponential backoff warnings */
- if ((counter & (-counter)) == counter)
- CWARN("%u ENOMEM tx %p\n", counter, conn);
-
- /* Queue on ksnd_enomem_conns for retry after a timeout */
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- /* enomem list takes over scheduler's ref... */
- LASSERT(conn->ksnc_tx_scheduled);
- list_add_tail(&conn->ksnc_tx_list,
- &ksocknal_data.ksnd_enomem_conns);
- if (!time_after_eq(jiffies + SOCKNAL_ENOMEM_RETRY,
- ksocknal_data.ksnd_reaper_waketime))
- wake_up(&ksocknal_data.ksnd_reaper_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
- return rc;
- }
-
- /* Actual error */
- LASSERT(rc < 0);
-
- if (!conn->ksnc_closing) {
- switch (rc) {
- case -ECONNRESET:
- LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n",
- &conn->ksnc_ipaddr);
- break;
- default:
- LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n",
- &conn->ksnc_ipaddr, rc);
- break;
- }
- CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n",
- conn, rc,
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- }
-
- if (tx->tx_zc_checked)
- ksocknal_uncheck_zc_req(tx);
-
- /* it's not an error if conn is being closed */
- ksocknal_close_conn_and_siblings(conn, (conn->ksnc_closing) ? 0 : rc);
-
- return rc;
-}
-
-static void
-ksocknal_launch_connection_locked(struct ksock_route *route)
-{
- /* called holding write lock on ksnd_global_lock */
-
- LASSERT(!route->ksnr_scheduled);
- LASSERT(!route->ksnr_connecting);
- LASSERT(ksocknal_route_mask() & ~route->ksnr_connected);
-
- route->ksnr_scheduled = 1; /* scheduling conn for connd */
- ksocknal_route_addref(route); /* extra ref for connd */
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
-
- list_add_tail(&route->ksnr_connd_list,
- &ksocknal_data.ksnd_connd_routes);
- wake_up(&ksocknal_data.ksnd_connd_waitq);
-
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
-}
-
-void
-ksocknal_launch_all_connections_locked(struct ksock_peer *peer)
-{
- struct ksock_route *route;
-
- /* called holding write lock on ksnd_global_lock */
- for (;;) {
- /* launch any/all connections that need it */
- route = ksocknal_find_connectable_route_locked(peer);
- if (!route)
- return;
-
- ksocknal_launch_connection_locked(route);
- }
-}
-
-struct ksock_conn *
-ksocknal_find_conn_locked(struct ksock_peer *peer, struct ksock_tx *tx,
- int nonblk)
-{
- struct list_head *tmp;
- struct ksock_conn *conn;
- struct ksock_conn *typed = NULL;
- struct ksock_conn *fallback = NULL;
- int tnob = 0;
- int fnob = 0;
-
- list_for_each(tmp, &peer->ksnp_conns) {
- struct ksock_conn *c;
- int nob, rc;
-
- c = list_entry(tmp, struct ksock_conn, ksnc_list);
- nob = atomic_read(&c->ksnc_tx_nob) +
- c->ksnc_sock->sk->sk_wmem_queued;
-
- LASSERT(!c->ksnc_closing);
- LASSERT(c->ksnc_proto &&
- c->ksnc_proto->pro_match_tx);
-
- rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
-
- switch (rc) {
- default:
- LBUG();
- case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
- continue;
-
- case SOCKNAL_MATCH_YES: /* typed connection */
- if (!typed || tnob > nob ||
- (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
- time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
- typed = c;
- tnob = nob;
- }
- break;
-
- case SOCKNAL_MATCH_MAY: /* fallback connection */
- if (!fallback || fnob > nob ||
- (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
- time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
- fallback = c;
- fnob = nob;
- }
- break;
- }
- }
-
- /* prefer the typed selection */
- conn = (typed) ? typed : fallback;
-
- if (conn)
- conn->ksnc_tx_last_post = jiffies;
-
- return conn;
-}
-
-void
-ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- conn->ksnc_proto->pro_pack(tx);
-
- atomic_add(tx->tx_nob, &conn->ksnc_tx_nob);
- ksocknal_conn_addref(conn); /* +1 ref for tx */
- tx->tx_conn = conn;
-}
-
-void
-ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn)
-{
- struct ksock_sched *sched = conn->ksnc_scheduler;
- struct ksock_msg *msg = &tx->tx_msg;
- struct ksock_tx *ztx = NULL;
- int bufnob = 0;
-
- /*
- * called holding global lock (read or irq-write) and caller may
- * not have dropped this lock between finding conn and calling me,
- * so we don't need the {get,put}connsock dance to deref
- * ksnc_sock...
- */
- LASSERT(!conn->ksnc_closing);
-
- CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr, conn->ksnc_port);
-
- ksocknal_tx_prep(conn, tx);
-
- /*
- * Ensure the frags we've been given EXACTLY match the number of
- * bytes we want to send. Many TCP/IP stacks disregard any total
- * size parameters passed to them and just look at the frags.
- *
- * We always expect at least 1 mapped fragment containing the
- * complete ksocknal message header.
- */
- LASSERT(lnet_iov_nob(tx->tx_niov, tx->tx_iov) +
- lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
- (unsigned int)tx->tx_nob);
- LASSERT(tx->tx_niov >= 1);
- LASSERT(tx->tx_resid == tx->tx_nob);
-
- CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
- tx, (tx->tx_lnetmsg) ? tx->tx_lnetmsg->msg_hdr.type :
- KSOCK_MSG_NOOP,
- tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
-
- /*
- * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
- * but they're used inside spinlocks a lot.
- */
- bufnob = conn->ksnc_sock->sk->sk_wmem_queued;
- spin_lock_bh(&sched->kss_lock);
-
- if (list_empty(&conn->ksnc_tx_queue) && !bufnob) {
- /* First packet starts the timeout */
- conn->ksnc_tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
- if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
- conn->ksnc_peer->ksnp_last_alive = jiffies;
- conn->ksnc_tx_bufnob = 0;
- mb(); /* order with adding to tx_queue */
- }
-
- if (msg->ksm_type == KSOCK_MSG_NOOP) {
- /*
- * The packet is noop ZC ACK, try to piggyback the ack_cookie
- * on a normal packet so I don't need to send it
- */
- LASSERT(msg->ksm_zc_cookies[1]);
- LASSERT(conn->ksnc_proto->pro_queue_tx_zcack);
-
- /* ZC ACK piggybacked on ztx release tx later */
- if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
- ztx = tx;
- } else {
- /*
- * It's a normal packet - can it piggback a noop zc-ack that
- * has been queued already?
- */
- LASSERT(!msg->ksm_zc_cookies[1]);
- LASSERT(conn->ksnc_proto->pro_queue_tx_msg);
-
- ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
- /* ztx will be released later */
- }
-
- if (ztx) {
- atomic_sub(ztx->tx_nob, &conn->ksnc_tx_nob);
- list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
- }
-
- if (conn->ksnc_tx_ready && /* able to send */
- !conn->ksnc_tx_scheduled) { /* not scheduled to send */
- /* +1 ref for scheduler */
- ksocknal_conn_addref(conn);
- list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
- conn->ksnc_tx_scheduled = 1;
- wake_up(&sched->kss_waitq);
- }
-
- spin_unlock_bh(&sched->kss_lock);
-}
-
-struct ksock_route *
-ksocknal_find_connectable_route_locked(struct ksock_peer *peer)
-{
- unsigned long now = jiffies;
- struct list_head *tmp;
- struct ksock_route *route;
-
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- LASSERT(!route->ksnr_connecting || route->ksnr_scheduled);
-
- /* connections being established */
- if (route->ksnr_scheduled)
- continue;
-
- /* all route types connected ? */
- if (!(ksocknal_route_mask() & ~route->ksnr_connected))
- continue;
-
- if (!(!route->ksnr_retry_interval || /* first attempt */
- time_after_eq(now, route->ksnr_timeout))) {
- CDEBUG(D_NET,
- "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n",
- &route->ksnr_ipaddr,
- route->ksnr_connected,
- route->ksnr_retry_interval,
- (route->ksnr_timeout - now) / HZ);
- continue;
- }
-
- return route;
- }
-
- return NULL;
-}
-
-struct ksock_route *
-ksocknal_find_connecting_route_locked(struct ksock_peer *peer)
-{
- struct list_head *tmp;
- struct ksock_route *route;
-
- list_for_each(tmp, &peer->ksnp_routes) {
- route = list_entry(tmp, struct ksock_route, ksnr_list);
-
- LASSERT(!route->ksnr_connecting || route->ksnr_scheduled);
-
- if (route->ksnr_scheduled)
- return route;
- }
-
- return NULL;
-}
-
-int
-ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx,
- struct lnet_process_id id)
-{
- struct ksock_peer *peer;
- struct ksock_conn *conn;
- rwlock_t *g_lock;
- int retry;
- int rc;
-
- LASSERT(!tx->tx_conn);
-
- g_lock = &ksocknal_data.ksnd_global_lock;
-
- for (retry = 0;; retry = 1) {
- read_lock(g_lock);
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer) {
- if (!ksocknal_find_connectable_route_locked(peer)) {
- conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
- if (conn) {
- /*
- * I've got no routes that need to be
- * connecting and I do have an actual
- * connection...
- */
- ksocknal_queue_tx_locked(tx, conn);
- read_unlock(g_lock);
- return 0;
- }
- }
- }
-
- /* I'll need a write lock... */
- read_unlock(g_lock);
-
- write_lock_bh(g_lock);
-
- peer = ksocknal_find_peer_locked(ni, id);
- if (peer)
- break;
-
- write_unlock_bh(g_lock);
-
- if (id.pid & LNET_PID_USERFLAG) {
- CERROR("Refusing to create a connection to userspace process %s\n",
- libcfs_id2str(id));
- return -EHOSTUNREACH;
- }
-
- if (retry) {
- CERROR("Can't find peer %s\n", libcfs_id2str(id));
- return -EHOSTUNREACH;
- }
-
- rc = ksocknal_add_peer(ni, id,
- LNET_NIDADDR(id.nid),
- lnet_acceptor_port());
- if (rc) {
- CERROR("Can't add peer %s: %d\n",
- libcfs_id2str(id), rc);
- return rc;
- }
- }
-
- ksocknal_launch_all_connections_locked(peer);
-
- conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
- if (conn) {
- /* Connection exists; queue message on it */
- ksocknal_queue_tx_locked(tx, conn);
- write_unlock_bh(g_lock);
- return 0;
- }
-
- if (peer->ksnp_accepting > 0 ||
- ksocknal_find_connecting_route_locked(peer)) {
- /* the message is going to be pinned to the peer */
- tx->tx_deadline =
- jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
-
- /* Queue the message until a connection is established */
- list_add_tail(&tx->tx_list, &peer->ksnp_tx_queue);
- write_unlock_bh(g_lock);
- return 0;
- }
-
- write_unlock_bh(g_lock);
-
- /* NB Routes may be ignored if connections to them failed recently */
- CNETERR("No usable routes to %s\n", libcfs_id2str(id));
- return -EHOSTUNREACH;
-}
-
-int
-ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg)
-{
- unsigned int mpflag = 0;
- int type = lntmsg->msg_type;
- struct lnet_process_id target = lntmsg->msg_target;
- unsigned int payload_niov = lntmsg->msg_niov;
- struct kvec *payload_iov = lntmsg->msg_iov;
- struct bio_vec *payload_kiov = lntmsg->msg_kiov;
- unsigned int payload_offset = lntmsg->msg_offset;
- unsigned int payload_nob = lntmsg->msg_len;
- struct ksock_tx *tx;
- int desc_size;
- int rc;
-
- /*
- * NB 'private' is different depending on what we're sending.
- * Just ignore it...
- */
- CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
- payload_nob, payload_niov, libcfs_id2str(target));
-
- LASSERT(!payload_nob || payload_niov > 0);
- LASSERT(payload_niov <= LNET_MAX_IOV);
- /* payload is either all vaddrs or all pages */
- LASSERT(!(payload_kiov && payload_iov));
- LASSERT(!in_interrupt());
-
- if (payload_iov)
- desc_size = offsetof(struct ksock_tx,
- tx_frags.virt.iov[1 + payload_niov]);
- else
- desc_size = offsetof(struct ksock_tx,
- tx_frags.paged.kiov[payload_niov]);
-
- if (lntmsg->msg_vmflush)
- mpflag = memalloc_noreclaim_save();
- tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
- if (!tx) {
- CERROR("Can't allocate tx desc type %d size %d\n",
- type, desc_size);
- if (lntmsg->msg_vmflush)
- memalloc_noreclaim_restore(mpflag);
- return -ENOMEM;
- }
-
- tx->tx_conn = NULL; /* set when assigned a conn */
- tx->tx_lnetmsg = lntmsg;
-
- if (payload_iov) {
- tx->tx_kiov = NULL;
- tx->tx_nkiov = 0;
- tx->tx_iov = tx->tx_frags.virt.iov;
- tx->tx_niov = 1 +
- lnet_extract_iov(payload_niov, &tx->tx_iov[1],
- payload_niov, payload_iov,
- payload_offset, payload_nob);
- } else {
- tx->tx_niov = 1;
- tx->tx_iov = &tx->tx_frags.paged.iov;
- tx->tx_kiov = tx->tx_frags.paged.kiov;
- tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
- payload_niov, payload_kiov,
- payload_offset, payload_nob);
-
- if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
- tx->tx_zc_capable = 1;
- }
-
- tx->tx_msg.ksm_csum = 0;
- tx->tx_msg.ksm_type = KSOCK_MSG_LNET;
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- tx->tx_msg.ksm_zc_cookies[1] = 0;
-
- /* The first fragment will be set later in pro_pack */
- rc = ksocknal_launch_packet(ni, tx, target);
- if (mpflag)
- memalloc_noreclaim_restore(mpflag);
-
- if (!rc)
- return 0;
-
- ksocknal_free_tx(tx);
- return -EIO;
-}
-
-int
-ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
-{
- struct task_struct *task = kthread_run(fn, arg, "%s", name);
-
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
- ksocknal_data.ksnd_nthreads++;
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
- return 0;
-}
-
-void
-ksocknal_thread_fini(void)
-{
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
- ksocknal_data.ksnd_nthreads--;
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-}
-
-int
-ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip)
-{
- static char ksocknal_slop_buffer[4096];
- struct kvec *kvec = conn->ksnc_rx_iov_space;
-
- int nob;
- unsigned int niov;
- int skipped;
-
- LASSERT(conn->ksnc_proto);
-
- if (*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) {
- /* Remind the socket to ack eagerly... */
- ksocknal_lib_eager_ack(conn);
- }
-
- if (!nob_to_skip) { /* right at next packet boundary now */
- conn->ksnc_rx_started = 0;
- mb(); /* racing with timeout thread */
-
- switch (conn->ksnc_proto->pro_version) {
- case KSOCK_PROTO_V2:
- case KSOCK_PROTO_V3:
- conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
- kvec->iov_base = &conn->ksnc_msg;
- kvec->iov_len = offsetof(struct ksock_msg, ksm_u);
- conn->ksnc_rx_nob_left = offsetof(struct ksock_msg, ksm_u);
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec,
- 1, offsetof(struct ksock_msg, ksm_u));
- break;
-
- case KSOCK_PROTO_V1:
- /* Receiving bare struct lnet_hdr */
- conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
- kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
- kvec->iov_len = sizeof(struct lnet_hdr);
- conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr);
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec,
- 1, sizeof(struct lnet_hdr));
- break;
-
- default:
- LBUG();
- }
- conn->ksnc_rx_csum = ~0;
- return 1;
- }
-
- /*
- * Set up to skip as much as possible now. If there's more left
- * (ran out of iov entries) we'll get called again
- */
- conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
- conn->ksnc_rx_nob_left = nob_to_skip;
- skipped = 0;
- niov = 0;
-
- do {
- nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer));
-
- kvec[niov].iov_base = ksocknal_slop_buffer;
- kvec[niov].iov_len = nob;
- niov++;
- skipped += nob;
- nob_to_skip -= nob;
-
- } while (nob_to_skip && /* mustn't overflow conn's rx iov */
- niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct iovec));
-
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, niov, skipped);
- return 0;
-}
-
-static int
-ksocknal_process_receive(struct ksock_conn *conn)
-{
- struct kvec *kvec = conn->ksnc_rx_iov_space;
- struct lnet_hdr *lhdr;
- struct lnet_process_id *id;
- int rc;
-
- LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0);
-
- /* NB: sched lock NOT held */
- /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */
- LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
- conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
- conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
- conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
- again:
- if (iov_iter_count(&conn->ksnc_rx_to)) {
- rc = ksocknal_receive(conn);
-
- if (rc <= 0) {
- LASSERT(rc != -EAGAIN);
-
- if (!rc)
- CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n",
- conn,
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- else if (!conn->ksnc_closing)
- CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n",
- conn, rc,
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
-
- /* it's not an error if conn is being closed */
- ksocknal_close_conn_and_siblings(conn,
- (conn->ksnc_closing) ? 0 : rc);
- return (!rc ? -ESHUTDOWN : rc);
- }
-
- if (iov_iter_count(&conn->ksnc_rx_to)) {
- /* short read */
- return -EAGAIN;
- }
- }
- switch (conn->ksnc_rx_state) {
- case SOCKNAL_RX_KSM_HEADER:
- if (conn->ksnc_flip) {
- __swab32s(&conn->ksnc_msg.ksm_type);
- __swab32s(&conn->ksnc_msg.ksm_csum);
- __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
- __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
- }
-
- if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
- conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
- CERROR("%s: Unknown message type: %x\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- conn->ksnc_msg.ksm_type);
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, -EPROTO);
- return -EPROTO;
- }
-
- if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
- conn->ksnc_msg.ksm_csum && /* has checksum */
- conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
- /* NOOP Checksum error */
- CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, -EPROTO);
- return -EIO;
- }
-
- if (conn->ksnc_msg.ksm_zc_cookies[1]) {
- __u64 cookie = 0;
-
- LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
-
- if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
- cookie = conn->ksnc_msg.ksm_zc_cookies[0];
-
- rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
- conn->ksnc_msg.ksm_zc_cookies[1]);
-
- if (rc) {
- CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, -EPROTO);
- return rc;
- }
- }
-
- if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
- ksocknal_new_packet(conn, 0);
- return 0; /* NOOP is done and just return */
- }
-
- conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
- conn->ksnc_rx_nob_left = sizeof(struct ksock_lnet_msg);
-
- kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg;
- kvec->iov_len = sizeof(struct ksock_lnet_msg);
-
- iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec,
- 1, sizeof(struct ksock_lnet_msg));
-
- goto again; /* read lnet header now */
-
- case SOCKNAL_RX_LNET_HEADER:
- /* unpack message header */
- conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
-
- if (conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) {
- /* Userspace peer */
- lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
- id = &conn->ksnc_peer->ksnp_id;
-
- /* Substitute process ID assigned at connection time */
- lhdr->src_pid = cpu_to_le32(id->pid);
- lhdr->src_nid = cpu_to_le64(id->nid);
- }
-
- conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
- ksocknal_conn_addref(conn); /* ++ref while parsing */
-
- rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
- &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
- conn->ksnc_peer->ksnp_id.nid, conn, 0);
- if (rc < 0) {
- /* I just received garbage: give up on this conn */
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, rc);
- ksocknal_conn_decref(conn);
- return -EPROTO;
- }
-
- /* I'm racing with ksocknal_recv() */
- LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
- conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
-
- if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
- return 0;
-
- /* ksocknal_recv() got called */
- goto again;
-
- case SOCKNAL_RX_LNET_PAYLOAD:
- /* payload all received */
- rc = 0;
-
- if (!conn->ksnc_rx_nob_left && /* not truncating */
- conn->ksnc_msg.ksm_csum && /* has checksum */
- conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
- CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id),
- conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
- rc = -EIO;
- }
-
- if (!rc && conn->ksnc_msg.ksm_zc_cookies[0]) {
- LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
-
- lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
- id = &conn->ksnc_peer->ksnp_id;
-
- rc = conn->ksnc_proto->pro_handle_zcreq(conn,
- conn->ksnc_msg.ksm_zc_cookies[0],
- *ksocknal_tunables.ksnd_nonblk_zcack ||
- le64_to_cpu(lhdr->src_nid) != id->nid);
- }
-
- lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
-
- if (rc) {
- ksocknal_new_packet(conn, 0);
- ksocknal_close_conn_and_siblings(conn, rc);
- return -EPROTO;
- }
- /* Fall through */
-
- case SOCKNAL_RX_SLOP:
- /* starting new packet? */
- if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left))
- return 0; /* come back later */
- goto again; /* try to finish reading slop now */
-
- default:
- break;
- }
-
- /* Not Reached */
- LBUG();
- return -EINVAL; /* keep gcc happy */
-}
-
-int
-ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg,
- int delayed, struct iov_iter *to, unsigned int rlen)
-{
- struct ksock_conn *conn = private;
- struct ksock_sched *sched = conn->ksnc_scheduler;
-
- LASSERT(iov_iter_count(to) <= rlen);
- LASSERT(to->nr_segs <= LNET_MAX_IOV);
-
- conn->ksnc_cookie = msg;
- conn->ksnc_rx_nob_left = rlen;
-
- conn->ksnc_rx_to = *to;
-
- LASSERT(conn->ksnc_rx_scheduled);
-
- spin_lock_bh(&sched->kss_lock);
-
- switch (conn->ksnc_rx_state) {
- case SOCKNAL_RX_PARSE_WAIT:
- list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
- wake_up(&sched->kss_waitq);
- LASSERT(conn->ksnc_rx_ready);
- break;
-
- case SOCKNAL_RX_PARSE:
- /* scheduler hasn't noticed I'm parsing yet */
- break;
- }
-
- conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
-
- spin_unlock_bh(&sched->kss_lock);
- ksocknal_conn_decref(conn);
- return 0;
-}
-
-static inline int
-ksocknal_sched_cansleep(struct ksock_sched *sched)
-{
- int rc;
-
- spin_lock_bh(&sched->kss_lock);
-
- rc = !ksocknal_data.ksnd_shuttingdown &&
- list_empty(&sched->kss_rx_conns) &&
- list_empty(&sched->kss_tx_conns);
-
- spin_unlock_bh(&sched->kss_lock);
- return rc;
-}
-
-int ksocknal_scheduler(void *arg)
-{
- struct ksock_sched_info *info;
- struct ksock_sched *sched;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
- int rc;
- int nloops = 0;
- long id = (long)arg;
-
- info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
- sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
-
- rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
- if (rc) {
- CWARN("Can't set CPU partition affinity to %d: %d\n",
- info->ksi_cpt, rc);
- }
-
- spin_lock_bh(&sched->kss_lock);
-
- while (!ksocknal_data.ksnd_shuttingdown) {
- int did_something = 0;
-
- /* Ensure I progress everything semi-fairly */
-
- if (!list_empty(&sched->kss_rx_conns)) {
- conn = list_entry(sched->kss_rx_conns.next,
- struct ksock_conn, ksnc_rx_list);
- list_del(&conn->ksnc_rx_list);
-
- LASSERT(conn->ksnc_rx_scheduled);
- LASSERT(conn->ksnc_rx_ready);
-
- /*
- * clear rx_ready in case receive isn't complete.
- * Do it BEFORE we call process_recv, since
- * data_ready can set it any time after we release
- * kss_lock.
- */
- conn->ksnc_rx_ready = 0;
- spin_unlock_bh(&sched->kss_lock);
-
- rc = ksocknal_process_receive(conn);
-
- spin_lock_bh(&sched->kss_lock);
-
- /* I'm the only one that can clear this flag */
- LASSERT(conn->ksnc_rx_scheduled);
-
- /* Did process_receive get everything it wanted? */
- if (!rc)
- conn->ksnc_rx_ready = 1;
-
- if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
- /*
- * Conn blocked waiting for ksocknal_recv()
- * I change its state (under lock) to signal
- * it can be rescheduled
- */
- conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
- } else if (conn->ksnc_rx_ready) {
- /* reschedule for rx */
- list_add_tail(&conn->ksnc_rx_list,
- &sched->kss_rx_conns);
- } else {
- conn->ksnc_rx_scheduled = 0;
- /* drop my ref */
- ksocknal_conn_decref(conn);
- }
-
- did_something = 1;
- }
-
- if (!list_empty(&sched->kss_tx_conns)) {
- LIST_HEAD(zlist);
-
- if (!list_empty(&sched->kss_zombie_noop_txs)) {
- list_add(&zlist, &sched->kss_zombie_noop_txs);
- list_del_init(&sched->kss_zombie_noop_txs);
- }
-
- conn = list_entry(sched->kss_tx_conns.next,
- struct ksock_conn, ksnc_tx_list);
- list_del(&conn->ksnc_tx_list);
-
- LASSERT(conn->ksnc_tx_scheduled);
- LASSERT(conn->ksnc_tx_ready);
- LASSERT(!list_empty(&conn->ksnc_tx_queue));
-
- tx = list_entry(conn->ksnc_tx_queue.next,
- struct ksock_tx, tx_list);
-
- if (conn->ksnc_tx_carrier == tx)
- ksocknal_next_tx_carrier(conn);
-
- /* dequeue now so empty list => more to send */
- list_del(&tx->tx_list);
-
- /*
- * Clear tx_ready in case send isn't complete. Do
- * it BEFORE we call process_transmit, since
- * write_space can set it any time after we release
- * kss_lock.
- */
- conn->ksnc_tx_ready = 0;
- spin_unlock_bh(&sched->kss_lock);
-
- if (!list_empty(&zlist)) {
- /*
- * free zombie noop txs, it's fast because
- * noop txs are just put in freelist
- */
- ksocknal_txlist_done(NULL, &zlist, 0);
- }
-
- rc = ksocknal_process_transmit(conn, tx);
-
- if (rc == -ENOMEM || rc == -EAGAIN) {
- /*
- * Incomplete send: replace tx on HEAD of
- * tx_queue
- */
- spin_lock_bh(&sched->kss_lock);
- list_add(&tx->tx_list, &conn->ksnc_tx_queue);
- } else {
- /* Complete send; tx -ref */
- ksocknal_tx_decref(tx);
-
- spin_lock_bh(&sched->kss_lock);
- /* assume space for more */
- conn->ksnc_tx_ready = 1;
- }
-
- if (rc == -ENOMEM) {
- /*
- * Do nothing; after a short timeout, this
- * conn will be reposted on kss_tx_conns.
- */
- } else if (conn->ksnc_tx_ready &&
- !list_empty(&conn->ksnc_tx_queue)) {
- /* reschedule for tx */
- list_add_tail(&conn->ksnc_tx_list,
- &sched->kss_tx_conns);
- } else {
- conn->ksnc_tx_scheduled = 0;
- /* drop my ref */
- ksocknal_conn_decref(conn);
- }
-
- did_something = 1;
- }
- if (!did_something || /* nothing to do */
- ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
- spin_unlock_bh(&sched->kss_lock);
-
- nloops = 0;
-
- if (!did_something) { /* wait for something to do */
- rc = wait_event_interruptible_exclusive(
- sched->kss_waitq,
- !ksocknal_sched_cansleep(sched));
- LASSERT(!rc);
- } else {
- cond_resched();
- }
-
- spin_lock_bh(&sched->kss_lock);
- }
- }
-
- spin_unlock_bh(&sched->kss_lock);
- ksocknal_thread_fini();
- return 0;
-}
-
-/*
- * Add connection to kss_rx_conns of scheduler
- * and wakeup the scheduler.
- */
-void ksocknal_read_callback(struct ksock_conn *conn)
-{
- struct ksock_sched *sched;
-
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
-
- conn->ksnc_rx_ready = 1;
-
- if (!conn->ksnc_rx_scheduled) { /* not being progressed */
- list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
- conn->ksnc_rx_scheduled = 1;
- /* extra ref for scheduler */
- ksocknal_conn_addref(conn);
-
- wake_up(&sched->kss_waitq);
- }
- spin_unlock_bh(&sched->kss_lock);
-}
-
-/*
- * Add connection to kss_tx_conns of scheduler
- * and wakeup the scheduler.
- */
-void ksocknal_write_callback(struct ksock_conn *conn)
-{
- struct ksock_sched *sched;
-
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
-
- conn->ksnc_tx_ready = 1;
-
- if (!conn->ksnc_tx_scheduled && /* not being progressed */
- !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */
- list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
- conn->ksnc_tx_scheduled = 1;
- /* extra ref for scheduler */
- ksocknal_conn_addref(conn);
-
- wake_up(&sched->kss_waitq);
- }
-
- spin_unlock_bh(&sched->kss_lock);
-}
-
-static struct ksock_proto *
-ksocknal_parse_proto_version(struct ksock_hello_msg *hello)
-{
- __u32 version = 0;
-
- if (hello->kshm_magic == LNET_PROTO_MAGIC)
- version = hello->kshm_version;
- else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
- version = __swab32(hello->kshm_version);
-
- if (version) {
-#if SOCKNAL_VERSION_DEBUG
- if (*ksocknal_tunables.ksnd_protocol == 1)
- return NULL;
-
- if (*ksocknal_tunables.ksnd_protocol == 2 &&
- version == KSOCK_PROTO_V3)
- return NULL;
-#endif
- if (version == KSOCK_PROTO_V2)
- return &ksocknal_protocol_v2x;
-
- if (version == KSOCK_PROTO_V3)
- return &ksocknal_protocol_v3x;
-
- return NULL;
- }
-
- if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
- struct lnet_magicversion *hmv = (struct lnet_magicversion *)hello;
-
- BUILD_BUG_ON(sizeof(struct lnet_magicversion) !=
- offsetof(struct ksock_hello_msg, kshm_src_nid));
-
- if (hmv->version_major == cpu_to_le16(KSOCK_PROTO_V1_MAJOR) &&
- hmv->version_minor == cpu_to_le16(KSOCK_PROTO_V1_MINOR))
- return &ksocknal_protocol_v1x;
- }
-
- return NULL;
-}
-
-int
-ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- lnet_nid_t peer_nid, struct ksock_hello_msg *hello)
-{
- /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
- struct ksock_net *net = (struct ksock_net *)ni->ni_data;
-
- LASSERT(hello->kshm_nips <= LNET_MAX_INTERFACES);
-
- /* rely on caller to hold a ref on socket so it wouldn't disappear */
- LASSERT(conn->ksnc_proto);
-
- hello->kshm_src_nid = ni->ni_nid;
- hello->kshm_dst_nid = peer_nid;
- hello->kshm_src_pid = the_lnet.ln_pid;
-
- hello->kshm_src_incarnation = net->ksnn_incarnation;
- hello->kshm_ctype = conn->ksnc_type;
-
- return conn->ksnc_proto->pro_send_hello(conn, hello);
-}
-
-static int
-ksocknal_invert_type(int type)
-{
- switch (type) {
- case SOCKLND_CONN_ANY:
- case SOCKLND_CONN_CONTROL:
- return type;
- case SOCKLND_CONN_BULK_IN:
- return SOCKLND_CONN_BULK_OUT;
- case SOCKLND_CONN_BULK_OUT:
- return SOCKLND_CONN_BULK_IN;
- default:
- return SOCKLND_CONN_NONE;
- }
-}
-
-int
-ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn,
- struct ksock_hello_msg *hello,
- struct lnet_process_id *peerid,
- __u64 *incarnation)
-{
- /* Return < 0 fatal error
- * 0 success
- * EALREADY lost connection race
- * EPROTO protocol version mismatch
- */
- struct socket *sock = conn->ksnc_sock;
- int active = !!conn->ksnc_proto;
- int timeout;
- int proto_match;
- int rc;
- struct ksock_proto *proto;
- struct lnet_process_id recv_id;
-
- /* socket type set on active connections - not set on passive */
- LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
-
- timeout = active ? *ksocknal_tunables.ksnd_timeout :
- lnet_acceptor_timeout();
-
- rc = lnet_sock_read(sock, &hello->kshm_magic,
- sizeof(hello->kshm_magic), timeout);
- if (rc) {
- CERROR("Error %d reading HELLO from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0);
- return rc;
- }
-
- if (hello->kshm_magic != LNET_PROTO_MAGIC &&
- hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
- hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
- /* Unexpected magic! */
- CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n",
- __cpu_to_le32(hello->kshm_magic),
- LNET_PROTO_TCP_MAGIC,
- &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- rc = lnet_sock_read(sock, &hello->kshm_version,
- sizeof(hello->kshm_version), timeout);
- if (rc) {
- CERROR("Error %d reading HELLO from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0);
- return rc;
- }
-
- proto = ksocknal_parse_proto_version(hello);
- if (!proto) {
- if (!active) {
- /* unknown protocol from peer, tell peer my protocol */
- conn->ksnc_proto = &ksocknal_protocol_v3x;
-#if SOCKNAL_VERSION_DEBUG
- if (*ksocknal_tunables.ksnd_protocol == 2)
- conn->ksnc_proto = &ksocknal_protocol_v2x;
- else if (*ksocknal_tunables.ksnd_protocol == 1)
- conn->ksnc_proto = &ksocknal_protocol_v1x;
-#endif
- hello->kshm_nips = 0;
- ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
- }
-
- CERROR("Unknown protocol version (%d.x expected) from %pI4h\n",
- conn->ksnc_proto->pro_version,
- &conn->ksnc_ipaddr);
-
- return -EPROTO;
- }
-
- proto_match = (conn->ksnc_proto == proto);
- conn->ksnc_proto = proto;
-
- /* receive the rest of hello message anyway */
- rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
- if (rc) {
- CERROR("Error %d reading or checking hello from from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0);
- return rc;
- }
-
- *incarnation = hello->kshm_src_incarnation;
-
- if (hello->kshm_src_nid == LNET_NID_ANY) {
- CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n",
- &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- if (!active &&
- conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
- /* Userspace NAL assigns peer process ID from socket */
- recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
- recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
- conn->ksnc_ipaddr);
- } else {
- recv_id.nid = hello->kshm_src_nid;
- recv_id.pid = hello->kshm_src_pid;
- }
-
- if (!active) {
- *peerid = recv_id;
-
- /* peer determines type */
- conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
- if (conn->ksnc_type == SOCKLND_CONN_NONE) {
- CERROR("Unexpected type %d from %s ip %pI4h\n",
- hello->kshm_ctype, libcfs_id2str(*peerid),
- &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- return 0;
- }
-
- if (peerid->pid != recv_id.pid ||
- peerid->nid != recv_id.nid) {
- LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n",
- libcfs_id2str(*peerid),
- &conn->ksnc_ipaddr,
- libcfs_id2str(recv_id));
- return -EPROTO;
- }
-
- if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
- /* Possible protocol mismatch or I lost the connection race */
- return proto_match ? EALREADY : EPROTO;
- }
-
- if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
- CERROR("Mismatched types: me %d, %s ip %pI4h %d\n",
- conn->ksnc_type, libcfs_id2str(*peerid),
- &conn->ksnc_ipaddr, hello->kshm_ctype);
- return -EPROTO;
- }
-
- return 0;
-}
-
-static int
-ksocknal_connect(struct ksock_route *route)
-{
- LIST_HEAD(zombies);
- struct ksock_peer *peer = route->ksnr_peer;
- int type;
- int wanted;
- struct socket *sock;
- unsigned long deadline;
- int retry_later = 0;
- int rc = 0;
-
- deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ;
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- LASSERT(route->ksnr_scheduled);
- LASSERT(!route->ksnr_connecting);
-
- route->ksnr_connecting = 1;
-
- for (;;) {
- wanted = ksocknal_route_mask() & ~route->ksnr_connected;
-
- /*
- * stop connecting if peer/route got closed under me, or
- * route got connected while queued
- */
- if (peer->ksnp_closing || route->ksnr_deleted ||
- !wanted) {
- retry_later = 0;
- break;
- }
-
- /* reschedule if peer is connecting to me */
- if (peer->ksnp_accepting > 0) {
- CDEBUG(D_NET,
- "peer %s(%d) already connecting to me, retry later.\n",
- libcfs_nid2str(peer->ksnp_id.nid),
- peer->ksnp_accepting);
- retry_later = 1;
- }
-
- if (retry_later) /* needs reschedule */
- break;
-
- if (wanted & BIT(SOCKLND_CONN_ANY)) {
- type = SOCKLND_CONN_ANY;
- } else if (wanted & BIT(SOCKLND_CONN_CONTROL)) {
- type = SOCKLND_CONN_CONTROL;
- } else if (wanted & BIT(SOCKLND_CONN_BULK_IN)) {
- type = SOCKLND_CONN_BULK_IN;
- } else {
- LASSERT(wanted & BIT(SOCKLND_CONN_BULK_OUT));
- type = SOCKLND_CONN_BULK_OUT;
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- if (time_after_eq(jiffies, deadline)) {
- rc = -ETIMEDOUT;
- lnet_connect_console_error(rc, peer->ksnp_id.nid,
- route->ksnr_ipaddr,
- route->ksnr_port);
- goto failed;
- }
-
- rc = lnet_connect(&sock, peer->ksnp_id.nid,
- route->ksnr_myipaddr,
- route->ksnr_ipaddr, route->ksnr_port);
- if (rc)
- goto failed;
-
- rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
- if (rc < 0) {
- lnet_connect_console_error(rc, peer->ksnp_id.nid,
- route->ksnr_ipaddr,
- route->ksnr_port);
- goto failed;
- }
-
- /*
- * A +ve RC means I have to retry because I lost the connection
- * race or I have to renegotiate protocol version
- */
- retry_later = (rc);
- if (retry_later)
- CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
- libcfs_nid2str(peer->ksnp_id.nid));
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
- }
-
- route->ksnr_scheduled = 0;
- route->ksnr_connecting = 0;
-
- if (retry_later) {
- /*
- * re-queue for attention; this frees me up to handle
- * the peer's incoming connection request
- */
- if (rc == EALREADY ||
- (!rc && peer->ksnp_accepting > 0)) {
- /*
- * We want to introduce a delay before next
- * attempt to connect if we lost conn race,
- * but the race is resolved quickly usually,
- * so min_reconnectms should be good heuristic
- */
- route->ksnr_retry_interval =
- *ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000;
- route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
- }
-
- ksocknal_launch_connection_locked(route);
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
- return retry_later;
-
- failed:
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- route->ksnr_scheduled = 0;
- route->ksnr_connecting = 0;
-
- /* This is a retry rather than a new connection */
- route->ksnr_retry_interval *= 2;
- route->ksnr_retry_interval =
- max(route->ksnr_retry_interval,
- (long)*ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000);
- route->ksnr_retry_interval =
- min(route->ksnr_retry_interval,
- (long)*ksocknal_tunables.ksnd_max_reconnectms * HZ / 1000);
-
- LASSERT(route->ksnr_retry_interval);
- route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
-
- if (!list_empty(&peer->ksnp_tx_queue) &&
- !peer->ksnp_accepting &&
- !ksocknal_find_connecting_route_locked(peer)) {
- struct ksock_conn *conn;
-
- /*
- * ksnp_tx_queue is queued on a conn on successful
- * connection for V1.x and V2.x
- */
- if (!list_empty(&peer->ksnp_conns)) {
- conn = list_entry(peer->ksnp_conns.next,
- struct ksock_conn, ksnc_list);
- LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x);
- }
-
- /*
- * take all the blocked packets while I've got the lock and
- * complete below...
- */
- list_splice_init(&peer->ksnp_tx_queue, &zombies);
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_peer_failed(peer);
- ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
- return 0;
-}
-
-/*
- * check whether we need to create more connds.
- * It will try to create new thread if it's necessary, @timeout can
- * be updated if failed to create, so caller wouldn't keep try while
- * running out of resource.
- */
-static int
-ksocknal_connd_check_start(time64_t sec, long *timeout)
-{
- char name[16];
- int rc;
- int total = ksocknal_data.ksnd_connd_starting +
- ksocknal_data.ksnd_connd_running;
-
- if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
- /* still in initializing */
- return 0;
- }
-
- if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
- total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
- /*
- * can't create more connd, or still have enough
- * threads to handle more connecting
- */
- return 0;
- }
-
- if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
- /* no pending connecting request */
- return 0;
- }
-
- if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
- /* may run out of resource, retry later */
- *timeout = HZ;
- return 0;
- }
-
- if (ksocknal_data.ksnd_connd_starting > 0) {
- /* serialize starting to avoid flood */
- return 0;
- }
-
- ksocknal_data.ksnd_connd_starting_stamp = sec;
- ksocknal_data.ksnd_connd_starting++;
- spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
-
- /* NB: total is the next id */
- snprintf(name, sizeof(name), "socknal_cd%02d", total);
- rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
-
- spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
- if (!rc)
- return 1;
-
- /* we tried ... */
- LASSERT(ksocknal_data.ksnd_connd_starting > 0);
- ksocknal_data.ksnd_connd_starting--;
- ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds();
-
- return 1;
-}
-
-/*
- * check whether current thread can exit, it will return 1 if there are too
- * many threads and no creating in past 120 seconds.
- * Also, this function may update @timeout to make caller come back
- * again to recheck these conditions.
- */
-static int
-ksocknal_connd_check_stop(time64_t sec, long *timeout)
-{
- int val;
-
- if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
- /* still in initializing */
- return 0;
- }
-
- if (ksocknal_data.ksnd_connd_starting > 0) {
- /* in progress of starting new thread */
- return 0;
- }
-
- if (ksocknal_data.ksnd_connd_running <=
- *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
- return 0;
- }
-
- /* created thread in past 120 seconds? */
- val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
- SOCKNAL_CONND_TIMEOUT - sec);
-
- *timeout = (val > 0) ? val * HZ :
- SOCKNAL_CONND_TIMEOUT * HZ;
- if (val > 0)
- return 0;
-
- /* no creating in past 120 seconds */
-
- return ksocknal_data.ksnd_connd_running >
- ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
-}
-
-/*
- * Go through connd_routes queue looking for a route that we can process
- * right now, @timeout_p can be updated if we need to come back later
- */
-static struct ksock_route *
-ksocknal_connd_get_route_locked(signed long *timeout_p)
-{
- struct ksock_route *route;
- unsigned long now;
-
- now = jiffies;
-
- /* connd_routes can contain both pending and ordinary routes */
- list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes,
- ksnr_connd_list) {
- if (!route->ksnr_retry_interval ||
- time_after_eq(now, route->ksnr_timeout))
- return route;
-
- if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
- (int)*timeout_p > (int)(route->ksnr_timeout - now))
- *timeout_p = (int)(route->ksnr_timeout - now);
- }
-
- return NULL;
-}
-
-int
-ksocknal_connd(void *arg)
-{
- spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
- struct ksock_connreq *cr;
- wait_queue_entry_t wait;
- int nloops = 0;
- int cons_retry = 0;
-
- init_waitqueue_entry(&wait, current);
-
- spin_lock_bh(connd_lock);
-
- LASSERT(ksocknal_data.ksnd_connd_starting > 0);
- ksocknal_data.ksnd_connd_starting--;
- ksocknal_data.ksnd_connd_running++;
-
- while (!ksocknal_data.ksnd_shuttingdown) {
- struct ksock_route *route = NULL;
- time64_t sec = ktime_get_real_seconds();
- long timeout = MAX_SCHEDULE_TIMEOUT;
- int dropped_lock = 0;
-
- if (ksocknal_connd_check_stop(sec, &timeout)) {
- /* wakeup another one to check stop */
- wake_up(&ksocknal_data.ksnd_connd_waitq);
- break;
- }
-
- if (ksocknal_connd_check_start(sec, &timeout)) {
- /* created new thread */
- dropped_lock = 1;
- }
-
- if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
- /* Connection accepted by the listener */
- cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
- struct ksock_connreq, ksncr_list);
-
- list_del(&cr->ksncr_list);
- spin_unlock_bh(connd_lock);
- dropped_lock = 1;
-
- ksocknal_create_conn(cr->ksncr_ni, NULL,
- cr->ksncr_sock, SOCKLND_CONN_NONE);
- lnet_ni_decref(cr->ksncr_ni);
- kfree(cr);
-
- spin_lock_bh(connd_lock);
- }
-
- /*
- * Only handle an outgoing connection request if there
- * is a thread left to handle incoming connections and
- * create new connd
- */
- if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
- ksocknal_data.ksnd_connd_running) {
- route = ksocknal_connd_get_route_locked(&timeout);
- }
- if (route) {
- list_del(&route->ksnr_connd_list);
- ksocknal_data.ksnd_connd_connecting++;
- spin_unlock_bh(connd_lock);
- dropped_lock = 1;
-
- if (ksocknal_connect(route)) {
- /* consecutive retry */
- if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
- CWARN("massive consecutive re-connecting to %pI4h\n",
- &route->ksnr_ipaddr);
- cons_retry = 0;
- }
- } else {
- cons_retry = 0;
- }
-
- ksocknal_route_decref(route);
-
- spin_lock_bh(connd_lock);
- ksocknal_data.ksnd_connd_connecting--;
- }
-
- if (dropped_lock) {
- if (++nloops < SOCKNAL_RESCHED)
- continue;
- spin_unlock_bh(connd_lock);
- nloops = 0;
- cond_resched();
- spin_lock_bh(connd_lock);
- continue;
- }
-
- /* Nothing to do for 'timeout' */
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq,
- &wait);
- spin_unlock_bh(connd_lock);
-
- nloops = 0;
- schedule_timeout(timeout);
-
- remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
- spin_lock_bh(connd_lock);
- }
- ksocknal_data.ksnd_connd_running--;
- spin_unlock_bh(connd_lock);
-
- ksocknal_thread_fini();
- return 0;
-}
-
-static struct ksock_conn *
-ksocknal_find_timed_out_conn(struct ksock_peer *peer)
-{
- /* We're called with a shared lock on ksnd_global_lock */
- struct ksock_conn *conn;
- struct list_head *ctmp;
-
- list_for_each(ctmp, &peer->ksnp_conns) {
- int error;
-
- conn = list_entry(ctmp, struct ksock_conn, ksnc_list);
-
- /* Don't need the {get,put}connsock dance to deref ksnc_sock */
- LASSERT(!conn->ksnc_closing);
-
- /*
- * SOCK_ERROR will reset error code of socket in
- * some platform (like Darwin8.x)
- */
- error = conn->ksnc_sock->sk->sk_err;
- if (error) {
- ksocknal_conn_addref(conn);
-
- switch (error) {
- case ECONNRESET:
- CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- break;
- case ETIMEDOUT:
- CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- break;
- default:
- CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n",
- error,
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- break;
- }
-
- return conn;
- }
-
- if (conn->ksnc_rx_started &&
- time_after_eq(jiffies,
- conn->ksnc_rx_deadline)) {
- /* Timed out incomplete incoming message */
- ksocknal_conn_addref(conn);
- CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %zd left %d\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port,
- conn->ksnc_rx_state,
- iov_iter_count(&conn->ksnc_rx_to),
- conn->ksnc_rx_nob_left);
- return conn;
- }
-
- if ((!list_empty(&conn->ksnc_tx_queue) ||
- conn->ksnc_sock->sk->sk_wmem_queued) &&
- time_after_eq(jiffies,
- conn->ksnc_tx_deadline)) {
- /*
- * Timed out messages queued for sending or
- * buffered in the socket's send buffer
- */
- ksocknal_conn_addref(conn);
- CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n",
- libcfs_id2str(peer->ksnp_id),
- &conn->ksnc_ipaddr,
- conn->ksnc_port);
- return conn;
- }
- }
-
- return NULL;
-}
-
-static inline void
-ksocknal_flush_stale_txs(struct ksock_peer *peer)
-{
- struct ksock_tx *tx;
- struct ksock_tx *tmp;
- LIST_HEAD(stale_txs);
-
- write_lock_bh(&ksocknal_data.ksnd_global_lock);
-
- list_for_each_entry_safe(tx, tmp, &peer->ksnp_tx_queue, tx_list) {
- if (!time_after_eq(jiffies,
- tx->tx_deadline))
- break;
-
- list_del(&tx->tx_list);
- list_add_tail(&tx->tx_list, &stale_txs);
- }
-
- write_unlock_bh(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
-}
-
-static int
-ksocknal_send_keepalive_locked(struct ksock_peer *peer)
- __must_hold(&ksocknal_data.ksnd_global_lock)
-{
- struct ksock_sched *sched;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
-
- /* last_alive will be updated by create_conn */
- if (list_empty(&peer->ksnp_conns))
- return 0;
-
- if (peer->ksnp_proto != &ksocknal_protocol_v3x)
- return 0;
-
- if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
- time_before(jiffies,
- peer->ksnp_last_alive + *ksocknal_tunables.ksnd_keepalive * HZ))
- return 0;
-
- if (time_before(jiffies, peer->ksnp_send_keepalive))
- return 0;
-
- /*
- * retry 10 secs later, so we wouldn't put pressure
- * on this peer if we failed to send keepalive this time
- */
- peer->ksnp_send_keepalive = jiffies + 10 * HZ;
-
- conn = ksocknal_find_conn_locked(peer, NULL, 1);
- if (conn) {
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
- if (!list_empty(&conn->ksnc_tx_queue)) {
- spin_unlock_bh(&sched->kss_lock);
- /* there is an queued ACK, don't need keepalive */
- return 0;
- }
-
- spin_unlock_bh(&sched->kss_lock);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- /* cookie = 1 is reserved for keepalive PING */
- tx = ksocknal_alloc_tx_noop(1, 1);
- if (!tx) {
- read_lock(&ksocknal_data.ksnd_global_lock);
- return -ENOMEM;
- }
-
- if (!ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) {
- read_lock(&ksocknal_data.ksnd_global_lock);
- return 1;
- }
-
- ksocknal_free_tx(tx);
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- return -EIO;
-}
-
-static void
-ksocknal_check_peer_timeouts(int idx)
-{
- struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
- struct ksock_peer *peer;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
-
- again:
- /*
- * NB. We expect to have a look at all the peers and not find any
- * connections to time out, so we just use a shared lock while we
- * take a look...
- */
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- list_for_each_entry(peer, peers, ksnp_list) {
- unsigned long deadline = 0;
- struct ksock_tx *tx_stale;
- int resid = 0;
- int n = 0;
-
- if (ksocknal_send_keepalive_locked(peer)) {
- read_unlock(&ksocknal_data.ksnd_global_lock);
- goto again;
- }
-
- conn = ksocknal_find_timed_out_conn(peer);
-
- if (conn) {
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT);
-
- /*
- * NB we won't find this one again, but we can't
- * just proceed with the next peer, since we dropped
- * ksnd_global_lock and it might be dead already!
- */
- ksocknal_conn_decref(conn);
- goto again;
- }
-
- /*
- * we can't process stale txs right here because we're
- * holding only shared lock
- */
- if (!list_empty(&peer->ksnp_tx_queue)) {
- tx = list_entry(peer->ksnp_tx_queue.next,
- struct ksock_tx, tx_list);
-
- if (time_after_eq(jiffies,
- tx->tx_deadline)) {
- ksocknal_peer_addref(peer);
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- ksocknal_flush_stale_txs(peer);
-
- ksocknal_peer_decref(peer);
- goto again;
- }
- }
-
- if (list_empty(&peer->ksnp_zc_req_list))
- continue;
-
- tx_stale = NULL;
- spin_lock(&peer->ksnp_lock);
- list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
- if (!time_after_eq(jiffies,
- tx->tx_deadline))
- break;
- /* ignore the TX if connection is being closed */
- if (tx->tx_conn->ksnc_closing)
- continue;
- if (!tx_stale)
- tx_stale = tx;
- n++;
- }
-
- if (!tx_stale) {
- spin_unlock(&peer->ksnp_lock);
- continue;
- }
-
- deadline = tx_stale->tx_deadline;
- resid = tx_stale->tx_resid;
- conn = tx_stale->tx_conn;
- ksocknal_conn_addref(conn);
-
- spin_unlock(&peer->ksnp_lock);
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n",
- n, libcfs_nid2str(peer->ksnp_id.nid), tx_stale,
- (jiffies - deadline) / HZ,
- resid, conn->ksnc_sock->sk->sk_wmem_queued);
-
- ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT);
- ksocknal_conn_decref(conn);
- goto again;
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-int
-ksocknal_reaper(void *arg)
-{
- wait_queue_entry_t wait;
- struct ksock_conn *conn;
- struct ksock_sched *sched;
- struct list_head enomem_conns;
- int nenomem_conns;
- long timeout;
- int i;
- int peer_index = 0;
- unsigned long deadline = jiffies;
-
- INIT_LIST_HEAD(&enomem_conns);
- init_waitqueue_entry(&wait, current);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- while (!ksocknal_data.ksnd_shuttingdown) {
- if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) {
- conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next,
- struct ksock_conn, ksnc_list);
- list_del(&conn->ksnc_list);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- ksocknal_terminate_conn(conn);
- ksocknal_conn_decref(conn);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
- continue;
- }
-
- if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) {
- conn = list_entry(ksocknal_data.ksnd_zombie_conns.next,
- struct ksock_conn, ksnc_list);
- list_del(&conn->ksnc_list);
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- ksocknal_destroy_conn(conn);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
- continue;
- }
-
- if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) {
- list_add(&enomem_conns,
- &ksocknal_data.ksnd_enomem_conns);
- list_del_init(&ksocknal_data.ksnd_enomem_conns);
- }
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- /* reschedule all the connections that stalled with ENOMEM... */
- nenomem_conns = 0;
- while (!list_empty(&enomem_conns)) {
- conn = list_entry(enomem_conns.next, struct ksock_conn,
- ksnc_tx_list);
- list_del(&conn->ksnc_tx_list);
-
- sched = conn->ksnc_scheduler;
-
- spin_lock_bh(&sched->kss_lock);
-
- LASSERT(conn->ksnc_tx_scheduled);
- conn->ksnc_tx_ready = 1;
- list_add_tail(&conn->ksnc_tx_list,
- &sched->kss_tx_conns);
- wake_up(&sched->kss_waitq);
-
- spin_unlock_bh(&sched->kss_lock);
- nenomem_conns++;
- }
-
- /* careful with the jiffy wrap... */
- while ((timeout = deadline - jiffies) <= 0) {
- const int n = 4;
- const int p = 1;
- int chunk = ksocknal_data.ksnd_peer_hash_size;
-
- /*
- * Time to check for timeouts on a few more peers: I do
- * checks every 'p' seconds on a proportion of the peer
- * table and I need to check every connection 'n' times
- * within a timeout interval, to ensure I detect a
- * timeout on any connection within (n+1)/n times the
- * timeout interval.
- */
- if (*ksocknal_tunables.ksnd_timeout > n * p)
- chunk = (chunk * n * p) /
- *ksocknal_tunables.ksnd_timeout;
- if (!chunk)
- chunk = 1;
-
- for (i = 0; i < chunk; i++) {
- ksocknal_check_peer_timeouts(peer_index);
- peer_index = (peer_index + 1) %
- ksocknal_data.ksnd_peer_hash_size;
- }
-
- deadline = deadline + p * HZ;
- }
-
- if (nenomem_conns) {
- /*
- * Reduce my timeout if I rescheduled ENOMEM conns.
- * This also prevents me getting woken immediately
- * if any go back on my enomem list.
- */
- timeout = SOCKNAL_ENOMEM_RETRY;
- }
- ksocknal_data.ksnd_reaper_waketime = jiffies + timeout;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
-
- if (!ksocknal_data.ksnd_shuttingdown &&
- list_empty(&ksocknal_data.ksnd_deathrow_conns) &&
- list_empty(&ksocknal_data.ksnd_zombie_conns))
- schedule_timeout(timeout);
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait);
-
- spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
- }
-
- spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
-
- ksocknal_thread_fini();
- return 0;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
deleted file mode 100644
index 93a02cd..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
+++ /dev/null
@@ -1,534 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.gnu.org/licenses/gpl-2.0.html
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#include <linux/highmem.h>
-#include "socklnd.h"
-
-int
-ksocknal_lib_get_conn_addrs(struct ksock_conn *conn)
-{
- int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr,
- &conn->ksnc_port);
-
- /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
- LASSERT(!conn->ksnc_closing);
-
- if (rc) {
- CERROR("Error %d getting sock peer IP\n", rc);
- return rc;
- }
-
- rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL);
- if (rc) {
- CERROR("Error %d getting sock local IP\n", rc);
- return rc;
- }
-
- return 0;
-}
-
-int
-ksocknal_lib_zc_capable(struct ksock_conn *conn)
-{
- int caps = conn->ksnc_sock->sk->sk_route_caps;
-
- if (conn->ksnc_proto == &ksocknal_protocol_v1x)
- return 0;
-
- /*
- * ZC if the socket supports scatter/gather and doesn't need software
- * checksums
- */
- return ((caps & NETIF_F_SG) && (caps & NETIF_F_CSUM_MASK));
-}
-
-int
-ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
- struct socket *sock = conn->ksnc_sock;
- int nob, i;
-
- if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
- conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */
- tx->tx_nob == tx->tx_resid && /* frist sending */
- !tx->tx_msg.ksm_csum) /* not checksummed */
- ksocknal_lib_csum_tx(tx);
-
- for (nob = i = 0; i < tx->tx_niov; i++)
- nob += tx->tx_iov[i].iov_len;
-
- if (!list_empty(&conn->ksnc_tx_queue) ||
- nob < tx->tx_resid)
- msg.msg_flags |= MSG_MORE;
-
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC,
- tx->tx_iov, tx->tx_niov, nob);
- return sock_sendmsg(sock, &msg);
-}
-
-int
-ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx)
-{
- struct socket *sock = conn->ksnc_sock;
- struct bio_vec *kiov = tx->tx_kiov;
- int rc;
- int nob;
-
- /* Not NOOP message */
- LASSERT(tx->tx_lnetmsg);
-
- if (tx->tx_msg.ksm_zc_cookies[0]) {
- /* Zero copy is enabled */
- struct sock *sk = sock->sk;
- struct page *page = kiov->bv_page;
- int offset = kiov->bv_offset;
- int fragsize = kiov->bv_len;
- int msgflg = MSG_DONTWAIT;
-
- CDEBUG(D_NET, "page %p + offset %x for %d\n",
- page, offset, kiov->bv_len);
-
- if (!list_empty(&conn->ksnc_tx_queue) ||
- fragsize < tx->tx_resid)
- msgflg |= MSG_MORE;
-
- if (sk->sk_prot->sendpage) {
- rc = sk->sk_prot->sendpage(sk, page,
- offset, fragsize, msgflg);
- } else {
- rc = tcp_sendpage(sk, page, offset, fragsize, msgflg);
- }
- } else {
- struct msghdr msg = {.msg_flags = MSG_DONTWAIT};
- int i;
-
- for (nob = i = 0; i < tx->tx_nkiov; i++)
- nob += kiov[i].bv_len;
-
- if (!list_empty(&conn->ksnc_tx_queue) ||
- nob < tx->tx_resid)
- msg.msg_flags |= MSG_MORE;
-
- iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC,
- kiov, tx->tx_nkiov, nob);
- rc = sock_sendmsg(sock, &msg);
- }
- return rc;
-}
-
-void
-ksocknal_lib_eager_ack(struct ksock_conn *conn)
-{
- int opt = 1;
- struct socket *sock = conn->ksnc_sock;
-
- /*
- * Remind the socket to ACK eagerly. If I don't, the socket might
- * think I'm about to send something it could piggy-back the ACK
- * on, introducing delay in completing zero-copy sends in my
- * peer.
- */
- kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt,
- sizeof(opt));
-}
-
-static int lustre_csum(struct kvec *v, void *context)
-{
- struct ksock_conn *conn = context;
- conn->ksnc_rx_csum = crc32_le(conn->ksnc_rx_csum,
- v->iov_base, v->iov_len);
- return 0;
-}
-
-int
-ksocknal_lib_recv(struct ksock_conn *conn)
-{
- struct msghdr msg = { .msg_iter = conn->ksnc_rx_to };
- __u32 saved_csum;
- int rc;
-
- rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT);
- if (rc <= 0)
- return rc;
-
- saved_csum = conn->ksnc_msg.ksm_csum;
- if (!saved_csum)
- return rc;
-
- /* header is included only in V2 - V3 checksums only the bulk data */
- if (!(conn->ksnc_rx_to.type & ITER_BVEC) &&
- conn->ksnc_proto != &ksocknal_protocol_v2x)
- return rc;
-
- /* accumulate checksum */
- conn->ksnc_msg.ksm_csum = 0;
- iov_iter_for_each_range(&conn->ksnc_rx_to, rc, lustre_csum, conn);
- conn->ksnc_msg.ksm_csum = saved_csum;
-
- return rc;
-}
-
-void
-ksocknal_lib_csum_tx(struct ksock_tx *tx)
-{
- int i;
- __u32 csum;
- void *base;
-
- LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg);
- LASSERT(tx->tx_conn);
- LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
-
- tx->tx_msg.ksm_csum = 0;
-
- csum = crc32_le(~0, tx->tx_iov[0].iov_base,
- tx->tx_iov[0].iov_len);
-
- if (tx->tx_kiov) {
- for (i = 0; i < tx->tx_nkiov; i++) {
- base = kmap(tx->tx_kiov[i].bv_page) +
- tx->tx_kiov[i].bv_offset;
-
- csum = crc32_le(csum, base, tx->tx_kiov[i].bv_len);
-
- kunmap(tx->tx_kiov[i].bv_page);
- }
- } else {
- for (i = 1; i < tx->tx_niov; i++)
- csum = crc32_le(csum, tx->tx_iov[i].iov_base,
- tx->tx_iov[i].iov_len);
- }
-
- if (*ksocknal_tunables.ksnd_inject_csum_error) {
- csum++;
- *ksocknal_tunables.ksnd_inject_csum_error = 0;
- }
-
- tx->tx_msg.ksm_csum = csum;
-}
-
-int
-ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem,
- int *rxmem, int *nagle)
-{
- struct socket *sock = conn->ksnc_sock;
- int len;
- int rc;
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) {
- LASSERT(conn->ksnc_closing);
- *txmem = *rxmem = *nagle = 0;
- return -ESHUTDOWN;
- }
-
- rc = lnet_sock_getbuf(sock, txmem, rxmem);
- if (!rc) {
- len = sizeof(*nagle);
- rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char *)nagle, &len);
- }
-
- ksocknal_connsock_decref(conn);
-
- if (!rc)
- *nagle = !*nagle;
- else
- *txmem = *rxmem = *nagle = 0;
-
- return rc;
-}
-
-int
-ksocknal_lib_setup_sock(struct socket *sock)
-{
- int rc;
- int option;
- int keep_idle;
- int keep_intvl;
- int keep_count;
- int do_keepalive;
- struct linger linger;
-
- sock->sk->sk_allocation = GFP_NOFS;
-
- /*
- * Ensure this socket aborts active sends immediately when we close
- * it.
- */
- linger.l_onoff = 0;
- linger.l_linger = 0;
-
- rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger,
- sizeof(linger));
- if (rc) {
- CERROR("Can't set SO_LINGER: %d\n", rc);
- return rc;
- }
-
- option = -1;
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option,
- sizeof(option));
- if (rc) {
- CERROR("Can't set SO_LINGER2: %d\n", rc);
- return rc;
- }
-
- if (!*ksocknal_tunables.ksnd_nagle) {
- option = 1;
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char *)&option, sizeof(option));
- if (rc) {
- CERROR("Can't disable nagle: %d\n", rc);
- return rc;
- }
- }
-
- rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size,
- *ksocknal_tunables.ksnd_rx_buffer_size);
- if (rc) {
- CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
- *ksocknal_tunables.ksnd_tx_buffer_size,
- *ksocknal_tunables.ksnd_rx_buffer_size, rc);
- return rc;
- }
-
-/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
-
- /* snapshot tunables */
- keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
- keep_count = *ksocknal_tunables.ksnd_keepalive_count;
- keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
-
- do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
-
- option = (do_keepalive ? 1 : 0);
- rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option,
- sizeof(option));
- if (rc) {
- CERROR("Can't set SO_KEEPALIVE: %d\n", rc);
- return rc;
- }
-
- if (!do_keepalive)
- return 0;
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle,
- sizeof(keep_idle));
- if (rc) {
- CERROR("Can't set TCP_KEEPIDLE: %d\n", rc);
- return rc;
- }
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
- (char *)&keep_intvl, sizeof(keep_intvl));
- if (rc) {
- CERROR("Can't set TCP_KEEPINTVL: %d\n", rc);
- return rc;
- }
-
- rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count,
- sizeof(keep_count));
- if (rc) {
- CERROR("Can't set TCP_KEEPCNT: %d\n", rc);
- return rc;
- }
-
- return 0;
-}
-
-void
-ksocknal_lib_push_conn(struct ksock_conn *conn)
-{
- struct sock *sk;
- struct tcp_sock *tp;
- int nonagle;
- int val = 1;
- int rc;
-
- rc = ksocknal_connsock_addref(conn);
- if (rc) /* being shut down */
- return;
-
- sk = conn->ksnc_sock->sk;
- tp = tcp_sk(sk);
-
- lock_sock(sk);
- nonagle = tp->nonagle;
- tp->nonagle = 1;
- release_sock(sk);
-
- rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY,
- (char *)&val, sizeof(val));
- LASSERT(!rc);
-
- lock_sock(sk);
- tp->nonagle = nonagle;
- release_sock(sk);
-
- ksocknal_connsock_decref(conn);
-}
-
-/*
- * socket call back in Linux
- */
-static void
-ksocknal_data_ready(struct sock *sk)
-{
- struct ksock_conn *conn;
-
- /* interleave correctly with closing sockets... */
- LASSERT(!in_irq());
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- conn = sk->sk_user_data;
- if (!conn) { /* raced with ksocknal_terminate_conn */
- LASSERT(sk->sk_data_ready != &ksocknal_data_ready);
- sk->sk_data_ready(sk);
- } else {
- ksocknal_read_callback(conn);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-static void
-ksocknal_write_space(struct sock *sk)
-{
- struct ksock_conn *conn;
- int wspace;
- int min_wpace;
-
- /* interleave correctly with closing sockets... */
- LASSERT(!in_irq());
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- conn = sk->sk_user_data;
- wspace = sk_stream_wspace(sk);
- min_wpace = sk_stream_min_wspace(sk);
-
- CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
- sk, wspace, min_wpace, conn,
- !conn ? "" : (conn->ksnc_tx_ready ?
- " ready" : " blocked"),
- !conn ? "" : (conn->ksnc_tx_scheduled ?
- " scheduled" : " idle"),
- !conn ? "" : (list_empty(&conn->ksnc_tx_queue) ?
- " empty" : " queued"));
-
- if (!conn) { /* raced with ksocknal_terminate_conn */
- LASSERT(sk->sk_write_space != &ksocknal_write_space);
- sk->sk_write_space(sk);
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return;
- }
-
- if (wspace >= min_wpace) { /* got enough space */
- ksocknal_write_callback(conn);
-
- /*
- * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
- * ENOMEM check in ksocknal_transmit is race-free (think about
- * it).
- */
- clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-}
-
-void
-ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn)
-{
- conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
- conn->ksnc_saved_write_space = sock->sk->sk_write_space;
-}
-
-void
-ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn)
-{
- sock->sk->sk_user_data = conn;
- sock->sk->sk_data_ready = ksocknal_data_ready;
- sock->sk->sk_write_space = ksocknal_write_space;
-}
-
-void
-ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn)
-{
- /*
- * Remove conn's network callbacks.
- * NB I _have_ to restore the callback, rather than storing a noop,
- * since the socket could survive past this module being unloaded!!
- */
- sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
- sock->sk->sk_write_space = conn->ksnc_saved_write_space;
-
- /*
- * A callback could be in progress already; they hold a read lock
- * on ksnd_global_lock (to serialise with me) and NOOP if
- * sk_user_data is NULL.
- */
- sock->sk->sk_user_data = NULL;
-}
-
-int
-ksocknal_lib_memory_pressure(struct ksock_conn *conn)
-{
- int rc = 0;
- struct ksock_sched *sched;
-
- sched = conn->ksnc_scheduler;
- spin_lock_bh(&sched->kss_lock);
-
- if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) &&
- !conn->ksnc_tx_ready) {
- /*
- * SOCK_NOSPACE is set when the socket fills
- * and cleared in the write_space callback
- * (which also sets ksnc_tx_ready). If
- * SOCK_NOSPACE and ksnc_tx_ready are BOTH
- * zero, I didn't fill the socket and
- * write_space won't reschedule me, so I
- * return -ENOMEM to get my caller to retry
- * after a timeout
- */
- rc = -ENOMEM;
- }
-
- spin_unlock_bh(&sched->kss_lock);
-
- return rc;
-}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
deleted file mode 100644
index 5663a4c..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
+++ /dev/null
@@ -1,184 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2011, 2012, Intel Corporation.
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include "socklnd.h"
-
-static int sock_timeout = 50;
-module_param(sock_timeout, int, 0644);
-MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
-
-static int credits = 256;
-module_param(credits, int, 0444);
-MODULE_PARM_DESC(credits, "# concurrent sends");
-
-static int peer_credits = 8;
-module_param(peer_credits, int, 0444);
-MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
-
-static int peer_buffer_credits;
-module_param(peer_buffer_credits, int, 0444);
-MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
-
-static int peer_timeout = 180;
-module_param(peer_timeout, int, 0444);
-MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
-
-/*
- * Number of daemons in each thread pool which is percpt,
- * we will estimate reasonable value based on CPUs if it's not set.
- */
-static unsigned int nscheds;
-module_param(nscheds, int, 0444);
-MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
-
-static int nconnds = 4;
-module_param(nconnds, int, 0444);
-MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
-
-static int nconnds_max = 64;
-module_param(nconnds_max, int, 0444);
-MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
-
-static int min_reconnectms = 1000;
-module_param(min_reconnectms, int, 0644);
-MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
-
-static int max_reconnectms = 60000;
-module_param(max_reconnectms, int, 0644);
-MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
-
-# define DEFAULT_EAGER_ACK 0
-static int eager_ack = DEFAULT_EAGER_ACK;
-module_param(eager_ack, int, 0644);
-MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
-
-static int typed_conns = 1;
-module_param(typed_conns, int, 0444);
-MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
-
-static int min_bulk = 1 << 10;
-module_param(min_bulk, int, 0644);
-MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
-
-# define DEFAULT_BUFFER_SIZE 0
-static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
-module_param(tx_buffer_size, int, 0644);
-MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
-
-static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
-module_param(rx_buffer_size, int, 0644);
-MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
-
-static int nagle;
-module_param(nagle, int, 0644);
-MODULE_PARM_DESC(nagle, "enable NAGLE?");
-
-static int round_robin = 1;
-module_param(round_robin, int, 0644);
-MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
-
-static int keepalive = 30;
-module_param(keepalive, int, 0644);
-MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
-
-static int keepalive_idle = 30;
-module_param(keepalive_idle, int, 0644);
-MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
-
-#define DEFAULT_KEEPALIVE_COUNT 5
-static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
-module_param(keepalive_count, int, 0644);
-MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
-
-static int keepalive_intvl = 5;
-module_param(keepalive_intvl, int, 0644);
-MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
-
-static int enable_csum;
-module_param(enable_csum, int, 0644);
-MODULE_PARM_DESC(enable_csum, "enable check sum");
-
-static int inject_csum_error;
-module_param(inject_csum_error, int, 0644);
-MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
-
-static int nonblk_zcack = 1;
-module_param(nonblk_zcack, int, 0644);
-MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
-
-static unsigned int zc_min_payload = 16 << 10;
-module_param(zc_min_payload, int, 0644);
-MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
-
-static unsigned int zc_recv;
-module_param(zc_recv, int, 0644);
-MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
-
-static unsigned int zc_recv_min_nfrags = 16;
-module_param(zc_recv_min_nfrags, int, 0644);
-MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
-
-#if SOCKNAL_VERSION_DEBUG
-static int protocol = 3;
-module_param(protocol, int, 0644);
-MODULE_PARM_DESC(protocol, "protocol version");
-#endif
-
-struct ksock_tunables ksocknal_tunables;
-
-int ksocknal_tunables_init(void)
-{
- /* initialize ksocknal_tunables structure */
- ksocknal_tunables.ksnd_timeout = &sock_timeout;
- ksocknal_tunables.ksnd_nscheds = &nscheds;
- ksocknal_tunables.ksnd_nconnds = &nconnds;
- ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
- ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
- ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
- ksocknal_tunables.ksnd_eager_ack = &eager_ack;
- ksocknal_tunables.ksnd_typed_conns = &typed_conns;
- ksocknal_tunables.ksnd_min_bulk = &min_bulk;
- ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
- ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
- ksocknal_tunables.ksnd_nagle = &nagle;
- ksocknal_tunables.ksnd_round_robin = &round_robin;
- ksocknal_tunables.ksnd_keepalive = &keepalive;
- ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
- ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
- ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
- ksocknal_tunables.ksnd_credits = &credits;
- ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
- ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
- ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
- ksocknal_tunables.ksnd_enable_csum = &enable_csum;
- ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
- ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
- ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
- ksocknal_tunables.ksnd_zc_recv = &zc_recv;
- ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
-
-#if SOCKNAL_VERSION_DEBUG
- ksocknal_tunables.ksnd_protocol = &protocol;
-#endif
-
- if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
- *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10;
-
- return 0;
-};
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
deleted file mode 100644
index 05982da..0000000
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
+++ /dev/null
@@ -1,810 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2012, Intel Corporation.
- *
- * Author: Zach Brown <zab@zabbo.net>
- * Author: Peter J. Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include "socklnd.h"
-
-/*
- * Protocol entries :
- * pro_send_hello : send hello message
- * pro_recv_hello : receive hello message
- * pro_pack : pack message header
- * pro_unpack : unpack message header
- * pro_queue_tx_zcack() : Called holding BH lock: kss_lock
- * return 1 if ACK is piggybacked, otherwise return 0
- * pro_queue_tx_msg() : Called holding BH lock: kss_lock
- * return the ACK that piggybacked by my message, or NULL
- * pro_handle_zcreq() : handler of incoming ZC-REQ
- * pro_handle_zcack() : handler of incoming ZC-ACK
- * pro_match_tx() : Called holding glock
- */
-
-static struct ksock_tx *
-ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg)
-{
- /* V1.x, just enqueue it */
- list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
- return NULL;
-}
-
-void
-ksocknal_next_tx_carrier(struct ksock_conn *conn)
-{
- struct ksock_tx *tx = conn->ksnc_tx_carrier;
-
- /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
- LASSERT(!list_empty(&conn->ksnc_tx_queue));
- LASSERT(tx);
-
- /* Next TX that can carry ZC-ACK or LNet message */
- if (tx->tx_list.next == &conn->ksnc_tx_queue) {
- /* no more packets queued */
- conn->ksnc_tx_carrier = NULL;
- } else {
- conn->ksnc_tx_carrier = list_next_entry(tx, tx_list);
- LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
- }
-}
-
-static int
-ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn,
- struct ksock_tx *tx_ack, __u64 cookie)
-{
- struct ksock_tx *tx = conn->ksnc_tx_carrier;
-
- LASSERT(!tx_ack ||
- tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- /*
- * Enqueue or piggyback tx_ack / cookie
- * . no tx can piggyback cookie of tx_ack (or cookie), just
- * enqueue the tx_ack (if tx_ack != NUL) and return NULL.
- * . There is tx can piggyback cookie of tx_ack (or cookie),
- * piggyback the cookie and return the tx.
- */
- if (!tx) {
- if (tx_ack) {
- list_add_tail(&tx_ack->tx_list,
- &conn->ksnc_tx_queue);
- conn->ksnc_tx_carrier = tx_ack;
- }
- return 0;
- }
-
- if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
- /* tx is noop zc-ack, can't piggyback zc-ack cookie */
- if (tx_ack)
- list_add_tail(&tx_ack->tx_list,
- &conn->ksnc_tx_queue);
- return 0;
- }
-
- LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
- LASSERT(!tx->tx_msg.ksm_zc_cookies[1]);
-
- if (tx_ack)
- cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
-
- /* piggyback the zc-ack cookie */
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- /* move on to the next TX which can carry cookie */
- ksocknal_next_tx_carrier(conn);
-
- return 1;
-}
-
-static struct ksock_tx *
-ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg)
-{
- struct ksock_tx *tx = conn->ksnc_tx_carrier;
-
- /*
- * Enqueue tx_msg:
- * . If there is no NOOP on the connection, just enqueue
- * tx_msg and return NULL
- * . If there is NOOP on the connection, piggyback the cookie
- * and replace the NOOP tx, and return the NOOP tx.
- */
- if (!tx) { /* nothing on queue */
- list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
- conn->ksnc_tx_carrier = tx_msg;
- return NULL;
- }
-
- if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
- list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
- return NULL;
- }
-
- LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- /* There is a noop zc-ack can be piggybacked */
- tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
- ksocknal_next_tx_carrier(conn);
-
- /* use new_tx to replace the noop zc-ack packet */
- list_add(&tx_msg->tx_list, &tx->tx_list);
- list_del(&tx->tx_list);
-
- return tx;
-}
-
-static int
-ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn,
- struct ksock_tx *tx_ack, __u64 cookie)
-{
- struct ksock_tx *tx;
-
- if (conn->ksnc_type != SOCKLND_CONN_ACK)
- return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
-
- /* non-blocking ZC-ACK (to router) */
- LASSERT(!tx_ack ||
- tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- tx = conn->ksnc_tx_carrier;
- if (!tx) {
- if (tx_ack) {
- list_add_tail(&tx_ack->tx_list,
- &conn->ksnc_tx_queue);
- conn->ksnc_tx_carrier = tx_ack;
- }
- return 0;
- }
-
- /* conn->ksnc_tx_carrier */
-
- if (tx_ack)
- cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
-
- if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
- return 1;
-
- if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
- /* replace the keepalive PING with a real ACK */
- LASSERT(!tx->tx_msg.ksm_zc_cookies[0]);
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- return 1;
- }
-
- if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
- cookie == tx->tx_msg.ksm_zc_cookies[1]) {
- CWARN("%s: duplicated ZC cookie: %llu\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
- return 1; /* XXX return error in the future */
- }
-
- if (!tx->tx_msg.ksm_zc_cookies[0]) {
- /*
- * NOOP tx has only one ZC-ACK cookie,
- * can carry at least one more
- */
- if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
- tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- } else {
- tx->tx_msg.ksm_zc_cookies[0] = cookie;
- }
-
- if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
- /*
- * not likely to carry more ACKs, skip it
- * to simplify logic
- */
- ksocknal_next_tx_carrier(conn);
- }
-
- return 1;
- }
-
- /* takes two or more cookies already */
-
- if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
- __u64 tmp = 0;
-
- /* two separated cookies: (a+2, a) or (a+1, a) */
- LASSERT(tx->tx_msg.ksm_zc_cookies[0] -
- tx->tx_msg.ksm_zc_cookies[1] <= 2);
-
- if (tx->tx_msg.ksm_zc_cookies[0] -
- tx->tx_msg.ksm_zc_cookies[1] == 2) {
- if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
- tmp = cookie;
- } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
- tmp = tx->tx_msg.ksm_zc_cookies[1];
- } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
- tmp = tx->tx_msg.ksm_zc_cookies[0];
- }
-
- if (tmp) {
- /* range of cookies */
- tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
- tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
- return 1;
- }
-
- } else {
- /*
- * ksm_zc_cookies[0] < ksm_zc_cookies[1],
- * it is range of cookies
- */
- if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
- cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
- CWARN("%s: duplicated ZC cookie: %llu\n",
- libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
- return 1; /* XXX: return error in the future */
- }
-
- if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
- tx->tx_msg.ksm_zc_cookies[1] = cookie;
- return 1;
- }
-
- if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
- tx->tx_msg.ksm_zc_cookies[0] = cookie;
- return 1;
- }
- }
-
- /* failed to piggyback ZC-ACK */
- if (tx_ack) {
- list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
- /* the next tx can piggyback at least 1 ACK */
- ksocknal_next_tx_carrier(conn);
- }
-
- return 0;
-}
-
-static int
-ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
-{
- int nob;
-
-#if SOCKNAL_VERSION_DEBUG
- if (!*ksocknal_tunables.ksnd_typed_conns)
- return SOCKNAL_MATCH_YES;
-#endif
-
- if (!tx || !tx->tx_lnetmsg) {
- /* noop packet */
- nob = offsetof(struct ksock_msg, ksm_u);
- } else {
- nob = tx->tx_lnetmsg->msg_len +
- ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
- sizeof(struct lnet_hdr) : sizeof(struct ksock_msg));
- }
-
- /* default checking for typed connection */
- switch (conn->ksnc_type) {
- default:
- CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
- LBUG();
- case SOCKLND_CONN_ANY:
- return SOCKNAL_MATCH_YES;
-
- case SOCKLND_CONN_BULK_IN:
- return SOCKNAL_MATCH_MAY;
-
- case SOCKLND_CONN_BULK_OUT:
- if (nob < *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
-
- case SOCKLND_CONN_CONTROL:
- if (nob >= *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
- }
-}
-
-static int
-ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk)
-{
- int nob;
-
- if (!tx || !tx->tx_lnetmsg)
- nob = offsetof(struct ksock_msg, ksm_u);
- else
- nob = tx->tx_lnetmsg->msg_len + sizeof(struct ksock_msg);
-
- switch (conn->ksnc_type) {
- default:
- CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
- LBUG();
- case SOCKLND_CONN_ANY:
- return SOCKNAL_MATCH_NO;
-
- case SOCKLND_CONN_ACK:
- if (nonblk)
- return SOCKNAL_MATCH_YES;
- else if (!tx || !tx->tx_lnetmsg)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_NO;
-
- case SOCKLND_CONN_BULK_OUT:
- if (nonblk)
- return SOCKNAL_MATCH_NO;
- else if (nob < *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
-
- case SOCKLND_CONN_CONTROL:
- if (nonblk)
- return SOCKNAL_MATCH_NO;
- else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
- return SOCKNAL_MATCH_MAY;
- else
- return SOCKNAL_MATCH_YES;
- }
-}
-
-/* (Sink) handle incoming ZC request from sender */
-static int
-ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote)
-{
- struct ksock_peer *peer = c->ksnc_peer;
- struct ksock_conn *conn;
- struct ksock_tx *tx;
- int rc;
-
- read_lock(&ksocknal_data.ksnd_global_lock);
-
- conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
- if (conn) {
- struct ksock_sched *sched = conn->ksnc_scheduler;
-
- LASSERT(conn->ksnc_proto->pro_queue_tx_zcack);
-
- spin_lock_bh(&sched->kss_lock);
-
- rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
-
- spin_unlock_bh(&sched->kss_lock);
-
- if (rc) { /* piggybacked */
- read_unlock(&ksocknal_data.ksnd_global_lock);
- return 0;
- }
- }
-
- read_unlock(&ksocknal_data.ksnd_global_lock);
-
- /* ACK connection is not ready, or can't piggyback the ACK */
- tx = ksocknal_alloc_tx_noop(cookie, !!remote);
- if (!tx)
- return -ENOMEM;
-
- rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id);
- if (!rc)
- return 0;
-
- ksocknal_free_tx(tx);
- return rc;
-}
-
-/* (Sender) handle ZC_ACK from sink */
-static int
-ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2)
-{
- struct ksock_peer *peer = conn->ksnc_peer;
- struct ksock_tx *tx;
- struct ksock_tx *temp;
- struct ksock_tx *tmp;
- LIST_HEAD(zlist);
- int count;
-
- if (!cookie1)
- cookie1 = cookie2;
-
- count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
-
- if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
- conn->ksnc_proto == &ksocknal_protocol_v3x) {
- /* keepalive PING for V3.x, just ignore it */
- return count == 1 ? 0 : -EPROTO;
- }
-
- spin_lock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list,
- tx_zc_list) {
- __u64 c = tx->tx_msg.ksm_zc_cookies[0];
-
- if (c == cookie1 || c == cookie2 ||
- (cookie1 < c && c < cookie2)) {
- tx->tx_msg.ksm_zc_cookies[0] = 0;
- list_del(&tx->tx_zc_list);
- list_add(&tx->tx_zc_list, &zlist);
-
- if (!--count)
- break;
- }
- }
-
- spin_unlock(&peer->ksnp_lock);
-
- list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) {
- list_del(&tx->tx_zc_list);
- ksocknal_tx_decref(tx);
- }
-
- return !count ? 0 : -EPROTO;
-}
-
-static int
-ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello)
-{
- struct socket *sock = conn->ksnc_sock;
- struct lnet_hdr *hdr;
- struct lnet_magicversion *hmv;
- int rc;
- int i;
-
- BUILD_BUG_ON(sizeof(struct lnet_magicversion) != offsetof(struct lnet_hdr, src_nid));
-
- hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
- if (!hdr) {
- CERROR("Can't allocate struct lnet_hdr\n");
- return -ENOMEM;
- }
-
- hmv = (struct lnet_magicversion *)&hdr->dest_nid;
-
- /*
- * Re-organize V2.x message header to V1.x (struct lnet_hdr)
- * header and send out
- */
- hmv->magic = cpu_to_le32(LNET_PROTO_TCP_MAGIC);
- hmv->version_major = cpu_to_le16(KSOCK_PROTO_V1_MAJOR);
- hmv->version_minor = cpu_to_le16(KSOCK_PROTO_V1_MINOR);
-
- if (the_lnet.ln_testprotocompat) {
- /* single-shot proto check */
- LNET_LOCK();
- if (the_lnet.ln_testprotocompat & 1) {
- hmv->version_major++; /* just different! */
- the_lnet.ln_testprotocompat &= ~1;
- }
- if (the_lnet.ln_testprotocompat & 2) {
- hmv->magic = LNET_PROTO_MAGIC;
- the_lnet.ln_testprotocompat &= ~2;
- }
- LNET_UNLOCK();
- }
-
- hdr->src_nid = cpu_to_le64(hello->kshm_src_nid);
- hdr->src_pid = cpu_to_le32(hello->kshm_src_pid);
- hdr->type = cpu_to_le32(LNET_MSG_HELLO);
- hdr->payload_length = cpu_to_le32(hello->kshm_nips * sizeof(__u32));
- hdr->msg.hello.type = cpu_to_le32(hello->kshm_ctype);
- hdr->msg.hello.incarnation = cpu_to_le64(hello->kshm_src_incarnation);
-
- rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
- rc, &conn->ksnc_ipaddr, conn->ksnc_port);
- goto out;
- }
-
- if (!hello->kshm_nips)
- goto out;
-
- for (i = 0; i < (int)hello->kshm_nips; i++)
- hello->kshm_ips[i] = __cpu_to_le32(hello->kshm_ips[i]);
-
- rc = lnet_sock_write(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32),
- lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
- rc, hello->kshm_nips,
- &conn->ksnc_ipaddr, conn->ksnc_port);
- }
-out:
- kfree(hdr);
-
- return rc;
-}
-
-static int
-ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello)
-{
- struct socket *sock = conn->ksnc_sock;
- int rc;
-
- hello->kshm_magic = LNET_PROTO_MAGIC;
- hello->kshm_version = conn->ksnc_proto->pro_version;
-
- if (the_lnet.ln_testprotocompat) {
- /* single-shot proto check */
- LNET_LOCK();
- if (the_lnet.ln_testprotocompat & 1) {
- hello->kshm_version++; /* just different! */
- the_lnet.ln_testprotocompat &= ~1;
- }
- LNET_UNLOCK();
- }
-
- rc = lnet_sock_write(sock, hello, offsetof(struct ksock_hello_msg, kshm_ips),
- lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n",
- rc, &conn->ksnc_ipaddr, conn->ksnc_port);
- return rc;
- }
-
- if (!hello->kshm_nips)
- return 0;
-
- rc = lnet_sock_write(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32),
- lnet_acceptor_timeout());
- if (rc) {
- CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n",
- rc, hello->kshm_nips,
- &conn->ksnc_ipaddr, conn->ksnc_port);
- }
-
- return rc;
-}
-
-static int
-ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello,
- int timeout)
-{
- struct socket *sock = conn->ksnc_sock;
- struct lnet_hdr *hdr;
- int rc;
- int i;
-
- hdr = kzalloc(sizeof(*hdr), GFP_NOFS);
- if (!hdr) {
- CERROR("Can't allocate struct lnet_hdr\n");
- return -ENOMEM;
- }
-
- rc = lnet_sock_read(sock, &hdr->src_nid,
- sizeof(*hdr) - offsetof(struct lnet_hdr, src_nid),
- timeout);
- if (rc) {
- CERROR("Error %d reading rest of HELLO hdr from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- goto out;
- }
-
- /* ...and check we got what we expected */
- if (hdr->type != cpu_to_le32(LNET_MSG_HELLO)) {
- CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n",
- le32_to_cpu(hdr->type),
- &conn->ksnc_ipaddr);
- rc = -EPROTO;
- goto out;
- }
-
- hello->kshm_src_nid = le64_to_cpu(hdr->src_nid);
- hello->kshm_src_pid = le32_to_cpu(hdr->src_pid);
- hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation);
- hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type);
- hello->kshm_nips = le32_to_cpu(hdr->payload_length) /
- sizeof(__u32);
-
- if (hello->kshm_nips > LNET_MAX_INTERFACES) {
- CERROR("Bad nips %d from ip %pI4h\n",
- hello->kshm_nips, &conn->ksnc_ipaddr);
- rc = -EPROTO;
- goto out;
- }
-
- if (!hello->kshm_nips)
- goto out;
-
- rc = lnet_sock_read(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32), timeout);
- if (rc) {
- CERROR("Error %d reading IPs from ip %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- goto out;
- }
-
- for (i = 0; i < (int)hello->kshm_nips; i++) {
- hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
-
- if (!hello->kshm_ips[i]) {
- CERROR("Zero IP[%d] from ip %pI4h\n",
- i, &conn->ksnc_ipaddr);
- rc = -EPROTO;
- break;
- }
- }
-out:
- kfree(hdr);
-
- return rc;
-}
-
-static int
-ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello,
- int timeout)
-{
- struct socket *sock = conn->ksnc_sock;
- int rc;
- int i;
-
- if (hello->kshm_magic == LNET_PROTO_MAGIC)
- conn->ksnc_flip = 0;
- else
- conn->ksnc_flip = 1;
-
- rc = lnet_sock_read(sock, &hello->kshm_src_nid,
- offsetof(struct ksock_hello_msg, kshm_ips) -
- offsetof(struct ksock_hello_msg, kshm_src_nid),
- timeout);
- if (rc) {
- CERROR("Error %d reading HELLO from %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- return rc;
- }
-
- if (conn->ksnc_flip) {
- __swab32s(&hello->kshm_src_pid);
- __swab64s(&hello->kshm_src_nid);
- __swab32s(&hello->kshm_dst_pid);
- __swab64s(&hello->kshm_dst_nid);
- __swab64s(&hello->kshm_src_incarnation);
- __swab64s(&hello->kshm_dst_incarnation);
- __swab32s(&hello->kshm_ctype);
- __swab32s(&hello->kshm_nips);
- }
-
- if (hello->kshm_nips > LNET_MAX_INTERFACES) {
- CERROR("Bad nips %d from ip %pI4h\n",
- hello->kshm_nips, &conn->ksnc_ipaddr);
- return -EPROTO;
- }
-
- if (!hello->kshm_nips)
- return 0;
-
- rc = lnet_sock_read(sock, hello->kshm_ips,
- hello->kshm_nips * sizeof(__u32), timeout);
- if (rc) {
- CERROR("Error %d reading IPs from ip %pI4h\n",
- rc, &conn->ksnc_ipaddr);
- LASSERT(rc < 0 && rc != -EALREADY);
- return rc;
- }
-
- for (i = 0; i < (int)hello->kshm_nips; i++) {
- if (conn->ksnc_flip)
- __swab32s(&hello->kshm_ips[i]);
-
- if (!hello->kshm_ips[i]) {
- CERROR("Zero IP[%d] from ip %pI4h\n",
- i, &conn->ksnc_ipaddr);
- return -EPROTO;
- }
- }
-
- return 0;
-}
-
-static void
-ksocknal_pack_msg_v1(struct ksock_tx *tx)
-{
- /* V1.x has no KSOCK_MSG_NOOP */
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
- LASSERT(tx->tx_lnetmsg);
-
- tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr;
- tx->tx_iov[0].iov_len = sizeof(struct lnet_hdr);
-
- tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr);
- tx->tx_resid = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr);
-}
-
-static void
-ksocknal_pack_msg_v2(struct ksock_tx *tx)
-{
- tx->tx_iov[0].iov_base = &tx->tx_msg;
-
- if (tx->tx_lnetmsg) {
- LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
-
- tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
- tx->tx_iov[0].iov_len = sizeof(struct ksock_msg);
- tx->tx_nob = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len;
- tx->tx_resid = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len;
- } else {
- LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
-
- tx->tx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
- tx->tx_nob = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
- tx->tx_resid = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr);
- }
- /*
- * Don't checksum before start sending, because packet can be
- * piggybacked with ACK
- */
-}
-
-static void
-ksocknal_unpack_msg_v1(struct ksock_msg *msg)
-{
- msg->ksm_csum = 0;
- msg->ksm_type = KSOCK_MSG_LNET;
- msg->ksm_zc_cookies[0] = 0;
- msg->ksm_zc_cookies[1] = 0;
-}
-
-static void
-ksocknal_unpack_msg_v2(struct ksock_msg *msg)
-{
- return; /* Do nothing */
-}
-
-struct ksock_proto ksocknal_protocol_v1x = {
- .pro_version = KSOCK_PROTO_V1,
- .pro_send_hello = ksocknal_send_hello_v1,
- .pro_recv_hello = ksocknal_recv_hello_v1,
- .pro_pack = ksocknal_pack_msg_v1,
- .pro_unpack = ksocknal_unpack_msg_v1,
- .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1,
- .pro_handle_zcreq = NULL,
- .pro_handle_zcack = NULL,
- .pro_queue_tx_zcack = NULL,
- .pro_match_tx = ksocknal_match_tx
-};
-
-struct ksock_proto ksocknal_protocol_v2x = {
- .pro_version = KSOCK_PROTO_V2,
- .pro_send_hello = ksocknal_send_hello_v2,
- .pro_recv_hello = ksocknal_recv_hello_v2,
- .pro_pack = ksocknal_pack_msg_v2,
- .pro_unpack = ksocknal_unpack_msg_v2,
- .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
- .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2,
- .pro_handle_zcreq = ksocknal_handle_zcreq,
- .pro_handle_zcack = ksocknal_handle_zcack,
- .pro_match_tx = ksocknal_match_tx
-};
-
-struct ksock_proto ksocknal_protocol_v3x = {
- .pro_version = KSOCK_PROTO_V3,
- .pro_send_hello = ksocknal_send_hello_v2,
- .pro_recv_hello = ksocknal_recv_hello_v2,
- .pro_pack = ksocknal_pack_msg_v2,
- .pro_unpack = ksocknal_unpack_msg_v2,
- .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
- .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3,
- .pro_handle_zcreq = ksocknal_handle_zcreq,
- .pro_handle_zcack = ksocknal_handle_zcack,
- .pro_match_tx = ksocknal_match_tx_v3
-};
OpenPOWER on IntegriCloud