summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/ceph/ceph_common.c27
-rw-r--r--net/ceph/cls_lock_client.c51
-rw-r--r--net/ceph/debugfs.c7
-rw-r--r--net/ceph/osd_client.c139
-rw-r--r--net/ceph/pagelist.c2
-rw-r--r--net/ceph/snapshot.c6
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c2
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/llc/llc_conn.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/smc/af_smc.c2
-rw-r--r--net/sunrpc/Kconfig1
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/sched.c5
-rw-r--r--net/sunrpc/svc.c134
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c8
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c71
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c89
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c79
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c512
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c978
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c110
-rw-r--r--net/sunrpc/xprtrdma/transport.c57
-rw-r--r--net/sunrpc/xprtrdma/verbs.c323
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h22
-rw-r--r--net/vmw_vsock/virtio_transport.c6
34 files changed, 1706 insertions, 972 deletions
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4eb773c..4fd0283 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -45,6 +45,17 @@ bool libceph_compatible(void *data)
}
EXPORT_SYMBOL(libceph_compatible);
+static int param_get_supported_features(char *buffer,
+ const struct kernel_param *kp)
+{
+ return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT);
+}
+static const struct kernel_param_ops param_ops_supported_features = {
+ .get = param_get_supported_features,
+};
+module_param_cb(supported_features, &param_ops_supported_features, NULL,
+ S_IRUGO);
+
/*
* find filename portion of a path (/foo/bar/baz -> baz)
*/
@@ -596,9 +607,7 @@ EXPORT_SYMBOL(ceph_client_gid);
/*
* create a fresh client instance
*/
-struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
- u64 supported_features,
- u64 required_features)
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
{
struct ceph_client *client;
struct ceph_entity_addr *myaddr = NULL;
@@ -615,14 +624,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
init_waitqueue_head(&client->auth_wq);
client->auth_err = 0;
- if (!ceph_test_opt(client, NOMSGAUTH))
- required_features |= CEPH_FEATURE_MSG_AUTH;
-
client->extra_mon_dispatch = NULL;
- client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
- supported_features;
- client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
- required_features;
+ client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
+
+ if (!ceph_test_opt(client, NOMSGAUTH))
+ client->required_features |= CEPH_FEATURE_MSG_AUTH;
/* msgr */
if (ceph_test_opt(client, MYIP))
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b9233b9..08ada89 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_cls_break_lock);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *old_cookie,
+ char *tag, char *new_cookie)
+{
+ int cookie_op_buf_size;
+ int name_len = strlen(lock_name);
+ int old_cookie_len = strlen(old_cookie);
+ int tag_len = strlen(tag);
+ int new_cookie_len = strlen(new_cookie);
+ void *p, *end;
+ struct page *cookie_op_page;
+ int ret;
+
+ cookie_op_buf_size = name_len + sizeof(__le32) +
+ old_cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ new_cookie_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (cookie_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ cookie_op_page = alloc_page(GFP_NOIO);
+ if (!cookie_op_page)
+ return -ENOMEM;
+
+ p = page_address(cookie_op_page);
+ end = p + cookie_op_buf_size;
+
+ /* encode cls_lock_set_cookie_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, old_cookie, old_cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ ceph_encode_string(&p, end, new_cookie, new_cookie_len);
+
+ dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n",
+ __func__, lock_name, type, old_cookie, tag, new_cookie);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie",
+ CEPH_OSD_FLAG_WRITE, cookie_op_page,
+ cookie_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(cookie_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_set_cookie);
+
void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
{
int i;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c62b2b0..71ba139 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p)
return 0;
down_read(&osdc->lock);
- seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
+ seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
+ osdc->epoch_barrier, map->flags);
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pi =
@@ -177,9 +178,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
seq_printf(s, "%llu\t", req->r_tid);
dump_target(s, &req->r_t);
- seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
- le32_to_cpu(req->r_replay_version.epoch),
- le64_to_cpu(req->r_replay_version.version));
+ seq_printf(s, "\t%d", req->r_attempts);
for (i = 0; i < req->r_num_ops; i++) {
struct ceph_osd_req_op *op = &req->r_ops[i];
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 242d7c0..924f07c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
+ req->r_abort_on_full = true;
req->r_flags = flags;
req->r_base_oloc.pool = layout->pool_id;
req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -1005,7 +1006,7 @@ static bool osd_registered(struct ceph_osd *osd)
*/
static void osd_init(struct ceph_osd *osd)
{
- atomic_set(&osd->o_ref, 1);
+ refcount_set(&osd->o_ref, 1);
RB_CLEAR_NODE(&osd->o_node);
osd->o_requests = RB_ROOT;
osd->o_linger_requests = RB_ROOT;
@@ -1050,9 +1051,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
static struct ceph_osd *get_osd(struct ceph_osd *osd)
{
- if (atomic_inc_not_zero(&osd->o_ref)) {
- dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
- atomic_read(&osd->o_ref));
+ if (refcount_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
+ refcount_read(&osd->o_ref));
return osd;
} else {
dout("get_osd %p FAIL\n", osd);
@@ -1062,9 +1063,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd)
static void put_osd(struct ceph_osd *osd)
{
- dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
- atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref)) {
+ dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
+ refcount_read(&osd->o_ref) - 1);
+ if (refcount_dec_and_test(&osd->o_ref)) {
osd_cleanup(osd);
kfree(osd);
}
@@ -1297,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
__pool_full(pi);
WARN_ON(pi->id != t->base_oloc.pool);
- return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+ ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+ (osdc->osdmap->epoch < osdc->epoch_barrier);
}
enum calc_target_result {
@@ -1503,9 +1505,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
ceph_encode_32(&p, req->r_flags);
ceph_encode_timespec(p, &req->r_mtime);
p += sizeof(struct ceph_timespec);
- /* aka reassert_version */
- memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
- p += sizeof(req->r_replay_version);
+
+ /* reassert_version */
+ memset(p, 0, sizeof(struct ceph_eversion));
+ p += sizeof(struct ceph_eversion);
/* oloc */
ceph_start_encoding(&p, 5, 4,
@@ -1626,6 +1629,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
ceph_monc_renew_subs(&osdc->client->monc);
}
+static void complete_request(struct ceph_osd_request *req, int err);
static void send_map_check(struct ceph_osd_request *req);
static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1635,6 +1639,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
enum calc_target_result ct_res;
bool need_send = false;
bool promoted = false;
+ bool need_abort = false;
WARN_ON(req->r_tid);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1650,8 +1655,13 @@ again:
goto promote;
}
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+ dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+ osdc->epoch_barrier);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
dout("req %p pausewr\n", req);
req->r_t.paused = true;
maybe_request_map(osdc);
@@ -1669,6 +1679,8 @@ again:
pr_warn_ratelimited("FULL or reached pool quota\n");
req->r_t.paused = true;
maybe_request_map(osdc);
+ if (req->r_abort_on_full)
+ need_abort = true;
} else if (!osd_homeless(osd)) {
need_send = true;
} else {
@@ -1685,6 +1697,8 @@ again:
link_request(osd, req);
if (need_send)
send_request(req);
+ else if (need_abort)
+ complete_request(req, -ENOSPC);
mutex_unlock(&osd->lock);
if (ct_res == CALC_TARGET_POOL_DNE)
@@ -1799,6 +1813,97 @@ static void abort_request(struct ceph_osd_request *req, int err)
complete_request(req, err);
}
+static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ if (likely(eb > osdc->epoch_barrier)) {
+ dout("updating epoch_barrier from %u to %u\n",
+ osdc->epoch_barrier, eb);
+ osdc->epoch_barrier = eb;
+ /* Request map if we're not to the barrier yet */
+ if (eb > osdc->osdmap->epoch)
+ maybe_request_map(osdc);
+ }
+}
+
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ down_read(&osdc->lock);
+ if (unlikely(eb > osdc->epoch_barrier)) {
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ update_epoch_barrier(osdc, eb);
+ up_write(&osdc->lock);
+ } else {
+ up_read(&osdc->lock);
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
+/*
+ * Drop all pending requests that are stalled waiting on a full condition to
+ * clear, and complete them with ENOSPC as the return code. Set the
+ * osdc->epoch_barrier to the latest map epoch that we've seen if any were
+ * cancelled.
+ */
+static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+ bool victims = false;
+
+ dout("enter abort_on_full\n");
+
+ if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc))
+ goto out;
+
+ /* Scan list and see if there is anything to abort */
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ struct rb_node *m;
+
+ m = rb_first(&osd->o_requests);
+ while (m) {
+ struct ceph_osd_request *req = rb_entry(m,
+ struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_abort_on_full) {
+ victims = true;
+ break;
+ }
+ }
+ if (victims)
+ break;
+ }
+
+ if (!victims)
+ goto out;
+
+ /*
+ * Update the barrier to current epoch if it's behind that point,
+ * since we know we have some calls to be aborted in the tree.
+ */
+ update_epoch_barrier(osdc, osdc->osdmap->epoch);
+
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ struct rb_node *m;
+
+ m = rb_first(&osd->o_requests);
+ while (m) {
+ struct ceph_osd_request *req = rb_entry(m,
+ struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_abort_on_full &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.target_oloc.pool)))
+ abort_request(req, -ENOSPC);
+ }
+ }
+out:
+ dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier);
+}
+
static void check_pool_dne(struct ceph_osd_request *req)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -3252,11 +3357,13 @@ done:
pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
have_pool_full(osdc);
- if (was_pauserd || was_pausewr || pauserd || pausewr)
+ if (was_pauserd || was_pausewr || pauserd || pausewr ||
+ osdc->osdmap->epoch < osdc->epoch_barrier)
maybe_request_map(osdc);
kick_requests(osdc, &need_resend, &need_resend_linger);
+ ceph_osdc_abort_on_full(osdc);
ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
osdc->osdmap->epoch);
up_write(&osdc->lock);
@@ -4126,7 +4233,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
close_osd(osd);
}
up_write(&osdc->lock);
- WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+ WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
osd_cleanup(&osdc->homeless_osd);
WARN_ON(!list_empty(&osdc->osd_lru));
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 6864007..ce09f73 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
void ceph_pagelist_release(struct ceph_pagelist *pl)
{
- if (!atomic_dec_and_test(&pl->refcnt))
+ if (!refcount_dec_and_test(&pl->refcnt))
return;
ceph_pagelist_unmap_tail(pl);
while (!list_empty(&pl->head)) {
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 705414e..e14a5d0 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
if (!snapc)
return NULL;
- atomic_set(&snapc->nref, 1);
+ refcount_set(&snapc->nref, 1);
snapc->num_snaps = snap_count;
return snapc;
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context);
struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
{
if (sc)
- atomic_inc(&sc->nref);
+ refcount_inc(&sc->nref);
return sc;
}
EXPORT_SYMBOL(ceph_get_snap_context);
@@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc)
{
if (!sc)
return;
- if (atomic_dec_and_test(&sc->nref)) {
+ if (refcount_dec_and_test(&sc->nref)) {
/*printk(" deleting snap_context %p\n", sc);*/
kfree(sc);
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b99168b..f75482b 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.rsk_prot = &dccp_request_sock_ops,
.twsk_prot = &dccp_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e..840f14a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.rsk_prot = &dccp6_request_sock_ops,
.twsk_prot = &dccp6_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a51582..5ab2aac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2395,7 +2395,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497..7a8237a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1917,7 +1917,7 @@ struct proto tcpv6_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cb4fff7..8364fe5 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
.name = "LLC",
.owner = THIS_MODULE,
.obj_size = sizeof(struct llc_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
};
/**
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1b..9b02c13 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_estab_match(sap, daddr, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_listener_match(sap, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d..63b6ab0 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_dgram_match(sap, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3c8f1ed..e847dba 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -911,7 +911,7 @@ static unsigned int early_drop_list(struct net *net,
continue;
/* kill only if still in same netns -- might have moved due to
- * SLAB_DESTROY_BY_RCU rules.
+ * SLAB_TYPESAFE_BY_RCU rules.
*
* We steal the timer reference. If that fails timer has
* already fired or someone else deleted it. Just drop ref
@@ -1114,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net,
/*
* Do not use kmem_cache_zalloc(), as this cache uses
- * SLAB_DESTROY_BY_RCU.
+ * SLAB_TYPESAFE_BY_RCU.
*/
ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
@@ -1159,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct)
struct net *net = nf_ct_net(ct);
/* A freed object has refcnt == 0, that's
- * the golden rule for SLAB_DESTROY_BY_RCU
+ * the golden rule for SLAB_TYPESAFE_BY_RCU
*/
NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
@@ -1929,7 +1929,7 @@ int nf_conntrack_init_start(void)
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
NFCT_INFOMASK + 1,
- SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
if (!nf_conntrack_cachep)
goto err_cachep;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b6ee21..6793d73 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
.unhash = smc_unhash_sk,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v4_hashinfo,
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 04ce2c0..ac09ca8 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
default SUNRPC && INFINIBAND
+ select SG_POOL
help
This option allows the NFS client and server to use RDMA
transports (InfiniBand, iWARP, or RoCE).
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 52da3ce..b5cb921 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1042,8 +1042,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
struct rpc_task *task;
task = rpc_new_task(task_setup_data);
- if (IS_ERR(task))
- goto out;
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
@@ -1053,7 +1051,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
atomic_inc(&task->tk_count);
rpc_execute(task);
-out:
return task;
}
EXPORT_SYMBOL_GPL(rpc_run_task);
@@ -1140,10 +1137,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
* Create an rpc_task to send the data
*/
task = rpc_new_task(&task_setup_data);
- if (IS_ERR(task)) {
- xprt_free_bc_request(req);
- goto out;
- }
task->tk_rqstp = req;
/*
@@ -1158,7 +1151,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
WARN_ON_ONCE(atomic_read(&task->tk_count) != 2);
rpc_execute(task);
-out:
dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
return task;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 5db68b3..0cc8383 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -965,11 +965,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
if (task == NULL) {
task = rpc_alloc_task();
- if (task == NULL) {
- rpc_release_calldata(setup_data->callback_ops,
- setup_data->callback_data);
- return ERR_PTR(-ENOMEM);
- }
flags = RPC_TASK_DYNAMIC;
}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a08aeb5..bc0f5a0 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -702,59 +702,32 @@ found_pool:
return task;
}
-/*
- * Create or destroy enough new threads to make the number
- * of threads the given number. If `pool' is non-NULL, applies
- * only to threads in that pool, otherwise round-robins between
- * all pools. Caller must ensure that mutual exclusion between this and
- * server startup or shutdown.
- *
- * Destroying threads relies on the service threads filling in
- * rqstp->rq_task, which only the nfs ones do. Assumes the serv
- * has been created using svc_create_pooled().
- *
- * Based on code that used to be in nfsd_svc() but tweaked
- * to be pool-aware.
- */
-int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+/* create new threads */
+static int
+svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct svc_rqst *rqstp;
struct task_struct *task;
struct svc_pool *chosen_pool;
- int error = 0;
unsigned int state = serv->sv_nrthreads-1;
int node;
- if (pool == NULL) {
- /* The -1 assumes caller has done a svc_get() */
- nrservs -= (serv->sv_nrthreads-1);
- } else {
- spin_lock_bh(&pool->sp_lock);
- nrservs -= pool->sp_nrthreads;
- spin_unlock_bh(&pool->sp_lock);
- }
-
- /* create new threads */
- while (nrservs > 0) {
+ do {
nrservs--;
chosen_pool = choose_pool(serv, pool, &state);
node = svc_pool_map_get_node(chosen_pool->sp_id);
rqstp = svc_prepare_thread(serv, chosen_pool, node);
- if (IS_ERR(rqstp)) {
- error = PTR_ERR(rqstp);
- break;
- }
+ if (IS_ERR(rqstp))
+ return PTR_ERR(rqstp);
__module_get(serv->sv_ops->svo_module);
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
- error = PTR_ERR(task);
module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
- break;
+ return PTR_ERR(task);
}
rqstp->rq_task = task;
@@ -763,18 +736,103 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
svc_sock_update_bufs(serv);
wake_up_process(task);
- }
+ } while (nrservs > 0);
+
+ return 0;
+}
+
+
+/* destroy old threads */
+static int
+svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct task_struct *task;
+ unsigned int state = serv->sv_nrthreads-1;
+
/* destroy old threads */
- while (nrservs < 0 &&
- (task = choose_victim(serv, pool, &state)) != NULL) {
+ do {
+ task = choose_victim(serv, pool, &state);
+ if (task == NULL)
+ break;
send_sig(SIGINT, task, 1);
nrservs++;
+ } while (nrservs < 0);
+
+ return 0;
+}
+
+/*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number. If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools. Caller must ensure that mutual exclusion between this and
+ * server startup or shutdown.
+ *
+ * Destroying threads relies on the service threads filling in
+ * rqstp->rq_task, which only the nfs ones do. Assumes the serv
+ * has been created using svc_create_pooled().
+ *
+ * Based on code that used to be in nfsd_svc() but tweaked
+ * to be pool-aware.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
}
- return error;
+ if (nrservs > 0)
+ return svc_start_kthreads(serv, pool, nrservs);
+ if (nrservs < 0)
+ return svc_signal_kthreads(serv, pool, nrservs);
+ return 0;
}
EXPORT_SYMBOL_GPL(svc_set_num_threads);
+/* destroy old threads */
+static int
+svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct task_struct *task;
+ unsigned int state = serv->sv_nrthreads-1;
+
+ /* destroy old threads */
+ do {
+ task = choose_victim(serv, pool, &state);
+ if (task == NULL)
+ break;
+ kthread_stop(task);
+ nrservs++;
+ } while (nrservs < 0);
+ return 0;
+}
+
+int
+svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
+ }
+
+ if (nrservs > 0)
+ return svc_start_kthreads(serv, pool, nrservs);
+ if (nrservs < 0)
+ return svc_stop_kthreads(serv, pool, nrservs);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+
/*
* Called from a server thread as it's exiting. Caller must hold the "service
* mutex" for the service.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 1f70821..e34f4ee 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -807,7 +807,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
EXPORT_SYMBOL_GPL(xdr_init_decode);
/**
- * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer from which to decode data
* @pages: list of pages to decode into
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index b530a28..3e63c5e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -651,6 +651,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
+EXPORT_SYMBOL_GPL(xprt_force_disconnect);
/**
* xprt_conditional_disconnect - force a transport to disconnect
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa4..c1ae814 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
- module.o
+ svc_rdma_rw.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a044be2..694e9b1 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -494,7 +494,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
}
sge->length = len;
- ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
sge->length, DMA_TO_DEVICE);
req->rl_send_wr.num_sge++;
return true;
@@ -523,7 +523,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
sge[sge_no].addr = rdmab_addr(rb);
sge[sge_no].length = xdr->head[0].iov_len;
sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
sge[sge_no].length, DMA_TO_DEVICE);
/* If there is a Read chunk, the page list is being handled
@@ -781,9 +781,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
return 0;
out_err:
- pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
- PTR_ERR(iptr));
- r_xprt->rx_stats.failed_marshal_count++;
+ if (PTR_ERR(iptr) != -ENOBUFS) {
+ pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
+ PTR_ERR(iptr));
+ r_xprt->rx_stats.failed_marshal_count++;
+ }
return PTR_ERR(iptr);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9..a4a8f69 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
-static unsigned int min_max_inline = 4096;
-static unsigned int max_max_inline = 65536;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
atomic_t rdma_stat_recv;
atomic_t rdma_stat_read;
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %u\n",
- svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40..c676ed0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
#undef SVCRDMA_BACKCHANNEL_DEBUG
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ * %0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ * %-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
struct xdr_buf *rcvbuf)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
p = (__be32 *)src->iov_base;
len = src->iov_len;
- xid = rmsgp->rm_xid;
+ xid = *rdma_resp;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: xid=%08x, length=%zu\n",
__func__, be32_to_cpu(xid), len);
pr_info("%s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
pr_info("%s: RPC: %*ph\n",
__func__, (int)len, p);
#endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
goto out_unlock;
memcpy(dst->iov_base, p, len);
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(rdma_resp + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
@@ -90,9 +100,9 @@ out_notfound:
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
*
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
- * instead, does not support chunks, and avoids blocking memory
- * allocation.
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
*
* XXX: There is still an opportunity to block in svc_rdma_send()
* if there are no SQ entries to post the Send. This may occur if
@@ -101,59 +111,36 @@ out_notfound:
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst)
{
- struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
struct svc_rdma_op_ctxt *ctxt;
- struct svc_rdma_req_map *vec;
- struct ib_send_wr send_wr;
int ret;
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
- if (ret)
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* rpcrdma_bc_send_request builds the transport header and
+ * the backchannel RPC message in the same buffer. Thus only
+ * one SGE is needed to send both.
+ */
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+ rqst->rq_snd_buf.len);
+ if (ret < 0)
goto out_err;
ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
if (ret)
goto out_err;
- ctxt = svc_rdma_get_context(rdma);
- ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
- ctxt->count = 1;
-
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = sndbuf->len;
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
- sndbuf->len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
- ret = -EIO;
- goto out_unmap;
- }
- svc_rdma_count_mappings(rdma, ctxt);
-
- memset(&send_wr, 0, sizeof(send_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = 1;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- ret = svc_rdma_send(rdma, &send_wr);
- if (ret) {
- ret = -EIO;
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
+ if (ret)
goto out_unmap;
- }
out_err:
- svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: %s returns %d\n", __func__, ret);
return ret;
out_unmap:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
+ ret = -EIO;
goto out_err;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf..bdcf7d8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,92 +166,3 @@ out_inval:
dprintk("svcrdma: failed to parse transport header\n");
return -EINVAL;
}
-
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err, __be32 *va)
-{
- __be32 *startp = va;
-
- *va++ = rmsgp->rm_xid;
- *va++ = rmsgp->rm_vers;
- *va++ = xprt->sc_fc_credits;
- *va++ = rdma_error;
- *va++ = cpu_to_be32(err);
- if (err == ERR_VERS) {
- *va++ = rpcrdma_version;
- *va++ = rpcrdma_version;
- }
-
- return (int)((unsigned long)va - (unsigned long)startp);
-}
-
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
- unsigned int nsegs;
- __be32 *p;
-
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
- struct rpcrdma_write_array *ary;
-
- /* no read-list */
- rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-
- /* write-array discrim */
- ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-
- /* write-list terminator */
- ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-
- /* reply-array discriminator */
- ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
- int chunks)
-{
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-}
-
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
- int chunk_no,
- __be32 rs_handle,
- __be64 rs_offset,
- u32 write_len)
-{
- struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
- seg->rs_handle = rs_handle;
- seg->rs_offset = rs_offset;
- seg->rs_length = cpu_to_be32(write_len);
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf..27a99bf 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.buflen = head->arg.buflen;
}
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+ __be32 *rdma_argp, int status)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p, *err_msgp;
+ unsigned int length;
+ struct page *page;
+ int ret;
+
+ ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+ if (ret)
+ return;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return;
+ err_msgp = page_address(page);
+
+ p = err_msgp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = xprt->sc_fc_credits;
+ *p++ = rdma_error;
+ if (status == -EPROTONOSUPPORT) {
+ *p++ = err_vers;
+ *p++ = rpcrdma_version;
+ *p++ = rpcrdma_version;
+ } else {
+ *p++ = err_chunk;
+ }
+ length = (unsigned long)p - (unsigned long)err_msgp;
+
+ /* Map transport header; no RPC message payload */
+ ctxt = svc_rdma_get_context(xprt);
+ ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+ if (ret) {
+ dprintk("svcrdma: Error %d mapping send for protocol error\n",
+ ret);
+ return;
+ }
+
+ ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+ if (ret) {
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ }
+}
+
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+ __be32 *rdma_resp)
{
- __be32 *p = (__be32 *)rmsgp;
+ __be32 *p;
if (!xprt->xpt_bc_xprt)
return false;
- if (rmsgp->rm_type != rdma_msg)
+ p = rdma_resp + 3;
+ if (*p++ != rdma_msg)
return false;
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != rmsgp->rm_xid)
+ /* XID sanity */
+ if (*p++ != *rdma_resp)
return false;
/* call direction */
- if (p[8] == cpu_to_be32(RPC_CALL))
+ if (*p == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
- ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+ &rmsgp->rm_xid,
&rqstp->rq_arg);
svc_rdma_put_context(ctxt, 0);
if (ret)
@@ -686,7 +739,7 @@ complete:
return ret;
out_err:
- svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+ svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
svc_rdma_put_context(ctxt, 0);
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 0000000..0cf6202
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+ struct list_head rw_list;
+ struct rdma_rw_ctx rw_ctx;
+ int rw_nents;
+ struct sg_table rw_sg_table;
+ struct scatterlist rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+ return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+ rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+
+ ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+ if (ctxt) {
+ list_del(&ctxt->rw_list);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ } else {
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ ctxt = kmalloc(sizeof(*ctxt) +
+ SG_CHUNK_SIZE * sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!ctxt)
+ goto out;
+ INIT_LIST_HEAD(&ctxt->rw_list);
+ }
+
+ ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+ if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+ ctxt->rw_sg_table.sgl)) {
+ kfree(ctxt);
+ ctxt = NULL;
+ }
+out:
+ return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+ list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+ kfree(ctxt);
+ }
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+ struct ib_cqe cc_cqe;
+ struct svcxprt_rdma *cc_rdma;
+ struct list_head cc_rwctxts;
+ int cc_sqecount;
+ enum dma_data_direction cc_dir;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+ struct svc_rdma_chunk_ctxt *cc,
+ enum dma_data_direction dir)
+{
+ cc->cc_rdma = rdma;
+ svc_xprt_get(&rdma->sc_xprt);
+
+ INIT_LIST_HEAD(&cc->cc_rwctxts);
+ cc->cc_sqecount = 0;
+ cc->cc_dir = dir;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+
+ rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, cc->cc_dir);
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ }
+ svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ * - Tracks progress of writing one chunk over all its segments
+ * - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+ /* write state of this chunk */
+ unsigned int wi_seg_off;
+ unsigned int wi_seg_no;
+ unsigned int wi_nsegs;
+ __be32 *wi_segs;
+
+ /* SGL constructor arguments */
+ struct xdr_buf *wi_xdr;
+ unsigned char *wi_base;
+ unsigned int wi_next_off;
+
+ struct svc_rdma_chunk_ctxt wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+ struct svc_rdma_write_info *info;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return info;
+
+ info->wi_seg_off = 0;
+ info->wi_seg_no = 0;
+ info->wi_nsegs = be32_to_cpup(++chunk);
+ info->wi_segs = ++chunk;
+ svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+ return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+ svc_rdma_cc_release(&info->wi_cc);
+ kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_chunk_ctxt *cc =
+ container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_write_info *info =
+ container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ }
+
+ svc_rdma_write_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ * even if one or more WRs are flushed. This is true when posting
+ * an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+ struct ib_send_wr *first_wr, *bad_wr;
+ struct list_head *tmp;
+ struct ib_cqe *cqe;
+ int ret;
+
+ first_wr = NULL;
+ cqe = &cc->cc_cqe;
+ list_for_each(tmp, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+
+ do {
+ if (atomic_sub_return(cc->cc_sqecount,
+ &rdma->sc_sq_avail) > 0) {
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ break;
+ return 0;
+ }
+
+ atomic_inc(&rdma_stat_sq_starve);
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wait_event(rdma->sc_send_wait,
+ atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+ } while (1);
+
+ pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+ /* If even one was posted, there will be a completion. */
+ if (bad_wr != first_wr)
+ return 0;
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+ return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+ sg_set_buf(&sg[0], info->wi_base, len);
+ info->wi_base += len;
+
+ ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+ unsigned int remaining,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ unsigned int sge_no, sge_bytes, page_off, page_no;
+ struct xdr_buf *xdr = info->wi_xdr;
+ struct scatterlist *sg;
+ struct page **page;
+
+ page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+ page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+ page = xdr->pages + page_no;
+ info->wi_next_off += remaining;
+ sg = ctxt->rw_sg_table.sgl;
+ sge_no = 0;
+ do {
+ sge_bytes = min_t(unsigned int, remaining,
+ PAGE_SIZE - page_off);
+ sg_set_page(sg, *page, sge_bytes, page_off);
+
+ remaining -= sge_bytes;
+ sg = sg_next(sg);
+ page_off = 0;
+ sge_no++;
+ page++;
+ } while (remaining);
+
+ ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+ void (*constructor)(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt),
+ unsigned int remaining)
+{
+ struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+ __be32 *seg;
+ int ret;
+
+ cc->cc_cqe.done = svc_rdma_write_done;
+ seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+ do {
+ unsigned int write_len;
+ u32 seg_length, seg_handle;
+ u64 seg_offset;
+
+ if (info->wi_seg_no >= info->wi_nsegs)
+ goto out_overflow;
+
+ seg_handle = be32_to_cpup(seg);
+ seg_length = be32_to_cpup(seg + 1);
+ xdr_decode_hyper(seg + 2, &seg_offset);
+ seg_offset += info->wi_seg_off;
+
+ write_len = min(remaining, seg_length - info->wi_seg_off);
+ ctxt = svc_rdma_get_rw_ctxt(rdma,
+ (write_len >> PAGE_SHIFT) + 2);
+ if (!ctxt)
+ goto out_noctx;
+
+ constructor(info, write_len, ctxt);
+ ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, 0, seg_offset,
+ seg_handle, DMA_TO_DEVICE);
+ if (ret < 0)
+ goto out_initerr;
+
+ list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+ cc->cc_sqecount += ret;
+ if (write_len == seg_length - info->wi_seg_off) {
+ seg += 4;
+ info->wi_seg_no++;
+ info->wi_seg_off = 0;
+ } else {
+ info->wi_seg_off += write_len;
+ }
+ remaining -= write_len;
+ } while (remaining);
+
+ return 0;
+
+out_overflow:
+ dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+ info->wi_nsegs);
+ return -E2BIG;
+
+out_noctx:
+ dprintk("svcrdma: no R/W ctxs available\n");
+ return -ENOMEM;
+
+out_initerr:
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+ return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+ struct kvec *vec)
+{
+ info->wi_base = vec->iov_base;
+ return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+ vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+ struct xdr_buf *xdr)
+{
+ info->wi_xdr = xdr;
+ info->wi_next_off = 0;
+ return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+ xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Write chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+ struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int ret;
+
+ if (!xdr->page_len)
+ return 0;
+
+ info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return xdr->page_len;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Reply chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+ bool writelist, struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int consumed, ret;
+
+ info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed = xdr->head[0].iov_len;
+
+ /* Send the page list in the Reply chunk only if the
+ * client did not provide Write chunks.
+ */
+ if (!writelist && xdr->page_len) {
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->page_len;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->tail[0].iov_len;
+ }
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return consumed;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b..1736337 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
@@ -40,6 +41,63 @@
* Author: Tom Tucker <tom@opengridcomputing.com>
*/
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ * successfully, or get flushed. Either way, the Send completion
+ * handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ * the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/spinlock.h>
@@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len)
return (len & 3) ? (4 - (len & 3)) : 0;
}
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec,
- bool write_chunk_present)
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
{
- int sge_no;
- u32 sge_bytes;
- u32 page_bytes;
- u32 page_off;
- int page_no;
-
- if (xdr->len !=
- (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
- return -EIO;
- }
+ unsigned int nsegs;
+ __be32 *p;
- /* Skip the first sge, this is for the RPCRDMA header */
- sge_no = 1;
+ p = rdma_resp;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;
- /* Head SGE */
- vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
- vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
- sge_no++;
-
- /* pages SGE */
- page_no = 0;
- page_bytes = xdr->page_len;
- page_off = xdr->page_base;
- while (page_bytes) {
- vec->sge[sge_no].iov_base =
- page_address(xdr->pages[page_no]) + page_off;
- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
- page_bytes -= sge_bytes;
- vec->sge[sge_no].iov_len = sge_bytes;
-
- sge_no++;
- page_no++;
- page_off = 0; /* reset for next time through loop */
+ /* Skip Write list. */
+ while (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
}
- /* Tail SGE */
- if (xdr->tail[0].iov_len) {
- unsigned char *base = xdr->tail[0].iov_base;
- size_t len = xdr->tail[0].iov_len;
- u32 xdr_pad = xdr_padsize(xdr->page_len);
+ /* Skip Reply chunk. */
+ if (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
- if (write_chunk_present && xdr_pad) {
- base += xdr_pad;
- len -= xdr_pad;
- }
+ return (unsigned long)p - (unsigned long)rdma_resp;
+}
- if (len) {
- vec->sge[sge_no].iov_base = base;
- vec->sge[sge_no].iov_len = len;
- sge_no++;
+/* One Write chunk is copied from Call transport header to Reply
+ * transport header. Each segment's length field is updated to
+ * reflect number of bytes consumed in the segment.
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+ unsigned int remaining)
+{
+ unsigned int i, nsegs;
+ u32 seg_len;
+
+ /* Write list discriminator */
+ *dst++ = *src++;
+
+ /* number of segments in this chunk */
+ nsegs = be32_to_cpup(src);
+ *dst++ = *src++;
+
+ for (i = nsegs; i; i--) {
+ /* segment's RDMA handle */
+ *dst++ = *src++;
+
+ /* bytes returned in this segment */
+ seg_len = be32_to_cpu(*src);
+ if (remaining >= seg_len) {
+ /* entire segment was consumed */
+ *dst = *src;
+ remaining -= seg_len;
+ } else {
+ /* segment only partly filled */
+ *dst = cpu_to_be32(remaining);
+ remaining = 0;
}
- }
+ dst++; src++;
- dprintk("svcrdma: %s: sge_no %d page_no %d "
- "page_base %u page_len %u head_len %zu tail_len %zu\n",
- __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
- xdr->head[0].iov_len, xdr->tail[0].iov_len);
+ /* segment's RDMA offset */
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
- vec->count = sge_no;
- return 0;
+ return nsegs;
}
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- u32 xdr_off, size_t len, int dir)
+/* The client provided a Write list in the Call message. Fill in
+ * the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ * - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+ unsigned int consumed)
{
- struct page *page;
- dma_addr_t dma_addr;
- if (xdr_off < xdr->head[0].iov_len) {
- /* This offset is in the head */
- xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
- page = virt_to_page(xdr->head[0].iov_base);
- } else {
- xdr_off -= xdr->head[0].iov_len;
- if (xdr_off < xdr->page_len) {
- /* This offset is in the page list */
- xdr_off += xdr->page_base;
- page = xdr->pages[xdr_off >> PAGE_SHIFT];
- xdr_off &= ~PAGE_MASK;
- } else {
- /* This offset is in the tail */
- xdr_off -= xdr->page_len;
- xdr_off += (unsigned long)
- xdr->tail[0].iov_base & ~PAGE_MASK;
- page = virt_to_page(xdr->tail[0].iov_base);
- }
+ unsigned int nsegs;
+ __be32 *p, *q;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+ q = wr_ch;
+ while (*q != xdr_zero) {
+ nsegs = xdr_encode_write_chunk(p, q, consumed);
+ q += 2 + nsegs * rpcrdma_segment_maxsz;
+ p += 2 + nsegs * rpcrdma_segment_maxsz;
+ consumed = 0;
}
- dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
- min_t(size_t, PAGE_SIZE, len), dir);
- return dma_addr;
+
+ /* Terminate Write list */
+ *p++ = xdr_zero;
+
+ /* Reply chunk discriminator; may be replaced later */
+ *p = xdr_zero;
+}
+
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+ unsigned int consumed)
+{
+ __be32 *p;
+
+ /* Find the Reply chunk in the Reply's xprt header.
+ * RPC-over-RDMA V1 replies never have a Read list.
+ */
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+ /* Skip past Write list */
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+
+ xdr_encode_write_chunk(p, rp_ch, consumed);
}
/* Parse the RPC Call's transport header.
*/
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array **write,
- struct rpcrdma_write_array **reply)
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
+ __be32 **write, __be32 **reply)
{
__be32 *p;
- p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
+ p = rdma_argp + rpcrdma_fixed_maxsz;
/* Read list */
while (*p++ != xdr_zero)
@@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
/* Write list */
if (*p != xdr_zero) {
- *write = (struct rpcrdma_write_array *)p;
+ *write = p;
while (*p++ != xdr_zero)
p += 1 + be32_to_cpu(*p) * 4;
} else {
@@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
/* Reply chunk */
if (*p != xdr_zero)
- *reply = (struct rpcrdma_write_array *)p;
+ *reply = p;
else
*reply = NULL;
}
@@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
* Invalidate, and responder chooses one rkey to invalidate.
*
* Find a candidate rkey to invalidate when sending a reply. Picks the
- * first rkey it finds in the chunks lists.
+ * first R_key it finds in the chunk lists.
*
* Returns zero if RPC's chunk lists are empty.
*/
-static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
- struct rpcrdma_write_array *wr_ary,
- struct rpcrdma_write_array *rp_ary)
+static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
+ __be32 *wr_lst, __be32 *rp_ch)
{
- struct rpcrdma_read_chunk *rd_ary;
- struct rpcrdma_segment *arg_ch;
+ __be32 *p;
- rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
- if (rd_ary->rc_discrim != xdr_zero)
- return be32_to_cpu(rd_ary->rc_target.rs_handle);
+ p = rdma_argp + rpcrdma_fixed_maxsz;
+ if (*p != xdr_zero)
+ p += 2;
+ else if (wr_lst && be32_to_cpup(wr_lst + 1))
+ p = wr_lst + 2;
+ else if (rp_ch && be32_to_cpup(rp_ch + 1))
+ p = rp_ch + 2;
+ else
+ return 0;
+ return be32_to_cpup(p);
+}
- if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
- arg_ch = &wr_ary->wc_array[0].wc_target;
- return be32_to_cpu(arg_ch->rs_handle);
- }
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * is used during completion to DMA-unmap this memory, and
+ * it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ unsigned int sge_no,
+ unsigned char *base,
+ unsigned int len)
+{
+ unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+ struct ib_device *dev = rdma->sc_cm_id->device;
+ dma_addr_t dma_addr;
- if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
- arg_ch = &rp_ary->wc_array[0].wc_target;
- return be32_to_cpu(arg_ch->rs_handle);
- }
+ dma_addr = ib_dma_map_page(dev, virt_to_page(base),
+ offset, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ return -EIO;
+ ctxt->sge[sge_no].addr = dma_addr;
+ ctxt->sge[sge_no].length = len;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+ svc_rdma_count_mappings(rdma, ctxt);
return 0;
}
-/* Assumptions:
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
- */
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
- u32 rmr, u64 to,
- u32 xdr_off, int write_len,
- struct svc_rdma_req_map *vec)
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ unsigned int sge_no,
+ struct page *page,
+ unsigned int offset,
+ unsigned int len)
{
- struct ib_rdma_wr write_wr;
- struct ib_sge *sge;
- int xdr_sge_no;
- int sge_no;
- int sge_bytes;
- int sge_off;
- int bc;
- struct svc_rdma_op_ctxt *ctxt;
+ struct ib_device *dev = rdma->sc_cm_id->device;
+ dma_addr_t dma_addr;
- if (vec->count > RPCSVC_MAXPAGES) {
- pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+ dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(dev, dma_addr))
return -EIO;
- }
- dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
- "write_len=%d, vec->sge=%p, vec->count=%lu\n",
- rmr, (unsigned long long)to, xdr_off,
- write_len, vec->sge, vec->count);
+ ctxt->sge[sge_no].addr = dma_addr;
+ ctxt->sge[sge_no].length = len;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+ svc_rdma_count_mappings(rdma, ctxt);
+ return 0;
+}
- ctxt = svc_rdma_get_context(xprt);
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ * %0 if the header is DMA mapped,
+ * %-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ __be32 *rdma_resp,
+ unsigned int len)
+{
ctxt->direction = DMA_TO_DEVICE;
- sge = ctxt->sge;
-
- /* Find the SGE associated with xdr_off */
- for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
- xdr_sge_no++) {
- if (vec->sge[xdr_sge_no].iov_len > bc)
- break;
- bc -= vec->sge[xdr_sge_no].iov_len;
- }
-
- sge_off = bc;
- bc = write_len;
- sge_no = 0;
-
- /* Copy the remaining SGE */
- while (bc != 0) {
- sge_bytes = min_t(size_t,
- bc, vec->sge[xdr_sge_no].iov_len-sge_off);
- sge[sge_no].length = sge_bytes;
- sge[sge_no].addr =
- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- sge[sge_no].addr))
- goto err;
- svc_rdma_count_mappings(xprt, ctxt);
- sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
- ctxt->count++;
- sge_off = 0;
- sge_no++;
- xdr_sge_no++;
- if (xdr_sge_no > vec->count) {
- pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
- goto err;
- }
- bc -= sge_bytes;
- if (sge_no == xprt->sc_max_sge)
- break;
- }
-
- /* Prepare WRITE WR */
- memset(&write_wr, 0, sizeof write_wr);
- ctxt->cqe.done = svc_rdma_wc_write;
- write_wr.wr.wr_cqe = &ctxt->cqe;
- write_wr.wr.sg_list = &sge[0];
- write_wr.wr.num_sge = sge_no;
- write_wr.wr.opcode = IB_WR_RDMA_WRITE;
- write_wr.wr.send_flags = IB_SEND_SIGNALED;
- write_wr.rkey = rmr;
- write_wr.remote_addr = to;
-
- /* Post It */
- atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr.wr))
- goto err;
- return write_len - bc;
- err:
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
- return -EIO;
+ ctxt->pages[0] = virt_to_page(rdma_resp);
+ ctxt->count = 1;
+ return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
}
-noinline
-static int send_write_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_write_array *wr_ary,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rqst *rqstp,
- struct svc_rdma_req_map *vec)
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added.
+ *
+ * Returns the number of sge elements loaded on success, or
+ * a negative errno on failure.
+ */
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct xdr_buf *xdr, __be32 *wr_lst)
{
- u32 xfer_len = rqstp->rq_res.page_len;
- int write_len;
- u32 xdr_off;
- int chunk_off;
- int chunk_no;
- int nchunks;
- struct rpcrdma_write_array *res_ary;
+ unsigned int len, sge_no, remaining, page_off;
+ struct page **ppages;
+ unsigned char *base;
+ u32 xdr_pad;
int ret;
- res_ary = (struct rpcrdma_write_array *)
- &rdma_resp->rm_body.rm_chunks[1];
-
- /* Write chunks start at the pagelist */
- nchunks = be32_to_cpu(wr_ary->wc_nchunks);
- for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
- xfer_len && chunk_no < nchunks;
- chunk_no++) {
- struct rpcrdma_segment *arg_ch;
- u64 rs_offset;
-
- arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
-
- /* Prepare the response chunk given the length actually
- * written */
- xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
- arg_ch->rs_handle,
- arg_ch->rs_offset,
- write_len);
- chunk_off = 0;
- while (write_len) {
- ret = send_write(xprt, rqstp,
- be32_to_cpu(arg_ch->rs_handle),
- rs_offset + chunk_off,
- xdr_off,
- write_len,
- vec);
- if (ret <= 0)
- goto out_err;
- chunk_off += ret;
- xdr_off += ret;
- xfer_len -= ret;
- write_len -= ret;
+ sge_no = 1;
+
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+ xdr->head[0].iov_base,
+ xdr->head[0].iov_len);
+ if (ret < 0)
+ return ret;
+
+ /* If a Write chunk is present, the xdr_buf's page list
+ * is not included inline. However the Upper Layer may
+ * have added XDR padding in the tail buffer, and that
+ * should not be included inline.
+ */
+ if (wr_lst) {
+ base = xdr->tail[0].iov_base;
+ len = xdr->tail[0].iov_len;
+ xdr_pad = xdr_padsize(xdr->page_len);
+
+ if (len && xdr_pad) {
+ base += xdr_pad;
+ len -= xdr_pad;
}
+
+ goto tail;
+ }
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_off = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - page_off, remaining);
+
+ ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+ *ppages++, page_off, len);
+ if (ret < 0)
+ return ret;
+
+ remaining -= len;
+ page_off = 0;
}
- /* Update the req with the number of chunks actually used */
- svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
- return rqstp->rq_res.page_len;
+ base = xdr->tail[0].iov_base;
+ len = xdr->tail[0].iov_len;
+tail:
+ if (len) {
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+ if (ret < 0)
+ return ret;
+ }
-out_err:
- pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
- return -EIO;
+ return sge_no - 1;
}
-noinline
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_write_array *rp_ary,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rqst *rqstp,
- struct svc_rdma_req_map *vec)
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *ctxt)
{
- u32 xfer_len = rqstp->rq_res.len;
- int write_len;
- u32 xdr_off;
- int chunk_no;
- int chunk_off;
- int nchunks;
- struct rpcrdma_segment *ch;
- struct rpcrdma_write_array *res_ary;
- int ret;
+ int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
- /* XXX: need to fix when reply lists occur with read-list and or
- * write-list */
- res_ary = (struct rpcrdma_write_array *)
- &rdma_resp->rm_body.rm_chunks[2];
-
- /* xdr offset starts at RPC message */
- nchunks = be32_to_cpu(rp_ary->wc_nchunks);
- for (xdr_off = 0, chunk_no = 0;
- xfer_len && chunk_no < nchunks;
- chunk_no++) {
- u64 rs_offset;
- ch = &rp_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
-
- /* Prepare the reply chunk given the length actually
- * written */
- xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
- ch->rs_handle, ch->rs_offset,
- write_len);
- chunk_off = 0;
- while (write_len) {
- ret = send_write(xprt, rqstp,
- be32_to_cpu(ch->rs_handle),
- rs_offset + chunk_off,
- xdr_off,
- write_len,
- vec);
- if (ret <= 0)
- goto out_err;
- chunk_off += ret;
- xdr_off += ret;
- xfer_len -= ret;
- write_len -= ret;
- }
+ ctxt->count += pages;
+ for (i = 0; i < pages; i++) {
+ ctxt->pages[i + 1] = rqstp->rq_respages[i];
+ rqstp->rq_respages[i] = NULL;
}
- /* Update the req with the number of chunks actually used */
- svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+}
- return rqstp->rq_res.len;
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ * %0 if the Send* was posted successfully,
+ * %-ENOTCONN if the connection was lost or dropped,
+ * %-EINVAL if there was a problem with the Send we built,
+ * %-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt, int num_sge,
+ u32 inv_rkey)
+{
+ struct ib_send_wr *send_wr = &ctxt->send_wr;
-out_err:
- pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
- return -EIO;
+ dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+
+ send_wr->next = NULL;
+ ctxt->cqe.done = svc_rdma_wc_send;
+ send_wr->wr_cqe = &ctxt->cqe;
+ send_wr->sg_list = ctxt->sge;
+ send_wr->num_sge = num_sge;
+ send_wr->send_flags = IB_SEND_SIGNALED;
+ if (inv_rkey) {
+ send_wr->opcode = IB_WR_SEND_WITH_INV;
+ send_wr->ex.invalidate_rkey = inv_rkey;
+ } else {
+ send_wr->opcode = IB_WR_SEND;
+ }
+
+ return svc_rdma_send(rdma, send_wr);
}
-/* This function prepares the portion of the RPCRDMA message to be
- * sent in the RDMA_SEND. This function is called after data sent via
- * RDMA has already been transmitted. There are three cases:
- * - The RPCRDMA header, RPC header, and payload are all sent in a
- * single RDMA_SEND. This is the "inline" case.
- * - The RPCRDMA header and some portion of the RPC header and data
- * are sent via this RDMA_SEND and another portion of the data is
- * sent via RDMA.
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
- * header and data are all transmitted via RDMA.
- * In all three cases, this function prepares the RPCRDMA header in
- * sge[0], the 'type' parameter indicates the type to place in the
- * RPCRDMA header, and the 'byte_count' field indicates how much of
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
- * to send is zero in the XDR.
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sge[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the ctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
*/
-static int send_reply(struct svcxprt_rdma *rdma,
- struct svc_rqst *rqstp,
- struct page *page,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rdma_req_map *vec,
- int byte_count,
- u32 inv_rkey)
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+ __be32 *rdma_argp, __be32 *rdma_resp,
+ struct svc_rqst *rqstp,
+ __be32 *wr_lst, __be32 *rp_ch)
{
struct svc_rdma_op_ctxt *ctxt;
- struct ib_send_wr send_wr;
- u32 xdr_off;
- int sge_no;
- int sge_bytes;
- int page_no;
- int pages;
- int ret = -EIO;
-
- /* Prepare the context */
+ u32 inv_rkey;
+ int ret;
+
+ dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
+ (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
+ rqstp->rq_res.head[0].iov_len,
+ rqstp->rq_res.page_len,
+ rqstp->rq_res.tail[0].iov_len);
+
ctxt = svc_rdma_get_context(rdma);
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->pages[0] = page;
- ctxt->count = 1;
- /* Prepare the SGE for the RPCRDMA Header */
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length =
- svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
- ctxt->sge[0].length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
+ svc_rdma_reply_hdr_len(rdma_resp));
+ if (ret < 0)
goto err;
- svc_rdma_count_mappings(rdma, ctxt);
-
- ctxt->direction = DMA_TO_DEVICE;
- /* Map the payload indicated by 'byte_count' */
- xdr_off = 0;
- for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
- sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
- byte_count -= sge_bytes;
- ctxt->sge[sge_no].addr =
- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(rdma->sc_cm_id->device,
- ctxt->sge[sge_no].addr))
+ if (!rp_ch) {
+ ret = svc_rdma_map_reply_msg(rdma, ctxt,
+ &rqstp->rq_res, wr_lst);
+ if (ret < 0)
goto err;
- svc_rdma_count_mappings(rdma, ctxt);
- ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[sge_no].length = sge_bytes;
}
- if (byte_count != 0) {
- pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+
+ svc_rdma_save_io_pages(rqstp, ctxt);
+
+ inv_rkey = 0;
+ if (rdma->sc_snd_w_inv)
+ inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
+ if (ret)
goto err;
- }
- /* Save all respages in the ctxt and remove them from the
- * respages array. They are our pages until the I/O
- * completes.
+ return 0;
+
+err:
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ return ret;
+}
+
+/* Given the client-provided Write and Reply chunks, the server was not
+ * able to form a complete reply. Return an RDMA_ERROR message so the
+ * client can retire this RPC transaction. As above, the Send completion
+ * routine releases payload pages that were part of a previous RDMA Write.
+ *
+ * Remote Invalidation is skipped for simplicity.
+ */
+static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+ __be32 *rdma_resp, struct svc_rqst *rqstp)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p;
+ int ret;
+
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* Replace the original transport header with an
+ * RDMA_ERROR response. XID etc are preserved.
*/
- pages = rqstp->rq_next_page - rqstp->rq_respages;
- for (page_no = 0; page_no < pages; page_no++) {
- ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
- ctxt->count++;
- rqstp->rq_respages[page_no] = NULL;
- }
- rqstp->rq_next_page = rqstp->rq_respages + 1;
+ p = rdma_resp + 3;
+ *p++ = rdma_error;
+ *p = err_chunk;
- if (sge_no > rdma->sc_max_sge) {
- pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
+ if (ret < 0)
goto err;
- }
- memset(&send_wr, 0, sizeof send_wr);
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = sge_no;
- if (inv_rkey) {
- send_wr.opcode = IB_WR_SEND_WITH_INV;
- send_wr.ex.invalidate_rkey = inv_rkey;
- } else
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
- ret = svc_rdma_send(rdma, &send_wr);
+ svc_rdma_save_io_pages(rqstp, ctxt);
+
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
if (ret)
goto err;
return 0;
- err:
+err:
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
return ret;
@@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
{
}
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ * %0 if an RPC reply has been successfully posted,
+ * %-ENOMEM if a resource shortage occurred (connection is lost),
+ * %-ENOTCONN if posting failed (connection is lost).
+ */
int svc_rdma_sendto(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = rqstp->rq_xprt;
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
- struct rpcrdma_msg *rdma_argp;
- struct rpcrdma_msg *rdma_resp;
- struct rpcrdma_write_array *wr_ary, *rp_ary;
- int ret;
- int inline_bytes;
+ __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+ struct xdr_buf *xdr = &rqstp->rq_res;
struct page *res_page;
- struct svc_rdma_req_map *vec;
- u32 inv_rkey;
- __be32 *p;
-
- dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+ int ret;
- /* Get the RDMA request header. The receive logic always
- * places this at the start of page 0.
+ /* Find the call's chunk lists to decide how to send the reply.
+ * Receive places the Call's xprt header at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
-
- inv_rkey = 0;
- if (rdma->sc_snd_w_inv)
- inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
- /* Build an req vec for the XDR */
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
- if (ret)
- goto err0;
- inline_bytes = rqstp->rq_res.len;
+ dprintk("svcrdma: preparing response for XID 0x%08x\n",
+ be32_to_cpup(rdma_argp));
/* Create the RDMA response header. xprt->xpt_mutex,
* acquired in svc_send(), serializes RPC replies. The
@@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
rdma_resp = page_address(res_page);
- p = &rdma_resp->rm_xid;
- *p++ = rdma_argp->rm_xid;
- *p++ = rdma_argp->rm_vers;
+ p = rdma_resp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+ *p++ = rp_ch ? rdma_nomsg : rdma_msg;
/* Start with empty chunks */
*p++ = xdr_zero;
*p++ = xdr_zero;
*p = xdr_zero;
- /* Send any write-chunk data and build resp write-list */
- if (wr_ary) {
- ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+ if (wr_lst) {
+ /* XXX: Presume the client sent only one Write chunk */
+ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
if (ret < 0)
- goto err1;
- inline_bytes -= ret + xdr_padsize(ret);
+ goto err2;
+ svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
}
-
- /* Send any reply-list data and update resp reply-list */
- if (rp_ary) {
- ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
+ if (rp_ch) {
+ ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
if (ret < 0)
- goto err1;
- inline_bytes -= ret;
+ goto err2;
+ svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
}
- /* Post a fresh Receive buffer _before_ sending the reply */
ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
if (ret)
goto err1;
-
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
- inline_bytes, inv_rkey);
+ ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+ wr_lst, rp_ch);
if (ret < 0)
goto err0;
+ return 0;
- svc_rdma_put_req_map(rdma, vec);
- dprintk("svcrdma: send_reply returns %d\n", ret);
- return ret;
+ err2:
+ if (ret != -E2BIG)
+ goto err1;
+
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+ if (ret)
+ goto err1;
+ ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
+ if (ret < 0)
+ goto err0;
+ return 0;
err1:
put_page(res_page);
err0:
- svc_rdma_put_req_map(rdma, vec);
pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
ret);
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
return -ENOTCONN;
}
-
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
- int status)
-{
- struct ib_send_wr err_wr;
- struct page *p;
- struct svc_rdma_op_ctxt *ctxt;
- enum rpcrdma_errcode err;
- __be32 *va;
- int length;
- int ret;
-
- ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
- if (ret)
- return;
-
- p = alloc_page(GFP_KERNEL);
- if (!p)
- return;
- va = page_address(p);
-
- /* XDR encode an error reply */
- err = ERR_CHUNK;
- if (status == -EPROTONOSUPPORT)
- err = ERR_VERS;
- length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-
- ctxt = svc_rdma_get_context(xprt);
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->count = 1;
- ctxt->pages[0] = p;
-
- /* Prepare SGE for local address */
- ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = length;
- ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
- p, 0, length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
- dprintk("svcrdma: Error mapping buffer for protocol error\n");
- svc_rdma_put_context(ctxt, 1);
- return;
- }
- svc_rdma_count_mappings(xprt, ctxt);
-
- /* Prepare SEND WR */
- memset(&err_wr, 0, sizeof(err_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- err_wr.wr_cqe = &ctxt->cqe;
- err_wr.sg_list = ctxt->sge;
- err_wr.num_sge = 1;
- err_wr.opcode = IB_WR_SEND;
- err_wr.send_flags = IB_SEND_SIGNALED;
-
- /* Post It */
- ret = svc_rdma_send(xprt, &err_wr);
- if (ret) {
- dprintk("svcrdma: Error %d posting send for protocol error\n",
- ret);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 1);
- }
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fc8f14c..a9d9cb1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
}
}
-static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
-{
- struct svc_rdma_req_map *map;
-
- map = kmalloc(sizeof(*map), flags);
- if (map)
- INIT_LIST_HEAD(&map->free);
- return map;
-}
-
-static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
-{
- unsigned int i;
-
- /* One for each receive buffer on this connection. */
- i = xprt->sc_max_requests;
-
- while (i--) {
- struct svc_rdma_req_map *map;
-
- map = alloc_req_map(GFP_KERNEL);
- if (!map) {
- dprintk("svcrdma: No memory for request map\n");
- return false;
- }
- list_add(&map->free, &xprt->sc_maps);
- }
- return true;
-}
-
-struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
-{
- struct svc_rdma_req_map *map = NULL;
-
- spin_lock(&xprt->sc_map_lock);
- if (list_empty(&xprt->sc_maps))
- goto out_empty;
-
- map = list_first_entry(&xprt->sc_maps,
- struct svc_rdma_req_map, free);
- list_del_init(&map->free);
- spin_unlock(&xprt->sc_map_lock);
-
-out:
- map->count = 0;
- return map;
-
-out_empty:
- spin_unlock(&xprt->sc_map_lock);
-
- /* Pre-allocation amount was incorrect */
- map = alloc_req_map(GFP_NOIO);
- if (map)
- goto out;
-
- WARN_ONCE(1, "svcrdma: empty request map list?\n");
- return NULL;
-}
-
-void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
- struct svc_rdma_req_map *map)
-{
- spin_lock(&xprt->sc_map_lock);
- list_add(&map->free, &xprt->sc_maps);
- spin_unlock(&xprt->sc_map_lock);
-}
-
-static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
-{
- while (!list_empty(&xprt->sc_maps)) {
- struct svc_rdma_req_map *map;
-
- map = list_first_entry(&xprt->sc_maps,
- struct svc_rdma_req_map, free);
- list_del(&map->free);
- kfree(map);
- }
-}
-
/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
@@ -474,24 +395,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
}
/**
- * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
- * @cq: completion queue
- * @wc: completed WR
- *
- */
-void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct svc_rdma_op_ctxt *ctxt;
-
- svc_rdma_send_wc_common_put(cq, wc, "write");
-
- ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
-}
-
-/**
* svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
* @cq: completion queue
* @wc: completed WR
@@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
- INIT_LIST_HEAD(&cma_xprt->sc_maps);
+ INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
spin_lock_init(&cma_xprt->sc_ctxt_lock);
- spin_lock_init(&cma_xprt->sc_map_lock);
+ spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
/*
* Note that this implies that the underlying transport support
@@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt, newxprt->sc_cm_id);
dev = newxprt->sc_cm_id->device;
+ newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
@@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
- newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ newxprt->sc_sq_depth = newxprt->sc_rq_depth;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
- if (!svc_rdma_prealloc_maps(newxprt))
- goto errout;
/*
* Limit ORD based on client limit, local device limit, and
@@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
+ qp_attr.port_num = newxprt->sc_cm_id->port_num;
+ qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
@@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work)
}
rdma_dealloc_frmr_q(rdma);
+ svc_rdma_destroy_rw_ctxts(rdma);
svc_rdma_destroy_ctxts(rdma);
- svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index c717f54..62ecbcc 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -66,8 +66,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
- int xprt_rdma_pad_optimize = 0;
+unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+int xprt_rdma_pad_optimize;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -396,7 +396,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap);
if (rc)
goto out1;
@@ -457,19 +457,33 @@ out1:
return ERR_PTR(rc);
}
-/*
- * Close a connection, during shutdown or timeout/reconnect
+/**
+ * xprt_rdma_close - Close down RDMA connection
+ * @xprt: generic transport to be closed
+ *
+ * Called during transport shutdown reconnect, or device
+ * removal. Caller holds the transport's write lock.
*/
static void
xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ dprintk("RPC: %s: closing xprt %p\n", __func__, xprt);
- dprintk("RPC: %s: closing\n", __func__);
- if (r_xprt->rx_ep.rep_connected > 0)
+ if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
+ xprt_clear_connected(xprt);
+ rpcrdma_ia_remove(ia);
+ return;
+ }
+ if (ep->rep_connected == -ENODEV)
+ return;
+ if (ep->rep_connected > 0)
xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ep_disconnect(ep, ia);
}
static void
@@ -484,6 +498,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
dprintk("RPC: %s: %u\n", __func__, port);
}
+/**
+ * xprt_rdma_timer - invoked when an RPC times out
+ * @xprt: controlling RPC transport
+ * @task: RPC task that timed out
+ *
+ * Invoked when the transport is still connected, but an RPC
+ * retransmit timeout occurs.
+ *
+ * Since RDMA connections don't have a keep-alive, forcibly
+ * disconnect and retry to connect. This drives full
+ * detection of the network path, and retransmissions of
+ * all pending RPCs.
+ */
+static void
+xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt);
+
+ xprt_force_disconnect(xprt);
+}
+
static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
@@ -659,6 +694,8 @@ xprt_rdma_free(struct rpc_task *task)
* xprt_rdma_send_request - marshal and send an RPC request
* @task: RPC task with an RPC message in rq_snd_buf
*
+ * Caller holds the transport's write lock.
+ *
* Return values:
* 0: The request has been sent
* ENOTCONN: Caller needs to invoke connect logic then call again
@@ -685,6 +722,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
+ if (!xprt_connected(xprt))
+ goto drop_connection;
+
/* On retransmit, remove any previously registered chunks */
if (unlikely(!list_empty(&req->rl_registered)))
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -776,6 +816,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.alloc_slot = xprt_alloc_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
+ .timer = xprt_rdma_timer,
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
.set_port = xprt_rdma_set_port,
.connect = xprt_rdma_connect,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3b332b3..3dbce9a 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,7 +53,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
-#include <linux/module.h> /* try_module_get()/module_put() */
+
#include <rdma/ib_cm.h>
#include "xprt_rdma.h"
@@ -69,8 +69,11 @@
/*
* internal functions
*/
+static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf);
+static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
-static struct workqueue_struct *rpcrdma_receive_wq;
+static struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
int
rpcrdma_alloc_wq(void)
@@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
- ib_dma_sync_single_for_cpu(rep->rr_device,
+ ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
@@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
__func__, ep);
complete(&ia->ri_done);
break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ pr_info("rpcrdma: removing device for %pIS:%u\n",
+ sap, rpc_get_port(sap));
+#endif
+ set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
+ ep->rep_connected = -ENODEV;
+ xprt_force_disconnect(&xprt->rx_xprt);
+ wait_for_completion(&ia->ri_remove_done);
+
+ ia->ri_id = NULL;
+ ia->ri_pd = NULL;
+ ia->ri_device = NULL;
+ /* Return 1 to ensure the core destroys the id. */
+ return 1;
case RDMA_CM_EVENT_ESTABLISHED:
connstate = 1;
ib_query_qp(ia->ri_id->qp, attr,
@@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
connstate = -ECONNABORTED;
- goto connected;
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- connstate = -ENODEV;
connected:
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
@@ -329,14 +344,6 @@ connected:
return 0;
}
-static void rpcrdma_destroy_id(struct rdma_cm_id *id)
-{
- if (id) {
- module_put(id->device->owner);
- rdma_destroy_id(id);
- }
-}
-
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
int rc;
init_completion(&ia->ri_done);
+ init_completion(&ia->ri_remove_done);
id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
IB_QPT_RC);
@@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
goto out;
}
- /* FIXME:
- * Until xprtrdma supports DEVICE_REMOVAL, the provider must
- * be pinned while there are active NFS/RDMA mounts to prevent
- * hangs and crashes at umount time.
- */
- if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
- dprintk("RPC: %s: Failed to get device module\n",
- __func__);
- ia->ri_async_rc = -ENODEV;
- }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
if (rc < 0) {
dprintk("RPC: %s: wait() exited: %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = ia->ri_async_rc;
if (rc)
- goto put;
+ goto out;
return id;
-put:
- module_put(id->device->owner);
+
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -413,13 +410,16 @@ out:
* Exported functions.
*/
-/*
- * Open and initialize an Interface Adapter.
- * o initializes fields of struct rpcrdma_ia, including
- * interface and provider attributes and protection zone.
+/**
+ * rpcrdma_ia_open - Open and initialize an Interface Adapter.
+ * @xprt: controlling transport
+ * @addr: IP address of remote peer
+ *
+ * Returns 0 on success, negative errno if an appropriate
+ * Interface Adapter could not be found and opened.
*/
int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
@@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
- goto out1;
+ goto out_err;
}
ia->ri_device = ia->ri_id->device;
@@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out2;
+ goto out_err;
}
- switch (memreg) {
+ switch (xprt_rdma_memreg_strategy) {
case RPCRDMA_FRMR:
if (frwr_is_supported(ia)) {
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
@@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
/*FALLTHROUGH*/
default:
- pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
- memreg);
+ pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
+ ia->ri_device->name, xprt_rdma_memreg_strategy);
rc = -EINVAL;
- goto out3;
+ goto out_err;
}
return 0;
-out3:
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-out2:
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
-out1:
+out_err:
+ rpcrdma_ia_close(ia);
return rc;
}
-/*
- * Clean up/close an IA.
- * o if event handles and PD have been initialized, free them.
- * o close the IA
+/**
+ * rpcrdma_ia_remove - Handle device driver unload
+ * @ia: interface adapter being removed
+ *
+ * Divest transport H/W resources associated with this adapter,
+ * but allow it to be restored later.
+ */
+void
+rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+ rx_ia);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+ struct rpcrdma_rep *rep;
+
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
+
+ /* This is similar to rpcrdma_ep_destroy, but:
+ * - Don't cancel the connect worker.
+ * - Don't call rpcrdma_ep_disconnect, which waits
+ * for another conn upcall, which will deadlock.
+ * - rdma_disconnect is unneeded, the underlying
+ * connection is already gone.
+ */
+ if (ia->ri_id->qp) {
+ ib_drain_qp(ia->ri_id->qp);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
+ }
+ ib_free_cq(ep->rep_attr.recv_cq);
+ ib_free_cq(ep->rep_attr.send_cq);
+
+ /* The ULP is responsible for ensuring all DMA
+ * mappings and MRs are gone.
+ */
+ list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
+ rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+ rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
+ }
+ rpcrdma_destroy_mrs(buf);
+
+ /* Allow waiters to continue */
+ complete(&ia->ri_remove_done);
+}
+
+/**
+ * rpcrdma_ia_close - Clean up/close an IA.
+ * @ia: interface adapter to close
+ *
*/
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
@@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
+ rdma_destroy_id(ia->ri_id);
}
+ ia->ri_id = NULL;
+ ia->ri_device = NULL;
/* If the pd is still busy, xprtrdma missed freeing a resource */
if (ia->ri_pd && !IS_ERR(ia->ri_pd))
ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
}
/*
@@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.send_cq);
}
+/* Re-establish a connection after a device removal event.
+ * Unlike a normal reconnection, a fresh PD and a new set
+ * of MRs and buffers is needed.
+ */
+static int
+rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ int rc, err;
+
+ pr_info("%s: r_xprt = %p\n", __func__, r_xprt);
+
+ rc = -EHOSTUNREACH;
+ if (rpcrdma_ia_open(r_xprt, sap))
+ goto out1;
+
+ rc = -ENOMEM;
+ err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
+ if (err) {
+ pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
+ goto out2;
+ }
+
+ rc = -ENETUNREACH;
+ err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
+ goto out3;
+ }
+
+ rpcrdma_create_mrs(r_xprt);
+ return 0;
+
+out3:
+ rpcrdma_ep_destroy(ep, ia);
+out2:
+ rpcrdma_ia_close(ia);
+out1:
+ return rc;
+}
+
+static int
+rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
+ struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ struct rdma_cm_id *id, *old;
+ int err, rc;
+
+ dprintk("RPC: %s: reconnecting...\n", __func__);
+
+ rpcrdma_ep_disconnect(ep, ia);
+
+ rc = -EHOSTUNREACH;
+ id = rpcrdma_create_id(r_xprt, ia, sap);
+ if (IS_ERR(id))
+ goto out;
+
+ /* As long as the new ID points to the same device as the
+ * old ID, we can reuse the transport's existing PD and all
+ * previously allocated MRs. Also, the same device means
+ * the transport's previous DMA mappings are still valid.
+ *
+ * This is a sanity check only. There should be no way these
+ * point to two different devices here.
+ */
+ old = id;
+ rc = -ENETUNREACH;
+ if (ia->ri_device != id->device) {
+ pr_err("rpcrdma: can't reconnect on different device!\n");
+ goto out_destroy;
+ }
+
+ err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ dprintk("RPC: %s: rdma_create_qp returned %d\n",
+ __func__, err);
+ goto out_destroy;
+ }
+
+ /* Atomically replace the transport's ID and QP. */
+ rc = 0;
+ old = ia->ri_id;
+ ia->ri_id = id;
+ rdma_destroy_qp(old);
+
+out_destroy:
+ rdma_destroy_id(old);
+out:
+ return rc;
+}
+
/*
* Connect unconnected endpoint.
*/
@@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
- struct rdma_cm_id *id, *old;
- struct sockaddr *sap;
unsigned int extras;
- int rc = 0;
+ int rc;
- if (ep->rep_connected != 0) {
retry:
- dprintk("RPC: %s: reconnecting...\n", __func__);
-
- rpcrdma_ep_disconnect(ep, ia);
-
- sap = (struct sockaddr *)&r_xprt->rx_data.addr;
- id = rpcrdma_create_id(r_xprt, ia, sap);
- if (IS_ERR(id)) {
- rc = -EHOSTUNREACH;
- goto out;
- }
- /* TEMP TEMP TEMP - fail if new device:
- * Deregister/remarshal *all* requests!
- * Close and recreate adapter, pd, etc!
- * Re-determine all attributes still sane!
- * More stuff I haven't thought of!
- * Rrrgh!
- */
- if (ia->ri_device != id->device) {
- printk("RPC: %s: can't reconnect on "
- "different device!\n", __func__);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
- /* END TEMP */
- rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
- if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
-
- old = ia->ri_id;
- ia->ri_id = id;
-
- rdma_destroy_qp(old);
- rpcrdma_destroy_id(old);
- } else {
+ switch (ep->rep_connected) {
+ case 0:
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- /* do not update ep->rep_connected */
- return -ENETUNREACH;
+ rc = -ENETUNREACH;
+ goto out_noupdate;
}
+ break;
+ case -ENODEV:
+ rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+ if (rc)
+ goto out_noupdate;
+ break;
+ default:
+ rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+ if (rc)
+ goto out;
}
ep->rep_connected = 0;
@@ -736,6 +845,8 @@ retry:
out:
if (rc)
ep->rep_connected = rc;
+
+out_noupdate:
return rc;
}
@@ -878,7 +989,6 @@ struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_rep *rep;
int rc;
@@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
- rep->rr_device = ia->ri_device;
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
@@ -1037,6 +1146,7 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
cancel_delayed_work_sync(&buf->rb_recovery_worker);
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
@@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
out_nomws:
dprintk("RPC: %s: no MWs available\n", __func__);
- schedule_delayed_work(&buf->rb_refresh_worker, 0);
+ if (r_xprt->rx_ep.rep_connected != -ENODEV)
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
/* Allow the reply handler and refresh worker to run */
cond_resched();
@@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
bool
__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
+ struct ib_device *device = ia->ri_device;
+
if (rb->rg_direction == DMA_NONE)
return false;
- rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+ rb->rg_iov.addr = ib_dma_map_single(device,
(void *)rb->rg_base,
rdmab_length(rb),
rb->rg_direction);
- if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+ if (ib_dma_mapping_error(device, rdmab_addr(rb)))
return false;
- rb->rg_device = ia->ri_device;
+ rb->rg_device = device;
rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
return true;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 171a351..1d66acf 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -69,6 +69,7 @@ struct rpcrdma_ia {
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct completion ri_done;
+ struct completion ri_remove_done;
int ri_async_rc;
unsigned int ri_max_segs;
unsigned int ri_max_frmr_depth;
@@ -78,10 +79,15 @@ struct rpcrdma_ia {
bool ri_reminv_expected;
bool ri_implicit_roundup;
enum ib_mr_type ri_mrtype;
+ unsigned long ri_flags;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
+enum {
+ RPCRDMA_IAF_REMOVING = 0,
+};
+
/*
* RDMA Endpoint -- one per transport instance
*/
@@ -164,6 +170,12 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
+static inline struct ib_device *
+rdmab_device(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_device;
+}
+
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
/* To ensure a transport can always make forward progress,
@@ -209,7 +221,6 @@ struct rpcrdma_rep {
unsigned int rr_len;
int rr_wc_flags;
u32 rr_inv_rkey;
- struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
struct list_head rr_list;
@@ -380,7 +391,6 @@ struct rpcrdma_buffer {
spinlock_t rb_mwlock; /* protect rb_mws list */
struct list_head rb_mws;
struct list_head rb_all;
- char *rb_pool;
spinlock_t rb_lock; /* protect buf lists */
int rb_send_count, rb_recv_count;
@@ -497,10 +507,16 @@ struct rpcrdma_xprt {
* Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
extern int xprt_rdma_pad_optimize;
+/* This setting controls the hunt for a supported memory
+ * registration strategy.
+ */
+extern unsigned int xprt_rdma_memreg_strategy;
+
/*
* Interface Adapter calls - xprtrdma/verbs.c
*/
-int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr);
+void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
void rpcrdma_ia_close(struct rpcrdma_ia *);
bool frwr_is_supported(struct rpcrdma_ia *);
bool fmr_is_supported(struct rpcrdma_ia *);
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 9dffe02..403d86e 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -576,9 +576,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->vdev = vdev;
- ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
- vsock->vqs, callbacks, names,
- NULL);
+ ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names,
+ NULL);
if (ret < 0)
goto out;
OpenPOWER on IntegriCloud