From 7a9a7b774fd543467313894fe53bc7dcc47d5708 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 May 2014 13:47:14 -0400 Subject: SUNRPC: Fix a module reference issue in rpcsec_gss We're not taking a reference in the case where _gss_mech_get_by_pseudoflavor loops without finding the correct rpcsec_gss flavour, so why are we releasing it? Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_mech_switch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 27ce262..92d5ab9 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -218,10 +218,8 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) spin_lock(®istered_mechs_lock); list_for_each_entry(pos, ®istered_mechs, gm_list) { - if (!mech_supports_pseudoflavor(pos, pseudoflavor)) { - module_put(pos->gm_owner); + if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; - } if (try_module_get(pos->gm_owner)) gm = pos; break; -- cgit v1.1 From c6c8fe79a83e1a03e5dd83d0bac178d6ba5ef30a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 7 May 2014 13:03:41 -0700 Subject: net, sunrpc: suppress allocation warning in rpc_malloc() rpc_malloc() allocates with GFP_NOWAIT without making any attempt at reclaim so it easily fails when low on memory. This ends up spamming the kernel log: SLAB: Unable to allocate memory on node 0 (gfp=0x4000) cache: kmalloc-8192, object size: 8192, order: 1 node 0: slabs: 207/207, objs: 207/207, free: 0 rekonq: page allocation failure: order:1, mode:0x204000 CPU: 2 PID: 14321 Comm: rekonq Tainted: G O 3.15.0-rc3-12.gfc9498b-desktop+ #6 Hardware name: System manufacturer System Product Name/M4A785TD-V EVO, BIOS 2105 07/23/2010 0000000000000000 ffff880010ff17d0 ffffffff815e693c 0000000000204000 ffff880010ff1858 ffffffff81137bd2 0000000000000000 0000001000000000 ffff88011ffebc38 0000000000000001 0000000000204000 ffff88011ffea000 Call Trace: [] dump_stack+0x4d/0x6f [] warn_alloc_failed+0xd2/0x140 [] __alloc_pages_nodemask+0x7e9/0xa30 [] kmem_getpages+0x58/0x140 [] fallback_alloc+0x1d6/0x210 [] ____cache_alloc_node+0x123/0x150 [] __kmalloc+0x203/0x490 [] rpc_malloc+0x32/0xa0 [sunrpc] [] call_allocate+0xb9/0x170 [sunrpc] [] __rpc_execute+0x88/0x460 [sunrpc] [] rpc_execute+0x59/0xc0 [sunrpc] [] rpc_run_task+0x6b/0x90 [sunrpc] [] nfs4_call_sync_sequence+0x51/0x80 [nfsv4] [] _nfs4_do_setattr+0x1ed/0x280 [nfsv4] [] nfs4_do_setattr+0x72/0x180 [nfsv4] [] nfs4_proc_setattr+0xbc/0x140 [nfsv4] [] nfs_setattr+0xd8/0x240 [nfs] [] notify_change+0x231/0x380 [] chmod_common+0xfc/0x120 [] SyS_chmod+0x40/0x90 [] system_call_fastpath+0x1a/0x1f ... If the allocation fails, simply return NULL and avoid spamming the kernel log. Reported-by: Marc Dietrich Signed-off-by: David Rientjes Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 25578af..c0365c1 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -832,7 +832,8 @@ static void rpc_async_schedule(struct work_struct *work) * @size: requested byte size * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL if the request cannot be serviced immediately. + * returning NULL and suppressing warning if the request cannot be serviced + * immediately. * The caller can arrange to sleep in a way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a @@ -845,7 +846,7 @@ static void rpc_async_schedule(struct work_struct *work) void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = GFP_NOWAIT; + gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (RPC_IS_SWAPPER(task)) gfp |= __GFP_MEMALLOC; -- cgit v1.1 From 0fc6c4e7bb287148eb5e949efd89327929d4841d Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 28 May 2014 10:32:00 -0400 Subject: xprtrdma: mind the device's max fast register page list depth Some rdma devices don't support a fast register page list depth of at least RPCRDMA_MAX_DATA_SEGS. So xprtrdma needs to chunk its fast register regions according to the minimum of the device max supported depth or RPCRDMA_MAX_DATA_SEGS. Signed-off-by: Steve Wise Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ---- net/sunrpc/xprtrdma/verbs.c | 47 ++++++++++++++++++++++++++++++----------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 36 insertions(+), 16 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 96ead52..400aa1b 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, /* success. all failures return above */ req->rl_nchunks = nchunks; - BUG_ON(nchunks == 0); - BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) - && (nchunks > 3)); - /* * finish off header. If write, marshal discrim and nchunks. */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 93726560..55fb09a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) __func__); memreg = RPCRDMA_REGISTER; #endif + } else { + /* Mind the ia limit on FRMR page list depth */ + ia->ri_max_frmr_depth = min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + devattr.max_fast_reg_page_list_len); } break; } @@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: + case RPCRDMA_FRMR: { + int depth = 7; + /* Add room for frmr register and invalidate WRs. * 1. FRMR reg WR for head * 2. FRMR invalidate WR for head - * 3. FRMR reg WR for pagelist - * 4. FRMR invalidate WR for pagelist + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist * 5. FRMR reg WR for tail * 6. FRMR invalidate WR for tail * 7. The RDMA_SEND WR */ - ep->rep_attr.cap.max_send_wr *= 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + int delta = RPCRDMA_MAX_DATA_SEGS - + ia->ri_max_frmr_depth; + + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + + } + ep->rep_attr.cap.max_send_wr *= depth; if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { - cdata->max_requests = devattr.max_qp_wr / 7; + cdata->max_requests = devattr.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; } break; + } case RPCRDMA_MEMWINDOWS_ASYNC: case RPCRDMA_MEMWINDOWS: /* Add room for mw_binds+unbinds - overkill! */ @@ -1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, case RPCRDMA_FRMR: for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - RPCRDMA_MAX_SEGS); + ia->ri_max_frmr_depth); if (IS_ERR(r->r.frmr.fr_mr)) { rc = PTR_ERR(r->r.frmr.fr_mr); dprintk("RPC: %s: ib_alloc_fast_reg_mr" " failed %i\n", __func__, rc); goto out; } - r->r.frmr.fr_pgl = - ib_alloc_fast_reg_page_list(ia->ri_id->device, - RPCRDMA_MAX_SEGS); + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); if (IS_ERR(r->r.frmr.fr_pgl)) { rc = PTR_ERR(r->r.frmr.fr_pgl); dprintk("RPC: %s: " @@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, seg1->mr_offset -= pageoff; /* start of page */ seg1->mr_len += pageoff; len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; + if (*nsegs > ia->ri_max_frmr_depth) + *nsegs = ia->ri_max_frmr_depth; for (page_no = i = 0; i < *nsegs;) { rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cc1445d..98340a3 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -66,6 +66,7 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; + unsigned int ri_max_frmr_depth; }; /* -- cgit v1.1 From 4034ba04231f554abb97ad8900a4c1af03f8e21d Mon Sep 17 00:00:00 2001 From: Allen Andrews Date: Wed, 28 May 2014 10:32:09 -0400 Subject: nfs-rdma: Fix for FMR leaks Two memory region leaks were found during testing: 1. rpcrdma_buffer_create: While allocating RPCRDMA_FRMR's ib_alloc_fast_reg_mr is called and then ib_alloc_fast_reg_page_list is called. If ib_alloc_fast_reg_page_list returns an error it bails out of the routine dropping the last ib_alloc_fast_reg_mr frmr region creating a memory leak. Added code to dereg the last frmr if ib_alloc_fast_reg_page_list fails. 2. rpcrdma_buffer_destroy: While cleaning up, the routine will only free the MR's on the rb_mws list if there are rb_send_bufs present. However, in rpcrdma_buffer_create while the rb_mws list is being built if one of the MR allocation requests fail after some MR's have been allocated on the rb_mws list the routine never gets to create any rb_send_bufs but instead jumps to the rpcrdma_buffer_destroy routine which will never free the MR's on rb_mws list because the rb_send_bufs were never created. This leaks all the MR's on the rb_mws list that were created prior to one of the MR allocations failing. Issue(2) was seen during testing. Our adapter had a finite number of MR's available and we created enough connections to where we saw an MR allocation failure on our Nth NFS connection request. After the kernel cleaned up the resources it had allocated for the Nth connection we noticed that FMR's had been leaked due to the coding error described above. Issue(1) was seen during a code review while debugging issue(2). Signed-off-by: Allen Andrews Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 73 +++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 35 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 55fb09a..8f9704e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1081,6 +1081,8 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, dprintk("RPC: %s: " "ib_alloc_fast_reg_page_list " "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); goto out; } list_add(&r->mw_list, &buf->rb_mws); @@ -1217,41 +1219,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_recv_bufs[i]); } if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = ib_dealloc_mw(r->r.mw); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_mw" - " failed %i\n", - __func__, rc); - break; - default: - break; - } - } rpcrdma_deregister_internal(ia, buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); @@ -1259,6 +1226,42 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s:" + " ib_dereg_mr" + " failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + break; + case RPCRDMA_MTHCAFMR: + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_fmr" + " failed %i\n", + __func__, rc); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + rc = ib_dealloc_mw(r->r.mw); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_mw" + " failed %i\n", + __func__, rc); + break; + default: + break; + } + } + kfree(buf->rb_pool); } -- cgit v1.1 From 254f91e2fa1f4cc18fd2eb9d5481888ffe126d5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:32:17 -0400 Subject: xprtrdma: RPC/RDMA must invoke xprt_wake_pending_tasks() in process context An IB provider can invoke rpcrdma_conn_func() in an IRQ context, thus rpcrdma_conn_func() cannot be allowed to directly invoke generic RPC functions like xprt_wake_pending_tasks(). Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 22 +++++++++++++++------- net/sunrpc/xprtrdma/verbs.c | 3 +++ net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++ 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 400aa1b..c296468 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -676,15 +676,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) rqst->rq_private_buf = rqst->rq_rcv_buf; } -/* - * This function is called when an async event is posted to - * the connection which changes the connection state. All it - * does at this point is mark the connection up/down, the rpc - * timers do the rest. - */ void -rpcrdma_conn_func(struct rpcrdma_ep *ep) +rpcrdma_connect_worker(struct work_struct *work) { + struct rpcrdma_ep *ep = + container_of(work, struct rpcrdma_ep, rep_connect_worker.work); struct rpc_xprt *xprt = ep->rep_xprt; spin_lock_bh(&xprt->transport_lock); @@ -701,6 +697,18 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) } /* + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. + */ +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + schedule_delayed_work(&ep->rep_connect_worker, 0); +} + +/* * This function is called when memory window unbind which we are waiting * for completes. Just use rr_func (zeroed by upcall) to signal completion. */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8f9704e..9cb88f3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -742,6 +742,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, INIT_CQCOUNT(ep); ep->rep_ia = ia; init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); /* * Create a single cq for receive dto and mw_bind (only ever @@ -817,6 +818,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); + cancel_delayed_work_sync(&ep->rep_connect_worker); + if (ia->ri_id->qp) { rc = rpcrdma_ep_disconnect(ep, ia); if (rc) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 98340a3..c620d13 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -43,6 +43,7 @@ #include /* wait_queue_head_t, etc */ #include /* spinlock_t, etc */ #include /* atomic_t, etc */ +#include /* struct work_struct */ #include /* RDMA connection api */ #include /* RDMA verbs api */ @@ -87,6 +88,7 @@ struct rpcrdma_ep { struct rpc_xprt *rep_xprt; /* for rep_func */ struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; + struct delayed_work rep_connect_worker; }; #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) @@ -336,6 +338,7 @@ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ +void rpcrdma_connect_worker(struct work_struct *); void rpcrdma_conn_func(struct rpcrdma_ep *); void rpcrdma_reply_handler(struct rpcrdma_rep *); -- cgit v1.1 From 03ff8821eb5ed168792667cfc3ddff903e97af99 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:32:26 -0400 Subject: xprtrdma: Remove BOUNCEBUFFERS memory registration mode Clean up: This memory registration mode is slow and was never meant for use in production environments. Remove it to reduce implementation complexity. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 11 ----------- net/sunrpc/xprtrdma/transport.c | 13 ------------- net/sunrpc/xprtrdma/verbs.c | 5 +---- 3 files changed, 1 insertion(+), 28 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c296468..02b2941 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -77,9 +77,6 @@ static const char transfertypes[][12] = { * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk * elements. Segments are then coalesced when registered, if possible * within the selected memreg mode. - * - * Note, this routine is never called if the connection's memory - * registration strategy is 0 (bounce buffers). */ static int @@ -439,14 +436,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) wtype = rpcrdma_noch; BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); - if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && - (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { - /* forced to "pure inline"? */ - dprintk("RPC: %s: too much data (%d/%d) for inline\n", - __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); - return -1; - } - hdrlen = 28; /*sizeof *headerp;*/ padlen = 0; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 1eb9c46..8c5035a2 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -503,18 +503,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) * If the allocation or registration fails, the RPC framework * will (doggedly) retry. */ - if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == - RPCRDMA_BOUNCEBUFFERS) { - /* forced to "pure inline" */ - dprintk("RPC: %s: too much data (%zd) for inline " - "(r/w max %d/%d)\n", __func__, size, - rpcx_to_rdmad(xprt).inline_rsize, - rpcx_to_rdmad(xprt).inline_wsize); - size = req->rl_size; - rpc_exit(task, -EIO); /* fail the operation */ - rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; - goto out; - } if (task->tk_flags & RPC_TASK_SWAPPER) nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); else @@ -543,7 +531,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) req = nreq; } dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); -out: req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_xdr_buf; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9cb88f3..4a4e4ea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -557,7 +557,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * adapter. */ switch (memreg) { - case RPCRDMA_BOUNCEBUFFERS: case RPCRDMA_REGISTER: case RPCRDMA_FRMR: break; @@ -778,9 +777,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) - ep->rep_remote_cma.responder_resources = 0; - else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; -- cgit v1.1 From b45ccfd25d506e83d9ecf93d0ac7edf031d35d2f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:32:34 -0400 Subject: xprtrdma: Remove MEMWINDOWS registration modes The MEMWINDOWS and MEMWINDOWS_ASYNC memory registration modes were intended as stop-gap modes before the introduction of FRMR. They are now considered obsolete. MEMWINDOWS_ASYNC is also considered unsafe because it can leave client memory registered and exposed for an indeterminant time after each I/O. At this point, the MEMWINDOWS modes add needless complexity, so remove them. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 34 +-------- net/sunrpc/xprtrdma/transport.c | 9 +-- net/sunrpc/xprtrdma/verbs.c | 165 ++-------------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 - 4 files changed, 7 insertions(+), 203 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 02b2941..46b5172 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -199,7 +199,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return 0; do { - /* bind/register the memory, then build chunk from result. */ int n = rpcrdma_register_external(seg, nsegs, cur_wchunk != NULL, r_xprt); if (n <= 0) @@ -698,16 +697,6 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) } /* - * This function is called when memory window unbind which we are waiting - * for completes. Just use rr_func (zeroed by upcall) to signal completion. - */ -static void -rpcrdma_unbind_func(struct rpcrdma_rep *rep) -{ - wake_up(&rep->rr_unbind); -} - -/* * Called as a tasklet to do req/reply match and complete a request * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. @@ -721,7 +710,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpc_xprt *xprt = rep->rr_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); __be32 *iptr; - int i, rdmalen, status; + int rdmalen, status; /* Check status. If bad, signal disconnect and return rep to pool */ if (rep->rr_len == ~0U) { @@ -850,27 +839,6 @@ badheader: break; } - /* If using mw bind, start the deregister process now. */ - /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ - if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_MEMWINDOWS: - for (i = 0; req->rl_nchunks-- > 1;) - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt, NULL); - /* Optionally wait (not here) for unbinds to complete */ - rep->rr_func = rpcrdma_unbind_func; - (void) rpcrdma_deregister_external(&req->rl_segments[i], - r_xprt, rep); - break; - case RPCRDMA_MEMWINDOWS_ASYNC: - for (i = 0; req->rl_nchunks--;) - i += rpcrdma_deregister_external(&req->rl_segments[i], - r_xprt, NULL); - break; - default: - break; - } - dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); xprt_complete_rqst(rqst->rq_task, status); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c5035a2..c23b0c1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -566,9 +566,7 @@ xprt_rdma_free(void *buffer) __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); /* - * Finish the deregistration. When using mw bind, this was - * begun in rpcrdma_reply_handler(). In all other modes, we - * do it here, in thread context. The process is considered + * Finish the deregistration. The process is considered * complete when the rr_func vector becomes NULL - this * was put in place during rpcrdma_reply_handler() - the wait * call below will not block if the dereg is "done". If @@ -580,11 +578,6 @@ xprt_rdma_free(void *buffer) &req->rl_segments[i], r_xprt, NULL); } - if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { - rep->rr_func = NULL; /* abandon the callback */ - req->rl_reply = NULL; - } - if (req->rl_iov.length == 0) { /* see allocate above */ struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; oreq->rl_reply = req->rl_reply; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4a4e4ea..304c7ad 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -152,7 +152,7 @@ void rpcrdma_event_process(struct ib_wc *wc) dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", __func__, rep, wc->status, wc->opcode, wc->byte_len); - if (!rep) /* send or bind completion that we don't care about */ + if (!rep) /* send completion that we don't care about */ return; if (IB_WC_SUCCESS != wc->status) { @@ -197,8 +197,6 @@ void rpcrdma_event_process(struct ib_wc *wc) } atomic_set(&rep->rr_buffer->rb_credits, credits); } - /* fall through */ - case IB_WC_BIND_MW: rpcrdma_schedule_tasklet(rep); break; default: @@ -233,7 +231,7 @@ rpcrdma_cq_poll(struct ib_cq *cq) /* * rpcrdma_cq_event_upcall * - * This upcall handles recv, send, bind and unbind events. + * This upcall handles recv and send events. * It is reentrant but processes single events in order to maintain * ordering of receives to keep server credits. * @@ -494,16 +492,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } switch (memreg) { - case RPCRDMA_MEMWINDOWS: - case RPCRDMA_MEMWINDOWS_ASYNC: - if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { - dprintk("RPC: %s: MEMWINDOWS registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; - } - break; case RPCRDMA_MTHCAFMR: if (!ia->ri_id->device->alloc_fmr) { #if RPCRDMA_PERSISTENT_REGISTRATION @@ -567,16 +555,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) IB_ACCESS_REMOTE_READ; goto register_setup; #endif - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_MW_BIND; - goto register_setup; case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; +#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: +#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -699,14 +684,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } break; } - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - /* Add room for mw_binds+unbinds - overkill! */ - ep->rep_attr.cap.max_send_wr++; - ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); - if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) - return -EINVAL; - break; default: break; } @@ -728,14 +705,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - ep->rep_cqinit -= RPCRDMA_MAX_SEGS; - break; - default: - break; - } if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); @@ -743,11 +712,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - /* - * Create a single cq for receive dto and mw_bind (only ever - * care about unbind, really). Send completions are suppressed. - * Use single threaded tasklet upcalls to maintain ordering. - */ ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, rpcrdma_cq_async_error_upcall, NULL, ep->rep_attr.cap.max_recv_wr + @@ -1020,11 +984,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * sizeof(struct rpcrdma_mw); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; default: break; } @@ -1055,11 +1014,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, } p += cdata->padding; - /* - * Allocate the fmr's, or mw's for mw_bind chunk registration. - * We "cycle" the mw's in order to minimize rkey reuse, - * and also reduce unbind-to-bind collision. - */ INIT_LIST_HEAD(&buf->rb_mws); r = (struct rpcrdma_mw *)p; switch (ia->ri_memreg_strategy) { @@ -1107,21 +1061,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, ++r; } break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - /* Allocate one extra request's worth, for full cycling */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1); - if (IS_ERR(r->r.mw)) { - rc = PTR_ERR(r->r.mw); - dprintk("RPC: %s: ib_alloc_mw" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } - break; default: break; } @@ -1170,7 +1109,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, memset(rep, 0, sizeof(struct rpcrdma_rep)); buf->rb_recv_bufs[i] = rep; buf->rb_recv_bufs[i]->rr_buffer = buf; - init_waitqueue_head(&rep->rr_unbind); rc = rpcrdma_register_internal(ia, rep->rr_base, len - offsetof(struct rpcrdma_rep, rr_base), @@ -1204,7 +1142,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) - * 1a. bind mw memory * 2. send mr memory (mr free, then kfree) * 3. padding (if any) [moved to rpcrdma_ep_destroy] * 4. arrays @@ -1248,15 +1185,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) " failed %i\n", __func__, rc); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = ib_dealloc_mw(r->r.mw); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_mw" - " failed %i\n", - __func__, rc); - break; default: break; } @@ -1331,15 +1259,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_niovs = 0; if (req->rl_reply) { buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - init_waitqueue_head(&req->rl_reply->rr_unbind); req->rl_reply->rr_func = NULL; req->rl_reply = NULL; } switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: /* * Cycle mw's back in reverse order, and "spin" them. * This delays and scrambles reuse as much as possible. @@ -1384,8 +1309,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) /* * Put reply buffers back into pool when not attached to - * request. This happens in error conditions, and when - * aborting unbinds. Pre-decrement counter/array index. + * request. This happens in error conditions. */ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) @@ -1688,74 +1612,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, } static int -rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct ib_mw_bind param; - int rc; - - *nsegs = 1; - rpcrdma_map_one(ia, seg, writing); - param.bind_info.mr = ia->ri_bind_mem; - param.wr_id = 0ULL; /* no send cookie */ - param.bind_info.addr = seg->mr_dma; - param.bind_info.length = seg->mr_len; - param.send_flags = 0; - param.bind_info.mw_access_flags = mem_priv; - - DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); - if (rc) { - dprintk("RPC: %s: failed ib_bind_mw " - "%u@0x%llx status %i\n", - __func__, seg->mr_len, - (unsigned long long)seg->mr_dma, rc); - rpcrdma_unmap_one(ia, seg); - } else { - seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; - seg->mr_base = param.bind_info.addr; - seg->mr_nsegs = 1; - } - return rc; -} - -static int -rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt, void **r) -{ - struct ib_mw_bind param; - LIST_HEAD(l); - int rc; - - BUG_ON(seg->mr_nsegs != 1); - param.bind_info.mr = ia->ri_bind_mem; - param.bind_info.addr = 0ULL; /* unbind */ - param.bind_info.length = 0; - param.bind_info.mw_access_flags = 0; - if (*r) { - param.wr_id = (u64) (unsigned long) *r; - param.send_flags = IB_SEND_SIGNALED; - INIT_CQCOUNT(&r_xprt->rx_ep); - } else { - param.wr_id = 0ULL; - param.send_flags = 0; - DECR_CQCOUNT(&r_xprt->rx_ep); - } - rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); - rpcrdma_unmap_one(ia, seg); - if (rc) - dprintk("RPC: %s: failed ib_(un)bind_mw," - " status %i\n", __func__, rc); - else - *r = NULL; /* will upcall on completion */ - return rc; -} - -static int rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, int *nsegs, int writing, struct rpcrdma_ia *ia) { @@ -1845,12 +1701,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; - /* Registration using memory windows */ - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); - break; - /* Default registration each time */ default: rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); @@ -1887,11 +1737,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_deregister_fmr_external(seg, ia); break; - case RPCRDMA_MEMWINDOWS_ASYNC: - case RPCRDMA_MEMWINDOWS: - rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); - break; - default: rc = rpcrdma_deregister_default_external(seg, ia); break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c620d13..bf08ee0b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -127,7 +127,6 @@ struct rpcrdma_rep { struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ struct list_head rr_list; /* tasklet list */ - wait_queue_head_t rr_unbind; /* optional unbind wait */ struct ib_sge rr_iov; /* for posting */ struct ib_mr *rr_handle; /* handle for mem in rr_iov */ char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ @@ -162,7 +161,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ struct ib_mr *rl_mr; /* if registered directly */ struct rpcrdma_mw { /* if registered from region */ union { - struct ib_mw *mw; struct ib_fmr *fmr; struct { struct ib_fast_reg_page_list *fr_pgl; -- cgit v1.1 From 0ac531c1832318efa3dc3d723e356a7e09330e80 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:32:43 -0400 Subject: xprtrdma: Remove REGISTER memory registration mode All kernel RDMA providers except amso1100 support either MTHCAFMR or FRMR, both of which are faster than REGISTER. amso1100 can continue to use ALLPHYSICAL. The only other ULP consumer in the kernel that uses the reg_phys_mr verb is Lustre. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 3 +- net/sunrpc/xprtrdma/verbs.c | 90 ++---------------------------------------- 2 files changed, 5 insertions(+), 88 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 46b5172..aae1726 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -476,8 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch && - r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) + if (wtype == rpcrdma_noch) wtype = rpcrdma_replych; } } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 304c7ad..6bb9a07 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -494,19 +494,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) switch (memreg) { case RPCRDMA_MTHCAFMR: if (!ia->ri_id->device->alloc_fmr) { -#if RPCRDMA_PERSISTENT_REGISTRATION dprintk("RPC: %s: MTHCAFMR registration " "specified but not supported by adapter, " "using riskier RPCRDMA_ALLPHYSICAL\n", __func__); memreg = RPCRDMA_ALLPHYSICAL; -#else - dprintk("RPC: %s: MTHCAFMR registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; -#endif } break; case RPCRDMA_FRMR: @@ -514,19 +506,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if ((devattr.device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { -#if RPCRDMA_PERSISTENT_REGISTRATION dprintk("RPC: %s: FRMR registration " "specified but not supported by adapter, " "using riskier RPCRDMA_ALLPHYSICAL\n", __func__); memreg = RPCRDMA_ALLPHYSICAL; -#else - dprintk("RPC: %s: FRMR registration " - "specified but not supported by adapter, " - "using slower RPCRDMA_REGISTER\n", - __func__); - memreg = RPCRDMA_REGISTER; -#endif } else { /* Mind the ia limit on FRMR page list depth */ ia->ri_max_frmr_depth = min_t(unsigned int, @@ -545,7 +529,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) * adapter. */ switch (memreg) { - case RPCRDMA_REGISTER: case RPCRDMA_FRMR: break; #if RPCRDMA_PERSISTENT_REGISTRATION @@ -565,11 +548,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n\t" - "Will continue with degraded performance\n", + "phys register failed with %lX\n", __func__, PTR_ERR(ia->ri_bind_mem)); - memreg = RPCRDMA_REGISTER; - ia->ri_bind_mem = NULL; + rc = -ENOMEM; + goto out2; } break; default: @@ -1611,67 +1593,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, return rc; } -static int -rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : - IB_ACCESS_REMOTE_READ); - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; - int len, i, rc = 0; - - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (len = 0, i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - ipb[i].addr = seg->mr_dma; - ipb[i].size = seg->mr_len; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) - break; - } - seg1->mr_base = seg1->mr_dma; - seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, - ipb, i, mem_priv, &seg1->mr_base); - if (IS_ERR(seg1->mr_chunk.rl_mr)) { - rc = PTR_ERR(seg1->mr_chunk.rl_mr); - dprintk("RPC: %s: failed ib_reg_phys_mr " - "%u@0x%llx (%d)... status %i\n", - __func__, len, - (unsigned long long)seg1->mr_dma, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - int rc; - - rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); - seg1->mr_chunk.rl_mr = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - if (rc) - dprintk("RPC: %s: failed ib_dereg_mr," - " status %i\n", __func__, rc); - return rc; -} - int rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int nsegs, int writing, struct rpcrdma_xprt *r_xprt) @@ -1701,10 +1622,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); break; - /* Default registration each time */ default: - rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); - break; + return -1; } if (rc) return -1; @@ -1738,7 +1657,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, break; default: - rc = rpcrdma_deregister_default_external(seg, ia); break; } if (r) { -- cgit v1.1 From f10eafd3a6ce9da7e96999c124b643ea6c4921f3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:32:51 -0400 Subject: xprtrdma: Fall back to MTHCAFMR when FRMR is not supported An audit of in-kernel RDMA providers that do not support the FRMR memory registration shows that several of them support MTHCAFMR. Prefer MTHCAFMR when FRMR is not supported. If MTHCAFMR is not supported, only then choose ALLPHYSICAL. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 6bb9a07..a352798 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -491,33 +491,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; } - switch (memreg) { - case RPCRDMA_MTHCAFMR: - if (!ia->ri_id->device->alloc_fmr) { - dprintk("RPC: %s: MTHCAFMR registration " - "specified but not supported by adapter, " - "using riskier RPCRDMA_ALLPHYSICAL\n", - __func__); - memreg = RPCRDMA_ALLPHYSICAL; - } - break; - case RPCRDMA_FRMR: + if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ if ((devattr.device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { dprintk("RPC: %s: FRMR registration " - "specified but not supported by adapter, " - "using riskier RPCRDMA_ALLPHYSICAL\n", - __func__); - memreg = RPCRDMA_ALLPHYSICAL; + "not supported by HCA\n", __func__); + memreg = RPCRDMA_MTHCAFMR; } else { /* Mind the ia limit on FRMR page list depth */ ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, devattr.max_fast_reg_page_list_len); } - break; + } + if (memreg == RPCRDMA_MTHCAFMR) { + if (!ia->ri_id->device->alloc_fmr) { + dprintk("RPC: %s: MTHCAFMR registration " + "not supported by HCA\n", __func__); +#if RPCRDMA_PERSISTENT_REGISTRATION + memreg = RPCRDMA_ALLPHYSICAL; +#else + rc = -EINVAL; + goto out2; +#endif + } } /* -- cgit v1.1 From cdd9ade711599e7672a635add0406080856f8b92 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:00 -0400 Subject: xprtrdma: mount reports "Invalid mount option" if memreg mode not supported If the selected memory registration mode is not supported by the underlying provider/HCA, the NFS mount command reports that there was an invalid mount option, and fails. This is misleading. Reporting a problem allocating memory is a lot closer to the truth. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a352798..7c7e9b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -513,7 +513,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) #if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; #else - rc = -EINVAL; + rc = -ENOMEM; goto out2; #endif } @@ -554,9 +554,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } break; default: - printk(KERN_ERR "%s: invalid memory registration mode %d\n", - __func__, memreg); - rc = -EINVAL; + printk(KERN_ERR "RPC: Unsupported memory " + "registration mode: %d\n", memreg); + rc = -ENOMEM; goto out2; } dprintk("RPC: %s: memory registration strategy is %d\n", -- cgit v1.1 From 13c9ff8f673862b69e795ea99a237b461c557eb3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:08 -0400 Subject: xprtrdma: Simplify rpcrdma_deregister_external() synopsis Clean up: All remaining callers of rpcrdma_deregister_external() pass NULL as the last argument, so remove that argument. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 8 +------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index aae1726..436d229 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -270,7 +270,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, out: for (pos = 0; nchunks--;) pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt, NULL); + &req->rl_segments[pos], r_xprt); return 0; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c23b0c1..430cabb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -575,7 +575,7 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt, NULL); + &req->rl_segments[i], r_xprt); } if (req->rl_iov.length == 0) { /* see allocate above */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7c7e9b4..0cbc83c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1632,7 +1632,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, int rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt, void *r) + struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; int nsegs = seg->mr_nsegs, rc; @@ -1658,12 +1658,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, default: break; } - if (r) { - struct rpcrdma_rep *rep = r; - void (*func)(struct rpcrdma_rep *) = rep->rr_func; - rep->rr_func = NULL; - func(rep); /* dereg done, callback now */ - } return nsegs; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index bf08ee0b..3f44d6a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -331,7 +331,7 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *, int rpcrdma_register_external(struct rpcrdma_mr_seg *, int, int, struct rpcrdma_xprt *); int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *, void *); + struct rpcrdma_xprt *); /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c -- cgit v1.1 From 7f1d54191ed6fa0f79f584fe3ebf6519738e817f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:16 -0400 Subject: xprtrdma: Make rpcrdma_ep_destroy() return void Clean up: rpcrdma_ep_destroy() returns a value that is used only to print a debugging message. rpcrdma_ep_destroy() already prints debugging messages in all error cases. Make rpcrdma_ep_destroy() return void instead. Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 8 ++------ net/sunrpc/xprtrdma/verbs.c | 7 +------ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 4 insertions(+), 13 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 430cabb..d18b2a3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -229,7 +229,6 @@ static void xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; dprintk("RPC: %s: called\n", __func__); @@ -238,10 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) xprt_clear_connected(xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", - __func__, rc); + rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); @@ -391,7 +387,7 @@ out4: xprt_rdma_free_addresses(xprt); rc = -EINVAL; out3: - (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); + rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 0cbc83c..edc951e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -748,11 +748,8 @@ out1: * Disconnect and destroy endpoint. After this, the only * valid operations on the ep are to free it (if dynamically * allocated) or re-create it. - * - * The caller's error handling must be sure to not leak the endpoint - * if this function fails. */ -int +void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; @@ -782,8 +779,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); - - return rc; } /* diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3f44d6a..362a19d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -301,7 +301,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); */ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); -int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); -- cgit v1.1 From fc66448549bbb77f2f1a38b270ab2d6b6a22da33 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:25 -0400 Subject: xprtrdma: Split the completion queue The current CQ handler uses the ib_wc.opcode field to distinguish between event types. However, the contents of that field are not reliable if the completion status is not IB_WC_SUCCESS. When an error completion occurs on a send event, the CQ handler schedules a tasklet with something that is not a struct rpcrdma_rep. This is never correct behavior, and sometimes it results in a panic. To resolve this issue, split the completion queue into a send CQ and a receive CQ. The send CQ handler now handles only struct rpcrdma_mw wr_id's, and the receive CQ handler now handles only struct rpcrdma_rep wr_id's. Fix suggested by Shirley Ma Reported-by: Rafael Reiter Fixes: 5c635e09cec0feeeb310968e51dad01040244851 BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=73211 Signed-off-by: Chuck Lever Tested-by: Klemens Senn Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 228 ++++++++++++++++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 2 files changed, 137 insertions(+), 92 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index edc951e..af2d097 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -142,96 +142,115 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) } } -static inline -void rpcrdma_event_process(struct ib_wc *wc) +static void +rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - struct rpcrdma_mw *frmr; - struct rpcrdma_rep *rep = - (struct rpcrdma_rep *)(unsigned long) wc->wr_id; + struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", - __func__, rep, wc->status, wc->opcode, wc->byte_len); + dprintk("RPC: %s: frmr %p status %X opcode %d\n", + __func__, frmr, wc->status, wc->opcode); - if (!rep) /* send completion that we don't care about */ + if (wc->wr_id == 0ULL) return; - - if (IB_WC_SUCCESS != wc->status) { - dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", - __func__, wc->opcode, wc->status); - rep->rr_len = ~0U; - if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) - rpcrdma_schedule_tasklet(rep); + if (wc->status != IB_WC_SUCCESS) return; - } - switch (wc->opcode) { - case IB_WC_FAST_REG_MR: - frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + if (wc->opcode == IB_WC_FAST_REG_MR) frmr->r.frmr.state = FRMR_IS_VALID; - break; - case IB_WC_LOCAL_INV: - frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + else if (wc->opcode == IB_WC_LOCAL_INV) frmr->r.frmr.state = FRMR_IS_INVALID; - break; - case IB_WC_RECV: - rep->rr_len = wc->byte_len; - ib_dma_sync_single_for_cpu( - rdmab_to_ia(rep->rr_buffer)->ri_id->device, - rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); - /* Keep (only) the most recent credits, after check validity */ - if (rep->rr_len >= 16) { - struct rpcrdma_msg *p = - (struct rpcrdma_msg *) rep->rr_base; - unsigned int credits = ntohl(p->rm_credit); - if (credits == 0) { - dprintk("RPC: %s: server" - " dropped credits to 0!\n", __func__); - /* don't deadlock */ - credits = 1; - } else if (credits > rep->rr_buffer->rb_max_requests) { - dprintk("RPC: %s: server" - " over-crediting: %d (%d)\n", - __func__, credits, - rep->rr_buffer->rb_max_requests); - credits = rep->rr_buffer->rb_max_requests; - } - atomic_set(&rep->rr_buffer->rb_credits, credits); - } - rpcrdma_schedule_tasklet(rep); - break; - default: - dprintk("RPC: %s: unexpected WC event %X\n", - __func__, wc->opcode); - break; - } } -static inline int -rpcrdma_cq_poll(struct ib_cq *cq) +static int +rpcrdma_sendcq_poll(struct ib_cq *cq) { struct ib_wc wc; int rc; - for (;;) { - rc = ib_poll_cq(cq, 1, &wc); - if (rc < 0) { - dprintk("RPC: %s: ib_poll_cq failed %i\n", - __func__, rc); - return rc; - } - if (rc == 0) - break; + while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) + rpcrdma_sendcq_process_wc(&wc); + return rc; +} - rpcrdma_event_process(&wc); +/* + * Handle send, fast_reg_mr, and local_inv completions. + * + * Send events are typically suppressed and thus do not result + * in an upcall. Occasionally one is signaled, however. This + * prevents the provider's completion queue from wrapping and + * losing a completion. + */ +static void +rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) +{ + int rc; + + rc = rpcrdma_sendcq_poll(cq); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); + return; } - return 0; + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; + } + + rpcrdma_sendcq_poll(cq); +} + +static void +rpcrdma_recvcq_process_wc(struct ib_wc *wc) +{ + struct rpcrdma_rep *rep = + (struct rpcrdma_rep *)(unsigned long)wc->wr_id; + + dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + + if (wc->status != IB_WC_SUCCESS) { + rep->rr_len = ~0U; + goto out_schedule; + } + if (wc->opcode != IB_WC_RECV) + return; + + rep->rr_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); + + if (rep->rr_len >= 16) { + struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; + unsigned int credits = ntohl(p->rm_credit); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > rep->rr_buffer->rb_max_requests) + credits = rep->rr_buffer->rb_max_requests; + atomic_set(&rep->rr_buffer->rb_credits, credits); + } + +out_schedule: + rpcrdma_schedule_tasklet(rep); +} + +static int +rpcrdma_recvcq_poll(struct ib_cq *cq) +{ + struct ib_wc wc; + int rc; + + while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) + rpcrdma_recvcq_process_wc(&wc); + return rc; } /* - * rpcrdma_cq_event_upcall + * Handle receive completions. * - * This upcall handles recv and send events. * It is reentrant but processes single events in order to maintain * ordering of receives to keep server credits. * @@ -240,26 +259,27 @@ rpcrdma_cq_poll(struct ib_cq *cq) * connection shutdown. That is, the structures required for * the completion of the reply handler must remain intact until * all memory has been reclaimed. - * - * Note that send events are suppressed and do not result in an upcall. */ static void -rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) +rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) { int rc; - rc = rpcrdma_cq_poll(cq); - if (rc) + rc = rpcrdma_recvcq_poll(cq); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); return; + } rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); if (rc) { - dprintk("RPC: %s: ib_req_notify_cq failed %i\n", + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); return; } - rpcrdma_cq_poll(cq); + rpcrdma_recvcq_poll(cq); } #ifdef RPC_DEBUG @@ -610,6 +630,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { struct ib_device_attr devattr; + struct ib_cq *sendcq, *recvcq; int rc, err; rc = ib_query_device(ia->ri_id->device, &devattr); @@ -685,7 +706,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_sge); /* set trigger for requesting send completion */ - ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); @@ -693,26 +714,43 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, + sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, rpcrdma_cq_async_error_upcall, NULL, - ep->rep_attr.cap.max_recv_wr + ep->rep_attr.cap.max_send_wr + 1, 0); - if (IS_ERR(ep->rep_cq)) { - rc = PTR_ERR(ep->rep_cq); - dprintk("RPC: %s: ib_create_cq failed: %i\n", + if (IS_ERR(sendcq)) { + rc = PTR_ERR(sendcq); + dprintk("RPC: %s: failed to create send CQ: %i\n", __func__, rc); goto out1; } - rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); + rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); if (rc) { dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); goto out2; } - ep->rep_attr.send_cq = ep->rep_cq; - ep->rep_attr.recv_cq = ep->rep_cq; + recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, + rpcrdma_cq_async_error_upcall, NULL, + ep->rep_attr.cap.max_recv_wr + 1, 0); + if (IS_ERR(recvcq)) { + rc = PTR_ERR(recvcq); + dprintk("RPC: %s: failed to create recv CQ: %i\n", + __func__, rc); + goto out2; + } + + rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + ib_destroy_cq(recvcq); + goto out2; + } + + ep->rep_attr.send_cq = sendcq; + ep->rep_attr.recv_cq = recvcq; /* Initialize cma parameters */ @@ -734,7 +772,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, return 0; out2: - err = ib_destroy_cq(ep->rep_cq); + err = ib_destroy_cq(sendcq); if (err) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); @@ -774,8 +812,14 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ep->rep_pad_mr = NULL; } - rpcrdma_clean_cq(ep->rep_cq); - rc = ib_destroy_cq(ep->rep_cq); + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rc = ib_destroy_cq(ep->rep_attr.recv_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); + + rpcrdma_clean_cq(ep->rep_attr.send_cq); + rc = ib_destroy_cq(ep->rep_attr.send_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); @@ -798,7 +842,9 @@ retry: if (rc && rc != -ENOTCONN) dprintk("RPC: %s: rpcrdma_ep_disconnect" " status %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_cq); + + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -907,7 +953,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_cq); + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -1727,7 +1774,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, ib_dma_sync_single_for_cpu(ia->ri_id->device, rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); - DECR_CQCOUNT(ep); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); if (rc) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 362a19d..334ab6e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -79,7 +79,6 @@ struct rpcrdma_ep { int rep_cqinit; int rep_connected; struct rpcrdma_ia *rep_ia; - struct ib_cq *rep_cq; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; struct ib_sge rep_pad; /* holds zeroed pad */ -- cgit v1.1 From 7f23f6f6e388d2003c4ecf5d558f3c2191e12530 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:34 -0400 Subject: xprtrmda: Reduce lock contention in completion handlers Skip the ib_poll_cq() after re-arming, if the provider knows there are no additional items waiting. (Have a look at commit ed23a727 for more details). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index af2d097..c7d5281 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -192,8 +192,11 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) return; } - rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (rc) { + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); return; @@ -272,8 +275,11 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) return; } - rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (rc) { + rc = ib_req_notify_cq(cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc == 0) + return; + if (rc < 0) { dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", __func__, rc); return; -- cgit v1.1 From 1c00dd0776543608e13c74a527660cb8cd28a74f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:42 -0400 Subject: xprtrmda: Reduce calls to ib_poll_cq() in completion handlers Change the completion handlers to grab up to 16 items per ib_poll_cq() call. No extra ib_poll_cq() is needed if fewer than 16 items are returned. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 56 ++++++++++++++++++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++ 2 files changed, 42 insertions(+), 18 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c7d5281..b8caee9 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -162,14 +162,23 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) } static int -rpcrdma_sendcq_poll(struct ib_cq *cq) +rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { - struct ib_wc wc; - int rc; + struct ib_wc *wcs; + int count, rc; - while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) - rpcrdma_sendcq_process_wc(&wc); - return rc; + do { + wcs = ep->rep_send_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) + return rc; + + count = rc; + while (count-- > 0) + rpcrdma_sendcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE); + return 0; } /* @@ -183,9 +192,10 @@ rpcrdma_sendcq_poll(struct ib_cq *cq) static void rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) { + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; int rc; - rc = rpcrdma_sendcq_poll(cq); + rc = rpcrdma_sendcq_poll(cq, ep); if (rc) { dprintk("RPC: %s: ib_poll_cq failed: %i\n", __func__, rc); @@ -202,7 +212,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) return; } - rpcrdma_sendcq_poll(cq); + rpcrdma_sendcq_poll(cq, ep); } static void @@ -241,14 +251,23 @@ out_schedule: } static int -rpcrdma_recvcq_poll(struct ib_cq *cq) +rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { - struct ib_wc wc; - int rc; + struct ib_wc *wcs; + int count, rc; - while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) - rpcrdma_recvcq_process_wc(&wc); - return rc; + do { + wcs = ep->rep_recv_wcs; + + rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); + if (rc <= 0) + return rc; + + count = rc; + while (count-- > 0) + rpcrdma_recvcq_process_wc(wcs++); + } while (rc == RPCRDMA_POLLSIZE); + return 0; } /* @@ -266,9 +285,10 @@ rpcrdma_recvcq_poll(struct ib_cq *cq) static void rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) { + struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; int rc; - rc = rpcrdma_recvcq_poll(cq); + rc = rpcrdma_recvcq_poll(cq, ep); if (rc) { dprintk("RPC: %s: ib_poll_cq failed: %i\n", __func__, rc); @@ -285,7 +305,7 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) return; } - rpcrdma_recvcq_poll(cq); + rpcrdma_recvcq_poll(cq, ep); } #ifdef RPC_DEBUG @@ -721,7 +741,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, - rpcrdma_cq_async_error_upcall, NULL, + rpcrdma_cq_async_error_upcall, ep, ep->rep_attr.cap.max_send_wr + 1, 0); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); @@ -738,7 +758,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, - rpcrdma_cq_async_error_upcall, NULL, + rpcrdma_cq_async_error_upcall, ep, ep->rep_attr.cap.max_recv_wr + 1, 0); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 334ab6e..cb4c882 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -74,6 +74,8 @@ struct rpcrdma_ia { * RDMA Endpoint -- one per transport instance */ +#define RPCRDMA_POLLSIZE (16) + struct rpcrdma_ep { atomic_t rep_cqcount; int rep_cqinit; @@ -88,6 +90,8 @@ struct rpcrdma_ep { struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; + struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; + struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; }; #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) -- cgit v1.1 From 8301a2c047cc25dabd645e5590c1db0ead4c5af4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:51 -0400 Subject: xprtrdma: Limit work done by completion handler Sagi Grimberg points out that a steady stream of CQ events could starve other work because of the boundless loop pooling in rpcrdma_{send,recv}_poll(). Instead of a (potentially infinite) while loop, return after collecting a budgeted number of completions. Signed-off-by: Chuck Lever Acked-by: Sagi Grimberg Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 10 ++++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b8caee9..1d08366 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -165,8 +165,9 @@ static int rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { struct ib_wc *wcs; - int count, rc; + int budget, count, rc; + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { wcs = ep->rep_send_wcs; @@ -177,7 +178,7 @@ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) count = rc; while (count-- > 0) rpcrdma_sendcq_process_wc(wcs++); - } while (rc == RPCRDMA_POLLSIZE); + } while (rc == RPCRDMA_POLLSIZE && --budget); return 0; } @@ -254,8 +255,9 @@ static int rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { struct ib_wc *wcs; - int count, rc; + int budget, count, rc; + budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { wcs = ep->rep_recv_wcs; @@ -266,7 +268,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) count = rc; while (count-- > 0) rpcrdma_recvcq_process_wc(wcs++); - } while (rc == RPCRDMA_POLLSIZE); + } while (rc == RPCRDMA_POLLSIZE && --budget); return 0; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cb4c882..0c3b88e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -74,6 +74,7 @@ struct rpcrdma_ia { * RDMA Endpoint -- one per transport instance */ +#define RPCRDMA_WC_BUDGET (128) #define RPCRDMA_POLLSIZE (16) struct rpcrdma_ep { -- cgit v1.1 From 65866f8259851cea5e356d2fd46fc37a4e26330e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:33:59 -0400 Subject: xprtrdma: Reduce the number of hardway buffer allocations While marshaling an RPC/RDMA request, the inline_{rsize,wsize} settings determine whether an inline request is used, or whether read or write chunks lists are built. The current default value of these settings is 1024. Any RPC request smaller than 1024 bytes is sent to the NFS server completely inline. rpcrdma_buffer_create() allocates and pre-registers a set of RPC buffers for each transport instance, also based on the inline rsize and wsize settings. RPC/RDMA requests and replies are built in these buffers. However, if an RPC/RDMA request is expected to be larger than 1024, a buffer has to be allocated and registered for that RPC, and deregistered and released when the RPC is complete. This is known has a "hardway allocation." Since the introduction of NFSv4, the size of RPC requests has become larger, and hardway allocations are thus more frequent. Hardway allocations are significant overhead, and they waste the existing RPC buffers pre-allocated by rpcrdma_buffer_create(). We'd like fewer hardway allocations. Increasing the size of the pre-registered buffers is the most direct way to do this. However, a blanket increase of the inline thresholds has interoperability consequences. On my 64-bit system, rpcrdma_buffer_create() requests roughly 7000 bytes for each RPC request buffer, using kmalloc(). Due to internal fragmentation, this wastes nearly 1200 bytes because kmalloc() already returns an 8192-byte piece of memory for a 7000-byte allocation request, though the extra space remains unused. So let's round up the size of the pre-allocated buffers, and make use of the unused space in the kmalloc'd memory. This change reduces the amount of hardway allocated memory for an NFSv4 general connectathon run from 1322092 to 9472 bytes (99%). Signed-off-by: Chuck Lever Tested-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1d08366..c80995a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include #include /* for Tavor hack below */ #include +#include #include "xprt_rdma.h" @@ -1005,7 +1006,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { char *p; - size_t len; + size_t len, rlen, wlen; int i, rc; struct rpcrdma_mw *r; @@ -1120,16 +1121,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * Allocate/init the request/reply buffers. Doing this * using kmalloc for now -- one for each buf. */ + wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); + rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); + dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", + __func__, wlen, rlen); + for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; struct rpcrdma_rep *rep; - len = cdata->inline_wsize + sizeof(struct rpcrdma_req); - /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ - /* Typical ~2400b, so rounding up saves work later */ - if (len < 4096) - len = 4096; - req = kmalloc(len, GFP_KERNEL); + req = kmalloc(wlen, GFP_KERNEL); if (req == NULL) { dprintk("RPC: %s: request buffer %d alloc" " failed\n", __func__, i); @@ -1141,16 +1142,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, buf->rb_send_bufs[i]->rl_buffer = buf; rc = rpcrdma_register_internal(ia, req->rl_base, - len - offsetof(struct rpcrdma_req, rl_base), + wlen - offsetof(struct rpcrdma_req, rl_base), &buf->rb_send_bufs[i]->rl_handle, &buf->rb_send_bufs[i]->rl_iov); if (rc) goto out; - buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); + buf->rb_send_bufs[i]->rl_size = wlen - + sizeof(struct rpcrdma_req); - len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); - rep = kmalloc(len, GFP_KERNEL); + rep = kmalloc(rlen, GFP_KERNEL); if (rep == NULL) { dprintk("RPC: %s: reply buffer %d alloc failed\n", __func__, i); @@ -1162,7 +1163,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, buf->rb_recv_bufs[i]->rr_buffer = buf; rc = rpcrdma_register_internal(ia, rep->rr_base, - len - offsetof(struct rpcrdma_rep, rr_base), + rlen - offsetof(struct rpcrdma_rep, rr_base), &buf->rb_recv_bufs[i]->rr_handle, &buf->rb_recv_bufs[i]->rr_iov); if (rc) -- cgit v1.1 From ec62f40d3505a643497d105c297093bb90afd44e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:34:07 -0400 Subject: xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting Devesh Sharma reports that after a disconnect, his HCA is failing to create a fresh QP, leaving ia_ri->ri_id->qp set to NULL. But xprtrdma still allows RPCs to wake up and post LOCAL_INV as they exit, causing an oops. rpcrdma_ep_connect() is allowing the wake-up by leaking the QP creation error code (-EPERM in this case) to the RPC client's generic layer. xprt_connect_status() does not recognize -EPERM, so it kills pending RPC tasks immediately rather than retrying the connect. Re-arrange the QP creation logic so that when it fails on reconnect, it leaves ->qp with the old QP rather than NULL. If pending RPC tasks wake and exit, LOCAL_INV work requests will flush rather than oops. On initial connect, leaving ->qp == NULL is OK, since there are no pending RPCs that might use ->qp. But be sure not to try to destroy a NULL QP when rpcrdma_ep_connect() is retried. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c80995a..54edf2a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -867,6 +867,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (ep->rep_connected != 0) { struct rpcrdma_xprt *xprt; retry: + dprintk("RPC: %s: reconnecting...\n", __func__); rc = rpcrdma_ep_disconnect(ep, ia); if (rc && rc != -ENOTCONN) dprintk("RPC: %s: rpcrdma_ep_disconnect" @@ -879,7 +880,7 @@ retry: id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { - rc = PTR_ERR(id); + rc = -EHOSTUNREACH; goto out; } /* TEMP TEMP TEMP - fail if new device: @@ -893,20 +894,30 @@ retry: printk("RPC: %s: can't reconnect on " "different device!\n", __func__); rdma_destroy_id(id); - rc = -ENETDOWN; + rc = -ENETUNREACH; goto out; } /* END TEMP */ + rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + rdma_destroy_id(id); + rc = -ENETUNREACH; + goto out; + } rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = id; - } - - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); - if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); - goto out; + } else { + dprintk("RPC: %s: connecting...\n", __func__); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + /* do not update ep->rep_connected */ + return -ENETUNREACH; + } } /* XXX Tavor device performs badly with 2K MTU! */ -- cgit v1.1 From 5bc4bc729275a0bfc2bfd04466e8ab7c85af2f6e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:34:16 -0400 Subject: xprtrdma: Remove Tavor MTU setting Clean up. Remove HCA-specific clutter in xprtrdma, which is supposed to be device-independent. Hal Rosenstock observes: > Note that there is OpenSM option (enable_quirks) to return 1K MTU > in SA PathRecord responses for Tavor so that can be used for this. > The default setting for enable_quirks is FALSE so that would need > changing. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 54edf2a..515dfc1 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -48,7 +48,6 @@ */ #include -#include /* for Tavor hack below */ #include #include @@ -920,19 +919,6 @@ retry: } } -/* XXX Tavor device performs badly with 2K MTU! */ -if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { - struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); - if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && - (pcid->vendor == PCI_VENDOR_ID_MELLANOX || - pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { - struct ib_qp_attr attr = { - .path_mtu = IB_MTU_1024 - }; - rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); - } -} - ep->rep_connected = 0; rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); -- cgit v1.1 From 196c69989d84ab902bbe545f7bd8ce78ee74dac4 Mon Sep 17 00:00:00 2001 From: Shirley Ma Date: Wed, 28 May 2014 10:34:24 -0400 Subject: xprtrdma: Allocate missing pagelist GETACL relies on transport layer to alloc memory for reply buffer. However xprtrdma assumes that the reply buffer (pagelist) has been pre-allocated in upper layer. This problem was reported by IOL OFA lab test on PPC. Signed-off-by: Shirley Ma Reviewed-by: Chuck Lever Tested-by: Edward Mossman Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 436d229..dc4a826 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -99,6 +99,12 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, page_base = xdrbuf->page_base & ~PAGE_MASK; p = 0; while (len && n < nsegs) { + if (!ppages[p]) { + /* alloc the pagelist for receiving buffer */ + ppages[p] = alloc_page(GFP_ATOMIC); + if (!ppages[p]) + return 0; + } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); -- cgit v1.1 From bfaee096deaa680195df5491eb650f81051c145d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:34:32 -0400 Subject: xprtrdma: Use macros for reconnection timeout constants Clean up: Ensure the same max and min constant values are used everywhere when setting reconnect timeouts. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d18b2a3..6b84d7d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = { #endif +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void @@ -285,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args) /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; - xprt->bind_timeout = (60U * HZ); - xprt->reestablish_timeout = (5U * HZ); - xprt->idle_timeout = (5U * 60 * HZ); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ @@ -432,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) schedule_delayed_work(&r_xprt->rdma_connect, xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; - if (xprt->reestablish_timeout > (30 * HZ)) - xprt->reestablish_timeout = (30 * HZ); - else if (xprt->reestablish_timeout < (5 * HZ)) - xprt->reestablish_timeout = (5 * HZ); + if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; + else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; } else { schedule_delayed_work(&r_xprt->rdma_connect, 0); if (!RPC_IS_ASYNC(task)) -- cgit v1.1 From 18906972aa1103c07869c9b43860a52e0e27e8e5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:34:41 -0400 Subject: xprtrdma: Reset connection timeout after successful reconnect If the new connection is able to make forward progress, reset the re-establish timeout. Otherwise it keeps growing even if disconnect events are rare. The same behavior as TCP is adopted: reconnect immediately if the transport instance has been able to make some forward progress. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index dc4a826..ac65b0c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -770,6 +770,7 @@ repost: /* from here on, the reply is no longer an orphan */ req->rl_reply = rep; + xprt->reestablish_timeout = 0; /* check for expected message types */ /* The order of some of these tests is important. */ -- cgit v1.1 From 4f4cf5ad6fc1b16dc8dc9d750bb80b35eba5e98d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:34:49 -0400 Subject: SUNRPC: Move congestion window constants to header file I would like to use one of the RPC client's congestion algorithm constants in transport-specific code. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d173f79..2d1d5a6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); -/* - * The transport code maintains an estimate on the maximum number of out- - * standing RPC requests, using a smoothed version of the congestion - * avoidance implemented in 44BSD. This is basically the Van Jacobson - * congestion algorithm: If a retransmit occurs, the congestion window is - * halved; otherwise, it is incremented by 1/cwnd when - * - * - a reply is received and - * - a full number of requests are outstanding and - * - the congestion window hasn't been updated recently. - */ -#define RPC_CWNDSHIFT (8U) -#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) -#define RPC_INITCWND RPC_CWNDSCALE -#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) - -#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) - /** * xprt_register_transport - register a transport implementation * @transport: transport to register @@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); * @task: recently completed RPC request used to adjust window * @result: result code of completed RPC request * - * We use a time-smoothed congestion estimator to avoid heavy oscillation. + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. */ void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result) { -- cgit v1.1 From e7ce710a8802351bd4118c5d6136c1d850f67cf9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:34:57 -0400 Subject: xprtrdma: Avoid deadlock when credit window is reset Update the cwnd while processing the server's reply. Otherwise the next task on the xprt_sending queue is still subject to the old credit window. Currently, no task is awoken if the old congestion window is still exceeded, even if the new window is larger, and a deadlock results. This is an issue during a transport reconnect. Servers don't normally shrink the credit window, but the client does reset it to 1 when reconnecting so the server can safely grow it again. As a minor optimization, remove the hack of grabbing the initial cwnd size (which happens to be RPC_CWNDSCALE) and using that value as the congestion scaling factor. The scaling value is invariant, and we are better off without the multiplication operation. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++++++ net/sunrpc/xprtrdma/transport.c | 19 +------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 7 insertions(+), 19 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ac65b0c..77b84cf 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -716,6 +716,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); __be32 *iptr; int rdmalen, status; + unsigned long cwnd; /* Check status. If bad, signal disconnect and return rep to pool */ if (rep->rr_len == ~0U) { @@ -845,6 +846,11 @@ badheader: break; } + cwnd = xprt->cwnd; + xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); xprt_complete_rqst(rqst->rq_task, status); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6b84d7d..187894b 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -448,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -static int -xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int credits = atomic_read(&r_xprt->rx_buf.rb_credits); - - /* == RPC_CWNDSCALE @ init, but *after* setup */ - if (r_xprt->rx_buf.rb_cwndscale == 0UL) { - r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; - dprintk("RPC: %s: cwndscale %lu\n", __func__, - r_xprt->rx_buf.rb_cwndscale); - BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); - } - xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; - return xprt_reserve_xprt_cong(xprt, task); -} - /* * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv @@ -686,7 +669,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) */ static struct rpc_xprt_ops xprt_rdma_procs = { - .reserve_xprt = xprt_rdma_reserve_xprt, + .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0c3b88e..89e7cd4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -212,7 +212,6 @@ struct rpcrdma_req { struct rpcrdma_buffer { spinlock_t rb_lock; /* protects indexes */ atomic_t rb_credits; /* most recent server credits */ - unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ int rb_send_index; -- cgit v1.1 From c977dea22708688eae31774f70126c97aa4dfe83 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:35:06 -0400 Subject: xprtrdma: Remove BUG_ON() call sites If an error occurs in the marshaling logic, fail the RPC request being processed, but leave the client running. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 187894b..93fe775 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -463,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) struct rpcrdma_req *req, *nreq; req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); - BUG_ON(NULL == req); + if (req == NULL) + return NULL; if (size > req->rl_size) { dprintk("RPC: %s: size %zd too large for buffer[%zd]: " diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 515dfc1..13dbd1c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1302,7 +1302,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) int i; unsigned long flags; - BUG_ON(req->rl_nchunks != 0); spin_lock_irqsave(&buffers->rb_lock, flags); buffers->rb_send_bufs[--buffers->rb_send_index] = req; req->rl_niovs = 0; @@ -1535,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, } else post_wr = &frmr_wr; - /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); - /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; @@ -1549,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, frmr_wr.wr.fast_reg.page_list_len = page_no; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - BUG_ON(frmr_wr.wr.fast_reg.length < len); + if (frmr_wr.wr.fast_reg.length < len) { + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + return -EIO; + } + + /* Bump the key */ + key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + frmr_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); @@ -1709,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, #if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: - BUG_ON(nsegs != 1); rpcrdma_unmap_one(ia, seg); - rc = 0; break; #endif -- cgit v1.1 From c93c62231cf55df4a26bd08937efeea97e6fc5e8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 May 2014 10:35:14 -0400 Subject: xprtrdma: Disconnect on registration failure If rpcrdma_register_external() fails during request marshaling, the current RPC request is killed. Instead, this RPC should be retried after reconnecting the transport instance. The most likely reason for registration failure with FRMR is a failed post_send, which would be due to a remote transport disconnect or memory exhaustion. These issues can be recovered by a retry. Problems encountered in the marshaling logic itself will not be corrected by trying again, so these should still kill a request. Now that we've added a clean exit for marshaling errors, take the opportunity to defang some BUG_ON's. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 48 ++++++++++++++++++++++++++--------------- net/sunrpc/xprtrdma/transport.c | 17 +++++++++------ 2 files changed, 42 insertions(+), 23 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 77b84cf..693966d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -77,6 +77,8 @@ static const char transfertypes[][12] = { * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk * elements. Segments are then coalesced when registered, if possible * within the selected memreg mode. + * + * Returns positive number of segments converted, or a negative errno. */ static int @@ -103,12 +105,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, /* alloc the pagelist for receiving buffer */ ppages[p] = alloc_page(GFP_ATOMIC); if (!ppages[p]) - return 0; + return -ENOMEM; } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); - BUG_ON(seg[n].mr_len > PAGE_SIZE); + if (seg[n].mr_len > PAGE_SIZE) + return -EIO; len -= seg[n].mr_len; ++n; ++p; @@ -117,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, /* Message overflows the seg array */ if (len && n == nsegs) - return 0; + return -EIO; if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing @@ -126,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, return n; if (n == nsegs) /* Tail remains, but we're out of segments */ - return 0; + return -EIO; seg[n].mr_page = NULL; seg[n].mr_offset = xdrbuf->tail[0].iov_base; seg[n].mr_len = xdrbuf->tail[0].iov_len; @@ -167,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * Reply chunk (a counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO + * + * Returns positive RPC/RDMA header size, or negative errno. */ -static unsigned int +static ssize_t rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int nsegs, nchunks = 0; + int n, nsegs, nchunks = 0; unsigned int pos; struct rpcrdma_mr_seg *seg = req->rl_segments; struct rpcrdma_read_chunk *cur_rchunk = NULL; @@ -201,11 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, pos = target->head[0].iov_len; nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); - if (nsegs == 0) - return 0; + if (nsegs < 0) + return nsegs; do { - int n = rpcrdma_register_external(seg, nsegs, + n = rpcrdma_register_external(seg, nsegs, cur_wchunk != NULL, r_xprt); if (n <= 0) goto out; @@ -277,7 +282,7 @@ out: for (pos = 0; nchunks--;) pos += rpcrdma_deregister_external( &req->rl_segments[pos], r_xprt); - return 0; + return n; } /* @@ -359,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. * [2] -- optional padding. * [3] -- if padded, header only in [1] and data here. + * + * Returns zero on success, otherwise a negative errno. */ int @@ -368,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t hdrlen, rpclen, padlen; + size_t rpclen, padlen; + ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; @@ -439,7 +447,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) wtype = rpcrdma_noch; - BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + dprintk("RPC: %s: cannot marshal multiple chunk lists\n", + __func__); + return -EIO; + } hdrlen = 28; /*sizeof *headerp;*/ padlen = 0; @@ -464,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - BUG_ON(wtype != rpcrdma_noch); - + if (wtype != rpcrdma_noch) { + dprintk("RPC: %s: invalid chunk list\n", + __func__); + return -EIO; + } } else { headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; @@ -500,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, headerp, wtype); } - - if (hdrlen == 0) - return -1; + if (hdrlen < 0) + return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 93fe775..66f91f0 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -595,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc; - /* marshal the send itself */ - if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { - r_xprt->rx_stats.failed_marshal_count++; - dprintk("RPC: %s: rpcrdma_marshal_req failed\n", - __func__); - return -EIO; + if (req->rl_niovs == 0) { + rc = rpcrdma_marshal_req(rqst); + if (rc < 0) + goto failed_marshal; } if (req->rl_reply == NULL) /* e.g. reconnection */ @@ -625,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task) rqst->rq_bytes_sent = 0; return 0; +failed_marshal: + r_xprt->rx_stats.failed_marshal_count++; + dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", + __func__, rc); + if (rc == -EIO) + return -EIO; drop_connection: xprt_disconnect_done(xprt); return -ENOTCONN; /* implies disconnect */ -- cgit v1.1