From 92b98361f119a6919061d067c6584de9ae2de9f4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:14:12 -0500 Subject: xprtrdma: Return an errno from rpcrdma_register_external() The RPC/RDMA send_request method and the chunk registration code expects an errno from the registration function. This allows the upper layers to distinguish between a recoverable failure (for example, temporary memory exhaustion) and a hard failure (for example, a bug in the registration logic). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 61c4129..6ea2942 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1918,10 +1918,10 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, break; default: - return -1; + return -EIO; } if (rc) - return -1; + return rc; return nsegs; } -- cgit v1.1 From e7104a2a96069975d489c60a30564372c6273a85 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:14:20 -0500 Subject: xprtrdma: Cap req_cqinit Recent work made FRMR registration and invalidation completions unsignaled. This greatly reduces the adapter interrupt rate. Every so often, however, a posted send Work Request is allowed to signal. Otherwise, the provider's Work Queue will wrap and the workload will hang. The number of Work Requests that are allowed to remain unsignaled is determined by the value of req_cqinit. Currently, this is set to the size of the send Work Queue divided by two, minus 1. For FRMR, the send Work Queue is the maximum number of concurrent RPCs (currently 32) times the maximum number of Work Requests an RPC might use (currently 7, though some adapters may need more). For mlx4, this is 224 entries. This leaves completion signaling disabled for 111 send Work Requests. Some providers hold back dispatching Work Requests until a CQE is generated. If completions are disabled, then no CQEs are generated for quite some time, and that can stall the Work Queue. I've seen this occur running xfstests generic/113 over NFSv4, where eventually, posting a FAST_REG_MR Work Request fails with -ENOMEM because the Work Queue has overflowed. The connection is dropped and re-established. Cap the rep_cqinit setting so completions are not left turned off for too long. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=269 Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 +++- net/sunrpc/xprtrdma/xprt_rdma.h | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 6ea2942..af45cf3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -733,7 +733,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit <= 2) + if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) + ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; + else if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); ep->rep_ia = ia; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7fc9a..b799041 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -97,6 +97,12 @@ struct rpcrdma_ep { struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; }; +/* + * Force a signaled SEND Work Request every so often, + * in case the provider needs to do some housekeeping. + */ +#define RPCRDMA_MAX_UNSIGNALED_SENDS (32) + #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) -- cgit v1.1 From 467c9674bccc073684ee34f4bd205cf1b135d76e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:14:29 -0500 Subject: xprtrdma: unmap all FMRs during transport disconnect When using RPCRDMA_MTHCAFMR memory registration, after a few transport disconnect / reconnect cycles, ib_map_phys_fmr() starts to return EINVAL because the provider has exhausted its map pool. Make sure that all FMRs are unmapped during transport disconnect, and that ->send_request remarshals them during an RPC retransmit. This resets the transport's MRs to ensure that none are leaked during a disconnect. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 42 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6a4615d..cfe9a81 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -599,7 +599,7 @@ xprt_rdma_send_request(struct rpc_task *task) if (req->rl_niovs == 0) rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) rc = rpcrdma_marshal_chunks(rqst, 0); if (rc < 0) goto failed_marshal; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index af45cf3..3c88276 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -62,6 +62,7 @@ #endif static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); +static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); /* * internal functions @@ -868,8 +869,19 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - if (ia->ri_memreg_strategy == RPCRDMA_FRMR) + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: rpcrdma_reset_frmrs(ia); + break; + case RPCRDMA_MTHCAFMR: + rpcrdma_reset_fmrs(ia); + break; + case RPCRDMA_ALLPHYSICAL: + break; + default: + rc = -EIO; + goto out; + } xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -1289,6 +1301,34 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_pool); } +/* After a disconnect, unmap all FMRs. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_fmr_external(). + */ +static void +rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct list_head *pos; + struct rpcrdma_mw *r; + LIST_HEAD(l); + int rc; + + list_for_each(pos, &buf->rb_all) { + r = list_entry(pos, struct rpcrdma_mw, mw_all); + + INIT_LIST_HEAD(&l); + list_add(&r->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + if (rc) + dprintk("RPC: %s: ib_unmap_fmr failed %i\n", + __func__, rc); + } +} + /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in * an unusable state. Find FRMRs in this state and dereg / reg * each. FRMRs that are VALID and attached to an rpcrdma_req are -- cgit v1.1 From f1a03b76fecdb23983e2ad7e817e98d093ece9f7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:14:37 -0500 Subject: xprtrdma: Refactor tasklet scheduling Restore the separate function that schedules the reply handling tasklet. I need to call it from two different paths. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3c88276..478b2fd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -107,6 +107,17 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); static void +rpcrdma_schedule_tasklet(struct list_head *sched_list) +{ + unsigned long flags; + + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_splice_tail(sched_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); +} + +static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; @@ -244,7 +255,6 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) struct list_head sched_list; struct ib_wc *wcs; int budget, count, rc; - unsigned long flags; INIT_LIST_HEAD(&sched_list); budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; @@ -262,10 +272,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) rc = 0; out_schedule: - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_splice_tail(&sched_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); + rpcrdma_schedule_tasklet(&sched_list); return rc; } -- cgit v1.1 From 5c166bef4fa138da45d756dc77a8cb29fced1714 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:14:45 -0500 Subject: xprtrdma: Re-write rpcrdma_flush_cqs() Currently rpcrdma_flush_cqs() attempts to avoid code duplication, and simply invokes rpcrdma_recvcq_upcall and rpcrdma_sendcq_upcall. 1. rpcrdma_flush_cqs() can run concurrently with provider upcalls. Both flush_cqs() and the upcalls were invoking ib_poll_cq() in different threads using the same wc buffers (ep->rep_recv_wcs and ep->rep_send_wcs), added by commit 1c00dd077654 ("xprtrmda: Reduce calls to ib_poll_cq() in completion handlers"). During transport disconnect processing, this sometimes resulted in the same reply getting added to the rpcrdma_tasklets_g list more than once, which corrupted the list. 2. The upcall functions drain only a limited number of CQEs, thanks to the poll budget added by commit 8301a2c047cc ("xprtrdma: Limit work done by completion handler"). Fixes: a7bc211ac926 ("xprtrdma: On disconnect, don't ignore ... ") BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=276 Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 478b2fd..e6ac964 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -317,8 +317,15 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) static void rpcrdma_flush_cqs(struct rpcrdma_ep *ep) { - rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); - rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); + struct ib_wc wc; + LIST_HEAD(sched_list); + + while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) + rpcrdma_recvcq_process_wc(&wc, &sched_list); + if (!list_empty(&sched_list)) + rpcrdma_schedule_tasklet(&sched_list); + while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0) + rpcrdma_sendcq_process_wc(&wc); } #ifdef RPC_DEBUG -- cgit v1.1 From d5440e27d3e572272db957d6b9661a902b7c339f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:14:53 -0500 Subject: xprtrdma: Enable pad optimization The Linux NFS/RDMA server used to reject NFSv3 WRITE requests when pad optimization was enabled. That bug was fixed by commit e560e3b510d2 ("svcrdma: Add zero padding if the client doesn't send it"). We can now enable pad optimization on the client, which helps performance and is supported now by both Linux and Solaris servers. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index cfe9a81..8ed2576 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -73,7 +73,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 0; + int xprt_rdma_pad_optimize = 1; #ifdef RPC_DEBUG -- cgit v1.1 From 7ff11de1bae02a41cac6503f858218ac1b9a3cbe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Nov 2014 20:15:01 -0500 Subject: xprtrdma: Display async errors An async error upcall is a hard error, and should be reported in the system log. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e6ac964..5783c1a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -106,6 +106,32 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); +static const char * const async_event[] = { + "CQ error", + "QP fatal error", + "QP request error", + "QP access error", + "communication established", + "send queue drained", + "path migration successful", + "path mig error", + "device fatal error", + "port active", + "port error", + "LID change", + "P_key change", + "SM change", + "SRQ error", + "SRQ limit reached", + "last WQE reached", + "client reregister", + "GID change", +}; + +#define ASYNC_MSG(status) \ + ((status) < ARRAY_SIZE(async_event) ? \ + async_event[(status)] : "unknown async error") + static void rpcrdma_schedule_tasklet(struct list_head *sched_list) { @@ -122,8 +148,9 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; - dprintk("RPC: %s: QP error %X on device %s ep %p\n", - __func__, event->event, event->device->name, context); + pr_err("RPC: %s: %s on device %s ep %p\n", + __func__, ASYNC_MSG(event->event), + event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; ep->rep_func(ep); @@ -136,8 +163,9 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; - dprintk("RPC: %s: CQ error %X on device %s ep %p\n", - __func__, event->event, event->device->name, context); + pr_err("RPC: %s: %s on device %s ep %p\n", + __func__, ASYNC_MSG(event->event), + event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; ep->rep_func(ep); -- cgit v1.1