From 9faaff593404a9c4e5abc6839a641635d7b9d0cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:40 +0000 Subject: rxrpc: Provide a different lockdep key for call->user_mutex for kernel calls Provide a different lockdep key for rxrpc_call::user_mutex when the call is made on a kernel socket, such as by the AFS filesystem. The problem is that lockdep registers a false positive between userspace calling the sendmsg syscall on a user socket where call->user_mutex is held whilst userspace memory is accessed whereas the AFS filesystem may perform operations with mmap_sem held by the caller. In such a case, the following warning is produced. ====================================================== WARNING: possible circular locking dependency detected 4.14.0-fscache+ #243 Tainted: G E ------------------------------------------------------ modpost/16701 is trying to acquire lock: (&vnode->io_lock){+.+.}, at: [] afs_begin_vnode_operation+0x33/0x77 [kafs] but task is already holding lock: (&mm->mmap_sem){++++}, at: [] __do_page_fault+0x1ef/0x486 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++}: __might_fault+0x61/0x89 _copy_from_iter_full+0x40/0x1fa rxrpc_send_data+0x8dc/0xff3 rxrpc_do_sendmsg+0x62f/0x6a1 rxrpc_sendmsg+0x166/0x1b7 sock_sendmsg+0x2d/0x39 ___sys_sendmsg+0x1ad/0x22b __sys_sendmsg+0x41/0x62 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #2 (&call->user_mutex){+.+.}: __mutex_lock+0x86/0x7d2 rxrpc_new_client_call+0x378/0x80e rxrpc_kernel_begin_call+0xf3/0x154 afs_make_call+0x195/0x454 [kafs] afs_vl_get_capabilities+0x193/0x198 [kafs] afs_vl_lookup_vldb+0x5f/0x151 [kafs] afs_create_volume+0x2e/0x2f4 [kafs] afs_mount+0x56a/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #1 (k-sk_lock-AF_RXRPC){+.+.}: lock_sock_nested+0x74/0x8a rxrpc_kernel_begin_call+0x8a/0x154 afs_make_call+0x195/0x454 [kafs] afs_fs_get_capabilities+0x17a/0x17f [kafs] afs_probe_fileserver+0xf7/0x2f0 [kafs] afs_select_fileserver+0x83f/0x903 [kafs] afs_fetch_status+0x89/0x11d [kafs] afs_iget+0x16f/0x4f8 [kafs] afs_mount+0x6c6/0x8d7 [kafs] mount_fs+0x6a/0x109 vfs_kern_mount+0x67/0x135 do_mount+0x90b/0xb57 SyS_mount+0x72/0x98 do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 -> #0 (&vnode->io_lock){+.+.}: lock_acquire+0x174/0x19f __mutex_lock+0x86/0x7d2 afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 __clear_user+0x3d/0x60 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be return_from_SYSCALL_64+0x0/0x75 other info that might help us debug this: Chain exists of: &vnode->io_lock --> &call->user_mutex --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&call->user_mutex); lock(&mm->mmap_sem); lock(&vnode->io_lock); *** DEADLOCK *** 1 lock held by modpost/16701: #0: (&mm->mmap_sem){++++}, at: [] __do_page_fault+0x1ef/0x486 stack backtrace: CPU: 0 PID: 16701 Comm: modpost Tainted: G E 4.14.0-fscache+ #243 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Call Trace: dump_stack+0x67/0x8e print_circular_bug+0x341/0x34f check_prev_add+0x11f/0x5d4 ? add_lock_to_list.isra.12+0x8b/0x8b ? add_lock_to_list.isra.12+0x8b/0x8b ? __lock_acquire+0xf77/0x10b4 __lock_acquire+0xf77/0x10b4 lock_acquire+0x174/0x19f ? afs_begin_vnode_operation+0x33/0x77 [kafs] __mutex_lock+0x86/0x7d2 ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] ? afs_begin_vnode_operation+0x33/0x77 [kafs] afs_begin_vnode_operation+0x33/0x77 [kafs] afs_fetch_data+0x80/0x12a [kafs] afs_readpages+0x314/0x405 [kafs] __do_page_cache_readahead+0x203/0x2ba ? filemap_fault+0x179/0x54d filemap_fault+0x179/0x54d __do_fault+0x17/0x60 __handle_mm_fault+0x6d7/0x95c handle_mm_fault+0x24e/0x2a3 __do_page_fault+0x301/0x486 do_page_fault+0x236/0x259 page_fault+0x22/0x30 RIP: 0010:__clear_user+0x3d/0x60 RSP: 0018:ffff880071e93da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000000000011c RCX: 000000000000011c RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000060f720 RBP: 000000000060f720 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: ffff8800b5459b68 R12: ffff8800ce150e00 R13: 000000000060f720 R14: 00000000006127a8 R15: 0000000000000000 padzero+0x1c/0x2b load_elf_binary+0x785/0xdc7 search_binary_handler+0x81/0x1ff do_execveat_common.isra.14+0x600/0x888 do_execve+0x1f/0x21 SyS_execve+0x28/0x2f do_syscall_64+0x89/0x1be entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fdb6009ee07 RSP: 002b:00007fff566d9728 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 000055ba57280900 RCX: 00007fdb6009ee07 RDX: 000055ba5727f270 RSI: 000055ba5727cac0 RDI: 000055ba57280900 RBP: 000055ba57280900 R08: 00007fff566d9700 R09: 0000000000000000 R10: 000055ba5727cac0 R11: 0000000000000246 R12: 0000000000000000 R13: 000055ba5727cac0 R14: 000055ba5727f270 R15: 0000000000000000 Signed-off-by: David Howells --- net/rxrpc/call_object.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'net/rxrpc/call_object.c') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 4c7fbc6..1f141dc 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -55,6 +55,8 @@ static void rxrpc_call_timer_expired(unsigned long _call) rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); } +static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; + /* * find an extant server call * - called in process context with IRQs enabled @@ -95,7 +97,7 @@ found_extant_call: /* * allocate a new call */ -struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) { struct rxrpc_call *call; @@ -114,6 +116,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) goto nomem_2; mutex_init(&call->user_mutex); + + /* Prevent lockdep reporting a deadlock false positive between the afs + * filesystem and sys_sendmsg() via the mmap sem. + */ + if (rx->sk.sk_kern_sock) + lockdep_set_class(&call->user_mutex, + &rxrpc_call_user_mutex_lock_class_key); + setup_timer(&call->timer, rxrpc_call_timer_expired, (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); @@ -151,7 +161,8 @@ nomem: /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, +static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; @@ -159,7 +170,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, _enter(""); - call = rxrpc_alloc_call(gfp); + call = rxrpc_alloc_call(rx, gfp); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; @@ -210,7 +221,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, user_call_ID); - call = rxrpc_alloc_client_call(srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); -- cgit v1.1 From 4812417894770f8c13e5dd8a66479ae44f4b01ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Split the call params from the operation params When rxrpc_sendmsg() parses the control message buffer, it places the parameters extracted into a structure, but lumps together call parameters (such as user call ID) with operation parameters (such as whether to send data, send an abort or accept a call). Split the call parameters out into their own structure, a copy of which is then embedded in the operation parameters struct. The call parameters struct is then passed down into the places that need it instead of passing the individual parameters. This allows for extra call parameters to be added. Signed-off-by: David Howells --- net/rxrpc/call_object.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/rxrpc/call_object.c') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1f141dc..c3e1fa8 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -208,8 +208,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, - unsigned long user_call_ID, - s64 tx_total_len, + struct rxrpc_call_params *p, gfp_t gfp) __releases(&rx->sk.sk_lock.slock) { @@ -219,7 +218,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, const void *here = __builtin_return_address(0); int ret; - _enter("%p,%lx", rx, user_call_ID); + _enter("%p,%lx", rx, p->user_call_ID); call = rxrpc_alloc_client_call(rx, srx, gfp); if (IS_ERR(call)) { @@ -228,9 +227,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - call->tx_total_len = tx_total_len; + call->tx_total_len = p->tx_total_len; trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), - here, (const void *)user_call_ID); + here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. @@ -246,16 +245,16 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, parent = *pp; xcall = rb_entry(parent, struct rxrpc_call, sock_node); - if (user_call_ID < xcall->user_call_ID) + if (p->user_call_ID < xcall->user_call_ID) pp = &(*pp)->rb_left; - else if (user_call_ID > xcall->user_call_ID) + else if (p->user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else goto error_dup_user_ID; } rcu_assign_pointer(call->socket, rx); - call->user_call_ID = user_call_ID; + call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); -- cgit v1.1 From a158bdd3247b9656df36ba133235fff702e9fdc3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:41 +0000 Subject: rxrpc: Fix call timeouts Fix the rxrpc call expiration timeouts and make them settable from userspace. By analogy with other rx implementations, there should be three timeouts: (1) "Normal timeout" This is set for all calls and is triggered if we haven't received any packets from the peer in a while. It is measured from the last time we received any packet on that call. This is not reset by any connection packets (such as CHALLENGE/RESPONSE packets). If a service operation takes a long time, the server should generate PING ACKs at a duration that's substantially less than the normal timeout so is to keep both sides alive. This is set at 1/6 of normal timeout. (2) "Idle timeout" This is set only for a service call and is triggered if we stop receiving the DATA packets that comprise the request data. It is measured from the last time we received a DATA packet. (3) "Hard timeout" This can be set for a call and specified the maximum lifetime of that call. It should not be specified by default. Some operations (such as volume transfer) take a long time. Allow userspace to set/change the timeouts on a call with sendmsg, using a control message: RXRPC_SET_CALL_TIMEOUTS The data to the message is a number of 32-bit words, not all of which need be given: u32 hard_timeout; /* sec from first packet */ u32 idle_timeout; /* msec from packet Rx */ u32 normal_timeout; /* msec from data Rx */ This can be set in combination with any other sendmsg() that affects a call. Signed-off-by: David Howells --- net/rxrpc/call_object.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net/rxrpc/call_object.c') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c3e1fa8..b305970 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -51,8 +51,10 @@ static void rxrpc_call_timer_expired(unsigned long _call) _enter("%d", call->debug_id); - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); + if (call->state < RXRPC_CALL_COMPLETE) { + trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); + rxrpc_queue_call(call); + } } static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; @@ -139,6 +141,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) atomic_set(&call->usage, 1); call->debug_id = atomic_inc_return(&rxrpc_debug_id); call->tx_total_len = -1; + call->next_rx_timo = 20 * HZ; + call->next_req_timo = 1 * HZ; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); @@ -189,15 +193,16 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, */ static void rxrpc_start_call_timer(struct rxrpc_call *call) { - ktime_t now = ktime_get_real(), expire_at; - - expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); - call->expire_at = expire_at; - call->ack_at = expire_at; - call->ping_at = expire_at; - call->resend_at = expire_at; - call->timer.expires = jiffies + LONG_MAX / 2; - rxrpc_set_timer(call, rxrpc_timer_begin, now); + unsigned long now = jiffies; + unsigned long j = now + MAX_JIFFY_OFFSET; + + call->ack_at = j; + call->resend_at = j; + call->ping_at = j; + call->expect_rx_by = j; + call->expect_req_by = j; + call->expect_term_by = j; + call->timer.expires = now; } /* -- cgit v1.1 From bd1fdf8cfdf3fdbccd2b21c33ec649ebd7429af7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2017 10:18:42 +0000 Subject: rxrpc: Add a timeout for detecting lost ACKs/lost DATA Add an extra timeout that is set/updated when we send a DATA packet that has the request-ack flag set. This allows us to detect if we don't get an ACK in response to the latest flagged packet. The ACK packet is adjudged to have been lost if it doesn't turn up within 2*RTT of the transmission. If the timeout occurs, we schedule the sending of a PING ACK to find out the state of the other side. If a new DATA packet is ready to go sooner, we cancel the sending of the ping and set the request-ack flag on that instead. If we get back a PING-RESPONSE ACK that indicates a lower tx_top than what we had at the time of the ping transmission, we adjudge all the DATA packets sent between the response tx_top and the ping-time tx_top to have been lost and retransmit immediately. Rather than sending a PING ACK, we could just pick a DATA packet and speculatively retransmit that with request-ack set. It should result in either a REQUESTED ACK or a DUPLICATE ACK which we can then use in lieu the a PING-RESPONSE ACK mentioned above. Signed-off-by: David Howells --- net/rxrpc/call_object.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/rxrpc/call_object.c') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index b305970..7ee3d6c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -197,6 +197,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) unsigned long j = now + MAX_JIFFY_OFFSET; call->ack_at = j; + call->ack_lost_at = j; call->resend_at = j; call->ping_at = j; call->expect_rx_by = j; -- cgit v1.1