From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sctp/socket.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c194127..f34adcc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5702,7 +5702,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sctp_sock *sp = sctp_sk(sk); unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + poll_wait(file, sk_sleep(sk), wait); /* A TCP-style listening socket becomes readable when the accept queue * is not empty. @@ -5943,7 +5943,7 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) int error; DEFINE_WAIT(wait); - prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); @@ -5980,14 +5980,14 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) sctp_lock_sock(sk); ready: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return 0; interrupted: error = sock_intr_errno(*timeo_p); out: - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); *err = error; return error; } @@ -6061,8 +6061,8 @@ static void __sctp_write_space(struct sctp_association *asoc) wake_up_interruptible(&asoc->wait); if (sctp_writeable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. @@ -6296,7 +6296,7 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo) for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&ep->asocs)) { @@ -6322,7 +6322,7 @@ static int sctp_wait_for_accept(struct sock *sk, long timeo) break; } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; } @@ -6332,7 +6332,7 @@ static void sctp_wait_for_close(struct sock *sk, long timeout) DEFINE_WAIT(wait); do { - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&sctp_sk(sk)->ep->asocs)) break; sctp_release_sock(sk); @@ -6340,7 +6340,7 @@ static void sctp_wait_for_close(struct sock *sk, long timeout) sctp_lock_sock(sk); } while (!signal_pending(current) && timeout); - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); } static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk) -- cgit v1.1 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sctp/socket.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f34adcc..13d8229 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3721,9 +3721,6 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); percpu_counter_inc(&sctp_sockets_allocated); - /* Set socket backlog limit. */ - sk->sk_backlog.limit = sysctl_sctp_rmem[1]; - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); local_bh_enable(); -- cgit v1.1 From 561b1733a465cf9677356b40c27653dd45f1ac56 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 28 Apr 2010 08:47:18 +0000 Subject: sctp: avoid irq lock inversion while call sk->sk_data_ready() sk->sk_data_ready() of sctp socket can be called from both BH and non-BH contexts, but the default sk->sk_data_ready(), sock_def_readable(), can not be used in this case. Therefore, we have to make a new function sctp_data_ready() to grab sk->sk_data_ready() with BH disabling. ========================================================= [ INFO: possible irq lock inversion dependency detected ] 2.6.33-rc6 #129 --------------------------------------------------------- sctp_darn/1517 just changed the state of lock: (clock-AF_INET){++.?..}, at: [] sock_def_readable+0x20/0x80 but this lock took another, SOFTIRQ-unsafe lock in the past: (slock-AF_INET){+.-...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: 1 lock held by sctp_darn/1517: #0: (sk_lock-AF_INET){+.+.+.}, at: [] sctp_sendmsg+0x23d/0xc00 [sctp] Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/endpointola.c | 1 + net/sctp/socket.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'net/sctp') diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 905fda5..7ec09ba 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -144,6 +144,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; + sk->sk_data_ready = sctp_data_ready; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 007e8ba..efa2bc3 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6189,6 +6189,16 @@ do_nonblock: goto out; } +void sctp_data_ready(struct sock *sk, int len) +{ + read_lock_bh(&sk->sk_callback_lock); + if (sk_has_sleeper(sk)) + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + read_unlock_bh(&sk->sk_callback_lock); +} + /* If socket sndbuf has changed, wake up all per association waiters. */ void sctp_write_space(struct sock *sk) { -- cgit v1.1 From 0c42749cffbb4a06be86c5e5db6c7ebad548781f Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 28 Apr 2010 08:47:19 +0000 Subject: sctp: fix potential reference of a freed pointer When sctp attempts to update an assocition, it removes any addresses that were not in the updated INITs. However, the loop may attempt to refrence a transport with address after removing it. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index df5abbf..99c93ee 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1194,8 +1194,10 @@ void sctp_assoc_update(struct sctp_association *asoc, /* Remove any peer addresses not present in the new association. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { trans = list_entry(pos, struct sctp_transport, transports); - if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) - sctp_assoc_del_peer(asoc, &trans->ipaddr); + if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) { + sctp_assoc_rm_peer(asoc, trans); + continue; + } if (asoc->state >= SCTP_STATE_ESTABLISHED) sctp_transport_reset(trans); -- cgit v1.1 From 81419d862db743fe4450a021893f24bab4698c1d Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 28 Apr 2010 08:47:20 +0000 Subject: sctp: per_cpu variables should be in bh_disabled section Since the change of the atomics to percpu variables, we now have to disable BH in process context when touching percpu variables. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index efa2bc3..44a1ab0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3719,12 +3719,12 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) sp->hmac = NULL; SCTP_DBG_OBJCNT_INC(sock); - percpu_counter_inc(&sctp_sockets_allocated); /* Set socket backlog limit. */ sk->sk_backlog.limit = sysctl_sctp_rmem[1]; local_bh_disable(); + percpu_counter_inc(&sctp_sockets_allocated); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); local_bh_enable(); @@ -3741,8 +3741,8 @@ SCTP_STATIC void sctp_destroy_sock(struct sock *sk) /* Release our hold on the endpoint. */ ep = sctp_sk(sk)->ep; sctp_endpoint_free(ep); - percpu_counter_dec(&sctp_sockets_allocated); local_bh_disable(); + percpu_counter_dec(&sctp_sockets_allocated); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); } -- cgit v1.1 From a8170c35e738d62e9919ce5b109cf4ed66e95bde Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 28 Apr 2010 08:47:21 +0000 Subject: sctp: fix to calc the INIT/INIT-ACK chunk length correctly is set When calculating the INIT/INIT-ACK chunk length, we should not only account the length of parameters, but also the parameters zero padding length, such as AUTH HMACS parameter and CHUNKS parameter. Without the parameters zero padding length we may get following oops. skb_over_panic: text:ce2068d2 len:130 put:6 head:cac3fe00 data:cac3fe00 tail:0xcac3fe82 end:0xcac3fe80 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:127! invalid opcode: 0000 [#2] SMP last sysfs file: /sys/module/aes_generic/initstate Modules linked in: authenc ...... Pid: 4102, comm: sctp_darn Tainted: G D 2.6.34-rc2 #6 EIP: 0060:[] EFLAGS: 00010282 CPU: 0 EIP is at skb_over_panic+0x37/0x3e EAX: 00000078 EBX: c07c024b ECX: c07c02b9 EDX: cb607b78 ESI: 00000000 EDI: cac3fe7a EBP: 00000002 ESP: cb607b74 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 Process sctp_darn (pid: 4102, ti=cb607000 task=cabdc990 task.ti=cb607000) Stack: c07c02b9 ce2068d2 00000082 00000006 cac3fe00 cac3fe00 cac3fe82 cac3fe80 <0> c07c024b cac3fe7c cac3fe7a c0608dec ca986e80 ce2068d2 00000006 0000007a <0> cb8120ca ca986e80 cb812000 00000003 cb8120c4 ce208a25 cb8120ca cadd9400 Call Trace: [] ? sctp_addto_chunk+0x45/0x85 [sctp] [] ? skb_put+0x2e/0x32 [] ? sctp_addto_chunk+0x45/0x85 [sctp] [] ? sctp_make_init+0x279/0x28c [sctp] [] ? apic_timer_interrupt+0x2a/0x30 [] ? sctp_sf_do_prm_asoc+0x2b/0x7b [sctp] [] ? sctp_do_sm+0xa0/0x14a [sctp] [] ? sctp_pname+0x0/0x14 [sctp] [] ? sctp_primitive_ASSOCIATE+0x2b/0x31 [sctp] [] ? sctp_sendmsg+0x7a0/0x9eb [sctp] [] ? inet_sendmsg+0x3b/0x43 [] ? task_tick_fair+0x2d/0xd9 [] ? sock_sendmsg+0xa7/0xc1 [] ? smp_apic_timer_interrupt+0x6b/0x75 [] ? dequeue_task_fair+0x34/0x19b [] ? sched_clock_local+0x17/0x11e [] ? _copy_from_user+0x2b/0x10c [] ? verify_iovec+0x3c/0x6a [] ? sys_sendmsg+0x186/0x1e2 [] ? __wake_up_common+0x34/0x5b [] ? __wake_up+0x2c/0x3b [] ? tty_wakeup+0x43/0x47 [] ? remove_wait_queue+0x16/0x24 [] ? n_tty_read+0x5b8/0x65e [] ? default_wake_function+0x0/0x8 [] ? sys_socketcall+0x17f/0x1cd [] ? sysenter_do_call+0x12/0x22 Code: 0f 45 de 53 ff b0 98 00 00 00 ff b0 94 ...... EIP: [] skb_over_panic+0x37/0x3e SS:ESP 0068:cb607b74 To reproduce: # modprobe sctp # echo 1 > /proc/sys/net/sctp/addip_enable # echo 1 > /proc/sys/net/sctp/auth_enable # sctp_test -H 3ffe:501:ffff:100:20c:29ff:fe4d:f37e -P 800 -l # sctp_darn -H 3ffe:501:ffff:100:20c:29ff:fe4d:f37e -P 900 -h 192.168.0.21 -p 800 -I -s -t sctp_darn ready to send... 3ffe:501:ffff:100:20c:29ff:fe4d:f37e:900-192.168.0.21:800 Interactive mode> bindx-add=192.168.0.21 3ffe:501:ffff:100:20c:29ff:fe4d:f37e:900-192.168.0.21:800 Interactive mode> bindx-add=192.168.1.21 3ffe:501:ffff:100:20c:29ff:fe4d:f37e:900-192.168.0.21:800 Interactive mode> snd=10 ------------------------------------------------------------------ eth0 has addresses: 3ffe:501:ffff:100:20c:29ff:fe4d:f37e and 192.168.0.21 eth1 has addresses: 192.168.1.21 ------------------------------------------------------------------ Reported-by: George Cheimonidis Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 17cb400e..f6fc5c1 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -208,7 +208,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); - chunksize = sizeof(init) + addrs_len + SCTP_SAT_LEN(num_types); + chunksize = sizeof(init) + addrs_len; + chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); if (sctp_prsctp_enable) @@ -238,14 +239,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* Add HMACS parameter length if any were defined */ auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += ntohs(auth_hmacs->length); + chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += ntohs(auth_chunks->length); + chunksize += WORD_ROUND(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -255,7 +256,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* If we have any extensions to report, account for that */ if (num_ext) - chunksize += sizeof(sctp_supported_ext_param_t) + num_ext; + chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * @@ -397,13 +399,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += ntohs(auth_hmacs->length); + chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += ntohs(auth_chunks->length); + chunksize += WORD_ROUND(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -412,7 +414,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, } if (num_ext) - chunksize += sizeof(sctp_supported_ext_param_t) + num_ext; + chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_chunk(asoc, SCTP_CID_INIT_ACK, 0, chunksize); -- cgit v1.1 From c0786693404cffd80ca3cb6e75ee7b35186b2825 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 28 Apr 2010 08:47:22 +0000 Subject: sctp: Fix oops when sending queued ASCONF chunks When we finish processing ASCONF_ACK chunk, we try to send the next queued ASCONF. This action runs the sctp state machine recursively and it's not prepared to do so. kernel BUG at kernel/timer.c:790! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/module/ipv6/initstate Modules linked in: sha256_generic sctp libcrc32c ipv6 dm_multipath uinput 8139too i2c_piix4 8139cp mii i2c_core pcspkr virtio_net joydev floppy virtio_blk virtio_pci [last unloaded: scsi_wait_scan] Pid: 0, comm: swapper Not tainted 2.6.34-rc4 #15 /Bochs EIP: 0060:[] EFLAGS: 00010286 CPU: 0 EIP is at add_timer+0xd/0x1b EAX: cecbab14 EBX: 000000f0 ECX: c0957b1c EDX: 03595cf4 ESI: cecba800 EDI: cf276f00 EBP: c0957aa0 ESP: c0957aa0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process swapper (pid: 0, ti=c0956000 task=c0988ba0 task.ti=c0956000) Stack: c0957ae0 d1851214 c0ab62e4 c0ab5f26 0500ffff 00000004 00000005 00000004 <0> 00000000 d18694fd 00000004 1666b892 cecba800 cecba800 c0957b14 00000004 <0> c0957b94 d1851b11 ceda8b00 cecba800 cf276f00 00000001 c0957b14 000000d0 Call Trace: [] ? sctp_side_effects+0x607/0xdfc [sctp] [] ? sctp_do_sm+0x108/0x159 [sctp] [] ? sctp_pname+0x0/0x1d [sctp] [] ? sctp_primitive_ASCONF+0x36/0x3b [sctp] [] ? sctp_process_asconf_ack+0x2a4/0x2d3 [sctp] [] ? sctp_sf_do_asconf_ack+0x1dd/0x2b4 [sctp] [] ? sctp_do_sm+0xb8/0x159 [sctp] [] ? sctp_cname+0x0/0x52 [sctp] [] ? sctp_assoc_bh_rcv+0xac/0xe1 [sctp] [] ? sctp_inq_push+0x2d/0x30 [sctp] [] ? sctp_rcv+0x797/0x82e [sctp] Tested-by: Wei Yongjun Signed-off-by: Yuansong Qiao Signed-off-by: Shuaijun Zhang Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 15 --------------- net/sctp/sm_sideeffect.c | 26 ++++++++++++++++++++++++++ net/sctp/sm_statefuns.c | 8 +++++++- 3 files changed, 33 insertions(+), 16 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f6fc5c1..0fd5b4c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3318,21 +3318,6 @@ int sctp_process_asconf_ack(struct sctp_association *asoc, sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; - /* Send the next asconf chunk from the addip chunk queue. */ - if (!list_empty(&asoc->addip_chunk_list)) { - struct list_head *entry = asoc->addip_chunk_list.next; - asconf = list_entry(entry, struct sctp_chunk, list); - - list_del_init(entry); - - /* Hold the chunk until an ASCONF_ACK is received. */ - sctp_chunk_hold(asconf); - if (sctp_primitive_ASCONF(asoc, asconf)) - sctp_chunk_free(asconf); - else - asoc->addip_last_asconf = asconf; - } - return retval; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 4c5bed9..d5ae450 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -962,6 +962,29 @@ static int sctp_cmd_send_msg(struct sctp_association *asoc, } +/* Sent the next ASCONF packet currently stored in the association. + * This happens after the ASCONF_ACK was succeffully processed. + */ +static void sctp_cmd_send_asconf(struct sctp_association *asoc) +{ + /* Send the next asconf chunk from the addip chunk + * queue. + */ + if (!list_empty(&asoc->addip_chunk_list)) { + struct list_head *entry = asoc->addip_chunk_list.next; + struct sctp_chunk *asconf = list_entry(entry, + struct sctp_chunk, list); + list_del_init(entry); + + /* Hold the chunk until an ASCONF_ACK is received. */ + sctp_chunk_hold(asconf); + if (sctp_primitive_ASCONF(asoc, asconf)) + sctp_chunk_free(asconf); + else + asoc->addip_last_asconf = asconf; + } +} + /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real @@ -1617,6 +1640,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, } error = sctp_cmd_send_msg(asoc, cmd->obj.msg); break; + case SCTP_CMD_SEND_NEXT_ASCONF: + sctp_cmd_send_asconf(asoc); + break; default: printk(KERN_WARNING "Impossible command: %u, %p\n", cmd->verb, cmd->obj.ptr); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index abf601a..24b2cd5 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3676,8 +3676,14 @@ sctp_disposition_t sctp_sf_do_asconf_ack(const struct sctp_endpoint *ep, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); if (!sctp_process_asconf_ack((struct sctp_association *)asoc, - asconf_ack)) + asconf_ack)) { + /* Successfully processed ASCONF_ACK. We can + * release the next asconf if we have one. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF, + SCTP_NULL()); return SCTP_DISPOSITION_CONSUME; + } abort = sctp_make_abort(asoc, asconf_ack, sizeof(sctp_errhdr_t)); -- cgit v1.1 From 5fa782c2f5ef6c2e4f04d3e228412c9b4a4c8809 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 28 Apr 2010 10:30:59 +0000 Subject: sctp: Fix skb_over_panic resulting from multiple invalid parameter errors (CVE-2010-1173) (v4) Ok, version 4 Change Notes: 1) Minor cleanups, from Vlads notes Summary: Hey- Recently, it was reported to me that the kernel could oops in the following way: <5> kernel BUG at net/core/skbuff.c:91! <5> invalid operand: 0000 [#1] <5> Modules linked in: sctp netconsole nls_utf8 autofs4 sunrpc iptable_filter ip_tables cpufreq_powersave parport_pc lp parport vmblock(U) vsock(U) vmci(U) vmxnet(U) vmmemctl(U) vmhgfs(U) acpiphp dm_mirror dm_mod button battery ac md5 ipv6 uhci_hcd ehci_hcd snd_ens1371 snd_rawmidi snd_seq_device snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd_page_alloc snd_ac97_codec snd soundcore pcnet32 mii floppy ext3 jbd ata_piix libata mptscsih mptsas mptspi mptscsi mptbase sd_mod scsi_mod <5> CPU: 0 <5> EIP: 0060:[] Not tainted VLI <5> EFLAGS: 00010216 (2.6.9-89.0.25.EL) <5> EIP is at skb_over_panic+0x1f/0x2d <5> eax: 0000002c ebx: c033f461 ecx: c0357d96 edx: c040fd44 <5> esi: c033f461 edi: df653280 ebp: 00000000 esp: c040fd40 <5> ds: 007b es: 007b ss: 0068 <5> Process swapper (pid: 0, threadinfo=c040f000 task=c0370be0) <5> Stack: c0357d96 e0c29478 00000084 00000004 c033f461 df653280 d7883180 e0c2947d <5> 00000000 00000080 df653490 00000004 de4f1ac0 de4f1ac0 00000004 df653490 <5> 00000001 e0c2877a 08000800 de4f1ac0 df653490 00000000 e0c29d2e 00000004 <5> Call Trace: <5> [] sctp_addto_chunk+0xb0/0x128 [sctp] <5> [] sctp_addto_chunk+0xb5/0x128 [sctp] <5> [] sctp_init_cause+0x3f/0x47 [sctp] <5> [] sctp_process_unk_param+0xac/0xb8 [sctp] <5> [] sctp_verify_init+0xcc/0x134 [sctp] <5> [] sctp_sf_do_5_1B_init+0x83/0x28e [sctp] <5> [] sctp_do_sm+0x41/0x77 [sctp] <5> [] cache_grow+0x140/0x233 <5> [] sctp_endpoint_bh_rcv+0xc5/0x108 [sctp] <5> [] sctp_inq_push+0xe/0x10 [sctp] <5> [] sctp_rcv+0x454/0x509 [sctp] <5> [] ipt_hook+0x17/0x1c [iptable_filter] <5> [] nf_iterate+0x40/0x81 <5> [] ip_local_deliver_finish+0x0/0x151 <5> [] ip_local_deliver_finish+0xc6/0x151 <5> [] nf_hook_slow+0x83/0xb5 <5> [] ip_local_deliver+0x1a2/0x1a9 <5> [] ip_local_deliver_finish+0x0/0x151 <5> [] ip_rcv+0x334/0x3b4 <5> [] netif_receive_skb+0x320/0x35b <5> [] init_stall_timer+0x67/0x6a [uhci_hcd] <5> [] process_backlog+0x6c/0xd9 <5> [] net_rx_action+0xfe/0x1f8 <5> [] __do_softirq+0x35/0x79 <5> [] handle_IRQ_event+0x0/0x4f <5> [] do_softirq+0x46/0x4d Its an skb_over_panic BUG halt that results from processing an init chunk in which too many of its variable length parameters are in some way malformed. The problem is in sctp_process_unk_param: if (NULL == *errp) *errp = sctp_make_op_error_space(asoc, chunk, ntohs(chunk->chunk_hdr->length)); if (*errp) { sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, WORD_ROUND(ntohs(param.p->length))); sctp_addto_chunk(*errp, WORD_ROUND(ntohs(param.p->length)), param.v); When we allocate an error chunk, we assume that the worst case scenario requires that we have chunk_hdr->length data allocated, which would be correct nominally, given that we call sctp_addto_chunk for the violating parameter. Unfortunately, we also, in sctp_init_cause insert a sctp_errhdr_t structure into the error chunk, so the worst case situation in which all parameters are in violation requires chunk_hdr->length+(sizeof(sctp_errhdr_t)*param_count) bytes of data. The result of this error is that a deliberately malformed packet sent to a listening host can cause a remote DOS, described in CVE-2010-1173: http://cve.mitre.org/cgi-bin/cvename.cgi?name=2010-1173 I've tested the below fix and confirmed that it fixes the issue. We move to a strategy whereby we allocate a fixed size error chunk and ignore errors we don't have space to report. Tested by me successfully Signed-off-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 62 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 5 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 0fd5b4c..30c1767 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -108,7 +108,7 @@ static const struct sctp_paramhdr prsctp_param = { cpu_to_be16(sizeof(struct sctp_paramhdr)), }; -/* A helper to initialize to initialize an op error inside a +/* A helper to initialize an op error inside a * provided chunk, as most cause codes will be embedded inside an * abort chunk. */ @@ -125,6 +125,29 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err); } +/* A helper to initialize an op error inside a + * provided chunk, as most cause codes will be embedded inside an + * abort chunk. Differs from sctp_init_cause in that it won't oops + * if there isn't enough space in the op error chunk + */ +int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code, + size_t paylen) +{ + sctp_errhdr_t err; + __u16 len; + + /* Cause code constants are now defined in network order. */ + err.cause = cause_code; + len = sizeof(sctp_errhdr_t) + paylen; + err.length = htons(len); + + if (skb_tailroom(chunk->skb) > len) + return -ENOSPC; + chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, + sizeof(sctp_errhdr_t), + &err); + return 0; +} /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two @@ -1132,6 +1155,24 @@ nodata: return retval; } +/* Create an Operation Error chunk of a fixed size, + * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) + * This is a helper function to allocate an error chunk for + * for those invalid parameter codes in which we may not want + * to report all the errors, if the incomming chunk is large + */ +static inline struct sctp_chunk *sctp_make_op_error_fixed( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + size_t size = asoc ? asoc->pathmtu : 0; + + if (!size) + size = SCTP_DEFAULT_MAXSEGMENT; + + return sctp_make_op_error_space(asoc, chunk, size); +} + /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, @@ -1374,6 +1415,18 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) return target; } +/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient + * space in the chunk + */ +void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk, + int len, const void *data) +{ + if (skb_tailroom(chunk->skb) > len) + return sctp_addto_chunk(chunk, len, data); + else + return NULL; +} + /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. @@ -1977,13 +2030,12 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, * returning multiple unknown parameters. */ if (NULL == *errp) - *errp = sctp_make_op_error_space(asoc, chunk, - ntohs(chunk->chunk_hdr->length)); + *errp = sctp_make_op_error_fixed(asoc, chunk); if (*errp) { - sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, + sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, WORD_ROUND(ntohs(param.p->length))); - sctp_addto_chunk(*errp, + sctp_addto_chunk_fixed(*errp, WORD_ROUND(ntohs(param.p->length)), param.v); } else { -- cgit v1.1 From a5f4cea74f1397bb29d0bbdabeb05bd05a23a741 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 21:42:42 -0400 Subject: sctp: Use correct address family in sctp_getsockopt_peer_addrs() The function should use the address family of the address when trying to determine the length of the structure. Signed-off-by: Vlad Yasevich --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sctp') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13d8229..1282a0e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4384,7 +4384,7 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, transports) { memcpy(&temp, &from->ipaddr, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); - addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) -- cgit v1.1 From c17b02b38aa99ef806c7066ef19a6f51122304f1 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 21:42:43 -0400 Subject: sctp: send SHUTDOWN-ACK chunk back to the source. SHUTDOWN-ACK is alaways sent to the primary path at the first time, but should better transmit SHUTDOWN-ACK chunk to the same destination transport address from which it received the SHUTDOWN chunk. Based on the work from Wei Yongjun . Signed-off-by: Vlad Yasevich --- net/sctp/sm_sideeffect.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 4c5bed9..49fb9ac 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -697,11 +697,15 @@ static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds, { struct sctp_transport *t; - t = sctp_assoc_choose_alter_transport(asoc, + if (chunk->transport) + t = chunk->transport; + else { + t = sctp_assoc_choose_alter_transport(asoc, asoc->shutdown_last_sent_to); + chunk->transport = t; + } asoc->shutdown_last_sent_to = t; asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; - chunk->transport = t; } /* Helper function to change the state of an association. */ -- cgit v1.1 From bd69b981a354be40cc709f3046f0c56f00da6163 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 30 Apr 2010 21:42:43 -0400 Subject: sctp: assure at least one T3-rtx timer is running if a FORWARD TSN is sent PR-SCTP extension section 3.5 Sender Side Implementation of PR-SCTP: C5) If a FORWARD TSN is sent, the sender MUST assure that at least one T3-rtx timer is running. So this patch fix to assure at least one T3-rtx timer is running if a FORWARD TSN is or will to sent. Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich --- net/sctp/outqueue.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/sctp') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index abfc0b8..16d451a 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -854,6 +854,12 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &q->control_chunk_list); + } else if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) { + /* PR-SCTP C5) If a FORWARD TSN is sent, the + * sender MUST assure that at least one T3-rtx + * timer is running. + */ + sctp_transport_reset_timers(transport, 0); } break; -- cgit v1.1 From 6429d3dc4bd6251b01c11b851e23a4d60f079e06 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 30 Apr 2010 21:42:44 -0400 Subject: sctp: missing set src and dest port while lookup output route While lookup the output route, we do not set the src and dest port. This will cause we got a wrong route if we had set the outbund transport to IPsec with src or dst port. Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich --- net/sctp/protocol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/sctp') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 704298f..1827498 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -474,13 +474,17 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, memset(&fl, 0x0, sizeof(struct flowi)); fl.fl4_dst = daddr->v4.sin_addr.s_addr; + fl.fl_ip_dport = daddr->v4.sin_port; fl.proto = IPPROTO_SCTP; if (asoc) { fl.fl4_tos = RT_CONN_FLAGS(asoc->base.sk); fl.oif = asoc->base.sk->sk_bound_dev_if; + fl.fl_ip_sport = htons(asoc->base.bind_addr.port); } - if (saddr) + if (saddr) { fl.fl4_src = saddr->v4.sin_addr.s_addr; + fl.fl_ip_sport = saddr->v4.sin_port; + } SCTP_DEBUG_PRINTK("%s: DST:%pI4, SRC:%pI4 - ", __func__, &fl.fl4_dst, &fl.fl4_src); @@ -528,6 +532,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, if ((laddr->state == SCTP_ADDR_SRC) && (AF_INET == laddr->a.sa.sa_family)) { fl.fl4_src = laddr->a.v4.sin_addr.s_addr; + fl.fl_ip_sport = laddr->a.v4.sin_port; if (!ip_route_output_key(&init_net, &rt, &fl)) { dst = &rt->u.dst; goto out_unlock; -- cgit v1.1 From bc4f841a05364b2572bcc266e9fd7e9cf5f06d5b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 30 Apr 2010 22:38:53 -0400 Subject: sctp: fix to retranmit at least one DATA chunk While doing retranmit, if control chunk exists, such as FORWARD TSN chunk, and the DATA chunk can not be bundled with this control chunk because of PMTU limit, no DATA chunk will be retranmitted in the current implementation. This patch makes sure to retranmit at least one DATA chunk in this case. Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich --- net/sctp/outqueue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/sctp') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 16d451a..e333d58 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -598,11 +598,23 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, if (fast_rtx && !chunk->fast_retransmit) continue; +redo: /* Attempt to append this chunk to the packet. */ status = sctp_packet_append_chunk(pkt, chunk); switch (status) { case SCTP_XMIT_PMTU_FULL: + if (!pkt->has_data && !pkt->has_cookie_echo) { + /* If this packet did not contain DATA then + * retransmission did not happen, so do it + * again. We'll ignore the error here since + * control chunks are already freed so there + * is nothing we can do. + */ + sctp_packet_transmit(pkt); + goto redo; + } + /* Send this packet. */ error = sctp_packet_transmit(pkt); -- cgit v1.1 From fbdf501c9374966a56829ecca3a7f25d2b49a305 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:39:26 -0400 Subject: sctp: Do no select unconfirmed transports for retransmissions An unconfirmed transport is one that we have not been able to reach since the beginning. There is no point in trying to retrasnmit data on those transports. Also, the specification forbids it due to security issues. Reported-by: Frank Schuster Signed-off-by: Vlad Yasevich --- net/sctp/associola.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index df5abbf..de830c2 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -762,7 +762,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, asoc->peer.retran_path = peer; } - if (asoc->peer.active_path == asoc->peer.retran_path) { + if (asoc->peer.active_path == asoc->peer.retran_path && + peer->state != SCTP_UNCONFIRMED) { asoc->peer.retran_path = peer; } @@ -1318,7 +1319,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc) /* Keep track of the next transport in case * we don't find any active transport. */ - if (!next) + if (t->state != SCTP_UNCONFIRMED && !next) next = t; } } -- cgit v1.1 From ec7b9519509061bbc09a43284c3570aa492e07f0 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Fri, 30 Apr 2010 22:41:09 -0400 Subject: sctp: use sctp_chunk_is_data macro to decide a chunk is data chunk sctp_chunk_is_data macro is defined to decide that whether a chunk is data chunk or not. Signed-off-by: Shan Wei Signed-off-by: Vlad Yasevich --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sctp') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e333d58..a4fe7de 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -308,7 +308,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) /* If it is data, queue it up, otherwise, send it * immediately. */ - if (SCTP_CID_DATA == chunk->chunk_hdr->type) { + if (sctp_chunk_is_data(chunk)) { /* Is it OK to queue data chunks? */ /* From 9. Termination of Association * -- cgit v1.1 From 787a51a0878f7bee3a9a83040077301e1556b69a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 30 Apr 2010 22:41:09 -0400 Subject: sctp: implement sctp association probing module This patch implement sctp association probing module, the module will be called sctp_probe. This module allows for capturing the changes to SCTP association state in response to incoming packets. It is used for debugging SCTP congestion control algorithms. Usage: $ modprobe sctp_probe [full=n] [port=n] [bufsize=n] $ cat /proc/net/sctpprobe The output format is: TIME ASSOC LPORT RPORT MTU RWND UNACK ... The output will be like this: 9.226086 c4064c48 9000 8000 1500 53352 1 *192.168.0.19 1 4380 54784 1252 0 1500 9.287195 c4064c48 9000 8000 1500 45144 5 *192.168.0.19 1 5880 54784 6500 0 1500 9.289130 c4064c48 9000 8000 1500 42724 5 *192.168.0.19 1 7380 54784 6500 0 1500 9.620332 c4064c48 9000 8000 1500 48284 4 *192.168.0.19 1 8880 54784 5200 0 1500 ...... Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich --- net/sctp/Kconfig | 12 +++ net/sctp/Makefile | 3 + net/sctp/probe.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 net/sctp/probe.c (limited to 'net/sctp') diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 58b3e88..126b014e 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -37,6 +37,18 @@ menuconfig IP_SCTP if IP_SCTP +config NET_SCTPPROBE + tristate "SCTP: Association probing" + depends on PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to SCTP association + state in response to incoming packets. It is used for debugging + SCTP congestion control algorithms. If you don't understand + what was just said, you don't need it: say N. + + To compile this code as a module, choose M here: the + module will be called sctp_probe. + config SCTP_DBG_MSG bool "SCTP: Debug messages" help diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 6b79473..5c30b7a 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_IP_SCTP) += sctp.o +obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ protocol.o endpointola.o associola.o \ @@ -11,6 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o ssnmap.o auth.o +sctp_probe-y := probe.o + sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o sctp-$(CONFIG_PROC_FS) += proc.o sctp-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sctp/probe.c b/net/sctp/probe.c new file mode 100644 index 0000000..8f025d5 --- /dev/null +++ b/net/sctp/probe.c @@ -0,0 +1,213 @@ +/* + * sctp_probe - Observe the SCTP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * Modified for SCTP from Stephen Hemminger's code + * Copyright (C) 2010, Wei Yongjun + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Wei Yongjun "); +MODULE_DESCRIPTION("SCTP snooper"); +MODULE_LICENSE("GPL"); + +static int port __read_mostly = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static int bufsize __read_mostly = 64 * 1024; +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +static int full __read_mostly = 1; +MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); +module_param(full, int, 0); + +static const char procname[] = "sctpprobe"; + +static struct { + struct kfifo fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timespec tstart; +} sctpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + char tbuf[256]; + + va_start(args, fmt); + len = vscnprintf(tbuf, sizeof(tbuf), fmt, args); + va_end(args); + + kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock); + wake_up(&sctpw.wait); +} + +static int sctpprobe_open(struct inode *inode, struct file *file) +{ + kfifo_reset(&sctpw.fifo); + getnstimeofday(&sctpw.tstart); + + return 0; +} + +static ssize_t sctpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt = 0; + unsigned char *tbuf; + + if (!buf) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(sctpw.wait, + kfifo_len(&sctpw.fifo) != 0); + if (error) + goto out_free; + + cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock); + error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0; + +out_free: + vfree(tbuf); + + return error ? error : cnt; +} + +static const struct file_operations sctpprobe_fops = { + .owner = THIS_MODULE, + .open = sctpprobe_open, + .read = sctpprobe_read, +}; + +sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *sp; + static __u32 lcwnd = 0; + struct timespec now; + + sp = asoc->peer.primary_path; + + if ((full || sp->cwnd != lcwnd) && + (!port || asoc->peer.port == port || + ep->base.bind_addr.port == port)) { + lcwnd = sp->cwnd; + + getnstimeofday(&now); + now = timespec_sub(now, sctpw.tstart); + + printl("%lu.%06lu ", (unsigned long) now.tv_sec, + (unsigned long) now.tv_nsec / NSEC_PER_USEC); + + printl("%p %5d %5d %5d %8d %5d ", asoc, + ep->base.bind_addr.port, asoc->peer.port, + asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data); + + list_for_each_entry(sp, &asoc->peer.transport_addr_list, + transports) { + if (sp == asoc->peer.primary_path) + printl("*"); + + if (sp->ipaddr.sa.sa_family == AF_INET) + printl("%pI4 ", &sp->ipaddr.v4.sin_addr); + else + printl("%pI6 ", &sp->ipaddr.v6.sin6_addr); + + printl("%2u %8u %8u %8u %8u %8u ", + sp->state, sp->cwnd, sp->ssthresh, + sp->flight_size, sp->partial_bytes_acked, + sp->pathmtu); + } + printl("\n"); + } + + jprobe_return(); + return 0; +} + +static struct jprobe sctp_recv_probe = { + .kp = { + .symbol_name = "sctp_sf_eat_sack_6_2", + }, + .entry = jsctp_sf_eat_sack, +}; + +static __init int sctpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&sctpw.wait); + spin_lock_init(&sctpw.lock); + if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL)) + return ret; + + if (!proc_net_fops_create(&init_net, procname, S_IRUSR, + &sctpprobe_fops)) + goto free_kfifo; + + ret = register_jprobe(&sctp_recv_probe); + if (ret) + goto remove_proc; + + pr_info("SCTP probe registered (port=%d)\n", port); + + return 0; + +remove_proc: + proc_net_remove(&init_net, procname); +free_kfifo: + kfifo_free(&sctpw.fifo); + return ret; +} + +static __exit void sctpprobe_exit(void) +{ + kfifo_free(&sctpw.fifo); + proc_net_remove(&init_net, procname); + unregister_jprobe(&sctp_recv_probe); +} + +module_init(sctpprobe_init); +module_exit(sctpprobe_exit); -- cgit v1.1 From b99a4d53a74ac25eb4b930eef6c745579149c571 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 30 Apr 2010 22:41:09 -0400 Subject: sctp: cleanup: remove duplicate assignment This assignment isn't needed because we did it earlier already. Also another reason to delete the assignment is because it triggers a Smatch warning about checking for NULL pointers after a dereference. Reported-by: Vlad Yasevich Signed-off-by: Dan Carpenter Signed-off-by: Vlad Yasevich --- net/sctp/sm_make_chunk.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 17cb400e..33aed1c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -419,10 +419,17 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, if (!retval) goto nomem_chunk; - /* Per the advice in RFC 2960 6.4, send this reply to - * the source of the INIT packet. + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it received the DATA or control chunk + * to which it is replying. + * + * [INIT ACK back to where the INIT came from.] */ retval->transport = chunk->transport; + retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); @@ -461,18 +468,6 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; - /* RFC 2960 6.4 Multi-homed SCTP Endpoints - * - * An endpoint SHOULD transmit reply chunks (e.g., SACK, - * HEARTBEAT ACK, * etc.) to the same destination transport - * address from which it received the DATA or control chunk - * to which it is replying. - * - * [INIT ACK back to where the INIT came from.] - */ - if (chunk) - retval->transport = chunk->transport; - nomem_chunk: kfree(cookie); nomem_cookie: -- cgit v1.1 From d598b166ced20d9b9281ea3527c0e18405ddb803 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:09 -0400 Subject: sctp: Make sure we always return valid retransmit path commit 4951feda0c60d1ef681f1a270afdd617924ab041 sctp: Do no select unconfirmed transports for retransmissions added code to make sure that we do not select unconfirmed paths for data transmission. This caused a problem when there are only 2 paths, 1 unconfirmed and 1 unreachable. In that case, the next retransmit path returned is NULL and that causes a kernel crash. The solution is to only change retransmit paths if we found one to use. Reported-by: Frank Schuster Signed-off-b: Vlad Yasevich --- net/sctp/associola.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/sctp') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index de830c2..fab9cb2 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1324,7 +1324,8 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc) } } - asoc->peer.retran_path = t; + if (t) + asoc->peer.retran_path = t; SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association" " %p addr: ", -- cgit v1.1 From ae19c54866450f6c6f79223ca7d37965859a54e1 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:09 -0400 Subject: sctp: remove 'resent' bit from the chunk The 'resent' bit is used to make sure that we don't update rto estimate based on retransmitted chunks. However, we already have the 'rto_pending' bit that we test when need to update rto, so 'resent' bit is just extra. Additionally, we currently have a bug in that we always set a 'resent' bit and thus rto estimate is only updated by Heartbeats. Signed-off-by: Vlad Yasevich --- net/sctp/output.c | 25 +++++++++---------------- net/sctp/outqueue.c | 1 - net/sctp/sm_make_chunk.c | 1 - 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/output.c b/net/sctp/output.c index fad261d..35e49b9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -429,24 +429,17 @@ int sctp_packet_transmit(struct sctp_packet *packet) list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ - if (!chunk->resent) { - - /* 6.3.1 C4) When data is in flight and when allowed - * by rule C5, a new RTT measurement MUST be made each - * round trip. Furthermore, new RTT measurements - * SHOULD be made no more than once per round-trip - * for a given destination transport address. - */ - - if (!tp->rto_pending) { - chunk->rtt_in_progress = 1; - tp->rto_pending = 1; - } + if (!tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; } - - chunk->resent = 1; - has_data = 1; } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index a4fe7de..4e551ba 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1405,7 +1405,6 @@ static void sctp_check_transmitted(struct sctp_outq *q, * instance). */ if (!tchunk->tsn_gap_acked && - !tchunk->resent && tchunk->rtt_in_progress) { tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 33aed1c..24effdf 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1205,7 +1205,6 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; - retval->resent = 0; retval->has_tsn = 0; retval->has_ssn = 0; retval->rtt_in_progress = 0; -- cgit v1.1 From d9efc2231b28bc199f9de4dd594248b7341188e5 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:09 -0400 Subject: sctp: Do not force T3 timer on fast retransmissions. We don't need to force the T3 timer any more and it's actually wrong to do as it causes too long of a delay. The timer will be started if one is not running, but if one is running, we leave it alone. Signed-off-by: Vlad Yasevich --- net/sctp/outqueue.c | 15 +++------------ net/sctp/transport.c | 4 ++-- 2 files changed, 5 insertions(+), 14 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4e551ba..786c4ff 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -659,14 +659,6 @@ redo: if (chunk->fast_retransmit == SCTP_NEED_FRTX) chunk->fast_retransmit = SCTP_DONT_FRTX; - /* Force start T3-rtx timer when fast retransmitting - * the earliest outstanding TSN - */ - if (!timer && fast_rtx && - ntohl(chunk->subh.data_hdr->tsn) == - asoc->ctsn_ack_point + 1) - timer = 2; - q->empty = 0; break; } @@ -871,7 +863,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) * sender MUST assure that at least one T3-rtx * timer is running. */ - sctp_transport_reset_timers(transport, 0); + sctp_transport_reset_timers(transport); } break; @@ -924,8 +916,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) rtx_timeout, &start_timer); if (start_timer) - sctp_transport_reset_timers(transport, - start_timer-1); + sctp_transport_reset_timers(transport); /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. @@ -1058,7 +1049,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) list_add_tail(&chunk->transmitted_list, &transport->transmitted); - sctp_transport_reset_timers(transport, 0); + sctp_transport_reset_timers(transport); q->empty = 0; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index be4d63d..0ebb97f 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -195,7 +195,7 @@ static void sctp_transport_destroy(struct sctp_transport *transport) /* Start T3_rtx timer if it is not already running and update the heartbeat * timer. This routine is called every time a DATA chunk is sent. */ -void sctp_transport_reset_timers(struct sctp_transport *transport, int force) +void sctp_transport_reset_timers(struct sctp_transport *transport) { /* RFC 2960 6.3.2 Retransmission Timer Rules * @@ -205,7 +205,7 @@ void sctp_transport_reset_timers(struct sctp_transport *transport, int force) * address. */ - if (force || !timer_pending(&transport->T3_rtx_timer)) + if (!timer_pending(&transport->T3_rtx_timer)) if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); -- cgit v1.1 From b2cf9b6bd93af1cc047d3356f1c6cc9367fe3731 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:10 -0400 Subject: sctp: update transport initializations Right now, sctp transports are not fully initialized and when adding any new fields, they have to be explicitely initialized. This is prone to mistakes. So we switch to calling kzalloc() which makes things much simpler. Signed-off-by: Vlad Yasevich --- net/sctp/associola.c | 3 --- net/sctp/endpointola.c | 2 -- net/sctp/transport.c | 25 ------------------------- 3 files changed, 30 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index fab9cb2..37753cd 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -87,9 +87,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Retrieve the SCTP per socket area. */ sp = sctp_sk((struct sock *)sk); - /* Init all variables to a known value. */ - memset(asoc, 0, sizeof(struct sctp_association)); - /* Discarding const is appropriate here. */ asoc->ep = (struct sctp_endpoint *)ep; sctp_endpoint_hold(asoc->ep); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 905fda5..2f8763b 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -70,8 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sctp_shared_key *null_key; int err; - memset(ep, 0, sizeof(struct sctp_endpoint)); - ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); if (!ep->digest) return NULL; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 0ebb97f..854228b 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -64,9 +64,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, /* Copy in the address. */ peer->ipaddr = *addr; peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); - peer->asoc = NULL; - - peer->dst = NULL; memset(&peer->saddr, 0, sizeof(union sctp_addr)); /* From 6.3.1 RTO Calculation: @@ -76,34 +73,21 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, * parameter 'RTO.Initial'. */ peer->rto = msecs_to_jiffies(sctp_rto_initial); - peer->rtt = 0; - peer->rttvar = 0; - peer->srtt = 0; - peer->rto_pending = 0; - peer->hb_sent = 0; - peer->fast_recovery = 0; peer->last_time_heard = jiffies; peer->last_time_ecne_reduced = jiffies; - peer->init_sent_count = 0; - peer->param_flags = SPP_HB_DISABLE | SPP_PMTUD_ENABLE | SPP_SACKDELAY_ENABLE; - peer->hbinterval = 0; /* Initialize the default path max_retrans. */ peer->pathmaxrxt = sctp_max_retrans_path; - peer->error_count = 0; INIT_LIST_HEAD(&peer->transmitted); INIT_LIST_HEAD(&peer->send_ready); INIT_LIST_HEAD(&peer->transports); - peer->T3_rtx_timer.expires = 0; - peer->hb_timer.expires = 0; - setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, (unsigned long)peer); setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event, @@ -113,15 +97,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); atomic_set(&peer->refcnt, 1); - peer->dead = 0; - - peer->malloced = 0; - - /* Initialize the state information for SFR-CACC */ - peer->cacc.changeover_active = 0; - peer->cacc.cycling_changeover = 0; - peer->cacc.next_tsn_at_change = 0; - peer->cacc.cacc_saw_newack = 0; return peer; } -- cgit v1.1 From cf9b4812e18aab6f86ff998bd7425a9e823269c3 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:10 -0400 Subject: sctp: fast recovery algorithm is per association. SCTP fast recovery algorithm really applies per association and impacts all transports. Signed-off-by: Vlad Yasevich --- net/sctp/transport.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 854228b..fccf494 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -378,15 +378,16 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) void sctp_transport_raise_cwnd(struct sctp_transport *transport, __u32 sack_ctsn, __u32 bytes_acked) { + struct sctp_association *asoc = transport->asoc; __u32 cwnd, ssthresh, flight_size, pba, pmtu; cwnd = transport->cwnd; flight_size = transport->flight_size; /* See if we need to exit Fast Recovery first */ - if (transport->fast_recovery && - TSN_lte(transport->fast_recovery_exit, sack_ctsn)) - transport->fast_recovery = 0; + if (asoc->fast_recovery && + TSN_lte(asoc->fast_recovery_exit, sack_ctsn)) + asoc->fast_recovery = 0; /* The appropriate cwnd increase algorithm is performed if, and only * if the cumulative TSN whould advanced and the congestion window is @@ -415,7 +416,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, * 2) the destination's path MTU. This upper bound protects * against the ACK-Splitting attack outlined in [SAVAGE99]. */ - if (transport->fast_recovery) + if (asoc->fast_recovery) return; if (bytes_acked > pmtu) @@ -466,6 +467,8 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, void sctp_transport_lower_cwnd(struct sctp_transport *transport, sctp_lower_cwnd_t reason) { + struct sctp_association *asoc = transport->asoc; + switch (reason) { case SCTP_LOWER_CWND_T3_RTX: /* RFC 2960 Section 7.2.3, sctpimpguide @@ -476,11 +479,11 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pathmtu); - transport->cwnd = transport->asoc->pathmtu; + 4*asoc->pathmtu); + transport->cwnd = asoc->pathmtu; - /* T3-rtx also clears fast recovery on the transport */ - transport->fast_recovery = 0; + /* T3-rtx also clears fast recovery */ + asoc->fast_recovery = 0; break; case SCTP_LOWER_CWND_FAST_RTX: @@ -496,15 +499,15 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * cwnd = ssthresh * partial_bytes_acked = 0 */ - if (transport->fast_recovery) + if (asoc->fast_recovery) return; /* Mark Fast recovery */ - transport->fast_recovery = 1; - transport->fast_recovery_exit = transport->asoc->next_tsn - 1; + asoc->fast_recovery = 1; + asoc->fast_recovery_exit = asoc->next_tsn - 1; transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pathmtu); + 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; break; @@ -524,7 +527,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, if (time_after(jiffies, transport->last_time_ecne_reduced + transport->rtt)) { transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pathmtu); + 4*asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } @@ -540,7 +543,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * interval. */ transport->cwnd = max(transport->cwnd/2, - 4*transport->asoc->pathmtu); + 4*asoc->pathmtu); break; } @@ -625,7 +628,6 @@ void sctp_transport_reset(struct sctp_transport *t) t->error_count = 0; t->rto_pending = 0; t->hb_sent = 0; - t->fast_recovery = 0; /* Initialize the state information for SFR-CACC */ t->cacc.changeover_active = 0; -- cgit v1.1 From 65883371894be2631603d5d412f90f8c09290fef Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:10 -0400 Subject: sctp: rwnd_press should be cumulative rwnd_press tracks the pressure on the recieve window. Every timer the receive buffer overlows, we truncate the receive window and then grow it back. However, if we don't track the cumulative presser, it's possible to reach a situation when receive buffer is empty, but rwnd stays truncated. Signed-off-by: Vlad Yasevich --- net/sctp/associola.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sctp') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 37753cd..65f9a7c 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1482,7 +1482,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len) if (asoc->rwnd >= len) { asoc->rwnd -= len; if (over) { - asoc->rwnd_press = asoc->rwnd; + asoc->rwnd_press += asoc->rwnd; asoc->rwnd = 0; } } else { -- cgit v1.1 From ea862c8d1f4a0d193979c7412c3b946f600721ce Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:10 -0400 Subject: sctp: correctly mark missing chunks in fast recovery According to RFC 4960 Section 7.2.4: If an endpoint is in Fast Recovery and a SACK arrives that advances the Cumulative TSN Ack Point, the miss indications are incremented for all TSNs reported missing in the SACK. Signed-off-by: Vlad Yasevich --- net/sctp/outqueue.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 786c4ff..b491a1a 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1154,6 +1154,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) struct sctp_transport *primary = asoc->peer.primary_path; int count_of_newacks = 0; int gap_ack_blocks; + u8 accum_moved = 0; /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; @@ -1232,16 +1233,22 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) count_of_newacks ++; } + /* Move the Cumulative TSN Ack Point if appropriate. */ + if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) { + asoc->ctsn_ack_point = sack_ctsn; + accum_moved = 1; + } + if (gap_ack_blocks) { + + if (asoc->fast_recovery && accum_moved) + highest_new_tsn = highest_tsn; + list_for_each_entry(transport, transport_list, transports) sctp_mark_missing(q, &transport->transmitted, transport, highest_new_tsn, count_of_newacks); } - /* Move the Cumulative TSN Ack Point if appropriate. */ - if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) - asoc->ctsn_ack_point = sack_ctsn; - /* Update unack_data field in the assoc. */ sctp_sack_update_unack_data(asoc, sack); @@ -1685,7 +1692,8 @@ static void sctp_mark_missing(struct sctp_outq *q, struct sctp_chunk *chunk; __u32 tsn; char do_fast_retransmit = 0; - struct sctp_transport *primary = q->asoc->peer.primary_path; + struct sctp_association *asoc = q->asoc; + struct sctp_transport *primary = asoc->peer.primary_path; list_for_each_entry(chunk, transmitted_queue, transmitted_list) { -- cgit v1.1 From bfa0d9843ac5feb9667990706b4524390fee4df9 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:10 -0400 Subject: sctp: Optimize computation of highest new tsn in SACK. Right now, if the highest tsn in the SACK doesn't change, we'll end up scanning the transmitted lists on the transports twice: once for locating the highest _new_ tsn, and once for actually tagging chunks as acked. This is a waste, since we can record the highest _new_ tsn at the same time as tagging chunks. Long ago this was not possible because we would try to mark chunks as missing at the same time as tagging them acked and this approach didn't work. Now that the two steps are separate, we can re-use the old approach. Signed-off-by: Vlad Yasevich --- net/sctp/outqueue.c | 42 +++++++----------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index b491a1a..5d05717 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -62,7 +62,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, struct sctp_sackhdr *sack, - __u32 highest_new_tsn); + __u32 *highest_new_tsn); static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, @@ -1109,32 +1109,6 @@ static void sctp_sack_update_unack_data(struct sctp_association *assoc, assoc->unack_data = unack_data; } -/* Return the highest new tsn that is acknowledged by the given SACK chunk. */ -static __u32 sctp_highest_new_tsn(struct sctp_sackhdr *sack, - struct sctp_association *asoc) -{ - struct sctp_transport *transport; - struct sctp_chunk *chunk; - __u32 highest_new_tsn, tsn; - struct list_head *transport_list = &asoc->peer.transport_addr_list; - - highest_new_tsn = ntohl(sack->cum_tsn_ack); - - list_for_each_entry(transport, transport_list, transports) { - list_for_each_entry(chunk, &transport->transmitted, - transmitted_list) { - tsn = ntohl(chunk->subh.data_hdr->tsn); - - if (!chunk->tsn_gap_acked && - TSN_lt(highest_new_tsn, tsn) && - sctp_acked(sack, tsn)) - highest_new_tsn = tsn; - } - } - - return highest_new_tsn; -} - /* This is where we REALLY process a SACK. * * Process the SACK against the outqueue. Mostly, this just frees @@ -1203,18 +1177,15 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) if (gap_ack_blocks) highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end); - if (TSN_lt(asoc->highest_sacked, highest_tsn)) { - highest_new_tsn = highest_tsn; + if (TSN_lt(asoc->highest_sacked, highest_tsn)) asoc->highest_sacked = highest_tsn; - } else { - highest_new_tsn = sctp_highest_new_tsn(sack, asoc); - } + highest_new_tsn = sack_ctsn; /* Run through the retransmit queue. Credit bytes received * and free those chunks that we can. */ - sctp_check_transmitted(q, &q->retransmit, NULL, sack, highest_new_tsn); + sctp_check_transmitted(q, &q->retransmit, NULL, sack, &highest_new_tsn); /* Run through the transmitted queue. * Credit bytes received and free those chunks which we can. @@ -1223,7 +1194,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) */ list_for_each_entry(transport, transport_list, transports) { sctp_check_transmitted(q, &transport->transmitted, - transport, sack, highest_new_tsn); + transport, sack, &highest_new_tsn); /* * SFR-CACC algorithm: * C) Let count_of_newacks be the number of @@ -1331,7 +1302,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, struct sctp_sackhdr *sack, - __u32 highest_new_tsn_in_sack) + __u32 *highest_new_tsn_in_sack) { struct list_head *lchunk; struct sctp_chunk *tchunk; @@ -1419,6 +1390,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, */ if (!tchunk->tsn_gap_acked) { tchunk->tsn_gap_acked = 1; + *highest_new_tsn_in_sack = tsn; bytes_acked += sctp_data_size(tchunk); if (!tchunk->transport) migrate_bytes += sctp_data_size(tchunk); -- cgit v1.1 From 0e3aef8d09a8c11e3fb83cdcb24b5bc7421b3726 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 30 Apr 2010 22:41:10 -0400 Subject: sctp: Tag messages that can be Nagle delayed at creation. When we create the sctp_datamsg and fragment the user data, we know exactly if we are sending full segments or not and how they might be bundled. During this time, we can mark messages a Nagle capable or not. This makes the check at transmit time much simpler. Signed-off-by: Vlad Yasevich --- net/sctp/chunk.c | 4 ++-- net/sctp/output.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sctp') diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 3eab6db..476caaf 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -58,9 +58,9 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg) msg->send_failed = 0; msg->send_error = 0; msg->can_abandon = 0; + msg->can_delay = 1; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); - msg->msg_size = 0; } /* Allocate and initialize datamsg. */ @@ -157,7 +157,6 @@ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chu { sctp_datamsg_hold(msg); chunk->msg = msg; - msg->msg_size += chunk->skb->len; } @@ -247,6 +246,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (msg_len >= first_len) { msg_len -= first_len; whole = 1; + msg->can_delay = 0; } /* How many full sized? How many bytes leftover? */ diff --git a/net/sctp/output.c b/net/sctp/output.c index 35e49b9..a646681 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -674,7 +674,7 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, * Don't delay large message writes that may have been * fragmeneted into small peices. */ - if ((len < max) && (chunk->msg->msg_size < max)) { + if ((len < max) && chunk->msg->can_delay) { retval = SCTP_XMIT_NAGLE_DELAY; goto finish; } -- cgit v1.1 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sctp') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 13d8229..d54700a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6065,7 +6065,7 @@ static void __sctp_write_space(struct sctp_association *asoc) * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ - if (sock->fasync_list && + if (sock->wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); -- cgit v1.1