From 14c04493cb77bc38404dbcb39d5ccbb667831ad7 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:17 +0200 Subject: tipc: add ability to order and receive topology events in driver As preparation for introducing communication groups, we add the ability to issue topology subscriptions and receive topology events from kernel space. This will make it possible for group member sockets to keep track of other group members. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d50edd6..9a7e7b5c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -896,6 +896,10 @@ exit: kfree_skb(skb); } +static void tipc_sk_top_evt(struct tipc_sock *tsk, struct tipc_event *evt) +{ +} + /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure @@ -1671,20 +1675,24 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); unsigned int limit = rcvbuf_limit(sk, skb); int err = TIPC_OK; - int usr = msg_user(hdr); - u32 onode; - - if (unlikely(msg_user(hdr) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb, xmitq); - return false; - } - if (unlikely(usr == SOCK_WAKEUP)) { - onode = msg_orignode(hdr); + if (unlikely(!msg_isdata(hdr))) { + switch (msg_user(hdr)) { + case CONN_MANAGER: + tipc_sk_proto_rcv(tsk, skb, xmitq); + return false; + case SOCK_WAKEUP: + u32_del(&tsk->cong_links, msg_orignode(hdr)); + tsk->cong_link_cnt--; + sk->sk_write_space(sk); + break; + case TOP_SRV: + tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); + break; + default: + break; + } kfree_skb(skb); - u32_del(&tsk->cong_links, onode); - tsk->cong_link_cnt--; - sk->sk_write_space(sk); return false; } -- cgit v1.1 From 23998835be98a6842e5698fa1824f404c7de850d Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:18 +0200 Subject: tipc: improve address sanity check in tipc_connect() The address given to tipc_connect() is not completely sanity checked, under the assumption that this will be done later in the function __tipc_sendmsg() when the address is used there. However, the latter functon will in the next commits serve as caller to several other send functions, so we want to move the corresponding sanity check there to the beginning of that function, before we possibly need to grab the address stored by tipc_connect(). We must therefore be able to trust that this address already has been thoroughly checked. We do this in this commit. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9a7e7b5c..7659e79 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1911,28 +1911,27 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int previous; int res = 0; + if (destlen != sizeof(struct sockaddr_tipc)) + return -EINVAL; + lock_sock(sk); - /* DGRAM/RDM connect(), just save the destaddr */ - if (tipc_sk_type_connectionless(sk)) { - if (dst->family == AF_UNSPEC) { - memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); - } else if (destlen != sizeof(struct sockaddr_tipc)) { + if (dst->family == AF_UNSPEC) { + memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); + if (!tipc_sk_type_connectionless(sk)) res = -EINVAL; - } else { - memcpy(&tsk->peer, dest, destlen); - } goto exit; + } else if (dst->family != AF_TIPC) { + res = -EINVAL; } - - /* - * Reject connection attempt using multicast address - * - * Note: send_msg() validates the rest of the address fields, - * so there's no need to do it here - */ - if (dst->addrtype == TIPC_ADDR_MCAST) { + if (dst->addrtype != TIPC_ADDR_ID && dst->addrtype != TIPC_ADDR_NAME) res = -EINVAL; + if (res) + goto exit; + + /* DGRAM/RDM connect(), just save the destaddr */ + if (tipc_sk_type_connectionless(sk)) { + memcpy(&tsk->peer, dest, destlen); goto exit; } -- cgit v1.1 From 64ac5f5977df5b276374fb2f051082129f5cdb22 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:20 +0200 Subject: tipc: refactor function filter_rcv() In the following commits we will need to handle multiple incoming and rejected/returned buffers in the function socket.c::filter_rcv(). As a preparation for this, we generalize the function by handling buffer queues instead of individual buffers. We also introduce a help function tipc_skb_reject(), and rename filter_rcv() to tipc_sk_filter_rcv() in line with other functions in socket.c. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 161 +++++++++++++++++++++++++++--------------------------- 1 file changed, 80 insertions(+), 81 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7659e79..bc226f5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -111,7 +111,7 @@ struct tipc_sock { struct rcu_head rcu; }; -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); @@ -453,7 +453,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; - sk->sk_backlog_rcv = tipc_backlog_rcv; + sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; @@ -850,16 +850,16 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } /** - * tipc_sk_proto_rcv - receive a connection mng protocol message + * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - struct sock *sk = &tsk->sk; - u32 onode = tsk_own_node(tsk); struct tipc_msg *hdr = buf_msg(skb); + u32 onode = tsk_own_node(tsk); + struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); bool conn_cong; @@ -1536,14 +1536,41 @@ static void tipc_sock_destruct(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); } +static void tipc_sk_proto_rcv(struct sock *sk, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); + + switch (msg_user(hdr)) { + case CONN_MANAGER: + tipc_sk_conn_proto_rcv(tsk, skb, xmitq); + return; + case SOCK_WAKEUP: + u32_del(&tsk->cong_links, msg_orignode(hdr)); + tsk->cong_link_cnt--; + sk->sk_write_space(sk); + break; + case TOP_SRV: + tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); + break; + default: + break; + } + + kfree_skb(skb); +} + /** - * filter_connect - Handle all incoming messages for a connection-based socket + * tipc_filter_connect - Handle incoming message for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * * Returns true if everything ok, false otherwise */ -static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) +static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -1657,7 +1684,7 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) } /** - * filter_rcv - validate incoming message + * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. * @@ -1666,75 +1693,49 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) * * Called with socket lock already taken * - * Returns true if message was added to socket receive queue, otherwise false */ -static bool filter_rcv(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { + bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); - unsigned int limit = rcvbuf_limit(sk, skb); - int err = TIPC_OK; + struct net *net = sock_net(sk); + struct sk_buff_head inputq; + int limit, err = TIPC_OK; - if (unlikely(!msg_isdata(hdr))) { - switch (msg_user(hdr)) { - case CONN_MANAGER: - tipc_sk_proto_rcv(tsk, skb, xmitq); - return false; - case SOCK_WAKEUP: - u32_del(&tsk->cong_links, msg_orignode(hdr)); - tsk->cong_link_cnt--; - sk->sk_write_space(sk); - break; - case TOP_SRV: - tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); - break; - default: - break; - } - kfree_skb(skb); - return false; - } + TIPC_SKB_CB(skb)->bytes_read = 0; + __skb_queue_head_init(&inputq); + __skb_queue_tail(&inputq, skb); - /* Drop if illegal message type */ - if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) { - kfree_skb(skb); - return false; - } + if (unlikely(!msg_isdata(hdr))) + tipc_sk_proto_rcv(sk, &inputq, xmitq); + else if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) + return kfree_skb(skb); - /* Reject if wrong message type for current socket state */ - if (tipc_sk_type_connectionless(sk)) { - if (msg_connected(hdr)) { + /* Validate and add to receive buffer if there is space */ + while ((skb = __skb_dequeue(&inputq))) { + hdr = buf_msg(skb); + limit = rcvbuf_limit(sk, skb); + if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || + (!sk_conn && msg_connected(hdr))) err = TIPC_ERR_NO_PORT; - goto reject; - } - } else if (unlikely(!filter_connect(tsk, skb))) { - err = TIPC_ERR_NO_PORT; - goto reject; - } + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + err = TIPC_ERR_OVERLOAD; - /* Reject message if there isn't room to queue it */ - if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) { - err = TIPC_ERR_OVERLOAD; - goto reject; + if (unlikely(err)) { + tipc_skb_reject(net, err, skb, xmitq); + err = TIPC_OK; + continue; + } + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk); } - - /* Enqueue message */ - TIPC_SKB_CB(skb)->bytes_read = 0; - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - - sk->sk_data_ready(sk); - return true; - -reject: - if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err)) - __skb_queue_tail(xmitq, skb); - return false; } /** - * tipc_backlog_rcv - handle incoming message from backlog queue + * tipc_sk_backlog_rcv - handle incoming message from backlog queue * @sk: socket * @skb: message * @@ -1742,27 +1743,25 @@ reject: * * Returns 0 */ -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - unsigned int truesize = skb->truesize; + unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; u32 dnode, selector; + unsigned int added; __skb_queue_head_init(&xmitq); - if (likely(filter_rcv(sk, skb, &xmitq))) { - atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); - return 0; - } - - if (skb_queue_empty(&xmitq)) - return 0; + tipc_sk_filter_rcv(sk, skb, &xmitq); + added = sk_rmem_alloc_get(sk) - before; + atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); - /* Send response/rejected message */ - skb = __skb_dequeue(&xmitq); - dnode = msg_destnode(buf_msg(skb)); - selector = msg_origport(buf_msg(skb)); - tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); + /* Send pending response/rejected messages, if any */ + while ((skb = __skb_dequeue(&xmitq))) { + selector = msg_origport(buf_msg(skb)); + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); + } return 0; } @@ -1794,7 +1793,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - filter_rcv(sk, skb, xmitq); + tipc_sk_filter_rcv(sk, skb, xmitq); continue; } -- cgit v1.1 From f70d37b796241f617107d5585ee96a7e1b660b63 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:21 +0200 Subject: tipc: add new function for sending multiple small messages We see an increasing need to send multiple single-buffer messages of TIPC_SYSTEM_IMPORTANCE to different individual destination nodes. Instead of looping over the send queue and sending each buffer individually, as we do now, we add a new help function tipc_node_distr_xmit() to do this. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index bc226f5..c7c6749 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1740,14 +1740,11 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, * @skb: message * * Caller must hold socket lock - * - * Returns 0 */ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; - u32 dnode, selector; unsigned int added; __skb_queue_head_init(&xmitq); @@ -1757,11 +1754,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); /* Send pending response/rejected messages, if any */ - while ((skb = __skb_dequeue(&xmitq))) { - selector = msg_origport(buf_msg(skb)); - dnode = msg_destnode(buf_msg(skb)); - tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); - } + tipc_node_distr_xmit(sock_net(sk), &xmitq); return 0; } @@ -1840,10 +1833,7 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) spin_unlock_bh(&sk->sk_lock.slock); } /* Send pending response/rejected messages, if any */ - while ((skb = __skb_dequeue(&xmitq))) { - dnode = msg_destnode(buf_msg(skb)); - tipc_node_xmit_skb(net, skb, dnode, dport); - } + tipc_node_distr_xmit(sock_net(sk), &xmitq); sock_put(sk); continue; } -- cgit v1.1 From a80ae5306a7346d4e52f59462878beb8362f4bbd Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:22 +0200 Subject: tipc: improve destination linked list We often see a need for a linked list of destination identities, sometimes containing a port number, sometimes a node identity, and sometimes both. The currently defined struct u32_list is not generic enough to cover all cases, so we extend it to contain two u32 integers and rename it to struct tipc_dest_list. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index c7c6749..daf7c4d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -565,7 +565,7 @@ static int tipc_release(struct socket *sock) /* Reject any messages that accumulated in backlog queue */ release_sock(sk); - u32_list_purge(&tsk->cong_links); + tipc_dest_list_purge(&tsk->cong_links); tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; @@ -826,8 +826,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_nametbl_mc_translate(net, msg_nametype(msg), msg_namelower(msg), msg_nameupper(msg), scope, &dports); - portid = u32_pop(&dports); - for (; portid; portid = u32_pop(&dports)) { + while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); @@ -1000,7 +999,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) } /* Block or return if destination link is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode)); + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(clinks, dnode, 0)); if (unlikely(rc)) return rc; @@ -1012,7 +1012,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { - u32_push(clinks, dnode); + tipc_dest_push(clinks, dnode, 0); tsk->cong_link_cnt++; rc = 0; } @@ -1549,7 +1549,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, tipc_sk_conn_proto_rcv(tsk, skb, xmitq); return; case SOCK_WAKEUP: - u32_del(&tsk->cong_links, msg_orignode(hdr)); + tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); tsk->cong_link_cnt--; sk->sk_write_space(sk); break; -- cgit v1.1 From 75da2163dbb6af9f2dce1d80056d11d290dd19a5 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:23 +0200 Subject: tipc: introduce communication groups As a preparation for introducing flow control for multicast and datagram messaging we need a more strictly defined framework than we have now. A socket must be able keep track of exactly how many and which other sockets it is allowed to communicate with at any moment, and keep the necessary state for those. We therefore introduce a new concept we have named Communication Group. Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN. The call takes four parameters: 'type' serves as group identifier, 'instance' serves as an logical member identifier, and 'scope' indicates the visibility of the group (node/cluster/zone). Finally, 'flags' makes it possible to set certain properties for the member. For now, there is only one flag, indicating if the creator of the socket wants to receive a copy of broadcast or multicast messages it is sending via the socket, and if wants to be eligible as destination for its own anycasts. A group is closed, i.e., sockets which have not joined a group will not be able to send messages to or receive messages from members of the group, and vice versa. Any member of a group can send multicast ('group broadcast') messages to all group members, optionally including itself, using the primitive send(). The messages are received via the recvmsg() primitive. A socket can only be member of one group at a time. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 182 insertions(+), 27 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index daf7c4d..64bbf9d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,7 +1,7 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2016, Ericsson AB + * Copyright (c) 2001-2007, 2012-2017, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -45,6 +45,7 @@ #include "socket.h" #include "bcast.h" #include "netlink.h" +#include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ @@ -78,7 +79,7 @@ enum { * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links - * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @snt_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm * @node: hash table node @@ -109,6 +110,7 @@ struct tipc_sock { struct rhash_head node; struct tipc_mc_method mc_method; struct rcu_head rcu; + struct tipc_group *group; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -123,6 +125,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); +static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); @@ -559,6 +562,7 @@ static int tipc_release(struct socket *sock) __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; + tipc_sk_leave(tsk); tipc_sk_withdraw(tsk, 0, NULL); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); @@ -601,7 +605,10 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } - + if (tsk->group) { + res = -EACCES; + goto exit; + } if (uaddr_len < sizeof(struct sockaddr_tipc)) { res = -EINVAL; goto exit; @@ -698,6 +705,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; u32 mask = 0; sock_poll_wait(file, sk_sleep(sk), wait); @@ -718,8 +726,9 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, mask |= (POLLIN | POLLRDNORM); break; case TIPC_OPEN: - if (!tsk->cong_link_cnt) - mask |= POLLOUT; + if (!grp || tipc_group_size(grp)) + if (!tsk->cong_link_cnt) + mask |= POLLOUT; if (tipc_sk_type_connectionless(sk) && (!skb_queue_empty(&sk->sk_receive_queue))) mask |= (POLLIN | POLLRDNORM); @@ -757,6 +766,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct tipc_nlist dsts; int rc; + if (tsk->group) + return -EACCES; + /* Block or return if any destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); if (unlikely(rc)) @@ -794,6 +806,64 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, } /** + * tipc_send_group_bcast - send message to all members in communication group + * @sk: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct tipc_nlist *dsts = tipc_group_dests(grp); + struct tipc_mc_method *method = &tsk->mc_method; + struct tipc_msg *hdr = &tsk->phdr; + int mtu = tipc_bcast_get_mtu(net); + struct sk_buff_head pkts; + int rc = -EHOSTUNREACH; + + if (!dsts->local && !dsts->remote) + return -EHOSTUNREACH; + + /* Block or return if any destination link is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; + + /* Complete message header */ + msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + msg_set_hdr_sz(hdr, MCAST_H_SIZE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_nameinst(hdr, 0); + msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_mcast_xmit(net, &pkts, method, dsts, + &tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; + + /* Update broadcast sequence number */ + tipc_group_update_bc_members(tsk->group); + + return dlen; +} + +/** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup @@ -803,13 +873,15 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - struct tipc_msg *msg; - struct list_head dports; - u32 portid; u32 scope = TIPC_CLUSTER_SCOPE; - struct sk_buff_head tmpq; - uint hsz; + u32 self = tipc_own_addr(net); struct sk_buff *skb, *_skb; + u32 lower = 0, upper = ~0; + struct sk_buff_head tmpq; + u32 portid, oport, onode; + struct list_head dports; + struct tipc_msg *msg; + int hsz; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); @@ -818,14 +890,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { msg = buf_msg(skb); hsz = skb_headroom(skb) + msg_hdr_sz(msg); - - if (in_own_node(net, msg_orignode(msg))) + oport = msg_origport(msg); + onode = msg_orignode(msg); + if (onode == self) scope = TIPC_NODE_SCOPE; /* Create destination port list and message clones: */ - tipc_nametbl_mc_translate(net, - msg_nametype(msg), msg_namelower(msg), - msg_nameupper(msg), scope, &dports); + if (!msg_in_group(msg)) { + lower = msg_namelower(msg); + upper = msg_nameupper(msg); + } + tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, + scope, &dports); while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { @@ -895,10 +971,6 @@ exit: kfree_skb(skb); } -static void tipc_sk_top_evt(struct tipc_sock *tsk, struct tipc_event *evt) -{ -} - /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure @@ -934,6 +1006,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct list_head *clinks = &tsk->cong_links; bool syn = !tipc_sk_type_connectionless(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; @@ -944,6 +1017,9 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; + if (unlikely(grp)) + return tipc_send_group_bcast(sock, m, dlen, timeout); + if (unlikely(!dest)) { dest = &tsk->peer; if (!syn || dest->family != AF_TIPC) @@ -1543,6 +1619,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, struct sk_buff *skb = __skb_dequeue(inputq); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); + struct tipc_group *grp = tsk->group; switch (msg_user(hdr)) { case CONN_MANAGER: @@ -1553,8 +1630,12 @@ static void tipc_sk_proto_rcv(struct sock *sk, tsk->cong_link_cnt--; sk->sk_write_space(sk); break; + case GROUP_PROTOCOL: + tipc_group_proto_rcv(grp, hdr, xmitq); + break; case TOP_SRV: - tipc_sk_top_evt(tsk, (void *)msg_data(hdr)); + tipc_group_member_evt(tsk->group, skb, xmitq); + skb = NULL; break; default: break; @@ -1699,6 +1780,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, { bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = buf_msg(skb); struct net *net = sock_net(sk); struct sk_buff_head inputq; @@ -1710,15 +1792,19 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); - else if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) + else if (unlikely(msg_type(hdr) > TIPC_GRP_BCAST_MSG)) return kfree_skb(skb); + if (unlikely(grp)) + tipc_group_filter_msg(grp, &inputq, xmitq); + /* Validate and add to receive buffer if there is space */ while ((skb = __skb_dequeue(&inputq))) { hdr = buf_msg(skb); limit = rcvbuf_limit(sk, skb); if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || - (!sk_conn && msg_connected(hdr))) + (!sk_conn && msg_connected(hdr)) || + (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) err = TIPC_ERR_OVERLOAD; @@ -1837,7 +1923,6 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) sock_put(sk); continue; } - /* No destination socket => dequeue skb if still there */ skb = tipc_skb_dequeue(inputq, dport); if (!skb) @@ -1905,6 +1990,11 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, lock_sock(sk); + if (tsk->group) { + res = -EINVAL; + goto exit; + } + if (dst->family == AF_UNSPEC) { memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); if (!tipc_sk_type_connectionless(sk)) @@ -2341,6 +2431,52 @@ void tipc_sk_rht_destroy(struct net *net) rhashtable_destroy(&tn->sk_rht); } +static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) +{ + struct net *net = sock_net(&tsk->sk); + u32 domain = addr_domain(net, mreq->scope); + struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; + struct tipc_name_seq seq; + int rc; + + if (mreq->type < TIPC_RESERVED_TYPES) + return -EACCES; + if (grp) + return -EACCES; + grp = tipc_group_create(net, tsk->portid, mreq); + if (!grp) + return -ENOMEM; + tsk->group = grp; + msg_set_lookup_scope(hdr, mreq->scope); + msg_set_nametype(hdr, mreq->type); + msg_set_dest_droppable(hdr, true); + seq.type = mreq->type; + seq.lower = mreq->instance; + seq.upper = seq.lower; + tipc_nametbl_build_group(net, grp, mreq->type, domain); + rc = tipc_sk_publish(tsk, mreq->scope, &seq); + if (rc) + tipc_group_delete(net, grp); + return rc; +} + +static int tipc_sk_leave(struct tipc_sock *tsk) +{ + struct net *net = sock_net(&tsk->sk); + struct tipc_group *grp = tsk->group; + struct tipc_name_seq seq; + int scope; + + if (!grp) + return -EINVAL; + tipc_group_self(grp, &seq, &scope); + tipc_group_delete(net, grp); + tsk->group = NULL; + tipc_sk_withdraw(tsk, scope, &seq); + return 0; +} + /** * tipc_setsockopt - set socket option * @sock: socket structure @@ -2359,6 +2495,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group_req mreq; u32 value = 0; int res = 0; @@ -2374,9 +2511,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_CONN_TIMEOUT: if (ol < sizeof(value)) return -EINVAL; - res = get_user(value, (u32 __user *)ov); - if (res) - return res; + if (get_user(value, (u32 __user *)ov)) + return -EFAULT; + break; + case TIPC_GROUP_JOIN: + if (ol < sizeof(mreq)) + return -EINVAL; + if (copy_from_user(&mreq, ov, sizeof(mreq))) + return -EFAULT; break; default: if (ov || ol) @@ -2409,6 +2551,12 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; break; + case TIPC_GROUP_JOIN: + res = tipc_sk_join(tsk, &mreq); + break; + case TIPC_GROUP_LEAVE: + res = tipc_sk_leave(tsk); + break; default: res = -EINVAL; } @@ -2436,7 +2584,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - int len; + struct tipc_name_seq seq; + int len, scope; u32 value; int res; @@ -2470,6 +2619,12 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_GROUP_JOIN: + seq.type = 0; + if (tsk->group) + tipc_group_self(tsk->group, &seq, &scope); + value = seq.type; + break; default: res = -EINVAL; } -- cgit v1.1 From 31c82a2d9d51fccbb85cbd2be983eb115225301c Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:24 +0200 Subject: tipc: add second source address to recvmsg()/recvfrom() With group communication, it becomes important for a message receiver to identify not only from which socket (identfied by a node:port tuple) the message was sent, but also the logical identity (type:instance) of the sending member. We fix this by adding a second instance of struct sockaddr_tipc to the source address area when a message is read. The extra address struct is filled in with data found in the received message header (type,) and in the local member representation struct (instance.) Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 53 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 18 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 64bbf9d..25ecf12 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -62,6 +62,11 @@ enum { TIPC_CONNECTING = TCP_SYN_SENT, }; +struct sockaddr_pair { + struct sockaddr_tipc sock; + struct sockaddr_tipc member; +}; + /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API @@ -1222,26 +1227,38 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, } /** - * set_orig_addr - capture sender's address for received message + * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info - * @msg: received message header + * @hdr: received message header * * Note: Address is not captured if not requested by receiver. */ -static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) -{ - DECLARE_SOCKADDR(struct sockaddr_tipc *, addr, m->msg_name); - - if (addr) { - addr->family = AF_TIPC; - addr->addrtype = TIPC_ADDR_ID; - memset(&addr->addr, 0, sizeof(addr->addr)); - addr->addr.id.ref = msg_origport(msg); - addr->addr.id.node = msg_orignode(msg); - addr->addr.name.domain = 0; /* could leave uninitialized */ - addr->scope = 0; /* could leave uninitialized */ - m->msg_namelen = sizeof(struct sockaddr_tipc); - } +static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) +{ + DECLARE_SOCKADDR(struct sockaddr_pair *, srcaddr, m->msg_name); + struct tipc_msg *hdr = buf_msg(skb); + + if (!srcaddr) + return; + + srcaddr->sock.family = AF_TIPC; + srcaddr->sock.addrtype = TIPC_ADDR_ID; + srcaddr->sock.addr.id.ref = msg_origport(hdr); + srcaddr->sock.addr.id.node = msg_orignode(hdr); + srcaddr->sock.addr.name.domain = 0; + srcaddr->sock.scope = 0; + m->msg_namelen = sizeof(struct sockaddr_tipc); + + if (!msg_in_group(hdr)) + return; + + /* Group message users may also want to know sending member's id */ + srcaddr->member.family = AF_TIPC; + srcaddr->member.addrtype = TIPC_ADDR_NAME; + srcaddr->member.addr.name.name.type = msg_nametype(hdr); + srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; + srcaddr->member.addr.name.domain = 0; + m->msg_namelen = sizeof(*srcaddr); } /** @@ -1432,7 +1449,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, } while (1); /* Collect msg meta data, including error code and rejected data */ - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (unlikely(rc)) goto exit; @@ -1526,7 +1543,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (rc) break; -- cgit v1.1 From ae236fb208a6fbbd2e7a6913385e8fb78ac807f8 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:25 +0200 Subject: tipc: receive group membership events via member socket Like with any other service, group members' availability can be subscribed for by connecting to be topology server. However, because the events arrive via a different socket than the member socket, there is a real risk that membership events my arrive out of synch with the actual JOIN/LEAVE action. I.e., it is possible to receive the first messages from a new member before the corresponding JOIN event arrives, just as it is possible to receive the last messages from a leaving member after the LEAVE event has already been received. Since each member socket is internally also subscribing for membership events, we now fix this problem by passing those events on to the user via the member socket. We leverage the already present member synch- ronization protocol to guarantee correct message/event order. An event is delivered to the user as an empty message where the two source addresses identify the new/lost member. Furthermore, we set the MSG_OOB bit in the message flags to mark it as an event. If the event is an indication about a member loss we also set the MSG_EOR bit, so it can be distinguished from a member addition event. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 25ecf12..0a2eac30 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -709,41 +709,43 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; - u32 mask = 0; + u32 revents = 0; sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP | POLLIN | POLLRDNORM; + revents |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) - mask |= POLLHUP; + revents |= POLLHUP; switch (sk->sk_state) { case TIPC_ESTABLISHED: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) - mask |= POLLOUT; + revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) - mask |= (POLLIN | POLLRDNORM); + if (skb) + revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: if (!grp || tipc_group_size(grp)) if (!tsk->cong_link_cnt) - mask |= POLLOUT; - if (tipc_sk_type_connectionless(sk) && - (!skb_queue_empty(&sk->sk_receive_queue))) - mask |= (POLLIN | POLLRDNORM); + revents |= POLLOUT; + if (!tipc_sk_type_connectionless(sk)) + break; + if (!skb) + break; + revents |= POLLIN | POLLRDNORM; break; case TIPC_DISCONNECTING: - mask = (POLLIN | POLLRDNORM | POLLHUP); + revents = POLLIN | POLLRDNORM | POLLHUP; break; } - - return mask; + return revents; } /** @@ -1415,11 +1417,12 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) { struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *skb; - struct tipc_msg *hdr; bool connected = !tipc_sk_type_connectionless(sk); + struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct tipc_msg *hdr; + struct sk_buff *skb; + bool grp_evt; long timeout; /* Catch invalid receive requests */ @@ -1443,6 +1446,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); err = msg_errcode(hdr); + grp_evt = msg_is_grp_evt(hdr); if (likely(dlen || err)) break; tsk_advance_rx_queue(sk); @@ -1469,11 +1473,20 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; + /* Mark message as group event if applicable */ + if (unlikely(grp_evt)) { + if (msg_grp_evt(hdr) == TIPC_WITHDRAWN) + m->msg_flags |= MSG_EOR; + m->msg_flags |= MSG_OOB; + copy = 0; + } + /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit; tsk_advance_rx_queue(sk); + if (likely(!connected)) goto exit; @@ -1648,10 +1661,10 @@ static void tipc_sk_proto_rcv(struct sock *sk, sk->sk_write_space(sk); break; case GROUP_PROTOCOL: - tipc_group_proto_rcv(grp, hdr, xmitq); + tipc_group_proto_rcv(grp, hdr, inputq, xmitq); break; case TOP_SRV: - tipc_group_member_evt(tsk->group, skb, xmitq); + tipc_group_member_evt(tsk->group, skb, inputq, xmitq); skb = NULL; break; default: -- cgit v1.1 From b7d42635517fde2b095deddd0fba37be2302a285 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:26 +0200 Subject: tipc: introduce flow control for group broadcast messages We introduce an end-to-end flow control mechanism for group broadcast messages. This ensures that no messages are ever lost because of destination receive buffer overflow, with minimal impact on performance. For now, the algorithm is based on the assumption that there is only one active transmitter at any moment in time. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 0a2eac30..50145c9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -201,6 +201,11 @@ static bool tsk_conn_cong(struct tipc_sock *tsk) return tsk->snt_unacked > tsk->snd_win; } +static u16 tsk_blocks(int len) +{ + return ((len / FLOWCTL_BLK_SZ) + 1); +} + /* tsk_blocks(): translate a buffer size in bytes to number of * advertisable blocks, taking into account the ratio truesize(len)/len * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ @@ -831,6 +836,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, struct tipc_group *grp = tsk->group; struct tipc_nlist *dsts = tipc_group_dests(grp); struct tipc_mc_method *method = &tsk->mc_method; + int blks = tsk_blocks(MCAST_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; int mtu = tipc_bcast_get_mtu(net); struct sk_buff_head pkts; @@ -839,14 +845,15 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, if (!dsts->local && !dsts->remote) return -EHOSTUNREACH; - /* Block or return if any destination link is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); + /* Block or return if any destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt && + !tipc_group_bc_cong(grp, blks)); if (unlikely(rc)) return rc; /* Complete message header */ msg_set_type(hdr, TIPC_GRP_BCAST_MSG); - msg_set_hdr_sz(hdr, MCAST_H_SIZE); + msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); msg_set_nameinst(hdr, 0); @@ -864,9 +871,8 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, if (unlikely(rc)) return rc; - /* Update broadcast sequence number */ - tipc_group_update_bc_members(tsk->group); - + /* Update broadcast sequence number and send windows */ + tipc_group_update_bc_members(tsk->group, blks); return dlen; } @@ -1024,7 +1030,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; - if (unlikely(grp)) + if (unlikely(grp && !dest)) return tipc_send_group_bcast(sock, m, dlen, timeout); if (unlikely(!dest)) { @@ -1420,6 +1426,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; bool grp_evt; @@ -1436,8 +1443,8 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + /* Step rcv queue to first msg with data or error; wait if necessary */ do { - /* Look at first msg in receive queue; wait if necessary */ rc = tipc_wait_for_rcvmsg(sock, &timeout); if (unlikely(rc)) goto exit; @@ -1485,12 +1492,21 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(flags & MSG_PEEK)) goto exit; + /* Send group flow control advertisement when applicable */ + if (tsk->group && msg_in_group(hdr) && !grp_evt) { + skb_queue_head_init(&xmitq); + tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen), + msg_orignode(hdr), msg_origport(hdr), + &xmitq); + tipc_node_distr_xmit(sock_net(sk), &xmitq); + } + tsk_advance_rx_queue(sk); if (likely(!connected)) goto exit; - /* Send connection flow control ack when applicable */ + /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); @@ -1650,6 +1666,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); struct tipc_group *grp = tsk->group; + bool wakeup = false; switch (msg_user(hdr)) { case CONN_MANAGER: @@ -1658,19 +1675,23 @@ static void tipc_sk_proto_rcv(struct sock *sk, case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); tsk->cong_link_cnt--; - sk->sk_write_space(sk); + wakeup = true; break; case GROUP_PROTOCOL: - tipc_group_proto_rcv(grp, hdr, inputq, xmitq); + tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); break; case TOP_SRV: - tipc_group_member_evt(tsk->group, skb, inputq, xmitq); + tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, + skb, inputq, xmitq); skb = NULL; break; default: break; } + if (wakeup) + sk->sk_write_space(sk); + kfree_skb(skb); } @@ -1785,6 +1806,9 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); + if (unlikely(msg_in_group(hdr))) + return sk->sk_rcvbuf; + if (unlikely(!msg_connected(hdr))) return sk->sk_rcvbuf << msg_importance(hdr); -- cgit v1.1 From 27bd9ec027f396457d1a147043c92ff22fc4c71e Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:27 +0200 Subject: tipc: introduce group unicast messaging We now make it possible to send connectionless unicast messages within a communication group. To send a message, the sender can use either a direct port address, aka port identity, or an indirect port name to be looked up. This type of messages are subject to the same start synchronization and flow control mechanism as group broadcast messages. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 10 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 50145c9..e71c8d2 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -818,6 +818,93 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, } /** + * tipc_send_group_msg - send a message to a member in the group + * @net: network namespace + * @m: message to send + * @mb: group member + * @dnode: destination node + * @dport: destination port + * @dlen: total length of message data + */ +static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, + struct msghdr *m, struct tipc_member *mb, + u32 dnode, u32 dport, int dlen) +{ + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_msg *hdr = &tsk->phdr; + struct sk_buff_head pkts; + int mtu, rc; + + /* Complete message header */ + msg_set_type(hdr, TIPC_GRP_UCAST_MSG); + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_destport(hdr, dport); + msg_set_destnode(hdr, dnode); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + tipc_dest_push(&tsk->cong_links, dnode, 0); + tsk->cong_link_cnt++; + } + + /* Update send window and sequence number */ + tipc_group_update_member(mb, blks); + + return dlen; +} + +/** + * tipc_send_group_unicast - send message to a member in the group + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + struct tipc_member *mb = NULL; + u32 node, port; + int rc; + + node = dest->addr.id.node; + port = dest->addr.id.ref; + if (!port && !node) + return -EHOSTUNREACH; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(&tsk->cong_links, node, 0) && + !tipc_group_cong(grp, node, port, blks, &mb)); + if (unlikely(rc)) + return rc; + + if (unlikely(!mb)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mb, node, port, dlen); + + return rc ? rc : dlen; +} + +/** * tipc_send_group_bcast - send message to all members in communication group * @sk: socket structure * @m: message to send @@ -1030,8 +1117,20 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; - if (unlikely(grp && !dest)) - return tipc_send_group_bcast(sock, m, dlen, timeout); + if (likely(dest)) { + if (unlikely(m->msg_namelen < sizeof(*dest))) + return -EINVAL; + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + } + + if (grp) { + if (!dest) + return tipc_send_group_bcast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_ID) + return tipc_send_group_unicast(sock, m, dlen, timeout); + return -EINVAL; + } if (unlikely(!dest)) { dest = &tsk->peer; @@ -1039,12 +1138,6 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EDESTADDRREQ; } - if (unlikely(m->msg_namelen < sizeof(*dest))) - return -EINVAL; - - if (unlikely(dest->family != AF_TIPC)) - return -EINVAL; - if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; @@ -1077,7 +1170,6 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; msg_set_type(hdr, TIPC_DIRECT_MSG); @@ -1846,7 +1938,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); - else if (unlikely(msg_type(hdr) > TIPC_GRP_BCAST_MSG)) + else if (unlikely(msg_type(hdr) > TIPC_GRP_UCAST_MSG)) return kfree_skb(skb); if (unlikely(grp)) -- cgit v1.1 From ee106d7f942dabce1352e01c6fe9ca4a720c2331 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:28 +0200 Subject: tipc: introduce group anycast messaging In this commit, we make it possible to send connectionless unicast messages to any member corresponding to the given member identity, when there is more than one such member. The sender must use a TIPC_ADDR_NAME address to achieve this effect. We also perform load balancing between the destinations, i.e., we primarily select one which has advertised sufficient send window to not cause a block/EAGAIN delay, if any. This mechanism is overlayed on the always present round-robin selection. Anycast messages are subject to the same start synchronization and flow control mechanism as group broadcast messages. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e71c8d2..66165e1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -905,6 +905,88 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, } /** + * tipc_send_group_anycast - send message to any member with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); + struct list_head *cong_links = &tsk->cong_links; + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_group *grp = tsk->group; + struct tipc_member *first = NULL; + struct tipc_member *mbr = NULL; + struct net *net = sock_net(sk); + u32 node, port, exclude; + u32 type, inst, domain; + struct list_head dsts; + int lookups = 0; + int dstcnt, rc; + bool cong; + + INIT_LIST_HEAD(&dsts); + + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + + while (++lookups < 4) { + first = NULL; + + /* Look for a non-congested destination member, if any */ + while (1) { + if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + &dstcnt, exclude, false)) + return -EHOSTUNREACH; + tipc_dest_pop(&dsts, &node, &port); + cong = tipc_group_cong(grp, node, port, blks, &mbr); + if (!cong) + break; + if (mbr == first) + break; + if (!first) + first = mbr; + } + + /* Start over if destination was not in member list */ + if (unlikely(!mbr)) + continue; + + if (likely(!cong && !tipc_dest_find(cong_links, node, 0))) + break; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(cong_links, node, 0) && + !tipc_group_cong(grp, node, port, + blks, &mbr)); + if (unlikely(rc)) + return rc; + + /* Send, unless destination disappeared while waiting */ + if (likely(mbr)) + break; + } + + if (unlikely(lookups >= 4)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mbr, node, port, dlen); + + return rc ? rc : dlen; +} + +/** * tipc_send_group_bcast - send message to all members in communication group * @sk: socket structure * @m: message to send @@ -1127,6 +1209,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (grp) { if (!dest) return tipc_send_group_bcast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_NAME) + return tipc_send_group_anycast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_ID) return tipc_send_group_unicast(sock, m, dlen, timeout); return -EINVAL; -- cgit v1.1 From 5b8dddb63769587badc50725ec9857caaeba4de0 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:29 +0200 Subject: tipc: introduce group multicast messaging The previously introduced message transport to all group members is based on the tipc multicast service, but is logically a broadcast service within the group, and that is what we call it. We now add functionality for sending messages to all group members having a certain identity. Correspondingly, we call this feature 'group multicast'. The service is using unicast when only one destination is found, otherwise it will use the bearer broadcast service to transfer the messages. In the latter case, the receiving members filter arriving messages by looking at the intended destination instance. If there is no match, the message will be dropped, while still being considered received and read as seen by the flow control mechanism. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 66165e1..8fdd969 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -999,6 +999,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); @@ -1021,11 +1022,16 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, return rc; /* Complete message header */ - msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + if (dest) { + msg_set_type(hdr, TIPC_GRP_MCAST_MSG); + msg_set_nameinst(hdr, dest->addr.name.name.instance); + } else { + msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + msg_set_nameinst(hdr, 0); + } msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); - msg_set_nameinst(hdr, 0); msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); /* Build message as chain of buffers */ @@ -1046,6 +1052,48 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, } /** + * tipc_send_group_mcast - send message to all members with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_name_seq *seq = &dest->addr.nameseq; + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + u32 domain, exclude, dstcnt; + struct list_head dsts; + + INIT_LIST_HEAD(&dsts); + + if (seq->lower != seq->upper) + return -ENOTSUPP; + + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, + &dsts, &dstcnt, exclude, true)) + return -EHOSTUNREACH; + + if (dstcnt == 1) { + tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref); + return tipc_send_group_unicast(sock, m, dlen, timeout); + } + + tipc_dest_list_purge(&dsts); + return tipc_send_group_bcast(sock, m, dlen, timeout); +} + +/** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup @@ -1213,6 +1261,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return tipc_send_group_anycast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_ID) return tipc_send_group_unicast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_MCAST) + return tipc_send_group_mcast(sock, m, dlen, timeout); return -EINVAL; } @@ -2022,8 +2072,6 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); - else if (unlikely(msg_type(hdr) > TIPC_GRP_UCAST_MSG)) - return kfree_skb(skb); if (unlikely(grp)) tipc_group_filter_msg(grp, &inputq, xmitq); -- cgit v1.1 From b87a5ea31c935a7f7e11ca85df2ec7917921e96d Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:30 +0200 Subject: tipc: guarantee group unicast doesn't bypass group broadcast Group unicast messages don't follow the same path as broadcast messages, and there is a high risk that unicasts sent from a socket might bypass previously sent broadcasts from the same socket. We fix this by letting all unicast messages carry the sequence number of the next sent broadcast from the same node, but without updating this number at the receiver. This way, a receiver can check and if necessary re-order such messages before they are added to the socket receive buffer. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8fdd969..3276b7a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -830,6 +830,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, struct msghdr *m, struct tipc_member *mb, u32 dnode, u32 dport, int dlen) { + u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; struct sk_buff_head pkts; @@ -840,6 +841,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, dport); msg_set_destnode(hdr, dnode); + msg_set_grp_bc_seqno(hdr, bc_snd_nxt); /* Build message as chain of buffers */ skb_queue_head_init(&pkts); -- cgit v1.1 From 2f487712b89376fce267223bbb0db93d393d4b09 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:31 +0200 Subject: tipc: guarantee that group broadcast doesn't bypass group unicast We need a mechanism guaranteeing that group unicasts sent out from a socket are not bypassed by later sent broadcasts from the same socket. We do this as follows: - Each time a unicast is sent, we set a the broadcast method for the socket to "replicast" and "mandatory". This forces the first subsequent broadcast message to follow the same network and data path as the preceding unicast to a destination, hence preventing it from overtaking the latter. - In order to make the 'same data path' statement above true, we let group unicasts pass through the multicast link input queue, instead of as previously through the unicast link input queue. - In the first broadcast following a unicast, we set a new header flag, requiring all recipients to immediately acknowledge its reception. - During the period before all the expected acknowledges are received, the socket refuses to accept any more broadcast attempts, i.e., by blocking or returning EAGAIN. This period should typically not be longer than a few microseconds. - When all acknowledges have been received, the sending socket will open up for subsequent broadcasts, this time giving the link layer freedom to itself select the best transmission method. - The forced and/or abrupt transmission method changes described above may lead to broadcasts arriving out of order to the recipients. We remedy this by introducing code that checks and if necessary re-orders such messages at the receiving end. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3276b7a..b1f1c3c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -831,6 +831,7 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, u32 dnode, u32 dport, int dlen) { u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); + struct tipc_mc_method *method = &tsk->mc_method; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; struct sk_buff_head pkts; @@ -857,9 +858,12 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, tsk->cong_link_cnt++; } - /* Update send window and sequence number */ + /* Update send window */ tipc_group_update_member(mb, blks); + /* A broadcast sent within next EXPIRE period must follow same path */ + method->rcast = true; + method->mandatory = true; return dlen; } @@ -1008,6 +1012,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, struct tipc_group *grp = tsk->group; struct tipc_nlist *dsts = tipc_group_dests(grp); struct tipc_mc_method *method = &tsk->mc_method; + bool ack = method->mandatory && method->rcast; int blks = tsk_blocks(MCAST_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; int mtu = tipc_bcast_get_mtu(net); @@ -1036,6 +1041,9 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, msg_set_destnode(hdr, 0); msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + /* Avoid getting stuck with repeated forced replicasts */ + msg_set_grp_bc_ack_req(hdr, ack); + /* Build message as chain of buffers */ skb_queue_head_init(&pkts); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); @@ -1043,13 +1051,17 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, return rc; /* Send message */ - rc = tipc_mcast_xmit(net, &pkts, method, dsts, - &tsk->cong_link_cnt); + rc = tipc_mcast_xmit(net, &pkts, method, dsts, &tsk->cong_link_cnt); if (unlikely(rc)) return rc; /* Update broadcast sequence number and send windows */ - tipc_group_update_bc_members(tsk->group, blks); + tipc_group_update_bc_members(tsk->group, blks, ack); + + /* Broadcast link is now free to choose method for next broadcast */ + method->mandatory = false; + method->expires = jiffies; + return dlen; } @@ -1113,7 +1125,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, u32 portid, oport, onode; struct list_head dports; struct tipc_msg *msg; - int hsz; + int user, mtyp, hsz; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); @@ -1121,6 +1133,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { msg = buf_msg(skb); + user = msg_user(msg); + mtyp = msg_type(msg); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { + spin_lock_bh(&inputq->lock); + if (skb_peek(arrvq) == skb) { + __skb_dequeue(arrvq); + __skb_queue_tail(inputq, skb); + } + refcount_dec(&skb->users); + spin_unlock_bh(&inputq->lock); + continue; + } hsz = skb_headroom(skb) + msg_hdr_sz(msg); oport = msg_origport(msg); onode = msg_orignode(msg); -- cgit v1.1 From 399574d41963285e72ba28dd46783c96316a81d1 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 13 Oct 2017 11:04:32 +0200 Subject: tipc: guarantee delivery of UP event before first broadcast The following scenario is possible: - A user joins a group, and immediately sends out a broadcast message to its members. - The broadcast message, following a different data path than the initial JOIN message sent out during the joining procedure, arrives to a receiver before the latter.. - The receiver drops the message, since it is not ready to accept any messages until the JOIN has arrived. We avoid this by treating group protocol JOIN messages like unicast messages. - We let them pass through the recipient's multicast input queue, just like ordinary unicasts. - We force the first following broadacst to be sent as replicated unicast and being acknowledged by the recipient before accepting any more broadcast transmissions. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b1f1c3c..2bbab4f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2762,6 +2762,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) rc = tipc_sk_publish(tsk, mreq->scope, &seq); if (rc) tipc_group_delete(net, grp); + + /* Eliminate any risk that a broadcast overtakes the sent JOIN */ + tsk->mc_method.rcast = true; + tsk->mc_method.mandatory = true; return rc; } -- cgit v1.1 From cb4dc41eaad0cb336bb5ddd379ae0d2cc89cb62b Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Thu, 19 Oct 2017 16:42:04 +0200 Subject: tipc: fix broken tipc_poll() function In commit ae236fb208a6 ("tipc: receive group membership events via member socket") we broke the tipc_poll() function by checking the state of the receive queue before the call to poll_sock_wait(), while relying that state afterwards, when it might have changed. We restore this in this commit. Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 2bbab4f..357954c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -714,7 +714,6 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; u32 revents = 0; @@ -733,7 +732,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (skb) + if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: @@ -742,7 +741,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, revents |= POLLOUT; if (!tipc_sk_type_connectionless(sk)) break; - if (!skb) + if (skb_queue_empty(&sk->sk_receive_queue)) break; revents |= POLLIN | POLLRDNORM; break; -- cgit v1.1 From 0d5fcebf3c370eb27f9a0e8db454625e73cd1cb4 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Fri, 20 Oct 2017 11:21:32 +0200 Subject: tipc: refactor tipc_sk_timeout() function The function tipc_sk_timeout() is more complex than necessary, and even seems to contain an undetected bug. At one of the occurences where we renew the timer we just order it with (HZ / 20), instead of (jiffies + HZ / 20); In this commit we clean up the function. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 357954c..b3b72d8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -48,7 +48,7 @@ #include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ -#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ +#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff #define TIPC_MIN_PORT 1 @@ -1472,7 +1472,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, msg_set_lookup_scope(msg, 0); msg_set_hdr_sz(msg, SHORT_H_SIZE); - sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL); + sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); @@ -2533,43 +2533,40 @@ static int tipc_shutdown(struct socket *sock, int how) static void tipc_sk_timeout(unsigned long data) { struct tipc_sock *tsk = (struct tipc_sock *)data; + u32 peer_port = tsk_peer_port(tsk); + u32 peer_node = tsk_peer_node(tsk); + u32 own_node = tsk_own_node(tsk); + u32 own_port = tsk->portid; struct sock *sk = &tsk->sk; + struct net *net = sock_net(sk); struct sk_buff *skb = NULL; - u32 peer_port, peer_node; - u32 own_node = tsk_own_node(tsk); bh_lock_sock(sk); - if (!tipc_sk_connected(sk)) { - bh_unlock_sock(sk); + if (!tipc_sk_connected(sk)) + goto exit; + + /* Try again later if socket is busy */ + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); goto exit; } - peer_port = tsk_peer_port(tsk); - peer_node = tsk_peer_node(tsk); if (tsk->probe_unacked) { - if (!sock_owned_by_user(sk)) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), - tsk_peer_port(tsk)); - sk->sk_state_change(sk); - } else { - /* Try again later */ - sk_reset_timer(sk, &sk->sk_timer, (HZ / 20)); - } - - bh_unlock_sock(sk); + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, peer_node, peer_port); + sk->sk_state_change(sk); goto exit; } - - skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, - INT_H_SIZE, 0, peer_node, own_node, - peer_port, tsk->portid, TIPC_OK); + /* Send new probe */ + skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, + peer_node, own_node, peer_port, own_port, + TIPC_OK); tsk->probe_unacked = true; - sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTERVAL); + sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); +exit: bh_unlock_sock(sk); if (skb) - tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); -exit: + tipc_node_xmit_skb(net, skb, peer_node, own_port); sock_put(sk); } -- cgit v1.1 From e233df01576bba9f5bafacccd571353b72152bd5 Mon Sep 17 00:00:00 2001 From: Cong Wang <xiyou.wangcong@gmail.com> Date: Tue, 24 Oct 2017 15:44:49 -0700 Subject: tipc: fix a dangling pointer tsk->group is set to grp earlier, but we forget to unset it after grp is freed. Fixes: 75da2163dbb6 ("tipc: introduce communication groups") Reported-by: syzkaller bot Cc: Jon Maloy <jon.maloy@ericsson.com> Cc: Ying Xue <ying.xue@windriver.com> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b3b72d8..ea61c32f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2756,8 +2756,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) seq.upper = seq.lower; tipc_nametbl_build_group(net, grp, mreq->type, domain); rc = tipc_sk_publish(tsk, mreq->scope, &seq); - if (rc) + if (rc) { tipc_group_delete(net, grp); + tsk->group = NULL; + } /* Eliminate any risk that a broadcast overtakes the sent JOIN */ tsk->mc_method.rcast = true; -- cgit v1.1 From 31b102bb501bea50ebc10f4aecf9d788305b8b87 Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Mon, 30 Oct 2017 14:06:45 -0700 Subject: net: tipc: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Jon Maloy <jon.maloy@ericsson.com> Cc: Ying Xue <ying.xue@windriver.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: netdev@vger.kernel.org Cc: tipc-discussion@lists.sourceforge.net Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/socket.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/tipc/socket.c') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ea61c32f..5d18c0c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -125,7 +125,7 @@ static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern); -static void tipc_sk_timeout(unsigned long data); +static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, @@ -464,7 +464,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, NAMED_H_SIZE, 0); msg_set_origport(msg, tsk->portid); - setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); + timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); sk->sk_shutdown = 0; sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; @@ -2530,14 +2530,14 @@ static int tipc_shutdown(struct socket *sock, int how) return res; } -static void tipc_sk_timeout(unsigned long data) +static void tipc_sk_timeout(struct timer_list *t) { - struct tipc_sock *tsk = (struct tipc_sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); + struct tipc_sock *tsk = tipc_sk(sk); u32 peer_port = tsk_peer_port(tsk); u32 peer_node = tsk_peer_node(tsk); u32 own_node = tsk_own_node(tsk); u32 own_port = tsk->portid; - struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; -- cgit v1.1