From 666b805150efd62f05810ff0db08f44a2370c937 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 1 Apr 2015 20:26:46 -0400 Subject: tcp: fix FRTO undo on cumulative ACK of SACKed range On processing cumulative ACKs, the FRTO code was not checking the SACKed bit, meaning that there could be a spurious FRTO undo on a cumulative ACK of a previously SACKed skb. The FRTO code should only consider a cumulative ACK to indicate that an original/unretransmitted skb is newly ACKed if the skb was not yet SACKed. The effect of the spurious FRTO undo would typically be to make the connection think that all previously-sent packets were in flight when they really weren't, leading to a stall and an RTO. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Fixes: e33099f96d99c ("tcp: implement RFC5682 F-RTO") Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb4cf8b..f501ac04 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3105,10 +3105,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!first_ackt.v64) first_ackt = last_ackt; - if (!(sacked & TCPCB_SACKED_ACKED)) + if (!(sacked & TCPCB_SACKED_ACKED)) { reord = min(pkts_acked, reord); - if (!after(scb->end_seq, tp->high_seq)) - flag |= FLAG_ORIG_SACK_ACKED; + if (!after(scb->end_seq, tp->high_seq)) + flag |= FLAG_ORIG_SACK_ACKED; + } } if (sacked & TCPCB_SACKED_ACKED) -- cgit v1.1 From ed785309c94445dd90e242370e1f7bb034e008fd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:45 -0700 Subject: ipv4: take rtnl_lock and mark mrt table as freed on namespace cleanup This is the IPv4 part for commit 905a6f96a1b1 (ipv6: take rtnl_lock and mark mrt6 table as freed on namespace cleanup). Cc: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9282544..bc40115 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,10 +278,12 @@ static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; + rtnl_lock(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt); } + rtnl_unlock(); fib_rules_unregister(net->ipv4.mr_rules_ops); } #else @@ -308,7 +310,10 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { + rtnl_lock(); ipmr_free_table(net->ipv4.mrt); + net->ipv4.mrt = NULL; + rtnl_unlock(); } #endif -- cgit v1.1 From 419df12fb5fa558451319276838c1842f2b11f8f Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:46 -0700 Subject: net: move fib_rules_unregister() under rtnl lock We have to hold rtnl lock for fib_rules_unregister() otherwise the following race could happen: fib_rules_unregister(): fib_nl_delrule(): ... ... ... ops = lookup_rules_ops(); list_del_rcu(&ops->list); list_for_each_entry(ops->rules) { fib_rules_cleanup_ops(ops); ... list_del_rcu(); list_del_rcu(); } Note, net->rules_mod_lock is actually not needed at all, either upper layer netns code or rtnl lock guarantees we are safe. Cc: Alexander Duyck Cc: Thomas Graf Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- net/decnet/dn_rules.c | 2 ++ net/ipv4/fib_frontend.c | 3 +-- net/ipv4/ipmr.c | 2 +- net/ipv6/fib6_rules.c | 2 ++ net/ipv6/ip6mr.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 44706e8..e4fdc9d 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -175,9 +175,9 @@ void fib_rules_unregister(struct fib_rules_ops *ops) spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); - fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); + fib_rules_cleanup_ops(ops); call_rcu(&ops->rcu, fib_rules_put_rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index faf7cc3..9d66a0f 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -248,7 +248,9 @@ void __init dn_fib_rules_init(void) void __exit dn_fib_rules_cleanup(void) { + rtnl_lock(); fib_rules_unregister(dn_fib_rules_ops); + rtnl_unlock(); rcu_barrier(); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57be71d..23b9b3e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1111,11 +1111,10 @@ static void ip_fib_net_exit(struct net *net) { unsigned int i; + rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif - - rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bc40115..fe54eba 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -283,8 +283,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_del(&mrt->list); ipmr_free_table(mrt); } - rtnl_unlock(); fib_rules_unregister(net->ipv4.mr_rules_ops); + rtnl_unlock(); } #else #define ipmr_for_each_table(mrt, net) \ diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 27ca796..70bc6ab 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -322,7 +322,9 @@ out_fib6_rules_ops: static void __net_exit fib6_rules_net_exit(struct net *net) { + rtnl_lock(); fib_rules_unregister(net->ipv6.fib6_rules_ops); + rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 52028f4..2f1fd9f 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -267,8 +267,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) list_del(&mrt->list); ip6mr_free_table(mrt); } - rtnl_unlock(); fib_rules_unregister(net->ipv6.mr6_rules_ops); + rtnl_unlock(); } #else #define ip6mr_for_each_table(mrt, net) \ -- cgit v1.1 From 7ba0c47c34a1ea5bc7a24ca67309996cce0569b5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:47 -0700 Subject: ip6mr: call del_timer_sync() in ip6mr_free_table() We need to wait for the flying timers, since we are going to free the mrtable right after it. Cc: Hannes Frederic Sowa Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2f1fd9f..312e0ff 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -336,7 +336,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr6_table *mrt) { - del_timer(&mrt->ipmr_expire_timer); + del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt); kfree(mrt); } -- cgit v1.1 From 6d458f5b4ece8542a5c2281e40008823fec91814 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Apr 2015 12:02:36 +0200 Subject: Revert "netns: don't clear nsid too early on removal" This reverts commit 4217291e592d ("netns: don't clear nsid too early on removal"). This is not the right fix, it introduces races. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5221f97..cb5290b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -349,7 +349,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp, *peer; + struct net *net, *tmp; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -365,6 +365,14 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); + for_each_net(tmp) { + int id = __peernet2id(tmp, net, false); + + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + } + idr_destroy(&net->netns_ids); + } rtnl_unlock(); @@ -390,26 +398,12 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); - rtnl_lock(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { - /* Unreference net from all peers (no need to loop over - * net_exit_list because idr_destroy() will be called for each - * element of this list. - */ - for_each_net(peer) { - int id = __peernet2id(peer, net, false); - - if (id >= 0) - idr_remove(&peer->netns_ids, id); - } - idr_destroy(&net->netns_ids); - list_del_init(&net->exit_list); put_user_ns(net->user_ns); net_drop_ns(net); } - rtnl_unlock(); } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.1 From 576b7cd2f6ff1e90b3fc0a000d2fe74f8a50a4bb Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Apr 2015 12:02:37 +0200 Subject: netns: don't allocate an id for dead netns First, let's explain the problem. Suppose you have an ipip interface that stands in the netns foo and its link part in the netns bar (so the netns bar has an nsid into the netns foo). Now, you remove the netns bar: - the bar nsid into the netns foo is removed - the netns exit method of ipip is called, thus our ipip iface is removed: => a netlink message is built in the netns foo to advertise this deletion => this netlink message requests an nsid for bar, thus a new nsid is allocated for bar and never removed. This patch adds a check in peernet2id() so that an id cannot be allocated for a netns which is currently destroyed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb5290b..70d3450 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -198,8 +198,10 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) */ int peernet2id(struct net *net, struct net *peer) { - int id = __peernet2id(net, peer, true); + bool alloc = atomic_read(&peer->count) == 0 ? false : true; + int id; + id = __peernet2id(net, peer, alloc); return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; } EXPORT_SYMBOL(peernet2id); -- cgit v1.1 From f60e5990d9c1424af9dbca60a23ba2a1c7c1ce90 Mon Sep 17 00:00:00 2001 From: "hannes@stressinduktion.org" Date: Wed, 1 Apr 2015 17:07:44 +0200 Subject: ipv6: protect skb->sk accesses from recursive dereference inside the stack We should not consult skb->sk for output decisions in xmit recursion levels > 0 in the stack. Otherwise local socket settings could influence the result of e.g. tunnel encapsulation process. ipv6 does not conform with this in three places: 1) ip6_fragment: we do consult ipv6_npinfo for frag_size 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should loop the packet back to the local socket 3) ip6_skb_dst_mtu could query the settings from the user socket and force a wrong MTU Furthermore: In sk_mc_loop we could potentially land in WARN_ON(1) if we use a PF_PACKET socket ontop of an IPv6-backed vxlan device. Reuse xmit_recursion as we are currently only interested in protecting tunnel devices. Cc: Jiri Pirko Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- net/core/sock.c | 19 +++++++++++++++++++ net/ipv6/ip6_output.c | 3 ++- 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 962ee9d..45109b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); +DEFINE_PER_CPU(int, xmit_recursion); +EXPORT_SYMBOL(xmit_recursion); + #define RECURSION_LIMIT 10 /** diff --git a/net/core/sock.c b/net/core/sock.c index 78e89eb..71e3e5f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) sock_reset_flag(sk, bit); } +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + switch (sk->sk_family) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e80b61..36cf0ab 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; -- cgit v1.1 From 67e04c29ec0daad9ba29341b4dab4b89526994cf Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 3 Apr 2015 13:46:09 -0700 Subject: l2tp: unregister l2tp_net_ops on failure path Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 895348e..a29a504 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1871,6 +1871,7 @@ static int __init l2tp_init(void) l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0); if (!l2tp_wq) { pr_err("alloc_workqueue failed\n"); + unregister_pernet_device(&l2tp_net_ops); rc = -ENOMEM; goto out; } -- cgit v1.1 From 303038135afbd0520d1e241c02592be6e4ea7204 Mon Sep 17 00:00:00 2001 From: Pavel Nakonechny Date: Sun, 5 Apr 2015 00:46:21 +0300 Subject: net: dsa: fix filling routing table from OF description According to description in 'include/net/dsa.h', in cascade switches configurations where there are more than one interconnected devices, 'rtable' array in 'dsa_chip_data' structure is used to indicate which port on this switch should be used to send packets to that are destined for corresponding switch. However, dsa_of_setup_routing_table() fills 'rtable' with port numbers of the _target_ switch, but not current one. This commit removes redundant devicetree parsing and adds needed port number as a function argument. So dsa_of_setup_routing_table() now just looks for target switch number by parsing parent of 'link' device node. To remove possible misunderstandings with the way of determining target switch number, a corresponding comment was added to the source code and to the DSA device tree bindings documentation file. This was tested on a custom board with two Marvell 88E6095 switches with following corresponding routing tables: { -1, 10 } and { 8, -1 }. Signed-off-by: Pavel Nakonechny Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 2173402..4dea2e0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -501,12 +501,10 @@ static struct net_device *dev_to_net_device(struct device *dev) #ifdef CONFIG_OF static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, struct dsa_chip_data *cd, - int chip_index, + int chip_index, int port_index, struct device_node *link) { - int ret; const __be32 *reg; - int link_port_addr; int link_sw_addr; struct device_node *parent_sw; int len; @@ -519,6 +517,10 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, if (!reg || (len != sizeof(*reg) * 2)) return -EINVAL; + /* + * Get the destination switch number from the second field of its 'reg' + * property, i.e. for "reg = <0x19 1>" sw_addr is '1'. + */ link_sw_addr = be32_to_cpup(reg + 1); if (link_sw_addr >= pd->nr_chips) @@ -535,20 +537,9 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, memset(cd->rtable, -1, pd->nr_chips * sizeof(s8)); } - reg = of_get_property(link, "reg", NULL); - if (!reg) { - ret = -EINVAL; - goto out; - } - - link_port_addr = be32_to_cpup(reg); - - cd->rtable[link_sw_addr] = link_port_addr; + cd->rtable[link_sw_addr] = port_index; return 0; -out: - kfree(cd->rtable); - return ret; } static void dsa_of_free_platform_data(struct dsa_platform_data *pd) @@ -658,7 +649,7 @@ static int dsa_of_probe(struct platform_device *pdev) if (!strcmp(port_name, "dsa") && link && pd->nr_chips > 1) { ret = dsa_of_setup_routing_table(pd, cd, - chip_index, link); + chip_index, port_index, link); if (ret) goto out_free_chip; } -- cgit v1.1