From f555f3d76aaade29c7e221a37ee64fe722955c09 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Jan 2015 11:37:12 +0100
Subject: genetlink: document parallel_ops

The kernel-doc for the parallel_ops family struct member is
missing, add it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 8412508..2ea2c55 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -27,6 +27,8 @@ struct genl_info;
  * @maxattr: maximum number of attributes supported
  * @netnsok: set to true if the family can handle network
  *	namespaces and should be presented in all of them
+ * @parallel_ops: operations can be called in parallel and aren't
+ *	synchronized by the core genetlink code
  * @pre_doit: called before an operation's doit callback, it may
  *	do additional, common, filtering and return an error
  * @post_doit: called after an operation's doit callback, it may
-- 
cgit v1.1


From ee1c244219fd652964710a6cc3e4f922e86aa492 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Jan 2015 11:37:14 +0100
Subject: genetlink: synchronize socket closing and family removal

In addition to the problem Jeff Layton reported, I looked at the code
and reproduced the same warning by subscribing and removing the genl
family with a socket still open. This is a fairly tricky race which
originates in the fact that generic netlink allows the family to go
away while sockets are still open - unlike regular netlink which has
a module refcount for every open socket so in general this cannot be
triggered.

Trying to resolve this issue by the obvious locking isn't possible as
it will result in deadlocks between unregistration and group unbind
notification (which incidentally lockdep doesn't find due to the home
grown locking in the netlink table.)

To really resolve this, introduce a "closing socket" reference counter
(for generic netlink only, as it's the only affected family) in the
core netlink code and use that in generic netlink to wait for all the
sockets that are being closed at the same time as a generic netlink
family is removed.

This fixes the race that when a socket is closed, it will should call
the unbind, but if the family is removed at the same time the unbind
will not find it, leading to the warning. The real problem though is
that in this case the unbind could actually find a new family that is
registered to have a multicast group with the same ID, and call its
mcast_unbind() leading to confusing.

Also remove the warning since it would still trigger, but is now no
longer a problem.

This also moves the code in af_netlink.c to before unreferencing the
module to avoid having the same problem in the normal non-genl case.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 2ea2c55..6c92415 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -35,7 +35,10 @@ struct genl_info;
  *	undo operations done by pre_doit, for example release locks
  * @mcast_bind: a socket bound to the given multicast group (which
  *	is given as the offset into the groups array)
- * @mcast_unbind: a socket was unbound from the given multicast group
+ * @mcast_unbind: a socket was unbound from the given multicast group.
+ *	Note that unbind() will not be called symmetrically if the
+ *	generic netlink family is removed while there are still open
+ *	sockets.
  * @attrbuf: buffer to store parsed attributes
  * @family_list: family list
  * @mcgrps: multicast groups used by this family (private)
-- 
cgit v1.1


From 75e8d06d4308436055d1a78a2c02bf6328ba724d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 14 Jan 2015 15:33:57 +0100
Subject: netfilter: nf_tables: validate hooks in NAT expressions

The user can crash the kernel if it uses any of the existing NAT
expressions from the wrong hook, so add some code to validate this
when loading the rule.

This patch introduces nft_chain_validate_hooks() which is based on
an existing function in the bridge version of the reject expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3ae969e..9eaaa78 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -530,6 +530,8 @@ enum nft_chain_type {
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
 				  enum nft_chain_type type);
+int nft_chain_validate_hooks(const struct nft_chain *chain,
+                             unsigned int hook_flags);
 
 struct nft_stats {
 	u64			bytes;
-- 
cgit v1.1


From df4d92549f23e1c037e83323aff58a21b3de7fe0 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Fri, 23 Jan 2015 12:01:26 +0100
Subject: ipv4: try to cache dst_entries which would cause a redirect

Not caching dst_entries which cause redirects could be exploited by hosts
on the same subnet, causing a severe DoS attack. This effect aggravated
since commit f88649721268999 ("ipv4: fix dst race in sk_dst_get()").

Lookups causing redirects will be allocated with DST_NOCACHE set which
will force dst_release to free them via RCU.  Unfortunately waiting for
RCU grace period just takes too long, we can end up with >1M dst_entries
waiting to be released and the system will run OOM. rcuos threads cannot
catch up under high softirq load.

Attaching the flag to emit a redirect later on to the specific skb allows
us to cache those dst_entries thus reducing the pressure on allocation
and deallocation.

This issue was discovered by Marcelo Leitner.

Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Marcelo Leitner <mleitner@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 0bb6207..f7cbd70 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -39,11 +39,12 @@ struct inet_skb_parm {
 	struct ip_options	opt;		/* Compiled IP options		*/
 	unsigned char		flags;
 
-#define IPSKB_FORWARDED		1
-#define IPSKB_XFRM_TUNNEL_SIZE	2
-#define IPSKB_XFRM_TRANSFORMED	4
-#define IPSKB_FRAG_COMPLETE	8
-#define IPSKB_REROUTED		16
+#define IPSKB_FORWARDED		BIT(0)
+#define IPSKB_XFRM_TUNNEL_SIZE	BIT(1)
+#define IPSKB_XFRM_TRANSFORMED	BIT(2)
+#define IPSKB_FRAG_COMPLETE	BIT(3)
+#define IPSKB_REROUTED		BIT(4)
+#define IPSKB_DOREDIRECT	BIT(5)
 
 	u16			frag_max_size;
 };
-- 
cgit v1.1


From e73ebb0881ea5534ce606c1d71b4ac44db5c6930 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:35 -0500
Subject: tcp: stretch ACK fixes prep

LRO, GRO, delayed ACKs, and middleboxes can cause "stretch ACKs" that
cover more than the RFC-specified maximum of 2 packets. These stretch
ACKs can cause serious performance shortfalls in common congestion
control algorithms that were designed and tuned years ago with
receiver hosts that were not using LRO or GRO, and were instead
politely ACKing every other packet.

This patch series fixes Reno and CUBIC to handle stretch ACKs.

This patch prepares for the upcoming stretch ACK bug fix patches. It
adds an "acked" parameter to tcp_cong_avoid_ai() to allow for future
fixes to tcp_cong_avoid_ai() to correctly handle stretch ACKs, and
changes all congestion control algorithms to pass in 1 for the ACKed
count. It also changes tcp_slow_start() to return the number of packet
ACK "credits" that were not processed in slow start mode, and can be
processed by the congestion control module in additive increase mode.

In future patches we will fix tcp_cong_avoid_ai() to handle stretch
ACKs, and fix Reno and CUBIC handling of stretch ACKs in slow start
and additive increase mode.

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf..9d9111e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,8 +834,8 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
 int tcp_set_congestion_control(struct sock *sk, const char *name);
-void tcp_slow_start(struct tcp_sock *tp, u32 acked);
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
 u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
-- 
cgit v1.1


From 0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Jan 2015 17:30:12 -0800
Subject: net: sched: fix panic in rate estimators

Doing the following commands on a non idle network device
panics the box instantly, because cpu_bstats gets overwritten
by stats.

tc qdisc add dev eth0 root <your_favorite_qdisc>
... some traffic (one packet is enough) ...
tc qdisc replace dev eth0 root est 1sec 4sec <your_favorite_qdisc>

[  325.355596] BUG: unable to handle kernel paging request at ffff8841dc5a074c
[  325.362609] IP: [<ffffffff81541c9e>] __gnet_stats_copy_basic+0x3e/0x90
[  325.369158] PGD 1fa7067 PUD 0
[  325.372254] Oops: 0000 [#1] SMP
[  325.375514] Modules linked in: ...
[  325.398346] CPU: 13 PID: 14313 Comm: tc Not tainted 3.19.0-smp-DEV #1163
[  325.412042] task: ffff8800793ab5d0 ti: ffff881ff2fa4000 task.ti: ffff881ff2fa4000
[  325.419518] RIP: 0010:[<ffffffff81541c9e>]  [<ffffffff81541c9e>] __gnet_stats_copy_basic+0x3e/0x90
[  325.428506] RSP: 0018:ffff881ff2fa7928  EFLAGS: 00010286
[  325.433824] RAX: 000000000000000c RBX: ffff881ff2fa796c RCX: 000000000000000c
[  325.440988] RDX: ffff8841dc5a0744 RSI: 0000000000000060 RDI: 0000000000000060
[  325.448120] RBP: ffff881ff2fa7948 R08: ffffffff81cd4f80 R09: 0000000000000000
[  325.455268] R10: ffff883ff223e400 R11: 0000000000000000 R12: 000000015cba0744
[  325.462405] R13: ffffffff81cd4f80 R14: ffff883ff223e460 R15: ffff883feea0722c
[  325.469536] FS:  00007f2ee30fa700(0000) GS:ffff88407fa20000(0000) knlGS:0000000000000000
[  325.477630] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  325.483380] CR2: ffff8841dc5a074c CR3: 0000003feeae9000 CR4: 00000000001407e0
[  325.490510] Stack:
[  325.492524]  ffff883feea0722c ffff883fef719dc0 ffff883feea0722c ffff883ff223e4a0
[  325.499990]  ffff881ff2fa79a8 ffffffff815424ee ffff883ff223e49c 000000015cba0744
[  325.507460]  00000000f2fa7978 0000000000000000 ffff881ff2fa79a8 ffff883ff223e4a0
[  325.514956] Call Trace:
[  325.517412]  [<ffffffff815424ee>] gen_new_estimator+0x8e/0x230
[  325.523250]  [<ffffffff815427aa>] gen_replace_estimator+0x4a/0x60
[  325.529349]  [<ffffffff815718ab>] tc_modify_qdisc+0x52b/0x590
[  325.535117]  [<ffffffff8155edd0>] rtnetlink_rcv_msg+0xa0/0x240
[  325.540963]  [<ffffffff8155ed30>] ? __rtnl_unlock+0x20/0x20
[  325.546532]  [<ffffffff8157f811>] netlink_rcv_skb+0xb1/0xc0
[  325.552145]  [<ffffffff8155b355>] rtnetlink_rcv+0x25/0x40
[  325.557558]  [<ffffffff8157f0d8>] netlink_unicast+0x168/0x220
[  325.563317]  [<ffffffff8157f47c>] netlink_sendmsg+0x2ec/0x3e0

Lets play safe and not use an union : percpu 'pointers' are mostly read
anyway, and we have typically few qdiscs per host.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Fixes: 22e0f8b9322c ("net: sched: make bstats per cpu and estimator RCU safe")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3d282cb..c605d30 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -79,6 +79,9 @@ struct Qdisc {
 	struct netdev_queue	*dev_queue;
 
 	struct gnet_stats_rate_est64	rate_est;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_queue	__percpu *cpu_qstats;
+
 	struct Qdisc		*next_sched;
 	struct sk_buff		*gso_skb;
 	/*
@@ -86,15 +89,9 @@ struct Qdisc {
 	 */
 	unsigned long		state;
 	struct sk_buff_head	q;
-	union {
-		struct gnet_stats_basic_packed bstats;
-		struct gnet_stats_basic_cpu __percpu *cpu_bstats;
-	} __packed;
+	struct gnet_stats_basic_packed bstats;
 	unsigned int		__state;
-	union {
-		struct gnet_stats_queue	qstats;
-		struct gnet_stats_queue	__percpu *cpu_qstats;
-	} __packed;
+	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
 	int			padded;
 	atomic_t		refcnt;
-- 
cgit v1.1


From bdbbb8527b6f6a358dbcb70dac247034d665b8e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Jan 2015 21:35:05 -0800
Subject: ipv4: tcp: get rid of ugly unicast_sock

In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :

commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.

commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.

commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression.

commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.

Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.

This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         | 2 +-
 include/net/netns/ipv4.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index f7cbd70..09cf5ae 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 24945ce..0ffef1a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -52,6 +52,7 @@ struct netns_ipv4 {
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
 	unsigned int		tcp_metrics_hash_log;
+	struct sock  * __percpu	*tcp_sk;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
-- 
cgit v1.1


From 0508c07f5e0c94f38afd5434e8b2a55b84553077 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 3 Feb 2015 16:36:15 -0500
Subject: ipv6: Select fragment id during UFO segmentation if not set.

If the IPv6 fragment id has not been set and we perform
fragmentation due to UFO, select a new fragment id.
We now consider a fragment id of 0 as unset and if id selection
process returns 0 (after all the pertrubations), we set it to
0x80000000, thus giving us ample space not to create collisions
with the next packet we may have to fragment.

When doing UFO integrity checking, we also select the
fragment id if it has not be set yet.   This is stored into
the skb_shinfo() thus allowing UFO to function correclty.

This patch also removes duplicate fragment id generation code
and moves ipv6_select_ident() into the header as it may be
used during GSO.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..9bf85d3 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -671,6 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
+u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst,
+			struct in6_addr *src);
+void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
 void ipv6_proxy_select_ident(struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
-- 
cgit v1.1


From f4575d3534617eec98c7eb8701185cec96b4374b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Feb 2015 13:31:54 -0800
Subject: flow_keys: n_proto type should be __be16

(struct flow_keys)->n_proto is in network order, use
proper type for this.

Fixes following sparse errors :

net/core/flow_dissector.c:139:39: warning: incorrect type in assignment (different base types)
net/core/flow_dissector.c:139:39:    expected unsigned short [unsigned] [usertype] n_proto
net/core/flow_dissector.c:139:39:    got restricted __be16 [assigned] [usertype] proto
net/core/flow_dissector.c:237:23: warning: incorrect type in assignment (different base types)
net/core/flow_dissector.c:237:23:    expected unsigned short [unsigned] [usertype] n_proto
net/core/flow_dissector.c:237:23:    got restricted __be16 [assigned] [usertype] proto

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in skb_flow_dissect()")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_keys.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index 7ee2df0..dc8fd81 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -22,9 +22,9 @@ struct flow_keys {
 		__be32 ports;
 		__be16 port16[2];
 	};
-	u16 thoff;
-	u16 n_proto;
-	u8 ip_proto;
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
 };
 
 bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
-- 
cgit v1.1


From 677651462c774b5866be2bc42601303a76b021a0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Feb 2015 15:03:25 -0800
Subject: ipv6: fix sparse errors in ip6_make_flowlabel()

include/net/ipv6.h:713:22: warning: incorrect type in assignment (different base types)
include/net/ipv6.h:713:22:    expected restricted __be32 [usertype] hash
include/net/ipv6.h:713:22:    got unsigned int
include/net/ipv6.h:719:25: warning: restricted __be32 degrades to integer
include/net/ipv6.h:719:22: warning: invalid assignment: ^=
include/net/ipv6.h:719:22:    left side has type restricted __be32
include/net/ipv6.h:719:22:    right side has type unsigned int

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9bf85d3..6e416f6 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -711,7 +711,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 					__be32 flowlabel, bool autolabel)
 {
 	if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) {
-		__be32 hash;
+		u32 hash;
 
 		hash = skb_get_hash(skb);
 
@@ -721,7 +721,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 		 */
 		hash ^= hash >> 12;
 
-		flowlabel = hash & IPV6_FLOWLABEL_MASK;
+		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
 	}
 
 	return flowlabel;
-- 
cgit v1.1