From 693c56491fb720087437a635e6eaf440659b922f Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Dec 2016 07:22:29 -0500
Subject: tipc: don't send FIN message from connectionless socket

In commit 6f00089c7372 ("tipc: remove SS_DISCONNECTING state") the
check for socket type is in the wrong place, causing a closing socket
to always send out a FIN message even when the socket was never
connected. This is normally harmless, since the destination node for
such messages most often is zero, and the message will be dropped, but
it is still a wrong and confusing behavior.

We fix this in this commit.

Reviewed-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 333c5da..800caaa 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -441,15 +441,19 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 		if (TIPC_SKB_CB(skb)->bytes_read) {
 			kfree_skb(skb);
-		} else {
-			if (!tipc_sk_type_connectionless(sk) &&
-			    sk->sk_state != TIPC_DISCONNECTING) {
-				tipc_set_sk_state(sk, TIPC_DISCONNECTING);
-				tipc_node_remove_conn(net, dnode, tsk->portid);
-			}
-			tipc_sk_respond(sk, skb, error);
+			continue;
+		}
+		if (!tipc_sk_type_connectionless(sk) &&
+		    sk->sk_state != TIPC_DISCONNECTING) {
+			tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+			tipc_node_remove_conn(net, dnode, tsk->portid);
 		}
+		tipc_sk_respond(sk, skb, error);
 	}
+
+	if (tipc_sk_type_connectionless(sk))
+		return;
+
 	if (sk->sk_state != TIPC_DISCONNECTING) {
 		skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
 				      TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
@@ -457,10 +461,8 @@ static void __tipc_shutdown(struct socket *sock, int error)
 				      tsk->portid, error);
 		if (skb)
 			tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
-		if (!tipc_sk_type_connectionless(sk)) {
-			tipc_node_remove_conn(net, dnode, tsk->portid);
-			tipc_set_sk_state(sk, TIPC_DISCONNECTING);
-		}
+		tipc_node_remove_conn(net, dnode, tsk->portid);
+		tipc_set_sk_state(sk, TIPC_DISCONNECTING);
 	}
 }
 
-- 
cgit v1.1


From 628185cfddf1dfb701c4efe2cfd72cf5b09f5702 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 21 Dec 2016 18:04:11 +0100
Subject: net, sched: fix soft lockup in tc_classify

Shahar reported a soft lockup in tc_classify(), where we run into an
endless loop when walking the classifier chain due to tp->next == tp
which is a state we should never run into. The issue only seems to
trigger under load in the tc control path.

What happens is that in tc_ctl_tfilter(), thread A allocates a new
tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
with it. In that classifier callback we had to unlock/lock the rtnl
mutex and returned with -EAGAIN. One reason why we need to drop there
is, for example, that we need to request an action module to be loaded.

This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning
after we loaded and found the requested action, we need to redo the
whole request so we don't race against others. While we had to unlock
rtnl in that time, thread B's request was processed next on that CPU.
Thread B added a new tp instance successfully to the classifier chain.
When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN
and destroying its tp instance which never got linked, we goto replay
and redo A's request.

This time when walking the classifier chain in tc_ctl_tfilter() for
checking for existing tp instances we had a priority match and found
the tp instance that was created and linked by thread B. Now calling
again into tp->ops->change() with that tp was successful and returned
without error.

tp_created was never cleared in the second round, thus kernel thinks
that we need to link it into the classifier chain (once again). tp and
*back point to the same object due to the match we had earlier on. Thus
for thread B's already public tp, we reset tp->next to tp itself and
link it into the chain, which eventually causes the mentioned endless
loop in tc_classify() once a packet hits the data path.

Fix is to clear tp_created at the beginning of each request, also when
we replay it. On the paths that can cause -EAGAIN we already destroy
the original tp instance we had and on replay we really need to start
from scratch. It seems that this issue was first introduced in commit
12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining
and avoid kernel panic when we use cls_cgroup").

Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup")
Reported-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3fbba79..1ecdf80 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -148,13 +148,15 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 	unsigned long cl;
 	unsigned long fh;
 	int err;
-	int tp_created = 0;
+	int tp_created;
 
 	if ((n->nlmsg_type != RTM_GETTFILTER) &&
 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 replay:
+	tp_created = 0;
+
 	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
 	if (err < 0)
 		return err;
-- 
cgit v1.1


From 56ab6b93007e5000a8812985aec1833c4a6a9ce0 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Sun, 25 Dec 2016 14:33:16 +0800
Subject: ipv4: Namespaceify tcp_tw_reuse knob

Different namespaces might have different requirements to reuse
TIME-WAIT sockets for new connections. This might be required in
cases where different namespace applications are in place which
require TIME_WAIT socket connections to be reduced independently
of the host.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 14 +++++++-------
 net/ipv4/tcp_ipv4.c        |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 80bc36b..22cbd61 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -433,13 +433,6 @@ static struct ctl_table ipv4_table[] = {
 		.extra2		= &tcp_adv_win_scale_max,
 	},
 	{
-		.procname	= "tcp_tw_reuse",
-		.data		= &sysctl_tcp_tw_reuse,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-	{
 		.procname	= "tcp_frto",
 		.data		= &sysctl_tcp_frto,
 		.maxlen		= sizeof(int),
@@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_tw_reuse",
+		.data		= &init_net.ipv4.sysctl_tcp_tw_reuse,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 30d81f5..fe9da4f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,7 +84,6 @@
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
-int sysctl_tcp_tw_reuse __read_mostly;
 int sysctl_tcp_low_latency __read_mostly;
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -120,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 	   and use initial timestamp retrieved from peer table.
 	 */
 	if (tcptw->tw_ts_recent_stamp &&
-	    (!twp || (sysctl_tcp_tw_reuse &&
+	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 		if (tp->write_seq == 0)
@@ -2456,6 +2455,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_orphan_retries = 0;
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+	net->ipv4.sysctl_tcp_tw_reuse = 0;
 
 	return 0;
 fail:
-- 
cgit v1.1


From df30f7408b187929dbde72661c7f7c615268f1d0 Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Mon, 26 Dec 2016 08:31:27 -0800
Subject: openvswitch: upcall: Fix vlan handling.

Networking stack accelerate vlan tag handling by
keeping topmost vlan header in skb. This works as
long as packet remains in OVS datapath. But during
OVS upcall vlan header is pushed on to the packet.
When such packet is sent back to OVS datapath, core
networking stack might not handle it correctly. Following
patch avoids this issue by accelerating the vlan tag
during flow key extract. This simplifies datapath by
bringing uniform packet processing for packets from
all code paths.

Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets").
CC: Jarno Rajahalme <jarno@ovn.org>
CC: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c |  1 -
 net/openvswitch/flow.c     | 54 +++++++++++++++++++++++-----------------------
 2 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2d4c4d3..9c62b63 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -606,7 +606,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	rcu_assign_pointer(flow->sf_acts, acts);
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
-	packet->protocol = flow->key.eth.type;
 
 	rcu_read_lock();
 	dp = get_dp_rcu(net, ovs_header->dp_ifindex);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 08aa926..2c0a00f 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
  * Returns 0 if it encounters a non-vlan or incomplete packet.
  * Returns 1 after successfully parsing vlan tag.
  */
-static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
+static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
+			  bool untag_vlan)
 {
 	struct vlan_head *vh = (struct vlan_head *)skb->data;
 
@@ -330,7 +331,20 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh)
 	key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT);
 	key_vh->tpid = vh->tpid;
 
-	__skb_pull(skb, sizeof(struct vlan_head));
+	if (unlikely(untag_vlan)) {
+		int offset = skb->data - skb_mac_header(skb);
+		u16 tci;
+		int err;
+
+		__skb_push(skb, offset);
+		err = __skb_vlan_pop(skb, &tci);
+		__skb_pull(skb, offset);
+		if (err)
+			return err;
+		__vlan_hwaccel_put_tag(skb, key_vh->tpid, tci);
+	} else {
+		__skb_pull(skb, sizeof(struct vlan_head));
+	}
 	return 1;
 }
 
@@ -351,13 +365,13 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 		key->eth.vlan.tpid = skb->vlan_proto;
 	} else {
 		/* Parse outer vlan tag in the non-accelerated case. */
-		res = parse_vlan_tag(skb, &key->eth.vlan);
+		res = parse_vlan_tag(skb, &key->eth.vlan, true);
 		if (res <= 0)
 			return res;
 	}
 
 	/* Parse inner vlan tag. */
-	res = parse_vlan_tag(skb, &key->eth.cvlan);
+	res = parse_vlan_tag(skb, &key->eth.cvlan, false);
 	if (res <= 0)
 		return res;
 
@@ -800,29 +814,15 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
 	if (err)
 		return err;
 
-	if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
-		/* key_extract assumes that skb->protocol is set-up for
-		 * layer 3 packets which is the case for other callers,
-		 * in particular packets recieved from the network stack.
-		 * Here the correct value can be set from the metadata
-		 * extracted above.
-		 */
-		skb->protocol = key->eth.type;
-	} else {
-		struct ethhdr *eth;
-
-		skb_reset_mac_header(skb);
-		eth = eth_hdr(skb);
-
-		/* Normally, setting the skb 'protocol' field would be
-		 * handled by a call to eth_type_trans(), but it assumes
-		 * there's a sending device, which we may not have.
-		 */
-		if (eth_proto_is_802_3(eth->h_proto))
-			skb->protocol = eth->h_proto;
-		else
-			skb->protocol = htons(ETH_P_802_2);
-	}
+	/* key_extract assumes that skb->protocol is set-up for
+	 * layer 3 packets which is the case for other callers,
+	 * in particular packets received from the network stack.
+	 * Here the correct value can be set from the metadata
+	 * extracted above.
+	 * For L2 packet key eth type would be zero. skb protocol
+	 * would be set to correct value later during key-extact.
+	 */
 
+	skb->protocol = key->eth.type;
 	return key_extract(skb, key);
 }
-- 
cgit v1.1


From be26727772cd86979255dfaf1eea967338dc0c9b Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 27 Dec 2016 10:49:54 +0800
Subject: net: xdp: remove unused bfp_warn_invalid_xdp_buffer()

After commit 73b62bd085f4737679ea9afc7867fa5f99ba7d1b ("virtio-net:
remove the warning before XDP linearizing"), there's no users for
bpf_warn_invalid_xdp_buffer(), so remove it. This is a revert for
commit f23bc46c30ca5ef58b8549434899fcbac41b2cfc.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 7190bd6..b146170 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2972,12 +2972,6 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-void bpf_warn_invalid_xdp_buffer(void)
-{
-	WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n");
-}
-EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer);
-
 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					int src_reg, int ctx_off,
 					struct bpf_insn *insn_buf,
-- 
cgit v1.1