From 693c56491fb720087437a635e6eaf440659b922f Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 22 Dec 2016 07:22:29 -0500 Subject: tipc: don't send FIN message from connectionless socket In commit 6f00089c7372 ("tipc: remove SS_DISCONNECTING state") the check for socket type is in the wrong place, causing a closing socket to always send out a FIN message even when the socket was never connected. This is normally harmless, since the destination node for such messages most often is zero, and the message will be dropped, but it is still a wrong and confusing behavior. We fix this in this commit. Reviewed-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 333c5da..800caaa 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -441,15 +441,19 @@ static void __tipc_shutdown(struct socket *sock, int error) while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (TIPC_SKB_CB(skb)->bytes_read) { kfree_skb(skb); - } else { - if (!tipc_sk_type_connectionless(sk) && - sk->sk_state != TIPC_DISCONNECTING) { - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - tipc_node_remove_conn(net, dnode, tsk->portid); - } - tipc_sk_respond(sk, skb, error); + continue; + } + if (!tipc_sk_type_connectionless(sk) && + sk->sk_state != TIPC_DISCONNECTING) { + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(net, dnode, tsk->portid); } + tipc_sk_respond(sk, skb, error); } + + if (tipc_sk_type_connectionless(sk)) + return; + if (sk->sk_state != TIPC_DISCONNECTING) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, @@ -457,10 +461,8 @@ static void __tipc_shutdown(struct socket *sock, int error) tsk->portid, error); if (skb) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); - if (!tipc_sk_type_connectionless(sk)) { - tipc_node_remove_conn(net, dnode, tsk->portid); - tipc_set_sk_state(sk, TIPC_DISCONNECTING); - } + tipc_node_remove_conn(net, dnode, tsk->portid); + tipc_set_sk_state(sk, TIPC_DISCONNECTING); } } -- cgit v1.1 From 628185cfddf1dfb701c4efe2cfd72cf5b09f5702 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 21 Dec 2016 18:04:11 +0100 Subject: net, sched: fix soft lockup in tc_classify Shahar reported a soft lockup in tc_classify(), where we run into an endless loop when walking the classifier chain due to tp->next == tp which is a state we should never run into. The issue only seems to trigger under load in the tc control path. What happens is that in tc_ctl_tfilter(), thread A allocates a new tp, initializes it, sets tp_created to 1, and calls into tp->ops->change() with it. In that classifier callback we had to unlock/lock the rtnl mutex and returned with -EAGAIN. One reason why we need to drop there is, for example, that we need to request an action module to be loaded. This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning after we loaded and found the requested action, we need to redo the whole request so we don't race against others. While we had to unlock rtnl in that time, thread B's request was processed next on that CPU. Thread B added a new tp instance successfully to the classifier chain. When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN and destroying its tp instance which never got linked, we goto replay and redo A's request. This time when walking the classifier chain in tc_ctl_tfilter() for checking for existing tp instances we had a priority match and found the tp instance that was created and linked by thread B. Now calling again into tp->ops->change() with that tp was successful and returned without error. tp_created was never cleared in the second round, thus kernel thinks that we need to link it into the classifier chain (once again). tp and *back point to the same object due to the match we had earlier on. Thus for thread B's already public tp, we reset tp->next to tp itself and link it into the chain, which eventually causes the mentioned endless loop in tc_classify() once a packet hits the data path. Fix is to clear tp_created at the beginning of each request, also when we replay it. On the paths that can cause -EAGAIN we already destroy the original tp instance we had and on replay we really need to start from scratch. It seems that this issue was first introduced in commit 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup"). Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup") Reported-by: Shahar Klein Signed-off-by: Daniel Borkmann Cc: Cong Wang Acked-by: Eric Dumazet Tested-by: Shahar Klein Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3fbba79..1ecdf80 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -148,13 +148,15 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) unsigned long cl; unsigned long fh; int err; - int tp_created = 0; + int tp_created; if ((n->nlmsg_type != RTM_GETTFILTER) && !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; replay: + tp_created = 0; + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); if (err < 0) return err; -- cgit v1.1 From 56ab6b93007e5000a8812985aec1833c4a6a9ce0 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sun, 25 Dec 2016 14:33:16 +0800 Subject: ipv4: Namespaceify tcp_tw_reuse knob Different namespaces might have different requirements to reuse TIME-WAIT sockets for new connections. This might be required in cases where different namespace applications are in place which require TIME_WAIT socket connections to be reduced independently of the host. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 80bc36b..22cbd61 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -433,13 +433,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &tcp_adv_win_scale_max, }, { - .procname = "tcp_tw_reuse", - .data = &sysctl_tcp_tw_reuse, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_frto", .data = &sysctl_tcp_frto, .maxlen = sizeof(int), @@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_tw_reuse", + .data = &init_net.ipv4.sysctl_tcp_tw_reuse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 30d81f5..fe9da4f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -84,7 +84,6 @@ #include #include -int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; #ifdef CONFIG_TCP_MD5SIG @@ -120,7 +119,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && + (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) @@ -2456,6 +2455,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_orphan_retries = 0; net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; + net->ipv4.sysctl_tcp_tw_reuse = 0; return 0; fail: -- cgit v1.1 From df30f7408b187929dbde72661c7f7c615268f1d0 Mon Sep 17 00:00:00 2001 From: pravin shelar Date: Mon, 26 Dec 2016 08:31:27 -0800 Subject: openvswitch: upcall: Fix vlan handling. Networking stack accelerate vlan tag handling by keeping topmost vlan header in skb. This works as long as packet remains in OVS datapath. But during OVS upcall vlan header is pushed on to the packet. When such packet is sent back to OVS datapath, core networking stack might not handle it correctly. Following patch avoids this issue by accelerating the vlan tag during flow key extract. This simplifies datapath by bringing uniform packet processing for packets from all code paths. Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets"). CC: Jarno Rajahalme CC: Jiri Benc Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 1 - net/openvswitch/flow.c | 54 +++++++++++++++++++++++----------------------- 2 files changed, 27 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2d4c4d3..9c62b63 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -606,7 +606,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) rcu_assign_pointer(flow->sf_acts, acts); packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; - packet->protocol = flow->key.eth.type; rcu_read_lock(); dp = get_dp_rcu(net, ovs_header->dp_ifindex); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 08aa926..2c0a00f 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -312,7 +312,8 @@ static bool icmp6hdr_ok(struct sk_buff *skb) * Returns 0 if it encounters a non-vlan or incomplete packet. * Returns 1 after successfully parsing vlan tag. */ -static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, + bool untag_vlan) { struct vlan_head *vh = (struct vlan_head *)skb->data; @@ -330,7 +331,20 @@ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); key_vh->tpid = vh->tpid; - __skb_pull(skb, sizeof(struct vlan_head)); + if (unlikely(untag_vlan)) { + int offset = skb->data - skb_mac_header(skb); + u16 tci; + int err; + + __skb_push(skb, offset); + err = __skb_vlan_pop(skb, &tci); + __skb_pull(skb, offset); + if (err) + return err; + __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci); + } else { + __skb_pull(skb, sizeof(struct vlan_head)); + } return 1; } @@ -351,13 +365,13 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) key->eth.vlan.tpid = skb->vlan_proto; } else { /* Parse outer vlan tag in the non-accelerated case. */ - res = parse_vlan_tag(skb, &key->eth.vlan); + res = parse_vlan_tag(skb, &key->eth.vlan, true); if (res <= 0) return res; } /* Parse inner vlan tag. */ - res = parse_vlan_tag(skb, &key->eth.cvlan); + res = parse_vlan_tag(skb, &key->eth.cvlan, false); if (res <= 0) return res; @@ -800,29 +814,15 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, if (err) return err; - if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { - /* key_extract assumes that skb->protocol is set-up for - * layer 3 packets which is the case for other callers, - * in particular packets recieved from the network stack. - * Here the correct value can be set from the metadata - * extracted above. - */ - skb->protocol = key->eth.type; - } else { - struct ethhdr *eth; - - skb_reset_mac_header(skb); - eth = eth_hdr(skb); - - /* Normally, setting the skb 'protocol' field would be - * handled by a call to eth_type_trans(), but it assumes - * there's a sending device, which we may not have. - */ - if (eth_proto_is_802_3(eth->h_proto)) - skb->protocol = eth->h_proto; - else - skb->protocol = htons(ETH_P_802_2); - } + /* key_extract assumes that skb->protocol is set-up for + * layer 3 packets which is the case for other callers, + * in particular packets received from the network stack. + * Here the correct value can be set from the metadata + * extracted above. + * For L2 packet key eth type would be zero. skb protocol + * would be set to correct value later during key-extact. + */ + skb->protocol = key->eth.type; return key_extract(skb, key); } -- cgit v1.1 From be26727772cd86979255dfaf1eea967338dc0c9b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 27 Dec 2016 10:49:54 +0800 Subject: net: xdp: remove unused bfp_warn_invalid_xdp_buffer() After commit 73b62bd085f4737679ea9afc7867fa5f99ba7d1b ("virtio-net: remove the warning before XDP linearizing"), there's no users for bpf_warn_invalid_xdp_buffer(), so remove it. This is a revert for commit f23bc46c30ca5ef58b8549434899fcbac41b2cfc. Cc: Daniel Borkmann Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 7190bd6..b146170 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2972,12 +2972,6 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -void bpf_warn_invalid_xdp_buffer(void) -{ - WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n"); -} -EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer); - static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, -- cgit v1.1