From 864d9664245565a6b9df86a68c7664a25a4fcd58 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Jul 2017 18:49:45 +0200 Subject: net/socket: fix type in assignment and trim long line The commit ffb07550c76f ("copy_msghdr_from_user(): get rid of field-by-field copyin") introduce a new sparse warning: net/socket.c:1919:27: warning: incorrect type in assignment (different address spaces) net/socket.c:1919:27: expected void *msg_control net/socket.c:1919:27: got void [noderef] *[addressable] msg_control and a line above 80 chars, let's fix them Fixes: ffb07550c76f ("copy_msghdr_from_user(): get rid of field-by-field copyin") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/socket.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index bf21226..ad22df1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1916,7 +1916,7 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - kmsg->msg_control = msg.msg_control; + kmsg->msg_control = (void __force *)msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; @@ -1935,7 +1935,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (msg.msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(msg.msg_name, kmsg->msg_namelen, + err = move_addr_to_kernel(msg.msg_name, + kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; -- cgit v1.1 From 69ec932e364b1ba9c3a2085fe96b76c8a3f71e7c Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 23 Jul 2017 17:52:23 +0800 Subject: openvswitch: fix potential out of bound access in parse_ct Before the 'type' is validated, we shouldn't use it to fetch the ovs_ct_attr_lens's minlen and maxlen, else, out of bound access may happen. Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Signed-off-by: Liping Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e3c4c6c..03859e3 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1310,8 +1310,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, nla_for_each_nested(a, attr, rem) { int type = nla_type(a); - int maxlen = ovs_ct_attr_lens[type].maxlen; - int minlen = ovs_ct_attr_lens[type].minlen; + int maxlen; + int minlen; if (type > OVS_CT_ATTR_MAX) { OVS_NLERR(log, @@ -1319,6 +1319,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, type, OVS_CT_ATTR_MAX); return -EINVAL; } + + maxlen = ovs_ct_attr_lens[type].maxlen; + minlen = ovs_ct_attr_lens[type].minlen; if (nla_len(a) < minlen || nla_len(a) > maxlen) { OVS_NLERR(log, "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", -- cgit v1.1 From c800aaf8d869f2b9b47b10c5c312fe19f0a94042 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 24 Jul 2017 10:07:32 -0700 Subject: packet: fix use-after-free in prb_retire_rx_blk_timer_expired() There are multiple reports showing we have a use-after-free in the timer prb_retire_rx_blk_timer_expired(), where we use struct tpacket_kbdq_core::pkbdq, a pg_vec, after it gets freed by free_pg_vec(). The interesting part is it is not freed via packet_release() but via packet_setsockopt(), which means we are not closing the socket. Looking into the big and fat function packet_set_ring(), this could happen if we satisfy the following conditions: 1. closing == 0, not on packet_release() path 2. req->tp_block_nr == 0, we don't allocate a new pg_vec 3. rx_ring->pg_vec is already set as V3, which means we already called packet_set_ring() wtih req->tp_block_nr > 0 previously 4. req->tp_frame_nr == 0, pass sanity check 5. po->mapped == 0, never called mmap() In this scenario we are clearing the old rx_ring->pg_vec, so we need to free this pg_vec, but we don't stop the timer on this path because of closing==0. The timer has to be stopped as long as we need to free pg_vec, therefore the check on closing!=0 is wrong, we should check pg_vec!=NULL instead. Thanks to liujian for testing different fixes. Reported-by: alexander.levin@verizon.com Reported-by: Dave Jones Reported-by: liujian (CE) Tested-by: liujian (CE) Cc: Ding Tianhong Cc: Willem de Bruijn Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 008bb34..0615c2a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4329,7 +4329,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, register_prot_hook(sk); } spin_unlock(&po->bind_lock); - if (closing && (po->tp_version > TPACKET_V2)) { + if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); -- cgit v1.1 From 9f9e772da2db279c8d8c9206d271b8009c9093f4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 24 Jul 2017 10:49:23 -0700 Subject: net: dsa: Initialize ds->cpu_port_mask earlier The mt7530 driver has its dsa_switch_ops::get_tag_protocol function check ds->cpu_port_mask to issue a warning in case the configured CPU port is not capable of supporting tags. After commit 14be36c2c96c ("net: dsa: Initialize all CPU and enabled ports masks in dsa_ds_parse()") we slightly re-arranged the initialization such that this was no longer working. Just make sure that ds->cpu_port_mask is set prior to the first call to get_tag_protocol, thus restoring the expected contract. In case of error, the CPU port bit is cleared. Fixes: 14be36c2c96c ("net: dsa: Initialize all CPU and enabled ports masks in dsa_ds_parse()") Reported-by: Sean Wang Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 56e4609..c442051 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -509,21 +509,22 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->netdev = ethernet_dev; } + /* Initialize cpu_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + ds->cpu_port_mask |= BIT(index); + tag_protocol = ds->ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); + ds->cpu_port_mask &= ~BIT(index); return PTR_ERR(dst->tag_ops); } dst->rcv = dst->tag_ops->rcv; - /* Initialize cpu_port_mask now for drv->setup() - * to have access to a correct value, just like what - * net/dsa/dsa.c::dsa_switch_setup_one does. - */ - ds->cpu_port_mask |= BIT(index); - return 0; } -- cgit v1.1 From dce4551cb2adb1ac9a30f8ab5299d614392b3cff Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 25 Jul 2017 17:57:47 +0200 Subject: udp: preserve head state for IP_CMSG_PASSSEC Paul Moore reported a SELinux/IP_PASSSEC regression caused by missing skb->sp at recvmsg() time. We need to preserve the skb head state to process the IP_CMSG_PASSSEC cmsg. With this commit we avoid releasing the skb head state in the BH even if a secpath is attached to the current skb, and stores the skb status (with/without head states) in the scratch area, so that we can access it at skb deallocation time, without incurring in cache-miss penalties. This also avoids misusing the skb CB for ipv6 packets, as introduced by the commit 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing"). Clean a bit the scratch area helpers implementation, to reduce the code differences between 32 and 64 bits build. Reported-by: Paul Moore Fixes: 0a463c78d25b ("udp: avoid a cache miss on dequeue") Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") Signed-off-by: Paolo Abeni Tested-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/udp.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b057653..d243772 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1163,34 +1163,32 @@ out: return ret; } -#if BITS_PER_LONG == 64 +#define UDP_SKB_IS_STATELESS 0x80000000 + static void udp_set_dev_scratch(struct sk_buff *skb) { - struct udp_dev_scratch *scratch; + struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); - scratch = (struct udp_dev_scratch *)&skb->dev_scratch; - scratch->truesize = skb->truesize; + scratch->_tsize_state = skb->truesize; +#if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); +#endif + if (likely(!skb->_skb_refdst)) + scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static int udp_skb_truesize(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; -} -#else -static void udp_set_dev_scratch(struct sk_buff *skb) -{ - skb->dev_scratch = skb->truesize; + return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } -static int udp_skb_truesize(struct sk_buff *skb) +static bool udp_skb_has_head_state(struct sk_buff *skb) { - return skb->dev_scratch; + return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } -#endif /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, @@ -1388,10 +1386,10 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } - /* we cleared the head states previously only if the skb lacks any IP - * options, see __udp_queue_rcv_skb(). + /* In the more common cases we cleared the head states previously, + * see __udp_queue_rcv_skb(). */ - if (unlikely(IPCB(skb)->opt.optlen > 0)) + if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); consume_stateless_skb(skb); } @@ -1784,11 +1782,11 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* At recvmsg() time we need skb->dst to process IP options-related - * cmsg, elsewhere can we clear all pending head states while they are - * hot in the cache + /* At recvmsg() time we may access skb->dst or skb->sp depending on + * the IP options and the cmsg flags, elsewhere can we clear all + * pending head states while they are hot in the cache */ - if (likely(IPCB(skb)->opt.optlen == 0)) + if (likely(IPCB(skb)->opt.optlen == 0 && !skb->sp)) skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); -- cgit v1.1 From afce615aaabfbaad02550e75c0bec106dafa1adf Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Mon, 24 Jul 2017 23:14:28 +0200 Subject: ipv6: Don't increase IPSTATS_MIB_FRAGFAILS twice in ip6_fragment() RFC 2465 defines ipv6IfStatsOutFragFails as: "The number of IPv6 datagrams that have been discarded because they needed to be fragmented at this output interface but could not be." The existing implementation, instead, would increase the counter twice in case we fail to allocate room for single fragments: once for the fragment, once for the datagram. This didn't look intentional though. In one of the two affected affected failure paths, the double increase was simply a result of a new 'goto fail' statement, introduced to avoid a skb leak. The other path appears to be affected since at least 2.6.12-rc2. Reported-by: Sabrina Dubroca Fixes: 1d325d217c7f ("ipv6: ip6_fragment: fix headroom tests and skb leak") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1422d6c..162efba 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -673,8 +673,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, *prevhdr = NEXTHDR_FRAGMENT; tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!tmp_hdr) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } @@ -789,8 +787,6 @@ slow_path: frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + hroom + troom, GFP_ATOMIC); if (!frag) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } -- cgit v1.1 From 9688f9b020263c4a17471d09bb18e34eccc1c0a5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 26 Jul 2017 11:33:49 +0200 Subject: udp: unbreak build lacking CONFIG_XFRM We must use pre-processor conditional block or suitable accessors to manipulate skb->sp elsewhere builds lacking the CONFIG_XFRM will break. Fixes: dce4551cb2ad ("udp: preserve head state for IP_CMSG_PASSSEC") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d243772..fac7cb9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1786,7 +1786,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * the IP options and the cmsg flags, elsewhere can we clear all * pending head states while they are hot in the cache */ - if (likely(IPCB(skb)->opt.optlen == 0 && !skb->sp)) + if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb))) skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); -- cgit v1.1 From 0c3a8f8b8fabff4f3ad2dd7b95ae0e90cdd1aebb Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 25 Jul 2017 11:36:25 -0700 Subject: netpoll: Fix device name check in netpoll_setup() Apparently netpoll_setup() assumes that netpoll.dev_name is a pointer when checking if the device name is set: if (np->dev_name) { ... However the field is a character array, therefore the condition always yields true. Check instead whether the first byte of the array has a non-zero value. Signed-off-by: Matthias Kaehlcke Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 8357f16..912731b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -666,7 +666,7 @@ int netpoll_setup(struct netpoll *np) int err; rtnl_lock(); - if (np->dev_name) { + if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } -- cgit v1.1 From 0c2232b0a71db0ac1d22f751aa1ac0cadb950fd2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:19:09 +0800 Subject: dccp: fix a memleak that dccp_ipv6 doesn't put reqsk properly In dccp_v6_conn_request, after reqsk gets alloced and hashed into ehash table, reqsk's refcnt is set 3. one is for req->rsk_timer, one is for hlist, and the other one is for current using. The problem is when dccp_v6_conn_request returns and finishes using reqsk, it doesn't put reqsk. This will cause reqsk refcnt leaks and reqsk obj never gets freed. Jianlin found this issue when running dccp_memleak.c in a loop, the system memory would run out. dccp_memleak.c: int s1 = socket(PF_INET6, 6, IPPROTO_IP); bind(s1, &sa1, 0x20); listen(s1, 0x9); int s2 = socket(PF_INET6, 6, IPPROTO_IP); connect(s2, &sa1, 0x20); close(s1); close(s2); This patch is to put the reqsk before dccp_v6_conn_request returns, just as what tcp_conn_request does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index c376af5..1b58eac 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -380,6 +380,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: -- cgit v1.1 From b7953d3c0e30a5fc944f6b7bd0bcceb0794bcd85 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:19:46 +0800 Subject: dccp: fix a memleak that dccp_ipv4 doesn't put reqsk properly The patch "dccp: fix a memleak that dccp_ipv6 doesn't put reqsk properly" fixed reqsk refcnt leak for dccp_ipv6. The same issue exists on dccp_ipv4. This patch is to fix it for dccp_ipv4. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index f85d901..1b202f1 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -631,6 +631,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: -- cgit v1.1 From e90ce2fc27cad7e7b1e72b9e66201a7a4c124c2b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:20:15 +0800 Subject: dccp: fix a memleak for dccp_feat_init err process In dccp_feat_init, when ccid_get_builtin_ccids failsto alloc memory for rx.val, it should free tx.val before returning an error. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/dccp/feat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 1704948..f227f00 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1471,9 +1471,12 @@ int dccp_feat_init(struct sock *sk) * singleton values (which always leads to failure). * These settings can still (later) be overridden via sockopts. */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || - ccid_get_builtin_ccids(&rx.val, &rx.len)) + if (ccid_get_builtin_ccids(&tx.val, &tx.len)) return -ENOBUFS; + if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { + kfree(tx.val); + return -ENOBUFS; + } if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) -- cgit v1.1 From 0254e0c632bfe4a0cf610e2d90397474144f00d2 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 26 Jul 2017 15:22:06 -0700 Subject: net: check dev->addr_len for dev_set_mac_address() Historically, dev_ifsioc() uses struct sockaddr as mac address definition, this is why dev_set_mac_address() accepts a struct sockaddr pointer as input but now we have various types of mac addresse whose lengths are up to MAX_ADDR_LEN, longer than struct sockaddr, and saved in dev->addr_len. It is too late to fix dev_ifsioc() due to API compatibility, so just reject those larger than sizeof(struct sockaddr), otherwise we would read and use some random bytes from kernel stack. Fortunately, only a few IPv6 tunnel devices have addr_len larger than sizeof(struct sockaddr) and they don't support ndo_set_mac_addr(). But with team driver, in lb mode, they can still be enslaved to a team master and make its mac addr length as the same. Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 06b147d..709a4e6 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -263,6 +263,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: + if (dev->addr_len > sizeof(struct sockaddr)) + return -EINVAL; return dev_set_mac_address(dev, &ifr->ifr_hwaddr); case SIOCSIFHWBROADCAST: -- cgit v1.1 From c9f2c1ae123a751d4e4f949144500219354d5ee1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 27 Jul 2017 14:45:09 +0200 Subject: udp6: fix socket leak on early demux When an early demuxed packet reaches __udp6_lib_lookup_skb(), the sk reference is retrieved and used, but the relevant reference count is leaked and the socket destructor is never called. Beyond leaking the sk memory, if there are pending UDP packets in the receive queue, even the related accounted memory is leaked. In the long run, this will cause persistent forward allocation errors and no UDP skbs (both ipv4 and ipv6) will be able to reach the user-space. Fix this by explicitly accessing the early demux reference before the lookup, and properly decreasing the socket reference count after usage. Also drop the skb_steal_sock() in __udp6_lib_lookup_skb(), and the now obsoleted comment about "socket cache". The newly added code is derived from the current ipv4 code for the similar path. v1 -> v2: fixed the __udp6_lib_rcv() return code for resubmission, as suggested by Eric Reported-by: Sam Edwards Reported-by: Marc Haber Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast") Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 27 ++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fac7cb9..e6276fa 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1928,7 +1928,7 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; @@ -1937,6 +1937,7 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) dst_release(old); } } +EXPORT_SYMBOL(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4a3e656..98fe456 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -291,11 +291,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct udp_table *udptable) { const struct ipv6hdr *iph = ipv6_hdr(skb); - struct sock *sk; - sk = skb_steal_sock(skb); - if (unlikely(sk)) - return sk; return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), udptable, skb); @@ -804,6 +800,24 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp6_csum_init(skb, uh, proto)) goto csum_error; + /* Check if the socket is already available, e.g. due to early demux */ + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); + int ret; + + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); + + ret = udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input */ + if (ret > 0) + return ret; + return 0; + } + /* * Multicast receive code */ @@ -812,11 +826,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); /* Unicast */ - - /* - * check socket cache ... must talk to Alan about his plans - * for sock caches... i'll skip this for now. - */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { int ret; -- cgit v1.1 From efe967cdec32af93e15839cc639695ec5f637771 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 28 Jul 2017 16:41:37 +0200 Subject: tcp: avoid bogus gcc-7 array-bounds warning When using CONFIG_UBSAN_SANITIZE_ALL, the TCP code produces a false-positive warning: net/ipv4/tcp_output.c: In function 'tcp_connect': net/ipv4/tcp_output.c:2207:40: error: array subscript is below array bounds [-Werror=array-bounds] tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; ^~ net/ipv4/tcp_output.c:2207:40: error: array subscript is below array bounds [-Werror=array-bounds] tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~ I have opened a gcc bug for this, but distros have already shipped compilers with this problem, and it's not clear yet whether there is a way for gcc to avoid the warning. As the problem is related to the bitfield access, this introduces a temporary variable to store the old enum value. I did not notice this warning earlier, since UBSAN is disabled when building with COMPILE_TEST, and that was always turned on in both allmodconfig and randconfig tests. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81601 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4e985de..2f1588b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2202,9 +2202,10 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; + enum tcp_chrono old = tp->chrono_type; - if (tp->chrono_type > TCP_CHRONO_UNSPEC) - tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + if (old > TCP_CHRONO_UNSPEC) + tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } -- cgit v1.1 From 54e22f265e872ae140755b3318521d400a094605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 6 Jul 2017 07:02:25 +0200 Subject: batman-adv: fix TT sync flag inconsistencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes an issue in the translation table code potentially leading to a TT Request + Response storm. The issue may occur for nodes involving BLA and an inconsistent configuration of the batman-adv AP isolation feature. However, since the new multicast optimizations, a single, malformed packet may lead to a mesh-wide, persistent Denial-of-Service, too. The issue occurs because nodes are currently OR-ing the TT sync flags of all originators announcing a specific MAC address via the translation table. When an intermediate node now receives a TT Request and wants to answer this on behalf of the destination node, then this intermediate node now responds with an altered flag field and broken CRC. The next OGM of the real destination will lead to a CRC mismatch and triggering a TT Request and Response again. Furthermore, the OR-ing is currently never undone as long as at least one originator announcing the according MAC address remains, leading to the potential persistency of this issue. This patch fixes this issue by storing the flags used in the CRC calculation on a a per TT orig entry basis to be able to respond with the correct, original flags in an intermediate TT Response for one thing. And to be able to correctly unset sync flags once all nodes announcing a sync flag vanish for another. Fixes: e9c00136a475 ("batman-adv: fix tt_global_entries flags update") Signed-off-by: Linus Lüssing Acked-by: Antonio Quartulli [sw: typo in commit message] Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 60 ++++++++++++++++++++++++++++++++------ net/batman-adv/types.h | 2 ++ 2 files changed, 53 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index e1133bc..8a3ce79 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1549,9 +1549,41 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, return found; } +/** + * batadv_tt_global_sync_flags - update TT sync flags + * @tt_global: the TT global entry to update sync flags in + * + * Updates the sync flag bits in the tt_global flag attribute with a logical + * OR of all sync flags from any of its TT orig entries. + */ +static void +batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) +{ + struct batadv_tt_orig_list_entry *orig_entry; + const struct hlist_head *head; + u16 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + head = &tt_global->orig_list; + hlist_for_each_entry_rcu(orig_entry, head, list) + flags |= orig_entry->flags; + rcu_read_unlock(); + + flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); + tt_global->common.flags = flags; +} + +/** + * batadv_tt_global_orig_entry_add - add or update a TT orig entry + * @tt_global: the TT global entry to add an orig entry in + * @orig_node: the originator to add an orig entry for + * @ttvn: translation table version number of this changeset + * @flags: TT sync flags + */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, - struct batadv_orig_node *orig_node, int ttvn) + struct batadv_orig_node *orig_node, int ttvn, + u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; @@ -1561,7 +1593,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; - goto out; + orig_entry->flags = flags; + goto sync_flags; } orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); @@ -1573,6 +1606,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; + orig_entry->flags = flags; kref_init(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); @@ -1582,6 +1616,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); +sync_flags: + batadv_tt_global_sync_flags(tt_global); out: if (orig_entry) batadv_tt_orig_list_entry_put(orig_entry); @@ -1703,10 +1739,10 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } /* the change can carry possible "attribute" flags like the - * TT_CLIENT_WIFI, therefore they have to be copied in the + * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ - common->flags |= flags; + common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a @@ -1723,7 +1759,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } add_orig_entry: /* add the new orig_entry (if needed) or update it */ - batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn); + batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, + flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", @@ -1946,6 +1983,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_tt_orig_list_entry *orig, bool best) { + u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags; void *hdr; struct batadv_orig_node_vlan *vlan; u8 last_ttvn; @@ -1975,7 +2013,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || - nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) @@ -2589,6 +2627,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; + struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; @@ -2627,8 +2666,9 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* find out if this global entry is announced by this * originator */ - if (!batadv_tt_global_entry_has_orig(tt_global, - orig_node)) + tt_orig = batadv_tt_global_orig_entry_find(tt_global, + orig_node); + if (!tt_orig) continue; /* use network order to read the VID: this ensures that @@ -2640,10 +2680,12 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* compute the CRC on flags that have to be kept in sync * among nodes */ - flags = tt_common->flags & BATADV_TT_SYNC_MASK; + flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); + + batadv_tt_orig_list_entry_put(tt_orig); } rcu_read_unlock(); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ea43a64..a627958 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1260,6 +1260,7 @@ struct batadv_tt_global_entry { * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client * @orig_node: pointer to orig node announcing this non-mesh client * @ttvn: translation table version number which added the non-mesh client + * @flags: per orig entry TT sync flags * @list: list node for batadv_tt_global_entry::orig_list * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -1267,6 +1268,7 @@ struct batadv_tt_global_entry { struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; + u8 flags; struct hlist_node list; struct kref refcount; struct rcu_head rcu; -- cgit v1.1 From 71ed7ee35ad2c5300f4b51634185a0193b4fb0fa Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 28 Jul 2017 23:27:44 +0300 Subject: ipv4: fib: Fix NULL pointer deref during fib_sync_down_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michał reported a NULL pointer deref during fib_sync_down_dev() when unregistering a netdevice. The problem is that we don't check for 'in_dev' being NULL, which can happen in very specific cases. Usually routes are flushed upon NETDEV_DOWN sent in either the netdev or the inetaddr notification chains. However, if an interface isn't configured with any IP address, then it's possible for host routes to be flushed following NETDEV_UNREGISTER, after NULLing dev->ip_ptr in inetdev_destroy(). To reproduce: $ ip link add type dummy $ ip route add local 1.1.1.0/24 dev dummy0 $ ip link del dev dummy0 Fix this by checking for the presence of 'in_dev' before referencing it. Fixes: 982acb97560c ("ipv4: fib: Notify about nexthop status changes") Signed-off-by: Ido Schimmel Reported-by: Michał Mirosław Tested-by: Michał Mirosław Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 22210010..b8d1817 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1452,7 +1452,7 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: - if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) || (fib_nh->nh_flags & RTNH_F_DEAD)) return call_fib_notifiers(dev_net(fib_nh->nh_dev), -- cgit v1.1 From cb891fa6a1d5f52c5f5c07b6f7f4c6d65ea55fc0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 31 Jul 2017 16:52:36 +0200 Subject: udp6: fix jumbogram reception Since commit 67a51780aebb ("ipv6: udp: leverage scratch area helpers") udp6_recvmsg() read the skb len from the scratch area, to avoid a cache miss. But the UDP6 rx path support RFC 2675 UDPv6 jumbograms, and their length exceeds the 16 bits available in the scratch area. As a side effect the length returned by recvmsg() is: % (1<<16) This commit addresses the issue allocating one more bit in the IP6CB flags field and setting it for incoming jumbograms. Such field is still in the first cacheline, so at recvmsg() time we can check it and fallback to access skb->len if required, without a measurable overhead. Fixes: 67a51780aebb ("ipv6: udp: leverage scratch area helpers") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 1 + net/ipv6/udp.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 4996d73..3cec529 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -756,6 +756,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; + IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 98fe456..578142b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -328,6 +328,15 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif +/* do not use the scratch area len for jumbogram: their length execeeds the + * scratch area space; note that the IP6CB flags is still in the first + * cacheline, so checking for jumbograms is cheap + */ +static int udp6_skb_len(struct sk_buff *skb) +{ + return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -358,7 +367,7 @@ try_again: if (!skb) return err; - ulen = udp_skb_len(skb); + ulen = udp6_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; -- cgit v1.1 From 986e89898acb3d8f750f259a90cb73afca426b58 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 25 Jul 2017 14:40:03 +0200 Subject: libceph: make encode_request_*() work with r_mempool requests Messages allocated out of ceph_msgpool have a fixed front length (pool->front_len). Asserting that the entire front has been filled while encoding is thus wrong. Fixes: 8cb441c0545d ("libceph: MOSDOp v8 encoding (actual spgid + full hash)") Reported-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Reviewed-by: "Yan, Zheng" --- net/ceph/osd_client.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 901bb82..b5f016c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1918,10 +1918,12 @@ static void encode_request_partial(struct ceph_osd_request *req, } ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ - BUG_ON(p != end - 8); /* space for features */ + BUG_ON(p > end - 8); /* space for features */ msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ /* front_len is finalized in encode_request_finish() */ + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.data_len = cpu_to_le32(data_len); /* * The header "data_off" is a hint to the receiver allowing it @@ -1937,11 +1939,12 @@ static void encode_request_partial(struct ceph_osd_request *req, static void encode_request_finish(struct ceph_msg *msg) { void *p = msg->front.iov_base; + void *const partial_end = p + msg->front.iov_len; void *const end = p + msg->front_alloc_len; if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { /* luminous OSD -- encode features and be done */ - p = end - 8; + p = partial_end; ceph_encode_64(&p, msg->con->peer_features); } else { struct { @@ -1984,7 +1987,7 @@ static void encode_request_finish(struct ceph_msg *msg) oid_len = p - oid; tail = p; - tail_len = (end - p) - 8; + tail_len = partial_end - p; p = msg->front.iov_base; ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); -- cgit v1.1 From 4690faf00cf838392ce038202a85ac0d5f1df598 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 26 Jul 2017 09:59:15 +0200 Subject: libceph: don't call ->reencode_message() more than once per message Reencoding an already reencoded message is a bad idea. This could happen on Policy::stateful_server connections (!CEPH_MSG_CONNECT_LOSSY), such as MDS sessions. This didn't pop up in testing because currently only OSD requests are reencoded and OSD sessions are always lossy. Fixes: 98ad5ebd1505 ("libceph: ceph_connection_operations::reencode_message() method") Signed-off-by: Ilya Dryomov Reviewed-by: "Yan, Zheng" --- net/ceph/messenger.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b7cc615..a67298c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1287,10 +1287,10 @@ static void prepare_write_message(struct ceph_connection *con) if (m->needs_out_seq) { m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; - } - if (con->ops->reencode_message) - con->ops->reencode_message(m); + if (con->ops->reencode_message) + con->ops->reencode_message(m); + } dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), -- cgit v1.1 From e17e8969f5c59a10083af5e260bdad6026872203 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Jul 2017 16:43:49 +0200 Subject: libceph: fallback for when there isn't a pool-specific choose_arg There is now a fallback to a choose_arg index of -1 if there isn't a pool-specific choose_arg set. If you create a per-pool weight-set, that works for that pool. Otherwise we try the compat/default one. If that doesn't exist either, then we use the normal CRUSH weights. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 64ae9f8..eb57a06 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2301,10 +2301,17 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, } } +/* + * Magic value used for a "default" fallback choose_args, used if the + * crush_choose_arg_map passed to do_crush() does not exist. If this + * also doesn't exist, fall back to canonical weights. + */ +#define CEPH_DEFAULT_CHOOSE_ARGS -1 + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - u64 choose_args_index) + s64 choose_args_index) { struct crush_choose_arg_map *arg_map; int r; @@ -2313,6 +2320,9 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, arg_map = lookup_choose_arg_map(&map->crush->choose_args, choose_args_index); + if (!arg_map) + arg_map = lookup_choose_arg_map(&map->crush->choose_args, + CEPH_DEFAULT_CHOOSE_ARGS); mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, -- cgit v1.1 From c7ed1a4bf4b446317eefa0f4916d94b1f6d3ada5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Jul 2017 15:49:52 +0200 Subject: crush: assume weight_set != null imples weight_set_size > 0 Reflects ceph.git commit 5e8fa3e06b68fae1582c9230a3a8d1abc6146286. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 2 +- net/ceph/osdmap.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 746b145..417df67 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -306,7 +306,7 @@ static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, const struct crush_choose_arg *arg, int position) { - if (!arg || !arg->weight_set || arg->weight_set_size == 0) + if (!arg || !arg->weight_set) return bucket->item_weights; if (position >= arg->weight_set_size) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index eb57a06..2586e55 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -295,6 +295,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) ret = decode_choose_arg(p, end, arg); if (ret) goto fail; + + if (arg->ids_size && + arg->ids_size != c->buckets[bucket_index]->size) + goto e_inval; } insert_choose_arg_map(&c->choose_args, arg_map); -- cgit v1.1 From f53b7665c8cec40c8a638b55ee098b721e6be20c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 27 Jul 2017 15:16:39 +0200 Subject: libceph: upmap semantic changes - apply both pg_upmap and pg_upmap_items - allow bidirectional swap of pg-upmap-items Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2586e55..0bec71f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2437,40 +2437,23 @@ static void apply_upmap(struct ceph_osdmap *osdmap, for (i = 0; i < pg->pg_upmap.len; i++) raw->osds[i] = pg->pg_upmap.osds[i]; raw->size = pg->pg_upmap.len; - return; + /* check and apply pg_upmap_items, if any */ } pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - /* - * Note: this approach does not allow a bidirectional swap, - * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. - */ - for (i = 0; i < pg->pg_upmap_items.len; i++) { - int from = pg->pg_upmap_items.from_to[i][0]; - int to = pg->pg_upmap_items.from_to[i][1]; - int pos = -1; - bool exists = false; - - /* make sure replacement doesn't already appear */ - for (j = 0; j < raw->size; j++) { - int osd = raw->osds[j]; - - if (osd == to) { - exists = true; + for (i = 0; i < raw->size; i++) { + for (j = 0; j < pg->pg_upmap_items.len; j++) { + int from = pg->pg_upmap_items.from_to[j][0]; + int to = pg->pg_upmap_items.from_to[j][1]; + + if (from == raw->osds[i]) { + if (!(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) + raw->osds[i] = to; break; } - /* ignore mapping if target is marked out */ - if (osd == from && pos < 0 && - !(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) { - pos = j; - } - } - if (!exists && pos >= 0) { - raw->osds[pos] = to; - return; } } } -- cgit v1.1 From ae78dd8139ce93a528beb7f3914531b7a7be9e30 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 27 Jul 2017 17:59:14 +0200 Subject: libceph: make RECOVERY_DELETES feature create a new interval This is needed so that the OSDs can regenerate the missing set at the start of a new interval where support for recovery deletes changed. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 5 +++++ net/ceph/osdmap.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b5f016c..dcfbdd7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1337,6 +1337,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool legacy_change; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); + bool recovery_deletes = ceph_osdmap_flag(osdc, + CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; int ret; @@ -1399,6 +1401,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, pi->pg_num, t->sort_bitwise, sort_bitwise, + t->recovery_deletes, + recovery_deletes, &last_pgid)) force_resend = true; @@ -1421,6 +1425,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->pg_num = pi->pg_num; t->pg_num_mask = pi->pg_num_mask; t->sort_bitwise = sort_bitwise; + t->recovery_deletes = recovery_deletes; t->osd = acting.primary; } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0bec71f..f358d0b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2082,6 +2082,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid) { return !osds_equal(old_acting, new_acting) || @@ -2089,7 +2091,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, old_size != new_size || old_min_size != new_min_size || ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || - old_sort_bitwise != new_sort_bitwise; + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes; } static int calc_pg_rank(int osd, const struct ceph_osds *acting) -- cgit v1.1 From 40413955ee265a5e42f710940ec78f5450d49149 Mon Sep 17 00:00:00 2001 From: "yujuan.qi" Date: Mon, 31 Jul 2017 11:23:01 +0800 Subject: Cipso: cipso_v4_optptr enter infinite loop in for(),if((optlen > 0) && (optptr[1] == 0)), enter infinite loop. Test: receive a packet which the ip length > 20 and the first byte of ip option is 0, produce this issue Signed-off-by: yujuan.qi Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c4c6e19..2ae8f54 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1523,9 +1523,17 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { - if (optptr[0] == IPOPT_CIPSO) + switch (optptr[0]) { + case IPOPT_CIPSO: return optptr; - taglen = optptr[1]; + case IPOPT_END: + return NULL; + case IPOPT_NOOP: + taglen = 1; + break; + default: + taglen = optptr[1]; + } optlen -= taglen; optptr += taglen; } -- cgit v1.1 From 1bff8a0c1f8c236209ee369b7952751c04eaa71a Mon Sep 17 00:00:00 2001 From: "K. Den" Date: Tue, 1 Aug 2017 01:05:39 +0900 Subject: gue: fix remcsum when GRO on and CHECKSUM_PARTIAL boundary is outer UDP In the case that GRO is turned on and the original received packet is CHECKSUM_PARTIAL, if the outer UDP header is exactly at the last csum-unnecessary point, which for instance could occur if the packet comes from another Linux guest on the same Linux host, we have to do either remcsum_adjust or set up CHECKSUM_PARTIAL again with its csum_start properly reset considering RCO. However, since b7fe10e5ebac ("gro: Fix remcsum offload to deal with frags in GRO") that barrier in such case could be skipped if GRO turned on, hence we pass over it and the inner L4 validation mistakenly reckons it as a bad csum. This patch makes remcsum_offload being reset at the same time of GRO remcsum cleanup, so as to make it work in such case as before. Fixes: b7fe10e5ebac ("gro: Fix remcsum offload to deal with frags in GRO") Signed-off-by: Koichiro Den Signed-off-by: David S. Miller --- net/ipv4/fou.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 8e0257d..1540db6 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -450,6 +450,7 @@ out_unlock: out: NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; return pp; } -- cgit v1.1 From ed254971edea92c3ac5c67c6a05247a92aa6075e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 1 Aug 2017 13:22:32 -0700 Subject: tcp: avoid setting cwnd to invalid ssthresh after cwnd reduction states If the sender switches the congestion control during ECN-triggered cwnd-reduction state (CA_CWR), upon exiting recovery cwnd is set to the ssthresh value calculated by the previous congestion control. If the previous congestion control is BBR that always keep ssthresh to TCP_INIFINITE_SSTHRESH, cwnd ends up being infinite. The safe step is to avoid assigning invalid ssthresh value when recovery ends. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2920e0c..dad026f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2520,8 +2520,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) return; /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_jiffies32; } -- cgit v1.1 From 2dda640040876cd8ae646408b69eea40c24f9ae9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Aug 2017 23:10:46 -0700 Subject: net: fix keepalive code vs TCP_FASTOPEN_CONNECT syzkaller was able to trigger a divide by 0 in TCP stack [1] Issue here is that keepalive timer needs to be updated to not attempt to send a probe if the connection setup was deferred using TCP_FASTOPEN_CONNECT socket option added in linux-4.11 [1] divide error: 0000 [#1] SMP CPU: 18 PID: 0 Comm: swapper/18 Not tainted task: ffff986f62f4b040 ti: ffff986f62fa2000 task.ti: ffff986f62fa2000 RIP: 0010:[] [] __tcp_select_window+0x8d/0x160 Call Trace: [] tcp_transmit_skb+0x11/0x20 [] tcp_xmit_probe_skb+0xc1/0xe0 [] tcp_write_wakeup+0x68/0x160 [] tcp_keepalive_timer+0x17b/0x230 [] call_timer_fn+0x39/0xf0 [] run_timer_softirq+0x1d7/0x280 [] __do_softirq+0xcb/0x257 [] irq_exit+0x9c/0xb0 [] smp_apic_timer_interrupt+0x6a/0x80 [] apic_timer_interrupt+0x7f/0x90 [] ? cpuidle_enter_state+0x13a/0x3b0 [] ? cpuidle_enter_state+0x11d/0x3b0 Tested: Following packetdrill no longer crashes the kernel `echo 0 >/proc/sys/net/ipv4/tcp_timestamps` // Cache warmup: send a Fast Open cookie request 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress) +0 > S 0:0(0) +.01 < S. 123:123(0) ack 1 win 14600 +0 > . 1:1(0) ack 1 +0 close(3) = 0 +0 > F. 1:1(0) ack 1 +0 < F. 1:1(0) ack 2 win 92 +0 > . 2:2(0) ack 2 +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 setsockopt(4, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0 +.01 connect(4, ..., ...) = 0 +0 setsockopt(4, SOL_TCP, TCP_KEEPIDLE, [5], 4) = 0 +10 close(4) = 0 `echo 1 >/proc/sys/net/ipv4/tcp_timestamps` Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Wei Wang Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeee..e906014 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -652,7 +652,8 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); -- cgit v1.1 From b91d532928dff2141ea9c107c3e73104d9843767 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 3 Aug 2017 14:13:46 +0800 Subject: ipv6: set rt6i_protocol properly in the route when it is installed After commit c2ed1880fd61 ("net: ipv6: check route protocol when deleting routes"), ipv6 route checks rt protocol when trying to remove a rt entry. It introduced a side effect causing 'ip -6 route flush cache' not to work well. When flushing caches with iproute, all route caches get dumped from kernel then removed one by one by sending DELROUTE requests to kernel for each cache. The thing is iproute sends the request with the cache whose proto is set with RTPROT_REDIRECT by rt6_fill_node() when kernel dumps it. But in kernel the rt_cache protocol is still 0, which causes the cache not to be matched and removed. So the real reason is rt6i_protocol in the route is not set when it is allocated. As David Ahern's suggestion, this patch is to set rt6i_protocol properly in the route when it is installed and remove the codes setting rtm_protocol according to rt6i_flags in rt6_fill_node. This is also an improvement to keep rt6i_protocol consistent with rtm_protocol. Fixes: c2ed1880fd61 ("net: ipv6: check route protocol when deleting routes") Reported-by: Jianlin Shi Suggested-by: David Ahern Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d30c96..a640fbc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2351,6 +2351,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; + nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) @@ -2461,6 +2462,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -2513,6 +2515,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), @@ -3424,14 +3427,6 @@ static int rt6_fill_node(struct net *net, rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags & RTF_DYNAMIC) - rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) - rtm->rtm_protocol = RTPROT_RA; - else - rtm->rtm_protocol = RTPROT_KERNEL; - } if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; -- cgit v1.1 From e1a10ef7fa876f8510aaec36ea5c0cf34baba410 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:52 -0400 Subject: tcp: introduce tcp_rto_delta_us() helper for xmit timer fix Pure refactor. This helper will be required in the xmit timer fix later in the patch series. (Because the TLP logic will want to make this calculation.) Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dad026f..b9ba8d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3004,10 +3004,7 @@ void tcp_rearm_rto(struct sock *sk) /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ -- cgit v1.1 From a2815817ffa68c7933a43eb55836d6e789bd4389 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:53 -0400 Subject: tcp: enable xmit timer fix by having TLP use time when RTO should fire Have tcp_schedule_loss_probe() base the TLP scheduling decision based on when the RTO *should* fire. This is to enable the upcoming xmit timer fix in this series, where tcp_schedule_loss_probe() cannot assume that the last timer installed was an RTO timer (because we are no longer doing the "rearm RTO, rearm RTO, rearm TLP" dance on every ACK). So tcp_schedule_loss_probe() must independently figure out when an RTO would want to fire. In the new TLP implementation following in this series, we cannot assume that icsk_timeout was set based on an RTO; after processing a cumulative ACK the icsk_timeout we see can be from a previous TLP or RTO. So we need to independently recalculate the RTO time (instead of reading it out of icsk_timeout). Removing this dependency on the nature of icsk_timeout makes things a little easier to reason about anyway. Note that the old and new code should be equivalent, since they are both saying: "if the RTO is in the future, but at an earlier time than the normal TLP time, then set the TLP timer to fire when the RTO would have fired". Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f1588b..cd8e257 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2377,8 +2377,8 @@ bool tcp_schedule_loss_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + u32 timeout, rto_delta_us; /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { @@ -2417,14 +2417,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) (rtt + (rtt >> 1) + TCP_DELACK_MAX)); timeout = max_t(u32, timeout, msecs_to_jiffies(10)); - /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_jiffies32 + timeout; - rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; - if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_jiffies32; - if (delta > 0) - timeout = delta; - } + /* If the RTO formula yields an earlier time, then use that time. */ + rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); -- cgit v1.1 From df92c8394e6ea0469e8056946ef8add740ab8046 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:54 -0400 Subject: tcp: fix xmit timer to only be reset if data ACKed/SACKed Fix a TCP loss recovery performance bug raised recently on the netdev list, in two threads: (i) July 26, 2017: netdev thread "TCP fast retransmit issues" (ii) July 26, 2017: netdev thread: "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one outstanding TLP retransmission" The basic problem is that incoming TCP packets that did not indicate forward progress could cause the xmit timer (TLP or RTO) to be rearmed and pushed back in time. In certain corner cases this could result in the following problems noted in these threads: - Repeated ACKs coming in with bogus SACKs corrupted by middleboxes could cause TCP to repeatedly schedule TLPs forever. We kept sending TLPs after every ~200ms, which elicited bogus SACKs, which caused more TLPs, ad infinitum; we never fired an RTO to fill in the holes. - Incoming data segments could, in some cases, cause us to reschedule our RTO or TLP timer further out in time, for no good reason. This could cause repeated inbound data to result in stalls in outbound data, in the presence of packet loss. This commit fixes these bugs by changing the TLP and RTO ACK processing to: (a) Only reschedule the xmit timer once per ACK. (b) Only reschedule the xmit timer if tcp_clean_rtx_queue() deems the ACK indicates sufficient forward progress (a packet was cumulatively ACKed, or we got a SACK for a packet that was sent before the most recent retransmit of the write queue head). This brings us back into closer compliance with the RFCs, since, as the comment for tcp_rearm_rto() notes, we should only restart the RTO timer after forward progress on the connection. Previously we were restarting the xmit timer even in these cases where there was no forward progress. As a side benefit, this commit simplifies and speeds up the TCP timer arming logic. We had been calling inet_csk_reset_xmit_timer() three times on normal ACKs that cumulatively acknowledged some data: 1) Once near the top of tcp_ack() to switch from TLP timer to RTO: if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); 2) Once in tcp_clean_rtx_queue(), to update the RTO: if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); 3) Once in tcp_ack() after tcp_fastretrans_alert() to switch from RTO to TLP: if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); This commit, by only rescheduling the xmit timer once per ACK, simplifies the code and reduces CPU overhead. This commit was tested in an A/B test with Google web server traffic. SNMP stats and request latency metrics were within noise levels, substantiating that for normal web traffic patterns this is a rare issue. This commit was also tested with packetdrill tests to verify that it fixes the timer behavior in the corner cases discussed in the netdev threads mentioned above. This patch is a bug fix patch intended to be queued for -stable relases. Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Reported-by: Klavs Klavsen Reported-by: Mao Wenan Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 25 ++++++++++++++++--------- net/ipv4/tcp_output.c | 9 --------- 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b9ba8d5..53de142 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ @@ -3016,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk) } } +/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ +static void tcp_set_xmit_timer(struct sock *sk) +{ + if (!tcp_schedule_loss_probe(sk)) + tcp_rearm_rto(sk); +} + /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3177,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3205,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ } if (icsk->icsk_ca_ops->pkts_acked) { @@ -3577,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) - tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; @@ -3644,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + /* If needed, reset TLP/RTO timer; RACK may later override this. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } - if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag); if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - if (icsk->icsk_pending == ICSK_TIME_RETRANS) - tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, sack_state.rate); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cd8e257..276406a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2380,21 +2380,12 @@ bool tcp_schedule_loss_probe(struct sock *sk) u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); u32 timeout, rto_delta_us; - /* No consecutive loss probes. */ - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { - tcp_rearm_rto(sk); - return false; - } /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (tp->fastopen_rsk) return false; - /* TLP is only scheduled when next timer event is RTO. */ - if (icsk->icsk_pending != ICSK_TIME_RETRANS) - return false; - /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ -- cgit v1.1 From ec0acb09313074ba1a4976945791d9c6815f39fb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Aug 2017 15:25:25 +0800 Subject: net: sched: set xt_tgchk_param par.net properly in ipt_init_target Now xt_tgchk_param par in ipt_init_target is a local varibale, par.net is not initialized there. Later when xt_check_target calls target's checkentry in which it may access par.net, it would cause kernel panic. Jaroslav found this panic when running: # ip link add TestIface type dummy # tc qd add dev TestIface ingress handle ffff: # tc filter add dev TestIface parent ffff: u32 match u32 0 0 \ action xt -j CONNMARK --set-mark 4 This patch is to pass net param into ipt_init_target and set par.net with it properly in there. v1->v2: As Wang Cong pointed, I missed ipt_net_id != xt_net_id, so fix it by also passing net_id to __tcf_ipt_init. v2->v3: Missed the fixes tag, so add it. Fixes: ecb2421b5ddf ("netfilter: add and use nf_ct_netns_get/put") Reported-by: Jaroslav Aster Signed-off-by: Xin Long Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 36f0ced..94ba5cf 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -36,8 +36,8 @@ static struct tc_action_ops act_ipt_ops; static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; -static int ipt_init_target(struct xt_entry_target *t, char *table, - unsigned int hook) +static int ipt_init_target(struct net *net, struct xt_entry_target *t, + char *table, unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; @@ -49,6 +49,7 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, return PTR_ERR(target); t->u.kernel.target = target; + par.net = net; par.table = table; par.entryinfo = NULL; par.target = target; @@ -91,10 +92,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, +static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; @@ -159,7 +161,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (unlikely(!t)) goto err2; - err = ipt_init_target(t, tname, hook); + err = ipt_init_target(net, t, tname, hook); if (err < 0) goto err3; @@ -193,18 +195,16 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind); + return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, + bind); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { - struct tc_action_net *tn = net_generic(net, xt_net_id); - - return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind); + return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, + bind); } static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, -- cgit v1.1 From 8ba60924710cde564a3905588b6219741d6356d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2017 01:41:58 -0700 Subject: tcp: fastopen: tcp_connect() must refresh the route With new TCP_FASTOPEN_CONNECT socket option, there is a possibility to call tcp_connect() while socket sk_dst_cache is either NULL or invalid. +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(4, ..., ...) = 0 << sk->sk_dst_cache becomes obsolete, or even set to NULL >> +1 sendto(4, ..., 1000, MSG_FASTOPEN, ..., ...) = 1000 We need to refresh the route otherwise bad things can happen, especially when syzkaller is running on the host :/ Fixes: 19f6d3f3c8422 ("net/tcp-fastopen: Add new API support") Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Cc: Wei Wang Cc: Yuchung Cheng Acked-by: Wei Wang Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 276406a..b7661a6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3436,6 +3436,10 @@ int tcp_connect(struct sock *sk) int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk); if (unlikely(tp->repair)) { -- cgit v1.1 From 05bfd7dbb53a10be4a3e7aebaeec04b558198d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Tue, 8 Aug 2017 11:13:32 +0200 Subject: rds: Reintroduce statistics counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 7e3f2952eeb1 ("rds: don't let RDS shutdown a connection while senders are present"), refilling the receive queue was removed from rds_ib_recv(), along with the increment of s_ib_rx_refill_from_thread. Commit 73ce4317bf98 ("RDS: make sure we post recv buffers") re-introduces filling the receive queue from rds_ib_recv(), but does not add the statistics counter. rds_ib_recv() was later renamed to rds_ib_recv_path(). This commit reintroduces the statistics counting of s_ib_rx_refill_from_thread and s_ib_rx_refill_from_cq. Signed-off-by: Håkon Bugge Reviewed-by: Knut Omang Reviewed-by: Wei Lin Guay Reviewed-by: Shamir Rabinovitch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e10624a..9722bf8 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1015,8 +1015,10 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - if (rds_ib_ring_low(&ic->i_recv_ring)) + if (rds_ib_ring_low(&ic->i_recv_ring)) { rds_ib_recv_refill(conn, 0, GFP_NOWAIT); + rds_ib_stats_inc(s_ib_rx_refill_from_cq); + } } int rds_ib_recv_path(struct rds_conn_path *cp) @@ -1029,6 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp) if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); rds_ib_recv_refill(conn, 0, GFP_KERNEL); + rds_ib_stats_inc(s_ib_rx_refill_from_thread); } return ret; -- cgit v1.1 From 8d63bee643f1fb53e472f0e135cae4eb99d62d19 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Aug 2017 14:22:55 -0400 Subject: net: avoid skb_warn_bad_offload false positives on UFO skb_warn_bad_offload triggers a warning when an skb enters the GSO stack at __skb_gso_segment that does not have CHECKSUM_PARTIAL checksum offload set. Commit b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") observed that SKB_GSO_DODGY producers can trigger the check and that passing those packets through the GSO handlers will fix it up. But, the software UFO handler will set ip_summed to CHECKSUM_NONE. When __skb_gso_segment is called from the receive path, this triggers the warning again. Make UFO set CHECKSUM_UNNECESSARY instead of CHECKSUM_NONE. On Tx these two are equivalent. On Rx, this better matches the skb state (checksum computed), as CHECKSUM_NONE here means no checksum computed. See also this thread for context: http://patchwork.ozlabs.org/patch/799015/ Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8515f8f..ce15a06 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2739,7 +2739,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 7812501..0932c85 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -235,7 +235,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a2267f8..e7d378c 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -72,7 +72,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in -- cgit v1.1 From 04c2cf34362f133be09878bd752f8b014318b59a Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Tue, 11 Jul 2017 10:07:25 +0300 Subject: mac80211: add api to start ba session timer expired flow Some drivers handle rx buffer reordering internally (and by extension handle also the rx ba session timer internally), but do not ofload the addba/delba negotiation. Add an api for these drivers to properly tear-down the ba session, including sending a delba. Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- net/mac80211/agg-rx.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 8708cbe..2b36eff 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation - * Copyright(c) 2015 Intel Deutschland GmbH + * Copyright(c) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -466,3 +466,23 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_manage_rx_ba_offl); + +void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, + const u8 *addr, unsigned int tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get_bss(sdata, addr); + if (!sta) + goto unlock; + + set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + + unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_rx_ba_timer_expired); -- cgit v1.1 From ed43594aede9719e56eca72fc6a9a200c60b60e6 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Tue, 8 Aug 2017 22:23:56 +0200 Subject: tipc: remove premature ESTABLISH FSM event at link synchronization When a link between two nodes come up, both endpoints will initially send out a STATE message to the peer, to increase the probability that the peer endpoint also is up when the first traffic message arrives. Thereafter, if the establishing link is the second link between two nodes, this first "traffic" message is a TUNNEL_PROTOCOL/SYNCH message, helping the peer to perform initial synchronization between the two links. However, the initial STATE message may be lost, in which case the SYNCH message will be the first one arriving at the peer. This should also work, as the SYNCH message itself will be used to take up the link endpoint before initializing synchronization. Unfortunately the code for this case is broken. Currently, the link is brought up through a tipc_link_fsm_evt(ESTABLISHED) when a SYNCH arrives, whereupon __tipc_node_link_up() is called to distribute the link slots and take the link into traffic. But, __tipc_node_link_up() is itself starting with a test for whether the link is up, and if true, returns without action. Clearly, the tipc_link_fsm_evt(ESTABLISHED) call is unnecessary, since tipc_node_link_up() is itself issuing such an event, but also harmful, since it inhibits tipc_node_link_up() to perform the test of its tasks, and the link endpoint in question hence is never taken into traffic. This problem has been exposed when we set up dual links between pre- and post-4.4 kernels, because the former ones don't send out the initial STATE message described above. We fix this by removing the unnecessary event call. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index aeef801..9b4dcb6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1455,10 +1455,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { syncpt = iseqno + exp_pkts - 1; - if (!tipc_link_is_up(l)) { - tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); - } if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); -- cgit v1.1 From 1714020e42b17135032c8606f7185b3fb2ba5d78 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 9 Aug 2017 14:38:04 +0300 Subject: igmp: Fix regression caused by igmp sysctl namespace code. Commit dcd87999d415 ("igmp: net: Move igmp namespace init to correct file") moved the igmp sysctls initialization from tcp_sk_init to igmp_net_init. This function is only called as part of per-namespace initialization, only if CONFIG_IP_MULTICAST is defined, otherwise igmp_mc_init() call in ip_init is compiled out, casuing the igmp pernet ops to not be registerd and those sysctl being left initialized with 0. However, there are certain functions, such as ip_mc_join_group which are always compiled and make use of some of those sysctls. Let's do a partial revert of the aforementioned commit and move the sysctl initialization into inet_init_net, that way they will always have sane values. Fixes: dcd87999d415 ("igmp: net: Move igmp namespace init to correct file") Link: https://bugzilla.kernel.org/show_bug.cgi?id=196595 Reported-by: Gerardo Exequiel Pozzi Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 +++++++ net/ipv4/igmp.c | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76c2077c..2e548ec 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1731,6 +1731,13 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif + /* Some igmp sysctl, whose values are always used */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; + return 0; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 28f14af..498706b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2974,12 +2974,6 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } - /* Sysctl initialization */ - net->ipv4.sysctl_igmp_max_memberships = 20; - net->ipv4.sysctl_igmp_max_msf = 10; - /* IGMP reports for link-local multicast groups are enabled by default */ - net->ipv4.sysctl_igmp_llm_reports = 1; - net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: -- cgit v1.1 From 96d9703050a0036a3360ec98bb41e107c90664fe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Aug 2017 18:15:19 +0800 Subject: net: sched: set xt_tgchk_param par.nft_compat as 0 in ipt_init_target Commit 55917a21d0cc ("netfilter: x_tables: add context to know if extension runs from nft_compat") introduced a member nft_compat to xt_tgchk_param structure. But it didn't set it's value for ipt_init_target. With unexpected value in par.nft_compat, it may return unexpected result in some target's checkentry. This patch is to set all it's fields as 0 and only initialize the non-zero fields in ipt_init_target. v1->v2: As Wang Cong's suggestion, fix it by setting all it's fields as 0 and only initializing the non-zero fields. Fixes: 55917a21d0cc ("netfilter: x_tables: add context to know if extension runs from nft_compat") Suggested-by: Cong Wang Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 94ba5cf..d516ba8 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -49,9 +49,9 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t, return PTR_ERR(target); t->u.kernel.target = target; + memset(&par, 0, sizeof(par)); par.net = net; par.table = table; - par.entryinfo = NULL; par.target = target; par.targinfo = t->data; par.hook_mask = hook; -- cgit v1.1 From 85f1bd9a7b5a79d5baa8bf44af19658f7bf77bfa Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Aug 2017 12:29:19 -0400 Subject: udp: consistently apply ufo or fragmentation When iteratively building a UDP datagram with MSG_MORE and that datagram exceeds MTU, consistently choose UFO or fragmentation. Once skb_is_gso, always apply ufo. Conversely, once a datagram is split across multiple skbs, do not consider ufo. Sendpage already maintains the first invariant, only add the second. IPv6 does not have a sendpage implementation to modify. A gso skb must have a partial checksum, do not follow sk_no_check_tx in udp_send_skb. Found by syzkaller. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Reported-by: Andrey Konovalov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 +++++--- net/ipv4/udp.c | 2 +- net/ipv6/ip6_output.c | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 50c74cd..e153c40 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -965,11 +965,12 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || - (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + (((length + (skb ? skb->len : fragheaderlen)) > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { + (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); @@ -1288,6 +1289,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, return -EINVAL; if ((size + skb->len > mtu) && + (skb_queue_len(&sk->sk_write_queue) == 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { if (skb->ip_summed != CHECKSUM_PARTIAL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e6276fa..a7c804f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -802,7 +802,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check_tx) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 162efba..2dfe50d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1381,11 +1381,12 @@ emsgsize: */ cork->length += length; - if ((((length + (skb ? skb->len : headersize)) > mtu) || - (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + (((length + (skb ? skb->len : headersize)) > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { + (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, transhdrlen, mtu, flags, fl6); -- cgit v1.1 From c27927e372f0785f3303e8fad94b85945e2c97b7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Aug 2017 12:41:58 -0400 Subject: packet: fix tp_reserve race in packet_set_ring Updates to tp_reserve can race with reads of the field in packet_set_ring. Avoid this by holding the socket lock during updates in setsockopt PACKET_RESERVE. This bug was discovered by syzkaller. Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt") Reported-by: Andrey Konovalov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 0615c2a..008a45c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3700,14 +3700,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; - po->tp_reserve = val; - return 0; + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_reserve = val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_LOSS: { -- cgit v1.1 From e71cb9e00922902ba0519f37d09145f117dc02b3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 9 Aug 2017 16:46:09 -0400 Subject: net: dsa: ksz: fix skb freeing The DSA layer frees the original skb when an xmit function returns NULL, meaning an error occurred. But if the tagging code copied the original skb, it is responsible of freeing the copy if an error occurs. The ksz tagging code currently has two issues: if skb_put_padto fails, the skb copy is not freed, and the original skb will be freed twice. To fix that, move skb_put_padto inside both branches of the skb_tailroom condition, before freeing the original skb, and free the copy on error. Signed-off-by: Vivien Didelot Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- net/dsa/tag_ksz.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index fab41de..de66ca8 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -42,6 +42,9 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { + if (skb_put_padto(skb, skb->len + padlen)) + return NULL; + nskb = skb; } else { nskb = alloc_skb(NET_IP_ALIGN + skb->len + @@ -56,13 +59,15 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); + + if (skb_put_padto(nskb, nskb->len + padlen)) { + kfree_skb(nskb); + return NULL; + } + kfree_skb(skb); } - /* skb is freed when it fails */ - if (skb_put_padto(nskb, nskb->len + padlen)) - return NULL; - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); tag[0] = 0; tag[1] = 1 << p->dp->index; /* destination port */ -- cgit v1.1 From 8d55373875052b891ae72c9bcaf9c2d7178676c0 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 10 Aug 2017 12:31:40 +0300 Subject: net/sched/hfsc: allocate tcf block for hfsc root class Without this filters cannot be attached. Signed-off-by: Konstantin Khlebnikov Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b52f746..3ad02bb 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1428,6 +1428,10 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; + err = tcf_block_get(&q->root.block, &q->root.filter_list); + if (err) + goto err_tcf; + q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; q->root.sched = q; @@ -1447,6 +1451,10 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); return 0; + +err_tcf: + qdisc_class_hash_destroy(&q->clhash); + return err; } static int -- cgit v1.1 From 2ed46ce45ec02f6b2188419acdf372a144e06fb5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 11 Aug 2017 18:31:25 +0200 Subject: bpf: fix two missing target_size settings in bpf_convert_ctx_access When CONFIG_NET_SCHED or CONFIG_NET_RX_BUSY_POLL is /not/ set and we try a narrow __sk_buff load of tc_index or napi_id, respectively, then verifier rightfully complains that it's misconfigured, because we need to set target_size in each of the two cases. The rewrite for the ctx access is just a dummy op, but needs to pass, so fix this up. Fixes: f96da09473b5 ("bpf: simplify narrower ctx access") Reported-by: Shubham Bansal Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index f44fc22..6280a60 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3505,6 +3505,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else @@ -3520,6 +3521,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else + *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; -- cgit v1.1 From 2c87d63ac853550e734edfd45e1be5e5aa44fbcc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Aug 2017 00:52:58 +0200 Subject: ipv4: route: fix inet_rtm_getroute induced crash "ip route get $daddr iif eth0 from $saddr" causes: BUG: KASAN: use-after-free in ip_route_input_rcu+0x1535/0x1b50 Call Trace: ip_route_input_rcu+0x1535/0x1b50 ip_route_input_noref+0xf9/0x190 tcp_v4_early_demux+0x1a4/0x2b0 ip_rcv+0xbcb/0xc05 __netif_receive_skb+0x9c/0xd0 netif_receive_skb_internal+0x5a8/0x890 Problem is that inet_rtm_getroute calls either ip_route_input_rcu (if an iif was provided) or ip_route_output_key_hash_rcu. But ip_route_input_rcu, unlike ip_route_output_key_hash_rcu, already associates the dst_entry with the skb. This clears the SKB_DST_NOREF bit (i.e. skb_dst_drop will release/free the entry while it should not). Thus only set the dst if we called ip_route_output_key_hash_rcu(). I tested this patch by running: while true;do ip r get 10.0.1.2;done > /dev/null & while true;do ip r get 10.0.1.2 iif eth0 from 10.0.1.1;done > /dev/null & ... and saw no crash or memory leak. Cc: Roopa Prabhu Cc: David Ahern Fixes: ba52d61e0ff ("ipv4: route: restore skb_dst_set in inet_rtm_getroute") Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0383e66f..7effa62 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2750,12 +2750,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); + else + skb_dst_set(skb, &rt->dst); } if (err) goto errout_free; - skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; -- cgit v1.1 From fed5f5718c4989a03b1b4cdc0c7f273c3c74ee9e Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 14 Aug 2017 17:55:56 +0200 Subject: tipc: accept PACKET_MULTICAST packets On L2 bearers, the TIPC broadcast function is sending out packets using the corresponding L2 broadcast address. At reception, we filter such packets under the assumption that they will also be delivered as broadcast packets. This assumption doesn't always hold true. Under high load, we have seen that a switch may convert the destination address and deliver the packet as a PACKET_MULTICAST, something leading to inadvertently dropped packets and a stale and reset broadcast link. We fix this by extending the reception filtering to accept packets of type PACKET_MULTICAST. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d174ee3..767e053 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -596,7 +596,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); b = rcu_dereference_rtnl(dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && - (skb->pkt_type <= PACKET_BROADCAST))) { + (skb->pkt_type <= PACKET_MULTICAST))) { skb->next = NULL; tipc_rcv(dev_net(dev), skb, b); rcu_read_unlock(); -- cgit v1.1 From 59a361bc6f6e91d57f25ff0aebb0e646beb3b41d Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 14 Aug 2017 18:28:49 +0200 Subject: tipc: avoid inheriting msg_non_seq flag when message is returned In the function msg_reverse(), we reverse the header while trying to reuse the original buffer whenever possible. Those rejected/returned messages are always transmitted as unicast, but the msg_non_seq field is not explicitly set to zero as it should be. We have seen cases where multicast senders set the message type to "NOT dest_droppable", meaning that a multicast message shorter than one MTU will be returned, e.g., during receive buffer overflow, by reusing the original buffer. This has the effect that even the 'msg_non_seq' field is inadvertently inherited by the rejected message, although it is now sent as a unicast message. This again leads the receiving unicast link endpoint to steer the packet toward the broadcast link receive function, where it is dropped. The affected unicast link is thereafter (after 100 failed retransmissions) declared 'stale' and reset. We fix this by unconditionally setting the 'msg_non_seq' flag to zero for all rejected/returned messages. Reported-by: Canh Duc Luu Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ab30876..dcd90e6 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -513,6 +513,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); + msg_set_non_seq(hdr, 0); msg_set_origport(hdr, msg_destport(&ohdr)); msg_set_destport(hdr, msg_origport(&ohdr)); msg_set_destnode(hdr, msg_prevnode(&ohdr)); -- cgit v1.1 From 539a06baedd06127389b254f6b9f016ca072da13 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 14 Aug 2017 18:04:24 +0200 Subject: tcp: ulp: avoid module refcnt leak in tcp_set_ulp __tcp_ulp_find_autoload returns tcp_ulp_ops after taking a reference on the module. Then, if ->init fails, tcp_set_ulp propagates the error but nothing releases that reference. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv4/tcp_ulp.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 2417f55..6bb9e14 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -122,14 +122,14 @@ int tcp_set_ulp(struct sock *sk, const char *name) ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) - err = -ENOENT; - else - err = ulp_ops->init(sk); + return -ENOENT; - if (err) - goto out; + err = ulp_ops->init(sk); + if (err) { + module_put(ulp_ops->owner); + return err; + } icsk->icsk_ulp_ops = ulp_ops; - out: - return err; + return 0; } -- cgit v1.1 From 36f41f8fc6d8aa9f8c9072d66ff7cf9055f5e69b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 10:16:45 -0700 Subject: af_key: do not use GFP_KERNEL in atomic contexts pfkey_broadcast() might be called from non process contexts, we can not use GFP_KERNEL in these cases [1]. This patch partially reverts commit ba51b6be38c1 ("net: Fix RCU splat in af_key"), only keeping the GFP_ATOMIC forcing under rcu_read_lock() section. [1] : syzkaller reported : in_atomic(): 1, irqs_disabled(): 0, pid: 2932, name: syzkaller183439 3 locks held by syzkaller183439/2932: #0: (&net->xfrm.xfrm_cfg_mutex){+.+.+.}, at: [] pfkey_sendmsg+0x4c8/0x9f0 net/key/af_key.c:3649 #1: (&pfk->dump_lock){+.+.+.}, at: [] pfkey_do_dump+0x76/0x3f0 net/key/af_key.c:293 #2: (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [] spin_lock_bh include/linux/spinlock.h:304 [inline] #2: (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [] xfrm_policy_walk+0x192/0xa30 net/xfrm/xfrm_policy.c:1028 CPU: 0 PID: 2932 Comm: syzkaller183439 Not tainted 4.13.0-rc4+ #24 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:5994 __might_sleep+0x95/0x190 kernel/sched/core.c:5947 slab_pre_alloc_hook mm/slab.h:416 [inline] slab_alloc mm/slab.c:3383 [inline] kmem_cache_alloc+0x24b/0x6e0 mm/slab.c:3559 skb_clone+0x1a0/0x400 net/core/skbuff.c:1037 pfkey_broadcast_one+0x4b2/0x6f0 net/key/af_key.c:207 pfkey_broadcast+0x4ba/0x770 net/key/af_key.c:281 dump_sp+0x3d6/0x500 net/key/af_key.c:2685 xfrm_policy_walk+0x2f1/0xa30 net/xfrm/xfrm_policy.c:1042 pfkey_dump_sp+0x42/0x50 net/key/af_key.c:2695 pfkey_do_dump+0xaa/0x3f0 net/key/af_key.c:299 pfkey_spddump+0x1a0/0x210 net/key/af_key.c:2722 pfkey_process+0x606/0x710 net/key/af_key.c:2814 pfkey_sendmsg+0x4d6/0x9f0 net/key/af_key.c:3650 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x755/0x890 net/socket.c:2035 __sys_sendmsg+0xe5/0x210 net/socket.c:2069 SYSC_sendmsg net/socket.c:2080 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2076 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x445d79 RSP: 002b:00007f32447c1dc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000445d79 RDX: 0000000000000000 RSI: 000000002023dfc8 RDI: 0000000000000008 RBP: 0000000000000086 R08: 00007f32447c2700 R09: 00007f32447c2700 R10: 00007f32447c2700 R11: 0000000000000202 R12: 0000000000000000 R13: 00007ffe33edec4f R14: 00007f32447c29c0 R15: 0000000000000000 Fixes: ba51b6be38c1 ("net: Fix RCU splat in af_key") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: David Ahern Acked-by: David Ahern Signed-off-by: David S. Miller --- net/key/af_key.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index ca9d3ae..98f4d82 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -228,7 +228,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 -static int pfkey_broadcast(struct sk_buff *skb, +static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { @@ -278,7 +278,7 @@ static int pfkey_broadcast(struct sk_buff *skb, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk); + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); kfree_skb(skb2); kfree_skb(skb); @@ -311,7 +311,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } @@ -355,7 +355,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1389,7 +1389,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ xfrm_state_put(x); - pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net); + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } @@ -1476,7 +1476,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x)); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } @@ -1589,7 +1589,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1694,8 +1694,8 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad return -ENOBUFS; } - pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk)); - + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, + sock_net(sk)); return 0; } @@ -1712,7 +1712,8 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, + sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) @@ -1733,7 +1734,7 @@ static int key_notify_sa_flush(const struct km_event *c) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -1790,7 +1791,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -1878,7 +1879,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb new_hdr->sadb_msg_errno = 0; } - pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } @@ -2206,7 +2207,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } @@ -2426,7 +2427,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: @@ -2682,7 +2683,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -2739,7 +2740,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -2803,7 +2804,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); @@ -3024,7 +3025,8 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; - pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); return 0; } @@ -3212,7 +3214,8 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_ctx->ctx_len); } - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, @@ -3408,7 +3411,8 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE @@ -3599,7 +3603,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; -- cgit v1.1 From e5645f51ba99738b0e5d708edf9c6454f33b9310 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 14 Aug 2017 10:44:59 -0700 Subject: ipv6: release rt6->rt6i_idev properly during ifdown When a dst is created by addrconf_dst_alloc() for a host route or an anycast route, dst->dev points to loopback dev while rt6->rt6i_idev points to a real device. When the real device goes down, the current cleanup code only checks for dst->dev and assumes rt6->rt6i_idev->dev is the same. This causes the refcount leak on the real device in the above situation. This patch makes sure to always release the refcount taken on rt6->rt6i_idev during dst_dev_put(). Fixes: 587fea741134 ("ipv6: mark DST_NOGC and remove the operation of dst_free()") Reported-by: John Stultz Tested-by: John Stultz Tested-by: Martin KaFai Lau Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a640fbc..99d4727 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -417,14 +417,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, struct net_device *loopback_dev = dev_net(dev)->loopback_dev; - if (dev != loopback_dev) { - if (idev && idev->dev == dev) { - struct inet6_dev *loopback_idev = - in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; - in6_dev_put(idev); - } + if (idev && idev->dev != loopback_dev) { + struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); } } } -- cgit v1.1 From 7749d4ff88d31b0be17c8683143135adaaadc6a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 14:10:25 -0700 Subject: dccp: purge write queue in dccp_destroy_sock() syzkaller reported that DCCP could have a non empty write queue at dismantle time. WARNING: CPU: 1 PID: 2953 at net/core/stream.c:199 sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 2953 Comm: syz-executor0 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:180 __warn+0x1c4/0x1d9 kernel/panic.c:541 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846 RIP: 0010:sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199 RSP: 0018:ffff8801d182f108 EFLAGS: 00010297 RAX: ffff8801d1144140 RBX: ffff8801d13cb280 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff85137b00 RDI: ffff8801d13cb280 RBP: ffff8801d182f148 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d13cb4d0 R13: ffff8801d13cb3b8 R14: ffff8801d13cb300 R15: ffff8801d13cb3b8 inet_csk_destroy_sock+0x175/0x3f0 net/ipv4/inet_connection_sock.c:835 dccp_close+0x84d/0xc10 net/dccp/proto.c:1067 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425 sock_release+0x8d/0x1e0 net/socket.c:597 sock_close+0x16/0x20 net/socket.c:1126 __fput+0x327/0x7e0 fs/file_table.c:210 ____fput+0x15/0x20 fs/file_table.c:246 task_work_run+0x18a/0x260 kernel/task_work.c:116 exit_task_work include/linux/task_work.h:21 [inline] do_exit+0xa32/0x1b10 kernel/exit.c:865 do_group_exit+0x149/0x400 kernel/exit.c:969 get_signal+0x7e8/0x17e0 kernel/signal.c:2330 do_signal+0x94/0x1ee0 arch/x86/kernel/signal.c:808 exit_to_usermode_loop+0x21c/0x2d0 arch/x86/entry/common.c:157 prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline] syscall_return_slowpath+0x3a7/0x450 arch/x86/entry/common.c:263 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/dccp/proto.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9fe25bf..86bc40b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -201,10 +201,7 @@ void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - /* - * DCCP doesn't use sk_write_queue, just sk_send_head - * for retransmissions - */ + __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; -- cgit v1.1 From d624d276d1ddacbcb12ad96832ce0c7b82cd25db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 17:44:43 -0700 Subject: tcp: fix possible deadlock in TCP stack vs BPF filter Filtering the ACK packet was not put at the right place. At this place, we already allocated a child and put it into accept queue. We absolutely need to call tcp_child_process() to release its spinlock, or we will deadlock at accept() or close() time. Found by syzkaller team (Thanks a lot !) Fixes: 8fac365f63c8 ("tcp: Add a tcp_filter hook before handle ack packet") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Chenbo Feng Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a20e7f0..e9252c7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1722,6 +1722,8 @@ process: */ sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1729,8 +1731,6 @@ process: } if (nsk == sk) { reqsk_put(req); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2521690..2062101 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1456,6 +1456,8 @@ process: } sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1464,8 +1466,6 @@ process: if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); goto discard_and_relse; -- cgit v1.1 From 12d94a804946af291e24b80fc53ec86264765781 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Aug 2017 04:09:51 -0700 Subject: ipv6: fix NULL dereference in ip6_route_dev_notify() Based on a syzkaller report [1], I found that a per cpu allocation failure in snmp6_alloc_dev() would then lead to NULL dereference in ip6_route_dev_notify(). It seems this is a very old bug, thus no Fixes tag in this submission. Let's add in6_dev_put_clear() helper, as we will probably use it elsewhere (once available/present in net-next) [1] kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 17294 Comm: syz-executor6 Not tainted 4.13.0-rc2+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff88019f456680 task.stack: ffff8801c6e58000 RIP: 0010:__read_once_size include/linux/compiler.h:250 [inline] RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline] RIP: 0010:refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP: 0018:ffff8801c6e5f1b0 EFLAGS: 00010202 RAX: 0000000000000037 RBX: dffffc0000000000 RCX: ffffc90005d25000 RDX: ffff8801c6e5f218 RSI: ffffffff82342bbf RDI: 0000000000000001 RBP: ffff8801c6e5f240 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff10038dcbe37 R13: 0000000000000006 R14: 0000000000000001 R15: 00000000000001b8 FS: 00007f21e0429700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001ddbc22000 CR3: 00000001d632b000 CR4: 00000000001426e0 DR0: 0000000020000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Call Trace: refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211 in6_dev_put include/net/addrconf.h:335 [inline] ip6_route_dev_notify+0x1c9/0x4a0 net/ipv6/route.c:3732 notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93 __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401 call_netdevice_notifiers_info+0x51/0x90 net/core/dev.c:1678 call_netdevice_notifiers net/core/dev.c:1694 [inline] rollback_registered_many+0x91c/0xe80 net/core/dev.c:7107 rollback_registered+0x1be/0x3c0 net/core/dev.c:7149 register_netdevice+0xbcd/0xee0 net/core/dev.c:7587 register_netdev+0x1a/0x30 net/core/dev.c:7669 loopback_net_init+0x76/0x160 drivers/net/loopback.c:214 ops_init+0x10a/0x570 net/core/net_namespace.c:118 setup_net+0x313/0x710 net/core/net_namespace.c:294 copy_net_ns+0x27c/0x580 net/core/net_namespace.c:418 create_new_namespaces+0x425/0x880 kernel/nsproxy.c:107 unshare_nsproxy_namespaces+0xae/0x1e0 kernel/nsproxy.c:206 SYSC_unshare kernel/fork.c:2347 [inline] SyS_unshare+0x653/0xfa0 kernel/fork.c:2297 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512c9 RSP: 002b:00007f21e0428c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 0000000000718150 RCX: 00000000004512c9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000062020200 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b973d R13: 00000000ffffffff R14: 000000002001d000 R15: 00000000000002dd Code: 50 2b 34 82 c7 00 f1 f1 f1 f1 c7 40 04 04 f2 f2 f2 c7 40 08 f3 f3 f3 f3 e8 a1 43 39 ff 4c 89 f8 48 8b 95 70 ff ff ff 48 c1 e8 03 <0f> b6 0c 18 4c 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85 RIP: __read_once_size include/linux/compiler.h:250 [inline] RSP: ffff8801c6e5f1b0 RIP: atomic_read arch/x86/include/asm/atomic.h:26 [inline] RSP: ffff8801c6e5f1b0 RIP: refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP: ffff8801c6e5f1b0 ---[ end trace e441d046c6410d31 ]--- Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 99d4727..94d6a13 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3721,10 +3721,10 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); - in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } -- cgit v1.1 From 187e5b3ac84d3421d2de3aca949b2791fbcad554 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Aug 2017 05:26:17 -0700 Subject: ipv4: fix NULL dereference in free_fib_info_rcu() If fi->fib_metrics could not be allocated in fib_create_info() we attempt to dereference a NULL pointer in free_fib_info_rcu() : m = fi->fib_metrics; if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) kfree(m); Before my recent patch, we used to call kfree(NULL) and nothing wrong happened. Instead of using RCU to defer freeing while we are under memory stress, it seems better to take immediate action. This was reported by syzkaller team. Fixes: 3fb07daff8e9 ("ipv4: add reference counting to metrics") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b8d1817..ec3a9ce 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1083,15 +1083,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (!fi->fib_metrics) - goto failure; + if (unlikely(!fi->fib_metrics)) { + kfree(fi); + return ERR_PTR(err); + } atomic_set(&fi->fib_metrics->refcnt, 1); - } else + } else { fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; - + } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; -- cgit v1.1 From 898904226b5a6dee657f23cf51e385f50da22596 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:35:21 +0300 Subject: net_sched: reset pointers to tcf blocks in classful qdiscs' destructors Traffic filters could keep direct pointers to classes in classful qdisc, thus qdisc destruction first removes all filters before freeing classes. Class destruction methods also tries to free attached filters but now this isn't safe because tcf_block_put() unlike to tcf_destroy_chain() cannot be called second time. This patch set class->block to NULL after first tcf_block_put() and turn second call into no-op. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Konstantin Khlebnikov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 4 +++- net/sched/sch_cbq.c | 4 +++- net/sched/sch_hfsc.c | 4 +++- net/sched/sch_htb.c | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 572fe25..c403c87 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -572,8 +572,10 @@ static void atm_tc_destroy(struct Qdisc *sch) struct atm_flow_data *flow, *tmp; pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) + list_for_each_entry(flow, &p->flows, list) { tcf_block_put(flow->block); + flow->block = NULL; + } list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 481036f..780db43 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1431,8 +1431,10 @@ static void cbq_destroy(struct Qdisc *sch) * be bound to classes which have been destroyed already. --TGR '04 */ for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ad02bb..fd15200 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1530,8 +1530,10 @@ hfsc_destroy_qdisc(struct Qdisc *sch) unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 203286a..5d65ec5 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1258,8 +1258,10 @@ static void htb_destroy(struct Qdisc *sch) tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], -- cgit v1.1 From 325d5dc3f7e7c2840b65e4a2988c082c2c0025c5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:37:04 +0300 Subject: net_sched/sfq: update hierarchical backlog when drop packet When sfq_enqueue() drops head packet or packet from another queue it have to update backlog at upper qdiscs too. Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: Konstantin Khlebnikov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index f80ea2c..82469ef 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -437,6 +437,7 @@ congestion_drop: qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); + qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } @@ -468,8 +469,10 @@ enqueue: /* Return Congestion Notification only if we dropped a packet * from this flow. */ - if (qlen != slot->qlen) + if (qlen != slot->qlen) { + qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; + } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); -- cgit v1.1 From c90e95147c27b1780e76c6e8fea1b5c78d7d387f Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:39:05 +0300 Subject: net_sched: remove warning from qdisc_hash_add It was added in commit e57a784d8cae ("pkt_sched: set root qdisc before change() in attach_default_qdiscs()") to hide duplicates from "tc qdisc show" for incative deivices. After 59cc1f61f ("net: sched: convert qdisc linked list to hashtable") it triggered when classful qdisc is added to inactive device because default qdiscs are added before switching root qdisc. Anyway after commit ea3274695353 ("net: sched: avoid duplicates in qdisc dump") duplicates are filtered right in dumper. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_api.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bd24a55..a3fa144 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -286,9 +286,6 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { - struct Qdisc *root = qdisc_dev(q)->qdisc; - - WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); if (invisible) -- cgit v1.1 From c7b725be84985532161bcb4fbecd056326998a77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Aug 2017 18:38:42 -0700 Subject: net: igmp: Use ingress interface rather than vrf device Anuradha reported that statically added groups for interfaces enslaved to a VRF device were not persisting. The problem is that igmp queries and reports need to use the data in the in_dev for the real ingress device rather than the VRF device. Update igmp_rcv accordingly. Fixes: e58e41596811 ("net: Enable support for VRF with ipv4 multicast") Reported-by: Anuradha Karuppiah Signed-off-by: David Ahern Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 498706b..caf2f11 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct net_device *dev = skb->dev; + struct in_device *in_dev; int len = skb->len; bool dropped = true; + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); + if (!dev) + goto drop; + } + + in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; -- cgit v1.1 From 494bea39f3201776cdfddc232705f54a0bd210c4 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 16 Aug 2017 13:30:07 +0800 Subject: openvswitch: fix skb_panic due to the incorrect actions attrlen For sw_flow_actions, the actions_len only represents the kernel part's size, and when we dump the actions to the userspace, we will do the convertions, so it's true size may become bigger than the actions_len. But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len to alloc the skbuff, so the user_skb's size may become insufficient and oops will happen like this: skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head: ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:129! [...] Call Trace: [] skb_put+0x43/0x44 [] skb_zerocopy+0x6c/0x1f4 [] queue_userspace_packet+0x3a3/0x448 [openvswitch] [] ovs_dp_upcall+0x30/0x5c [openvswitch] [] output_userspace+0x132/0x158 [openvswitch] [] ? ip6_rcv_finish+0x74/0x77 [ipv6] [] do_execute_actions+0xcc1/0xdc8 [openvswitch] [] ovs_execute_actions+0x74/0x106 [openvswitch] [] ovs_dp_process_packet+0xe1/0xfd [openvswitch] [] ? key_extract+0x63c/0x8d5 [openvswitch] [] ovs_vport_receive+0xa1/0xc3 [openvswitch] [...] Also we can find that the actions_len is much little than the orig_len: crash> struct sw_flow_actions 0xffff8812f539d000 struct sw_flow_actions { rcu = { next = 0xffff8812f5398800, func = 0xffffe3b00035db32 }, orig_len = 1384, actions_len = 592, actions = 0xffff8812f539d01c } So as a quick fix, use the orig_len instead of the actions_len to alloc the user_skb. Last, this oops happened on our system running a relative old kernel, but the same risk still exists on the mainline, since we use the wrong actions_len from the beginning. Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace") Cc: Neil McKee Signed-off-by: Liping Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 1 + net/openvswitch/datapath.c | 7 ++++--- net/openvswitch/datapath.h | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e461067..a54a556 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1337,6 +1337,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, goto out; } + OVS_CB(skb)->acts_origlen = acts->orig_len; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 45fe8c8..6b44fe4 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -381,7 +381,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, - unsigned int hdrlen) + unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ @@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) - size += nla_total_size(upcall_info->actions_len); + size += nla_total_size(actions_attrlen); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) @@ -465,7 +465,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info, hlen - cutlen); + len = upcall_msg_size(upcall_info, hlen - cutlen, + OVS_CB(skb)->acts_origlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 5d8dcd8..4806006 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -99,11 +99,13 @@ struct datapath { * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not * fragmented. + * @acts_origlen: The netlink size of the flow actions applied to this skb. * @cutlen: The number of bytes from the packet end to be removed. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; + u16 acts_origlen; u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) -- cgit v1.1 From 120e9dabaf551c6dc03d3a10a1f026376cb1811c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 07:03:15 -0700 Subject: dccp: defer ccid_hc_tx_delete() at dismantle time syszkaller team reported another problem in DCCP [1] Problem here is that the structure holding RTO timer (ccid2_hc_tx_rto_expire() handler) is freed too soon. We can not use del_timer_sync() to cancel the timer since this timer wants to grab socket lock (that would risk a dead lock) Solution is to defer the freeing of memory when all references to the socket were released. Socket timers do own a reference, so this should fix the issue. [1] ================================================================== BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144 Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365 CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events_unbound call_usermodehelper_exec_work Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:638 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702 RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline] RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline] RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343 RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10 RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006 RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98 RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60 R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0 release_task+0xe9e/0x1a40 kernel/exit.c:220 wait_task_zombie kernel/exit.c:1162 [inline] wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389 do_wait_thread kernel/exit.c:1452 [inline] do_wait+0x441/0xa90 kernel/exit.c:1523 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665 SYSC_wait4+0x134/0x140 kernel/exit.c:1677 SyS_wait4+0x2c/0x40 kernel/exit.c:1673 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline] call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097 worker_thread+0x223/0x1860 kernel/workqueue.c:2231 kthread+0x35e/0x430 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425 Allocated by task 21267: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561 ccid_new+0x20e/0x390 net/dccp/ccid.c:151 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline] dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679 sk_backlog_rcv include/net/sock.h:911 [inline] __release_sock+0x124/0x360 net/core/sock.c:2269 release_sock+0xa4/0x2a0 net/core/sock.c:2784 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline] __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682 SYSC_connect+0x204/0x470 net/socket.c:1642 SyS_connect+0x24/0x30 net/socket.c:1623 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 3049: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3763 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833 dccp_done+0xb7/0xd0 net/dccp/proto.c:145 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679 sk_backlog_rcv include/net/sock.h:911 [inline] __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:248 [inline] ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:477 [inline] ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:248 [inline] ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455 process_backlog+0x203/0x740 net/core/dev.c:5130 napi_poll net/core/dev.c:5527 [inline] net_rx_action+0x792/0x1910 net/core/dev.c:5593 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284 The buggy address belongs to the object at ffff8801d2660100 which belongs to the cache ccid2_hc_tx_sock of size 1240 The buggy address is located 1088 bytes inside of 1240-byte region [ffff8801d2660100, ffff8801d26605d8) The buggy address belongs to the page: page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0 flags: 0x200000000008100(slab|head) raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005 raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/proto.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 86bc40b..b68168f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); +static void dccp_sk_destruct(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = NULL; + inet_sock_destruct(sk); +} + int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); @@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; @@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); -- cgit v1.1 From c780a049f9bf442314335372c9abc4548bfe3e44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 11:09:12 -0700 Subject: ipv4: better IP_MAX_MTU enforcement While working on yet another syzkaller report, I found that our IP_MAX_MTU enforcements were not properly done. gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and final result can be bigger than IP_MAX_MTU :/ This is a problem because device mtu can be changed on other cpus or threads. While this patch does not fix the issue I am working on, it is probably worth addressing it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7effa62..fe877a4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) if (mtu) return mtu; - mtu = dst->dev->mtu; + mtu = READ_ONCE(dst->dev->mtu); if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { if (rt->rt_uses_gateway && mtu > 576) -- cgit v1.1 From acc8b31665b4cc17b35c4fa445427f7e2f6dc86b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 18 Aug 2017 10:10:43 +0200 Subject: net: sched: fix p_filter_chain check in tcf_chain_flush The dereference before check is wrong and leads to an oops when p_filter_chain is NULL. The check needs to be done on the pointer to prevent NULL dereference. Fixes: f93e1cdcf42c ("net/sched: fix filter flushing") Signed-off-by: Jiri Pirko Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 39da0c5..9fd44c2 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -205,7 +205,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp; - if (*chain->p_filter_chain) + if (chain->p_filter_chain) RCU_INIT_POINTER(*chain->p_filter_chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); -- cgit v1.1 From a0917e0bc6efc05834c0c1eafebd579a9c75e6e9 Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Fri, 18 Aug 2017 15:04:54 -0400 Subject: datagram: When peeking datagrams with offset < 0 don't skip empty skbs Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove headers from UDP packets before queueing"), when udp packets are being peeked the requested extra offset is always 0 as there is no need to skip the udp header. However, when the offset is 0 and the next skb is of length 0, it is only returned once. The behaviour can be seen with the following python script: from socket import *; f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); f.bind(('::', 0)); addr=('::1', f.getsockname()[1]); g.sendto(b'', addr) g.sendto(b'b', addr) print(f.recvfrom(10, MSG_PEEK)); print(f.recvfrom(10, MSG_PEEK)); Where the expected output should be the empty string twice. Instead, make sk_peek_offset return negative values, and pass those values to __skb_try_recv_datagram/__skb_try_recv_from_queue. If the passed offset to __skb_try_recv_from_queue is negative, the checked skb is never skipped. __skb_try_recv_from_queue will then ensure the offset is reset back to 0 if a peek is requested without an offset, unless no packets are found. Also simplify the if condition in __skb_try_recv_from_queue. If _off is greater then 0, and off is greater then or equal to skb->len, then (_off || skb->len) must always be true assuming skb->len >= 0 is always true. Also remove a redundant check around a call to sk_peek_offset in af_unix.c, as it double checked if MSG_PEEK was set in the flags. V2: - Moved the negative fixup into __skb_try_recv_from_queue, and remove now redundant checks - Fix peeking in udp{,v6}_recvmsg to report the right value when the offset is 0 V3: - Marked new branch in __skb_try_recv_from_queue as unlikely. Signed-off-by: Matthew Dawson Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/datagram.c | 12 +++++++++--- net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 3 ++- net/unix/af_unix.c | 5 +---- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index ee5647b..a21ca8d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, int *peeked, int *off, int *err, struct sk_buff **last) { + bool peek_at_off = false; struct sk_buff *skb; - int _off = *off; + int _off = 0; + + if (unlikely(flags & MSG_PEEK && *off >= 0)) { + peek_at_off = true; + _off = *off; + } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { + if (peek_at_off && _off >= skb->len && + (_off || skb->peeked)) { _off -= skb->len; continue; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a7c804f..cd1d044 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 578142b..20039c8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7b52a38..be8982b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, */ mutex_lock(&u->iolock); - if (flags & MSG_PEEK) - skip = sk_peek_offset(sk, flags); - else - skip = 0; + skip = max(sk_peek_offset(sk, flags), 0); do { int chunk; -- cgit v1.1 From 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 09:41:54 -0700 Subject: tipc: fix use-after-free syszkaller reported use-after-free in tipc [1] When msg->rep skb is freed, set the pointer to NULL, so that caller does not free it again. [1] ================================================================== BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466 Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115 CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 skb_push+0xd4/0xe0 net/core/skbuff.c:1466 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9 RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76 R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000 Allocated by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219 alloc_skb include/linux/skbuff.h:903 [inline] tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline] tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3763 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622 __kfree_skb net/core/skbuff.c:682 [inline] kfree_skb+0x165/0x4c0 net/core/skbuff.c:699 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline] tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe The buggy address belongs to the object at ffff8801c6e71dc0 which belongs to the cache skbuff_head_cache of size 224 The buggy address is located 208 bytes inside of 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0) The buggy address belongs to the page: page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0 flags: 0x200000000000100(slab) raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 9bfe886..750949d 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, arg = nlmsg_new(0, GFP_KERNEL); if (!arg) { kfree_skb(msg->rep); + msg->rep = NULL; return -ENOMEM; } err = __tipc_nl_compat_dumpit(cmd, msg, arg); - if (err) + if (err) { kfree_skb(msg->rep); - + msg->rep = NULL; + } kfree_skb(arg); return err; -- cgit v1.1 From 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 16 Aug 2017 20:16:40 +0200 Subject: sctp: fully initialize the IPv6 address in sctp_v6_to_addr() KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below). Make sure all fields of an IPv6 address are initialized, which guarantees that the IPv4 fields are also initialized. ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x172/0x1c0 lib/dump_stack.c:42 is_logbuf_locked mm/kmsan/kmsan.c:59 [inline] kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938 native_save_fl arch/x86/include/asm/irqflags.h:18 [inline] arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline] arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline] __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467 sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651 sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg net/socket.c:643 [inline] SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x13/0x94 RIP: 0033:0x44b479 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000 origin description: ----dst_saddr@sctp_v6_get_dst local variable created at: sk_fullsock include/net/sock.h:2321 [inline] inet6_sk include/linux/ipv6.h:309 [inline] sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x172/0x1c0 lib/dump_stack.c:42 is_logbuf_locked mm/kmsan/kmsan.c:59 [inline] kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938 native_save_fl arch/x86/include/asm/irqflags.h:18 [inline] arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline] arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline] __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467 sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651 sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg net/socket.c:643 [inline] SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x13/0x94 RIP: 0033:0x44b479 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000 origin description: ----dst_saddr@sctp_v6_get_dst local variable created at: sk_fullsock include/net/sock.h:2321 [inline] inet6_sk include/linux/ipv6.h:309 [inline] sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 ================================================================== Signed-off-by: Alexander Potapenko Reviewed-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 2a186b20..a4b6ffb 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -512,7 +512,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; + addr->v6.sin6_scope_id = 0; } /* Compare addresses exactly. -- cgit v1.1 From 383143f31d7d3525a1dbff733d52fff917f82f15 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 16 Aug 2017 11:18:09 -0700 Subject: ipv6: reset fn->rr_ptr when replacing route syzcaller reported the following use-after-free issue in rt6_select(): BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8 BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8 Read of size 4 by task syz-executor1/439628 CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380 Call Trace: [] __dump_stack lib/dump_stack.c:15 [inline] [] dump_stack+0xc1/0x124 lib/dump_stack.c:51 sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option. Use struct sctp_sack_info instead [] kasan_object_err+0x21/0x70 mm/kasan/report.c:158 [] print_address_description mm/kasan/report.c:196 [inline] [] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285 [] kasan_report mm/kasan/report.c:305 [inline] [] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325 [] rt6_select net/ipv6/route.c:755 [inline] [] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 [] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203 [] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95 [] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223 [] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41 [] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224 [] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943 [] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079 [] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91 [] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline] [] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272 [] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284 [] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564 [] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582 [] SyS_connect+0x29/0x30 net/socket.c:1563 [] entry_SYSCALL_64_fastpath+0x12/0x17 Object at ffff8800bc699380, in cache ip6_dst_cache size: 384 The root cause of it is that in fib6_add_rt2node(), when it replaces an existing route with the new one, it does not update fn->rr_ptr. This commit resets fn->rr_ptr to NULL when it points to a route which is replaced in fib6_add_rt2node(). Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ebb299c..e6f8780 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -914,6 +914,8 @@ add: } nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { @@ -926,6 +928,8 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { -- cgit v1.1 From bc3aae2bbac46dd894c89db5d5e98f7f0ef9e205 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 16 Aug 2017 12:38:52 -0700 Subject: net: check and errout if res->fi is NULL when RTM_F_FIB_MATCH is set Syzkaller hit 'general protection fault in fib_dump_info' bug on commit 4.13-rc5.. Guilty file: net/ipv4/fib_semantics.c kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff880078562700 task.stack: ffff880078110000 RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP: 0018:ffff880078117010 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 00000000000000fe RCX: 0000000000000002 RDX: 0000000000000006 RSI: ffff880078117084 RDI: 0000000000000030 RBP: ffff880078117268 R08: 000000000000000c R09: ffff8800780d80c8 R10: 0000000058d629b4 R11: 0000000067fce681 R12: 0000000000000000 R13: ffff8800784bd540 R14: ffff8800780d80b5 R15: ffff8800780d80a4 FS: 00000000022fa940(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004387d0 CR3: 0000000079135000 CR4: 00000000000006f0 Call Trace: inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766 rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217 netlink_rcv_skb+0x340/0x470 net/netlink/af_netlink.c:2397 rtnetlink_rcv+0x28/0x30 net/core/rtnetlink.c:4223 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4c4/0x6e0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0x8c4/0xca0 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x779/0x8d0 net/socket.c:2035 __sys_sendmsg+0xd1/0x170 net/socket.c:2069 SYSC_sendmsg net/socket.c:2080 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2076 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: 0033:0x4512e9 RSP: 002b:00007ffc75584cc8 EFLAGS: 00000216 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00000000004512e9 RDX: 0000000000000000 RSI: 0000000020f2cfc8 RDI: 0000000000000003 RBP: 000000000000000e R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: fffffffffffffffe R13: 0000000000718000 R14: 0000000020c44ff0 R15: 0000000000000000 Code: 00 0f b6 8d ec fd ff ff 48 8b 85 f0 fd ff ff 88 48 17 48 8b 45 28 48 8d 78 30 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e cb 0c 00 00 48 8b 45 28 44 RIP: fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP: ffff880078117010 ---[ end trace 254a7af28348f88b ]--- This patch adds a res->fi NULL check. example run: $ip route get 0.0.0.0 iif virt1-0 broadcast 0.0.0.0 dev lo cache iif virt1-0 $ip route get 0.0.0.0 iif virt1-0 fibmatch RTNETLINK answers: No route to host Reported-by: idaifish Reported-by: Dmitry Vyukov Fixes: b61798130f1b ("net: ipv4: RTM_GETROUTE: return matched fib result when requested") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fe877a4..2331de2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2763,14 +2763,21 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - if (rtm->rtm_flags & RTM_F_FIB_MATCH) + if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + if (!res.fi) { + err = fib_props[res.type].error; + if (!err) + err = -EHOSTUNREACH; + goto errout_free; + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, table_id, rt->rt_type, res.prefix, res.prefixlen, fl4.flowi4_tos, res.fi, 0); - else + } else { err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + } if (err < 0) goto errout_free; -- cgit v1.1 From cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Aug 2017 17:53:36 -0400 Subject: tcp: when rearming RTO, if RTO time is in past then fire RTO ASAP In some situations tcp_send_loss_probe() can realize that it's unable to send a loss probe (TLP), and falls back to calling tcp_rearm_rto() to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto() realizes that the RTO was eligible to fire immediately or at some point in the past (delta_us <= 0). Previously in such cases tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now + icsk_rto, which caused needless delays of hundreds of milliseconds (and non-linear behavior that made reproducible testing difficult). This commit changes the logic to schedule "overdue" RTOs ASAP, rather than at now + icsk_rto. Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Suggested-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53de142..bab7f04 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3009,8 +3009,7 @@ void tcp_rearm_rto(struct sock *sk) /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta_us > 0) - rto = usecs_to_jiffies(delta_us); + rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); -- cgit v1.1 From b024d949a3c24255a7ef1a470420eb478949aa4c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Aug 2017 23:14:58 +0100 Subject: irda: do not leak initialized list.dev to userspace list.dev has not been initialized and so the copy_to_user is copying data from the stack back to user space which is a potential information leak. Fix this ensuring all of list is initialized to zero. Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/irda/af_irda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 2e6990f..23fa7c8 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2213,7 +2213,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); - struct irda_device_list list; + struct irda_device_list list = { 0 }; struct irda_device_info *discoveries; struct irda_ias_set * ias_opt; /* IAS get/query params */ struct ias_object * ias_obj; /* Object in IAS */ -- cgit v1.1 From 9a19bad70cf16b0cdf3576efda7deb490e7aa529 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 18 Aug 2017 00:19:42 +0100 Subject: rxrpc: Fix oops when discarding a preallocated service call rxrpc_service_prealloc_one() doesn't set the socket pointer on any new call it preallocates, but does add it to the rxrpc net namespace call list. This, however, causes rxrpc_put_call() to oops when the call is discarded when the socket is closed. rxrpc_put_call() needs the socket to be able to reach the namespace so that it can use a lock held therein. Fix this by setting a call's socket pointer immediately before discarding it. This can be triggered by unloading the kafs module, resulting in an oops like the following: BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 IP: rxrpc_put_call+0x1e2/0x32d PGD 0 P4D 0 Oops: 0000 [#1] SMP Modules linked in: kafs(E-) CPU: 3 PID: 3037 Comm: rmmod Tainted: G E 4.12.0-fscache+ #213 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 task: ffff8803fc92e2c0 task.stack: ffff8803fef74000 RIP: 0010:rxrpc_put_call+0x1e2/0x32d RSP: 0018:ffff8803fef77e08 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8803fab99ac0 RCX: 000000000000000f RDX: ffffffff81c50a40 RSI: 000000000000000c RDI: ffff8803fc92ea88 RBP: ffff8803fef77e30 R08: ffff8803fc87b941 R09: ffffffff82946d20 R10: ffff8803fef77d10 R11: 00000000000076fc R12: 0000000000000005 R13: ffff8803fab99c20 R14: 0000000000000001 R15: ffffffff816c6aee FS: 00007f915a059700(0000) GS:ffff88041fb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 00000003fef39000 CR4: 00000000001406e0 Call Trace: rxrpc_discard_prealloc+0x325/0x341 rxrpc_listen+0xf9/0x146 kernel_listen+0xb/0xd afs_close_socket+0x3e/0x173 [kafs] afs_exit+0x1f/0x57 [kafs] SyS_delete_module+0x10f/0x19a do_syscall_64+0x8a/0x149 entry_SYSCALL64_slow_path+0x25/0x25 Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_accept.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index dd30d74..ec3383f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -223,6 +223,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; + call->socket = rx; if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); -- cgit v1.1 From 4f8a881acc9d1adaf1e552349a0b1df28933a04c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Aug 2017 11:01:36 +0800 Subject: net: sched: fix NULL pointer dereference when action calls some targets As we know in some target's checkentry it may dereference par.entryinfo to check entry stuff inside. But when sched action calls xt_check_target, par.entryinfo is set with NULL. It would cause kernel panic when calling some targets. It can be reproduce with: # tc qd add dev eth1 ingress handle ffff: # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \ -j ECN --ecn-tcp-remove It could also crash kernel when using target CLUSTERIP or TPROXY. By now there's no proper value for par.entryinfo in ipt_init_target, but it can not be set with NULL. This patch is to void all these panics by setting it with an ipt_entry obj with all members = 0. Note that this issue has been there since the very beginning. Signed-off-by: Xin Long Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d516ba8..5417078 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -41,6 +41,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t, { struct xt_tgchk_param par; struct xt_target *target; + struct ipt_entry e = {}; int ret = 0; target = xt_request_find_target(AF_INET, t->u.user.name, @@ -52,6 +53,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t, memset(&par, 0, sizeof(par)); par.net = net; par.table = table; + par.entryinfo = &e; par.target = target; par.targinfo = t->data; par.hook_mask = hook; -- cgit v1.1 From 348a4002729ccab8b888b38cbc099efa2f2a2036 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 18 Aug 2017 17:14:49 -0700 Subject: ipv6: repair fib6 tree in failure case In fib6_add(), it is possible that fib6_add_1() picks an intermediate node and sets the node's fn->leaf to NULL in order to add this new route. However, if fib6_add_rt2node() fails to add the new route for some reason, fn->leaf will be left as NULL and could potentially cause crash when fn->leaf is accessed in fib6_locate(). This patch makes sure fib6_repair_tree() is called to properly repair fn->leaf in the above failure case. Here is the syzkaller reported general protection fault in fib6_locate: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000 RIP: 0010:[] [] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline] RIP: 0010:[] [] ipv6_prefix_equal include/net/ipv6.h:492 [inline] RIP: 0010:[] [] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline] RIP: 0010:[] [] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233 RSP: 0018:ffff8801d01a36a8 EFLAGS: 00010202 RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000 RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100 RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000 R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000 FS: 00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988 Call Trace: [] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109 [] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075 [] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450 [] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281 [] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456 [] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline] [] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232 [] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778 [] sock_sendmsg_nosec net/socket.c:609 [inline] [] sock_sendmsg+0xcf/0x110 net/socket.c:619 [] sock_write_iter+0x222/0x3a0 net/socket.c:834 [] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478 [] __vfs_write+0xe4/0x110 fs/read_write.c:491 [] vfs_write+0x178/0x4b0 fs/read_write.c:538 [] SYSC_write fs/read_write.c:585 [inline] [] SyS_write+0xd9/0x1b0 fs/read_write.c:577 [] entry_SYSCALL_64_fastpath+0x12/0x17 Note: there is no "Fixes" tag as this seems to be a bug introduced very early. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e6f8780..5cc0ea0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1018,7 +1018,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Create subtree root node */ sfn = node_alloc(); if (!sfn) - goto st_failure; + goto failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); @@ -1035,12 +1035,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { /* If it is failed, discard just allocated - root, and then (in st_failure) stale node + root, and then (in failure) stale node in main tree. */ node_free(sfn); err = PTR_ERR(sn); - goto st_failure; + goto failure; } /* Now link new subtree to main tree */ @@ -1055,7 +1055,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { err = PTR_ERR(sn); - goto st_failure; + goto failure; } } @@ -1096,18 +1096,17 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - /* Always release dst as dst->__refcnt is guaranteed - * to be taken before entering this function - */ - dst_release_immediate(&rt->dst); + goto failure; } return err; -#ifdef CONFIG_IPV6_SUBTREES - /* Subtree creation failed, probably main tree node - is orphan. If it is, shoot it. +failure: + /* fn->leaf could be NULL if fn is an intermediate node and we + * failed to add the new route to it in both subtree creation + * failure and fib6_add_rt2node() failure case. + * In both cases, fib6_repair_tree() should be called to fix + * fn->leaf. */ -st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); /* Always release dst as dst->__refcnt is guaranteed @@ -1115,7 +1114,6 @@ st_failure: */ dst_release_immediate(&rt->dst); return err; -#endif } /* -- cgit v1.1 From eebe53e87f97975ee58a21693e44797608bf679c Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Mon, 21 Aug 2017 07:23:07 -0400 Subject: net: sunrpc: svcsock: fix NULL-pointer exception While running nfs/connectathon tests kernel NULL-pointer exception has been observed due to races in svcsock.c. Race is appear when kernel accepts connection by kernel_accept (which creates new socket) and start queuing ingress packets to new socket. This happens in ksoftirq context which could run concurrently on a different core while new socket setup is not done yet. The fix is to re-order socket user data init sequence and add write/read barrier calls to be sure that we got proper values for callback pointers before actually calling them. Test results: nfs/connectathon reports '0' failed tests for about 200+ iterations. Crash log: ---<-snip->--- [ 6708.638984] Unable to handle kernel NULL pointer dereference at virtual address 00000000 [ 6708.647093] pgd = ffff0000094e0000 [ 6708.650497] [00000000] *pgd=0000010ffff90003, *pud=0000010ffff90003, *pmd=0000010ffff80003, *pte=0000000000000000 [ 6708.660761] Internal error: Oops: 86000005 [#1] SMP [ 6708.665630] Modules linked in: nfsv3 nfnetlink_queue nfnetlink_log nfnetlink rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache overlay xt_CONNSECMARK xt_SECMARK xt_conntrack iptable_security ip_tables ah4 xfrm4_mode_transport sctp tun binfmt_misc ext4 jbd2 mbcache loop tcp_diag udp_diag inet_diag rpcrdma ib_isert iscsi_target_mod ib_iser rdma_cm iw_cm libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib ib_ucm ib_uverbs ib_umad ib_cm ib_core nls_koi8_u nls_cp932 ts_kmp nf_conntrack_ipv4 nf_defrag_ipv4 nf_conntrack vfat fat ghash_ce sha2_ce sha1_ce cavium_rng_vf i2c_thunderx sg thunderx_edac i2c_smbus edac_core cavium_rng nfsd auth_rpcgss nfs_acl lockd grace sunrpc xfs libcrc32c nicvf nicpf ast i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops [ 6708.736446] ttm drm i2c_core thunder_bgx thunder_xcv mdio_thunder mdio_cavium dm_mirror dm_region_hash dm_log dm_mod [last unloaded: stap_3c300909c5b3f46dcacd49aab3334af_87021] [ 6708.752275] CPU: 84 PID: 0 Comm: swapper/84 Tainted: G W OE 4.11.0-4.el7.aarch64 #1 [ 6708.760787] Hardware name: www.cavium.com CRB-2S/CRB-2S, BIOS 0.3 Mar 13 2017 [ 6708.767910] task: ffff810006842e80 task.stack: ffff81000689c000 [ 6708.773822] PC is at 0x0 [ 6708.776739] LR is at svc_data_ready+0x38/0x88 [sunrpc] [ 6708.781866] pc : [<0000000000000000>] lr : [] pstate: 60000145 [ 6708.789248] sp : ffff810ffbad3900 [ 6708.792551] x29: ffff810ffbad3900 x28: ffff000008c73d58 [ 6708.797853] x27: 0000000000000000 x26: ffff81000bbe1e00 [ 6708.803156] x25: 0000000000000020 x24: ffff800f7410bf28 [ 6708.808458] x23: ffff000008c63000 x22: ffff000008c63000 [ 6708.813760] x21: ffff800f7410bf28 x20: ffff81000bbe1e00 [ 6708.819063] x19: ffff810012412400 x18: 00000000d82a9df2 [ 6708.824365] x17: 0000000000000000 x16: 0000000000000000 [ 6708.829667] x15: 0000000000000000 x14: 0000000000000001 [ 6708.834969] x13: 0000000000000000 x12: 722e736f622e676e [ 6708.840271] x11: 00000000f814dd99 x10: 0000000000000000 [ 6708.845573] x9 : 7374687225000000 x8 : 0000000000000000 [ 6708.850875] x7 : 0000000000000000 x6 : 0000000000000000 [ 6708.856177] x5 : 0000000000000028 x4 : 0000000000000000 [ 6708.861479] x3 : 0000000000000000 x2 : 00000000e5000000 [ 6708.866781] x1 : 0000000000000000 x0 : ffff81000bbe1e00 [ 6708.872084] [ 6708.873565] Process swapper/84 (pid: 0, stack limit = 0xffff81000689c000) [ 6708.880341] Stack: (0xffff810ffbad3900 to 0xffff8100068a0000) [ 6708.886075] Call trace: [ 6708.888513] Exception stack(0xffff810ffbad3710 to 0xffff810ffbad3840) [ 6708.894942] 3700: ffff810012412400 0001000000000000 [ 6708.902759] 3720: ffff810ffbad3900 0000000000000000 0000000060000145 ffff800f79300000 [ 6708.910577] 3740: ffff000009274d00 00000000000003ea 0000000000000015 ffff000008c63000 [ 6708.918395] 3760: ffff810ffbad3830 ffff800f79300000 000000000000004d 0000000000000000 [ 6708.926212] 3780: ffff810ffbad3890 ffff0000080f88dc ffff800f79300000 000000000000004d [ 6708.934030] 37a0: ffff800f7930093c ffff000008c63000 0000000000000000 0000000000000140 [ 6708.941848] 37c0: ffff000008c2c000 0000000000040b00 ffff81000bbe1e00 0000000000000000 [ 6708.949665] 37e0: 00000000e5000000 0000000000000000 0000000000000000 0000000000000028 [ 6708.957483] 3800: 0000000000000000 0000000000000000 0000000000000000 7374687225000000 [ 6708.965300] 3820: 0000000000000000 00000000f814dd99 722e736f622e676e 0000000000000000 [ 6708.973117] [< (null)>] (null) [ 6708.977824] [] tcp_data_queue+0x754/0xc5c [ 6708.983386] [] tcp_rcv_established+0x1a0/0x67c [ 6708.989384] [] tcp_v4_do_rcv+0x15c/0x22c [ 6708.994858] [] tcp_v4_rcv+0xaf0/0xb58 [ 6709.000077] [] ip_local_deliver_finish+0x10c/0x254 [ 6709.006419] [] ip_local_deliver+0xf0/0xfc [ 6709.011980] [] ip_rcv_finish+0x208/0x3a4 [ 6709.017454] [] ip_rcv+0x2dc/0x3c8 [ 6709.022328] [] __netif_receive_skb_core+0x2f8/0xa0c [ 6709.028758] [] __netif_receive_skb+0x38/0x84 [ 6709.034580] [] netif_receive_skb_internal+0x68/0xdc [ 6709.041010] [] napi_gro_receive+0xcc/0x1a8 [ 6709.046690] [] nicvf_cq_intr_handler+0x59c/0x730 [nicvf] [ 6709.053559] [] nicvf_poll+0x38/0xb8 [nicvf] [ 6709.059295] [] net_rx_action+0x2f8/0x464 [ 6709.064771] [] __do_softirq+0x11c/0x308 [ 6709.070164] [] irq_exit+0x12c/0x174 [ 6709.075206] [] __handle_domain_irq+0x78/0xc4 [ 6709.081027] [] gic_handle_irq+0x94/0x190 [ 6709.086501] Exception stack(0xffff81000689fdf0 to 0xffff81000689ff20) [ 6709.092929] fde0: 0000810ff2ec0000 ffff000008c10000 [ 6709.100747] fe00: ffff000008c70ef4 0000000000000001 0000000000000000 ffff810ffbad9b18 [ 6709.108565] fe20: ffff810ffbad9c70 ffff8100169d3800 ffff810006843ab0 ffff81000689fe80 [ 6709.116382] fe40: 0000000000000bd0 0000ffffdf979cd0 183f5913da192500 0000ffff8a254ce4 [ 6709.124200] fe60: 0000ffff8a254b78 0000aaab10339808 0000000000000000 0000ffff8a0c2a50 [ 6709.132018] fe80: 0000ffffdf979b10 ffff000008d6d450 ffff000008c10000 ffff000008d6d000 [ 6709.139836] fea0: 0000000000000054 ffff000008cd3dbc 0000000000000000 0000000000000000 [ 6709.147653] fec0: 0000000000000000 0000000000000000 0000000000000000 ffff81000689ff20 [ 6709.155471] fee0: ffff000008085240 ffff81000689ff20 ffff000008085244 0000000060000145 [ 6709.163289] ff00: ffff81000689ff10 ffff00000813f1e4 ffffffffffffffff ffff00000813f238 [ 6709.171107] [] el1_irq+0xb4/0x140 [ 6709.175976] [] arch_cpu_idle+0x44/0x11c [ 6709.181368] [] default_idle_call+0x20/0x30 [ 6709.187020] [] do_idle+0x158/0x1e4 [ 6709.191973] [] cpu_startup_entry+0x2c/0x30 [ 6709.197624] [] secondary_start_kernel+0x13c/0x160 [ 6709.203878] [<0000000001bc71c4>] 0x1bc71c4 [ 6709.207967] Code: bad PC value [ 6709.211061] SMP: stopping secondary CPUs [ 6709.218830] Starting crashdump kernel... [ 6709.222749] Bye! ---<-snip>--- Signed-off-by: Vadim Lomovtsev Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2b720fa..e185001 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -421,6 +421,9 @@ static void svc_data_ready(struct sock *sk) dprintk("svc: socket %p(inet %p), busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_odata(sk); if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); @@ -437,6 +440,9 @@ static void svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } @@ -760,8 +766,12 @@ static void svc_tcp_listen_data_ready(struct sock *sk) dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); - if (svsk) + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_odata(sk); + } + /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -794,6 +804,8 @@ static void svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_ostate(sk); if (sk->sk_state != TCP_ESTABLISHED) { set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@ -1381,12 +1393,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return ERR_PTR(err); } - inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; + /* + * This barrier is necessary in order to prevent race condition + * with svc_data_ready(), svc_listen_data_ready() and others + * when calling callbacks above. + */ + wmb(); + inet->sk_user_data = svsk; /* Initialize the socket */ if (sock->type == SOCK_DGRAM) -- cgit v1.1